bug-m4
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: poor m4 hash performance


From: Paul Eggert
Subject: Re: poor m4 hash performance
Date: Sun, 04 Jun 2006 21:24:43 -0700
User-agent: Gnus/5.1008 (Gnus v5.10.8) Emacs/21.4 (gnu/linux)

Eric Blake <address@hidden> writes:

> right now, both the 1.4 branch and head use an algorithm borrowed from emacs:

That algorithm could be improved.  M4 should do all its hash
computation using size_t, not a mixture of int and size_t.  And it
shouldn't slow itself down to do case-folding, since M4 is
case-sensitive.  And it shouldn't assume that int is 32 bits wide.

Worst of all, M4 shouldn't assume that the absolute value of an 'int'
must be nonnegative.  That is not true for INT_MIN.  On most current
hosts, m4-1.4.4 has an approximately one in 2**32 chance of using
INT_MIN % 509 as a subscript, which on most hosts will lead to a
subscript error that will cause serious trouble.

Here is a patch to m4 1.4.4 that fixes these problems.  It results in
a very slight performance improvement to Autoconf, and a very
slightly-smaller M4 executable.  It changes M4 to use the same hash
algorithm as gnulib/lib/hash.c.

2006-06-04  Paul Eggert  <address@hidden>

        * src/m4.h (hash_table_size): Now size_t instead of int.
        * src/m4.c (hash_table_size): Likewise.
        (main): Adjust to this; use atol rather than atoi.
        * src/symtab.c: Include <limits.h>, for CHAR_BIT.
        (symtab_init, lookup_symbol, hack_all_symbols):
        Use size_t for sizes and indexes, not int.
        (hash): Likewise.  Don't case-fold in the hash function.
        Shift by 7, not 3, for consistency with gnulib/lib/hash.c.
        Don't assume hash word is 32 bits.

diff -pru m4-1.4.4/src/m4.c m4-1.4.4-hash/src/m4.c
--- m4-1.4.4/src/m4.c   2005-10-18 04:48:58.000000000 -0700
+++ m4-1.4.4-hash/src/m4.c      2006-06-04 20:54:36.000000000 -0700
@@ -42,7 +42,7 @@ int sync_output = 0;
 int debug_level = 0;
 
 /* Hash table size (should be a prime) (-Hsize).  */
-int hash_table_size = HASHMAX;
+size_t hash_table_size = HASHMAX;
 
 /* Disable GNU extensions (-G).  */
 int no_gnu_extensions = 0;
@@ -329,8 +329,8 @@ main (int argc, char *const *argv, char 
        break;
 
       case 'H':
-       hash_table_size = atoi (optarg);
-       if (hash_table_size <= 0)
+       hash_table_size = atol (optarg);
+       if (hash_table_size == 0)
          hash_table_size = HASHMAX;
        break;
 
diff -pru m4-1.4.4/src/m4.h m4-1.4.4-hash/src/m4.h
--- m4-1.4.4/src/m4.h   2005-05-01 04:37:09.000000000 -0700
+++ m4-1.4.4-hash/src/m4.h      2006-06-04 20:54:56.000000000 -0700
@@ -150,7 +150,7 @@ typedef struct token_data token_data;
 /* Option flags.  */
 extern int sync_output;                        /* -s */
 extern int debug_level;                        /* -d */
-extern int hash_table_size;            /* -H */
+extern size_t hash_table_size;         /* -H */
 extern int no_gnu_extensions;          /* -G */
 extern int prefix_all_builtins;                /* -P */
 extern int max_debug_argument_length;  /* -l */
diff -pru m4-1.4.4/src/symtab.c m4-1.4.4-hash/src/symtab.c
--- m4-1.4.4/src/symtab.c       2005-05-01 04:35:20.000000000 -0700
+++ m4-1.4.4-hash/src/symtab.c  2006-06-04 21:18:39.000000000 -0700
@@ -32,6 +32,7 @@
    will then always be the first found.  */
 
 #include "m4.h"
+#include <limits.h>
 
 /*----------------------------------------------------------------------.
 | Initialise the symbol table, by allocating the necessary storage, and |
@@ -44,34 +45,29 @@ symbol **symtab;
 void
 symtab_init (void)
 {
-  int i;
+  size_t i;
   symbol **s;
 
   s = symtab = (symbol **) xmalloc (hash_table_size * sizeof (symbol *));
 
-  for (i = hash_table_size; --i >= 0;)
-    *s++ = NULL;
+  for (i = 0; i < hash_table_size; i++)
+    s[i] = NULL;
 }
 
 /*--------------------------------------------------.
 | Return a hashvalue for a string, from GNU-emacs.  |
 `--------------------------------------------------*/
 
-static int
+static size_t
 hash (const char *s)
 {
-  register int val = 0;
+  register size_t val = 0;
 
   register const char *ptr = s;
   register char ch;
 
   while ((ch = *ptr++) != '\0')
-    {
-      if (ch >= 0140)
-       ch -= 40;
-      val = ((val << 3) + (val >> 28) + ch);
-    };
-  val = (val < 0) ? -val : val;
+    val = (val << 7) + (val >> (sizeof (val) * CHAR_BIT - 7)) + ch;
   return val % hash_table_size;
 }
 
@@ -105,7 +101,8 @@ free_symbol (symbol *sym)
 symbol *
 lookup_symbol (const char *name, symbol_lookup mode)
 {
-  int h, cmp = 1;
+  size_t h;
+  int cmp = 1;
   symbol *sym, *prev;
   symbol **spp;
 
@@ -209,7 +206,7 @@ lookup_symbol (const char *name, symbol_
 void
 hack_all_symbols (hack_symbol *func, const char *data)
 {
-  int h;
+  size_t h;
   symbol *sym;
 
   for (h = 0; h < hash_table_size; h++)




reply via email to

[Prev in Thread] Current Thread [Next in Thread]