m4-patches
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

minor regex speedup


From: Eric Blake
Subject: minor regex speedup
Date: Fri, 15 Feb 2008 23:39:04 +0000 (UTC)
User-agent: Loom/3.14 (http://gmane.org/)

I noticed in the GNU regex documentation that the use of fastmaps is 
recommended to speed up searches on long input strings.  With just a one-line 
change to builtin.c, I saw about a 1% difference in repeated trials of 
running 'autoconf -f' on coreutils, enough to say that the improvement is more 
than just noise in the timing runs.  Then in the process, I noticed that using 
re_compile_fastmap is a much faster way to do what we were already doing for 
changeword.  This is the patch for the branch, the patch for head only touches 
modules/gnu.c.

From: Eric Blake <address@hidden>
Date: Fri, 15 Feb 2008 16:29:14 -0700
Subject: [PATCH] Use fastmaps for better regex performance.

* src/builtin.c (compile_pattern): Allocate a fastmap.
* src/input.c (word_start): Delete.
(set_word_regexp): Compile a fastmap instead.
(peek_token, next_token): Use fastmap.
(pop_wrapup): Free memory on exit.

Signed-off-by: Eric Blake <address@hidden>
---
 ChangeLog     |    9 +++++++++
 src/builtin.c |    2 ++
 src/input.c   |   28 +++++++++++++---------------
 3 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 099d643..6425c7d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2008-02-15  Eric Blake  <address@hidden>
+
+       Use fastmaps for better regex performance.
+       * src/builtin.c (compile_pattern): Allocate a fastmap.
+       * src/input.c (word_start): Delete.
+       (set_word_regexp): Compile a fastmap instead.
+       (peek_token, next_token): Use fastmap.
+       (pop_wrapup): Free memory on exit.
+
 2008-02-11  Eric Blake  <address@hidden>
 
        Document behavior of __gnu__().
diff --git a/src/builtin.c b/src/builtin.c
index c89ad44..a48e7a0 100644
--- a/src/builtin.c
+++ b/src/builtin.c
@@ -298,6 +298,8 @@ compile_pattern (const char *str, size_t len, struct 
re_pattern_buffer **buf,
       free (new_buf);
       return msg;
     }
+  /* Use a fastmap for speed; it is freed by regfree.  */
+  new_buf->fastmap = xcharalloc (256);
 
   /* Now, find a victim slot.  Decrease the count of all entries, then
      prime the count of the victim slot at REGEX_CACHE_SIZE.  This
diff --git a/src/input.c b/src/input.c
index 7788562..a0de36f 100644
--- a/src/input.c
+++ b/src/input.c
@@ -165,9 +165,6 @@ string_pair curr_comm;
 
 # define DEFAULT_WORD_REGEXP "[_a-zA-Z][_a-zA-Z0-9]*"
 
-/* Table of characters that can start a word.  */
-static char word_start[256];
-
 /* Current regular expression for detecting words.  */
 static struct re_pattern_buffer word_regexp;
 
@@ -637,6 +634,9 @@ pop_wrapup (void)
       obstack_free (&file_names, NULL);
       obstack_free (wrapup_stack, NULL);
       free (wrapup_stack);
+#ifdef ENABLE_CHANGEWORD
+      regfree (&word_regexp);
+#endif /* ENABLE_CHANGEWORD */
       return false;
     }
 
@@ -1203,7 +1203,6 @@ set_comment (const char *bc, const char *ec)
 void
 set_word_regexp (const char *caller, const char *regexp)
 {
-  int i;
   const char *msg;
   struct re_pattern_buffer new_word_regexp;
 
@@ -1225,21 +1224,20 @@ set_word_regexp (const char *caller, const char *regexp)
       return;
     }
 
-  /* If compilation worked, retry using the word_regexp struct.
-     Can't rely on struct assigns working, so redo the compilation.  */
-  regfree (&word_regexp);
+  /* If compilation worked, retry using the word_regexp struct.  We
+     can't rely on struct assigns working, so redo the compilation.
+     The fastmap can be reused between compilations, and will be freed
+     by the final regfree.  */
+  if (!word_regexp.fastmap)
+    word_regexp.fastmap = xcharalloc (256);
   msg = re_compile_pattern (regexp, strlen (regexp), &word_regexp);
   assert (!msg);
   re_set_registers (&word_regexp, &regs, regs.num_regs, regs.start, regs.end);
+  if (re_compile_fastmap (&word_regexp))
+    assert (false);
 
   default_word_regexp = false;
   set_quote_age ();
-
-  for (i = 1; i < 256; i++)
-    {
-      char test = i;
-      word_start[i] = re_match (&word_regexp, &test, 1, 0, NULL) > 0;
-    }
 }
 
 #endif /* ENABLE_CHANGEWORD */
@@ -1421,7 +1419,7 @@ next_token (token_data *td, int *line, struct obstack 
*obs, const char *caller)
 
 #ifdef ENABLE_CHANGEWORD
 
-  else if (!default_word_regexp && word_start[ch])
+  else if (!default_word_regexp && word_regexp.fastmap[ch])
     {
       obstack_1grow (&token_stack, ch);
       while (1)
@@ -1587,7 +1585,7 @@ peek_token (void)
     }
   else if ((default_word_regexp && (isalpha (ch) || ch == '_'))
 #ifdef ENABLE_CHANGEWORD
-      || (!default_word_regexp && word_start[ch])
+      || (!default_word_regexp && word_regexp.fastmap[ch])
 #endif /* ENABLE_CHANGEWORD */
       )
     {
-- 
1.5.4







reply via email to

[Prev in Thread] Current Thread [Next in Thread]