m4-patches
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: branch-1_4 regexp coredump


From: Eric Blake
Subject: Re: branch-1_4 regexp coredump
Date: Fri, 18 Aug 2006 16:24:55 +0000 (UTC)
User-agent: Loom/3.14 (http://gmane.org/)

Eric Blake <ebb9 <at> byu.net> writes:

> 
> I still need to work on this.  We have a memory leak (and have done, since m4 
> 0.75 when regexp was introduced), because re_search allocates memory in 
> re_registers on success if we haven't done it ourselves.
> 

With this patch, I no longer run out of memory on:

$ m4
divert(-1)include(examples/forloop.m4)
forloop(i,1,1000000,`regexp(abc,\(\(b\)\))')
m4exit

I imagine this will improve the performance of m4 as used by automake, which 
does its fair share of regexp and patsubst, now that they don't leak memory, 
although I haven't actually tried profiling it.  There is also a minor 
performance tweak for when changeword is enabled.

2006-08-18  Eric Blake  <address@hidden>

        Regular expressions were leaking memory.
        * src/builtin.c (init_pattern_buffer, free_pattern_buffer): New
        helper methods.
        (m4_regexp, m4_patsubst): Avoid memory leak.
        * src/input.c (init_pattern_buffer) [ENABLE_CHANGEWORD]: Make
        static.
        (set_word_regexp) [ENABLE_CHANGEWORD]: Avoid memory leak.  Change
        from O(n^2) to O(n) for calculating word_start.
        (next_token, peek_token) [ENABLE_CHANGEWORD]: Treat word_start as
        O(1) bitmap, not O(n) search string.
        * NEWS: Document this fix.

Index: NEWS
===================================================================
RCS file: /sources/m4/m4/NEWS,v
retrieving revision 1.1.1.1.2.53
diff -u -r1.1.1.1.2.53 NEWS
--- NEWS        18 Aug 2006 04:00:59 -0000      1.1.1.1.2.53
+++ NEWS        18 Aug 2006 15:52:26 -0000
@@ -7,6 +7,7 @@
 * Fix buffer overruns in regexp and patsubst macros when handed a trailing
   backslash in the replacement text, or when handling \n substitutions
   beyond the number of \(\) groups.
+* Fix memory leak in regexp, patsubst, and changeword macros.
 * The format macro now understands %F, %g, and %G.
 * When loading frozen files, m4 now exits with status 63 if version
   mismatch is detected.
Index: src/builtin.c
===================================================================
RCS file: /sources/m4/m4/src/Attic/builtin.c,v
retrieving revision 1.1.1.1.2.34
diff -u -r1.1.1.1.2.34 builtin.c
--- src/builtin.c       18 Aug 2006 03:39:11 -0000      1.1.1.1.2.34
+++ src/builtin.c       18 Aug 2006 15:52:26 -0000
@@ -483,7 +483,6 @@
                "INTERNAL ERROR: bad token data type in define_macro ()"));
       abort ();
     }
-  return;
 }
 
 static void
@@ -1703,6 +1702,33 @@
     }
 }
 
+/*------------------------------------------.
+| Initialize regular expression variables.  |
+`------------------------------------------*/
+
+static void
+init_pattern_buffer (struct re_pattern_buffer *buf, struct re_registers *regs)
+{
+  buf->translate = NULL;
+  buf->fastmap = NULL;
+  buf->buffer = NULL;
+  buf->allocated = 0;
+  regs->start = NULL;
+  regs->end = NULL;
+}
+
+/*----------------------------------------.
+| Clean up regular expression variables.  |
+`----------------------------------------*/
+
+static void
+free_pattern_buffer (struct re_pattern_buffer *buf, struct re_registers *regs)
+{
+  regfree (buf);
+  free (regs->start);
+  free (regs->end);
+}
+
 /*--------------------------------------------------------------------------.
 | Regular expression version of index.  Given two arguments, expand to the  |
 | index of the first match of the second argument (a regexp) in the first.  |
@@ -1729,31 +1755,26 @@
   victim = TOKEN_DATA_TEXT (argv[1]);
   regexp = TOKEN_DATA_TEXT (argv[2]);
 
-  buf.buffer = NULL;
-  buf.allocated = 0;
-  buf.fastmap = NULL;
-  buf.translate = NULL;
+  init_pattern_buffer (&buf, &regs);
   msg = re_compile_pattern (regexp, strlen (regexp), &buf);
 
   if (msg != NULL)
     {
       M4ERROR ((warning_status, 0,
                "bad regular expression: `%s': %s", regexp, msg));
+      free_pattern_buffer (&buf, &regs);
       return;
     }
 
   length = strlen (victim);
-  startpos = re_search (&buf, victim, length, 0, length, &regs);
-  free (buf.buffer);
+  /* Avoid overhead of allocating regs if we won't use it.  */
+  startpos = re_search (&buf, victim, length, 0, length,
+                        argc == 3 ? NULL : &regs);
 
   if (startpos  == -2)
-    {
-      M4ERROR ((warning_status, 0,
-               "error matching regular expression \"%s\"", regexp));
-      return;
-    }
-
-  if (argc == 3)
+     M4ERROR ((warning_status, 0,
+              "error matching regular expression \"%s\"", regexp));
+  else if (argc == 3)
     shipout_int (obs, startpos);
   else if (startpos >= 0)
     {
@@ -1761,7 +1782,7 @@
       substitute (obs, victim, repl, &regs);
     }
 
-  return;
+  free_pattern_buffer (&buf, &regs);
 }
 
 /*--------------------------------------------------------------------------.
@@ -1789,10 +1810,7 @@
 
   regexp = TOKEN_DATA_TEXT (argv[2]);
 
-  buf.buffer = NULL;
-  buf.allocated = 0;
-  buf.fastmap = NULL;
-  buf.translate = NULL;
+  init_pattern_buffer (&buf, &regs);
   msg = re_compile_pattern (regexp, strlen (regexp), &buf);
 
   if (msg != NULL)
@@ -1846,8 +1864,7 @@
     }
   obstack_1grow (obs, '\0');
 
-  free (buf.buffer);
-  return;
+  free_pattern_buffer (&buf, &regs);
 }
 
 /* Finally, a placeholder builtin.  This builtin is not installed by
Index: src/input.c
===================================================================
RCS file: /sources/m4/m4/src/Attic/input.c,v
retrieving revision 1.1.1.1.2.19
diff -u -r1.1.1.1.2.19 input.c
--- src/input.c 8 Aug 2006 23:17:44 -0000       1.1.1.1.2.19
+++ src/input.c 18 Aug 2006 15:52:26 -0000
@@ -682,12 +682,12 @@
 
 #ifdef ENABLE_CHANGEWORD
 
-void
+static void
 init_pattern_buffer (struct re_pattern_buffer *buf)
 {
-  buf->translate = 0;
-  buf->fastmap = 0;
-  buf->buffer = 0;
+  buf->translate = NULL;
+  buf->fastmap = NULL;
+  buf->buffer = NULL;
   buf->allocated = 0;
 }
 
@@ -719,7 +719,9 @@
 
   /* If compilation worked, retry using the word_regexp struct.
      Can't rely on struct assigns working, so redo the compilation.  */
+  regfree (&word_regexp);
   msg = re_compile_pattern (regexp, strlen (regexp), &word_regexp);
+  re_set_registers (&word_regexp, &regs, regs.num_regs, regs.start, regs.end);
 
   if (msg != NULL)
     {
@@ -738,8 +740,7 @@
   for (i = 1; i < 256; i++)
     {
       test[0] = i;
-      if (re_search (&word_regexp, test, 1, 0, 0, &regs) >= 0)
-       strcat (word_start, test);
+      word_start[i] = re_search (&word_regexp, test, 1, 0, 0, NULL) >= 0;
     }
 }
 
@@ -826,7 +827,7 @@
 
 #ifdef ENABLE_CHANGEWORD
 
-  else if (!default_word_regexp && strchr (word_start, ch))
+  else if (!default_word_regexp && word_start[ch])
     {
       obstack_1grow (&token_stack, ch);
       while (1)
@@ -960,7 +961,7 @@
 
   if ((default_word_regexp && (isalpha (ch) || ch == '_'))
 #ifdef ENABLE_CHANGEWORD
-      || (! default_word_regexp && strchr (word_start, ch))
+      || (! default_word_regexp && word_start[ch])
 #endif /* ENABLE_CHANGEWORD */
       )
     {






reply via email to

[Prev in Thread] Current Thread [Next in Thread]