m4-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[SCM] GNU M4 source repository branch, master, updated. 394083c7c6df499e


From: Eric Blake
Subject: [SCM] GNU M4 source repository branch, master, updated. 394083c7c6df499eedb10173ac838aced7d00a34
Date: Tue, 09 Oct 2007 20:05:13 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU M4 source repository".

http://git.sv.gnu.org/gitweb/?p=m4.git;a=commitdiff;h=394083c7c6df499eedb10173ac838aced7d00a34

The branch, master has been updated
       via  394083c7c6df499eedb10173ac838aced7d00a34 (commit)
       via  2243fca7d2467700d05ea05c650115702cb49ab0 (commit)
      from  06ff2073f88f397ecdab5f2b095c98f3a4cf9f9f (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 394083c7c6df499eedb10173ac838aced7d00a34
Author: Eric Blake <address@hidden>
Date:   Tue Oct 9 13:55:37 2007 -0600

    Fix regexp regression of 2007-09-29.
    
    * modules/gnu.c (substitute): Allow NULL buf when no
    subexpressions were present.
    (regexp): Handle \ escapes even with empty regex.
    * doc/m4.texinfo (Regexp, Patsubst): Catch this bug.
    
    Signed-off-by: Eric Blake <address@hidden>

commit 2243fca7d2467700d05ea05c650115702cb49ab0
Author: Eric Blake <address@hidden>
Date:   Tue Oct 9 12:16:29 2007 -0600

    Cache regex compilation for another autoconf speedup.
    
    * modules/gnu.c     (gnu_buf): Replace...
    (REGEX_CACHE_SIZE, regex_cache): ...with new declarations.
    (m4_pattern_buffer): Add fields.
    (m4_regexp_compile): Rename...
    (regexp_compile): ...to this, and drop no_sub parameter.
    Implement caching.
    (M4FINISH_HANDLER): Clean up entire cache.
    (m4_regexp_search): Rename...
    (regex_search): ...to this, adjust to new struct contents, and add
    no_sub parameter.
    (m4_regexp_substitute): Rename...
    (regexp_substitute): ...to this.
    (substitute, patsubst, regexp, renamesyms): Adjust callers.
    
    Signed-off-by: Eric Blake <address@hidden>

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog      |   23 +++++++
 doc/m4.texinfo |    8 +-
 modules/gnu.c  |  199 ++++++++++++++++++++++++++++++++++++++-----------------
 3 files changed, 164 insertions(+), 66 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 4bff9c4..e762099 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,26 @@
+2007-10-09  Eric Blake  <address@hidden>
+
+       Fix regexp regression of 2007-09-29.
+       * modules/gnu.c (substitute): Allow NULL buf when no
+       subexpressions were present.
+       (regexp): Handle \ escapes even with empty regex.
+       * doc/m4.texinfo (Regexp, Patsubst): Catch this bug.
+
+       Cache regex compilation for another autoconf speedup.
+       * modules/gnu.c (gnu_buf): Replace...
+       (REGEX_CACHE_SIZE, regex_cache): ...with new declarations.
+       (m4_pattern_buffer): Add fields.
+       (m4_regexp_compile): Rename...
+       (regexp_compile): ...to this, and drop no_sub parameter.
+       Implement caching.
+       (M4FINISH_HANDLER): Clean up entire cache.
+       (m4_regexp_search): Rename...
+       (regex_search): ...to this, adjust to new struct contents, and add
+       no_sub parameter.
+       (m4_regexp_substitute): Rename...
+       (regexp_substitute): ...to this.
+       (substitute, patsubst, regexp, renamesyms): Adjust callers.
+
 2007-10-02  Eric Blake  <address@hidden>
 
        Document quoting pitfalls in capitalize.
diff --git a/doc/m4.texinfo b/doc/m4.texinfo
index d3f3b8f..5cedc18 100644
--- a/doc/m4.texinfo
+++ b/doc/m4.texinfo
@@ -5837,8 +5837,8 @@ regexp(`abc')
 @result{}0
 regexp(`abc', `')
 @result{}0
-regexp(`abc', `', `def')
address@hidden
+regexp(`abc', `', `\\def')
address@hidden
 @end example
 
 If @var{resyntax} is given, @var{regexp} must be given according to
@@ -6127,8 +6127,8 @@ patsubst(`abc')
 @result{}abc
 patsubst(`abc', `')
 @result{}abc
-patsubst(`abc', `', `-')
address@hidden
+patsubst(`abc', `', `\\-')
address@hidden
 @end example
 
 @node Format
diff --git a/modules/gnu.c b/modules/gnu.c
index 900ae15..5c69396 100644
--- a/modules/gnu.c
+++ b/modules/gnu.c
@@ -102,70 +102,134 @@ m4_macro m4_macro_table[] =
 
 
 
-/* The regs_allocated field in an re_pattern_buffer refers to the
-   state of the re_registers struct used in successive matches with
-   the same compiled pattern:  */
-
+/* Regular expressions.  Reuse re_registers among multiple
+   re_pattern_buffer allocations to reduce malloc usage.  */
+
+/* Maybe this is worth making runtime tunable.  Too small, and nothing
+   gets cached because the working set of active regex is larger than
+   the cache, and we are always swapping out entries.  Too large, and
+   the time spent searching the cache for a match overtakes the time
+   saved by caching.  For now, this size proved reasonable for the
+   typical working set of Autoconf 2.62.  */
+#define REGEX_CACHE_SIZE 16
+
+/* Structure for using a compiled regex, as well as making it easier
+   to cache frequently used expressions.  */
 typedef struct {
-  struct re_pattern_buffer pat;        /* compiled regular expression */
-  struct re_registers regs;    /* match registers */
+  unsigned count;                      /* usage counter */
+  int resyntax;                                /* flavor of regex */
+  size_t len;                          /* length of string */
+  char *str;                           /* copy of compiled string */
+  struct re_pattern_buffer *pat;       /* compiled regex, allocated */
+  struct re_registers regs;            /* match registers, reused */
 } m4_pattern_buffer;
 
-static m4_pattern_buffer gnu_buf;      /* compiled regular expression */
+/* Storage for the cache of regular expressions.  */
+static m4_pattern_buffer regex_cache[REGEX_CACHE_SIZE];
 
-/* Compile a REGEXP using the RESYNTAX bits, and return the buffer.
-   Report errors on behalf of CALLER.  If NO_SUB, optimize the
-   compilation to skip filling out the regs member of the buffer.  */
+/* Compile a REGEXP using the RESYNTAX flavor, and return the buffer.
+   On error, report the problem on behalf of CALLER, and return
+   NULL.  */
 
 static m4_pattern_buffer *
-m4_regexp_compile (m4 *context, const char *caller,
-                  const char *regexp, int resyntax, bool no_sub)
+regexp_compile (m4 *context, const char *caller, const char *regexp,
+               int resyntax)
 {
-  /* gnu_buf is guaranteed to start life 0-initialized, which works in the
-     below algorithm.
+  /* regex_cache is guaranteed to start life 0-initialized, which
+     works in the algorithm below.
 
      FIXME - this method is not reentrant, since re_compile_pattern
      mallocs memory, depends on the global variable re_syntax_options
      for its syntax (but at least the compiled regex remembers its
      syntax even if the global variable changes later), and since we
      use a static variable.  To be reentrant, we would need a mutex in
-     this method, and move the storage for gnu_buf into context.  */
+     this method, and move the storage for regex_cache into context.  */
 
   const char *msg;             /* error message from re_compile_pattern */
-
+  int i;                       /* iterator */
+  m4_pattern_buffer *victim;   /* cache slot to replace */
+  unsigned victim_count;       /* track which victim to replace */
+  struct re_pattern_buffer *pat;/* newly compiled regex */
+  size_t len = strlen (regexp);        /* regex length */
+
+  /* First, check if REGEXP is already cached with the given RESYNTAX.
+     If so, increase its use count and return it.  */
+  for (i = 0; i < REGEX_CACHE_SIZE; i++)
+    if (len == regex_cache[i].len && resyntax == regex_cache[i].resyntax
+       && regex_cache[i].str && memcmp (regexp, regex_cache[i].str, len) == 0)
+      {
+       regex_cache[i].count++;
+       return &regex_cache[i];
+      }
+
+  /* Next, check if REGEXP can be compiled.  */
+  pat = xzalloc (sizeof *pat);
   re_set_syntax (resyntax);
-  regfree (&gnu_buf.pat);
-  gnu_buf.pat.no_sub = no_sub;
-  msg = re_compile_pattern (regexp, strlen (regexp), &gnu_buf.pat);
+  msg = re_compile_pattern (regexp, len, pat);
 
   if (msg != NULL)
     {
       m4_error (context, 0, 0, _("%s: bad regular expression `%s': %s"),
                caller, regexp, msg);
+      regfree (pat);
+      free (pat);
       return NULL;
     }
 
-  re_set_registers (&gnu_buf.pat, &gnu_buf.regs, gnu_buf.regs.num_regs,
-                   gnu_buf.regs.start, gnu_buf.regs.end);
-  return &gnu_buf;
+  /* Now, find a victim slot.  Decrease the count of all entries, then
+     prime the count of the victim slot at REGEX_CACHE_SIZE.  This
+     way, frequently used entries and newly created entries are least
+     likely to be victims next time we have a cache miss.  */
+  victim = regex_cache;
+  victim_count = victim->count;
+  if (victim_count)
+    victim->count--;
+  for (i = 1; i < REGEX_CACHE_SIZE; i++)
+    {
+      if (regex_cache[i].count < victim_count)
+       {
+         victim_count = regex_cache[i].count;
+         victim = &regex_cache[i];
+       }
+      if (regex_cache[i].count)
+       regex_cache[i].count--;
+    }
+  victim->count = REGEX_CACHE_SIZE;
+  victim->resyntax = resyntax;
+  victim->len = len;
+  if (victim->str)
+    {
+      free (victim->str);
+      regfree (victim->pat);
+      free (victim->pat);
+    }
+  victim->str = xstrdup (regexp);
+  victim->pat = pat;
+  re_set_registers (pat, &victim->regs, victim->regs.num_regs,
+                   victim->regs.start, victim->regs.end);
+  return victim;
 }
 
 
-/* Wrap up GNU Regex re_search call to work with an m4_pattern_buffer.  */
+/* Wrap up GNU Regex re_search call to work with an m4_pattern_buffer.
+   If NO_SUB, then storing matches in buf->regs is not necessary.  */
 
 static int
-m4_regexp_search (m4_pattern_buffer *buf, const char *string,
-                 const int size, const int start, const int range)
+regexp_search (m4_pattern_buffer *buf, const char *string, const int size,
+              const int start, const int range, bool no_sub)
 {
-  return re_search (&buf->pat, string, size, start, range, &buf->regs);
+  return re_search (buf->pat, string, size, start, range,
+                   no_sub ? NULL : &buf->regs);
 }
 
 
-/* Function to perform substitution by regular expressions.  Used by the
-   builtins regexp, patsubst and renamesyms.  The changed text is placed on
-   the obstack.  The substitution is REPL, with \& substituted by this part
-   of VICTIM matched by the last whole regular expression, and \N
-   substituted by the text matched by the Nth parenthesized sub-expression.  */
+/* Function to perform substitution by regular expressions.  Used by
+   the builtins regexp, patsubst and renamesyms.  The changed text is
+   placed on the obstack OBS.  The substitution is REPL, with \&
+   substituted by this part of VICTIM matched by the last whole
+   regular expression, and \N substituted by the text matched by the
+   Nth parenthesized sub-expression in BUF.  Any warnings are issued
+   on behalf of CALLER.  BUF may be NULL for the empty regex.  */
 
 static void
 substitute (m4 *context, m4_obstack *obs, const char *caller,
@@ -185,14 +249,15 @@ substitute (m4 *context, m4_obstack *obs, const char 
*caller,
       switch ((ch = *repl++))
        {
        case '&':
-         obstack_grow (obs, victim + buf->regs.start[0],
-                       buf->regs.end[0] - buf->regs.start[0]);
+         if (buf)
+           obstack_grow (obs, victim + buf->regs.start[0],
+                         buf->regs.end[0] - buf->regs.start[0]);
          break;
 
        case '1': case '2': case '3': case '4': case '5': case '6':
        case '7': case '8': case '9':
          ch -= '0';
-         if (buf->pat.re_nsub < ch)
+         if (!buf || buf->pat->re_nsub < ch)
            m4_warn (context, 0, _("%s: sub-expression %d not present"),
                     caller, ch);
          else if (buf->regs.end[ch] > 0)
@@ -214,17 +279,17 @@ substitute (m4 *context, m4_obstack *obs, const char 
*caller,
 
 
 /* For each match against compiled REGEXP (held in BUF -- as returned
-   by m4_regexp_compile) in VICTIM, substitute REPLACE.  Non-matching
+   by regexp_compile) in VICTIM, substitute REPLACE.  Non-matching
    characters are copied verbatim, and the result copied to the
    obstack.  Errors are reported on behalf of CALLER.  Return true if
    a substitution was made.  If IGNORE_DUPLICATES is set, don't worry
    about completing the obstack when returning false.  */
 
 static bool
-m4_regexp_substitute (m4 *context, m4_obstack *obs, const char *caller,
-                     const char *victim, const char *regexp,
-                     m4_pattern_buffer *buf, const char *replace,
-                     bool ignore_duplicates)
+regexp_substitute (m4 *context, m4_obstack *obs, const char *caller,
+                  const char *victim, const char *regexp,
+                  m4_pattern_buffer *buf, const char *replace,
+                  bool ignore_duplicates)
 {
   int matchpos = 0;            /* start position of match */
   int offset   = 0;            /* current match offset */
@@ -233,8 +298,8 @@ m4_regexp_substitute (m4 *context, m4_obstack *obs, const 
char *caller,
 
   while (offset <= length)
     {
-      matchpos = m4_regexp_search (buf, victim, length,
-                                  offset, length - offset);
+      matchpos = regexp_search (buf, victim, length, offset, length - offset,
+                               false);
 
       if (matchpos < 0)
        {
@@ -284,12 +349,19 @@ m4_regexp_substitute (m4 *context, m4_obstack *obs, const 
char *caller,
 /* Reclaim memory used by this module.  */
 M4FINISH_HANDLER(gnu)
 {
-  regfree (&gnu_buf.pat);
-  free (gnu_buf.regs.start);
-  free (gnu_buf.regs.end);
+  int i;
+  for (i = 0; i < REGEX_CACHE_SIZE; i++)
+    if (regex_cache[i].str)
+      {
+       free (regex_cache[i].str);
+       regfree (regex_cache[i].pat);
+       free (regex_cache[i].pat);
+       free (regex_cache[i].regs.start);
+       free (regex_cache[i].regs.end);
+      }
   /* If this module was preloaded, then we need to explicitly reset
      the memory in case it gets reloaded.  */
-  memset (&gnu_buf, 0, sizeof gnu_buf);
+  memset (&regex_cache, 0, sizeof regex_cache);
 }
 
 
@@ -635,10 +707,12 @@ M4BUILTIN_HANDLER (mkdtemp)
 }
 
 
-/* Substitute all matches of a regexp occuring in a string.  Each match of
-   the second argument (a regexp) in the first argument is changed to the
-   third argument, with \& substituted by the matched text, and \N
-   substituted by the text matched by the Nth parenthesized sub-expression.  */
+/* Substitute all matches of a regexp occurring in a string.  Each
+   match of the second argument (a regexp) in the first argument is
+   changed to the optional third argument, with \& substituted by the
+   matched text, and \N substituted by the text matched by the Nth
+   parenthesized sub-expression.  The optional fourth argument changes
+   the regex flavor.  */
 
 /**
  * patsubst(VICTIM, REGEXP, [REPLACEMENT], [RESYNTAX])
@@ -672,19 +746,20 @@ M4BUILTIN_HANDLER (patsubst)
       return;
     }
 
-  buf = m4_regexp_compile (context, me, pattern, resyntax, false);
+  buf = regexp_compile (context, me, pattern, resyntax);
   if (!buf)
     return;
 
-  m4_regexp_substitute (context, obs, me, M4ARG (1), pattern, buf,
-                       replace, false);
+  regexp_substitute (context, obs, me, M4ARG (1), pattern, buf,
+                    replace, false);
 }
 
 
-/* Regular expression version of index.  Given two arguments, expand to the
-   index of the first match of the second argument (a regexp) in the first.
-   Expand to -1 if here is no match.  Given a third argument, it changes
-   the expansion to this argument.  */
+/* Regular expression version of index.  Given two arguments, expand
+   to the index of the first match of the second argument (a regexp)
+   in the first.  Expand to -1 if there is no match.  Given a third
+   argument, a match is substituted according to this argument.  The
+   optional fourth argument changes the regex flavor.  */
 
 /**
  * regexp(VICTIM, REGEXP, RESYNTAX)
@@ -735,18 +810,18 @@ M4BUILTIN_HANDLER (regexp)
     {
       /* The empty regex matches everything.  */
       if (replace)
-       obstack_grow (obs, replace, strlen (replace));
+       substitute (context, obs, me, M4ARG (1), replace, NULL);
       else
        m4_shipout_int (obs, 0);
       return;
     }
 
-  buf = m4_regexp_compile (context, me, pattern, resyntax, replace == NULL);
+  buf = regexp_compile (context, me, pattern, resyntax);
   if (!buf)
     return;
 
   length = strlen (M4ARG (1));
-  startpos = m4_regexp_search (buf, M4ARG (1), length, 0, length);
+  startpos = regexp_search (buf, M4ARG (1), length, 0, length, replace == 
NULL);
 
   if (startpos == -2)
     {
@@ -797,7 +872,7 @@ M4BUILTIN_HANDLER (renamesyms)
            return;
        }
 
-      buf = m4_regexp_compile (context, me, regexp, resyntax, false);
+      buf = regexp_compile (context, me, regexp, resyntax);
       if (!buf)
        return;
 
@@ -810,8 +885,8 @@ M4BUILTIN_HANDLER (renamesyms)
        {
          const char *name = data.base[0];
 
-         if (m4_regexp_substitute (context, &rename_obs, me, name, regexp,
-                                   buf, replace, true))
+         if (regexp_substitute (context, &rename_obs, me, name, regexp,
+                                buf, replace, true))
            {
              const char *renamed = obstack_finish (&rename_obs);
 


hooks/post-receive
--
GNU M4 source repository




reply via email to

[Prev in Thread] Current Thread [Next in Thread]