bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 1/2] dfa: use backward set in removal of epsilon closure


From: Paul Eggert
Subject: [PATCH 1/2] dfa: use backward set in removal of epsilon closure
Date: Sat, 12 Sep 2020 18:54:56 -0700

From: Norihiro Tanaka <noritnk@kcn.ne.jp>

When removing in epsilon closure, the code searched all nodes
sequentially, and this was slow for some cases.  Build a backward
set before search, and only check previous position with the set.
Problem reported in <https://bugs.gnu.org/40634>.
* lib/dfa.c (struct dfa): New member 'epsilon'.
(addtok_mb): Check whether a pattern has epsilon node or not.
(epsclosure): New arg BACKWORD; caller changed.  When removing
epsilon node and reconnecting, check only previous positions.
Treat BEG as if it were character.
(dfaanalyze): Build backward set.
---
 lib/dfa.c | 65 +++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 51 insertions(+), 14 deletions(-)

diff --git a/lib/dfa.c b/lib/dfa.c
index 1f0587a7a..7c8eb6685 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -488,6 +488,7 @@ struct dfa
   bool fast;                   /* The DFA is fast.  */
   token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales.  */
   mbstate_t mbs;               /* Multibyte conversion state.  */
+  bool epsilon;
 
   /* The following are valid only if MB_CUR_MAX > 1.  */
 
@@ -1615,13 +1616,21 @@ addtok_mb (struct dfa *dfa, token t, char mbprop)
       dfa->parse.depth--;
       break;
 
-    case BACKREF:
-      dfa->fast = false;
+    case BEGLINE:
+    case ENDLINE:
+    case BEGWORD:
+    case ENDWORD:
+    case LIMWORD:
+    case NOTLIMWORD:
+    case EMPTY:
+      dfa->epsilon = true;
       FALLTHROUGH;
+
     default:
-      dfa->nleaves++;
-      FALLTHROUGH;
-    case EMPTY:
+      if (t == BACKREF)
+        dfa->fast = false;
+      if (t != EMPTY)
+        dfa->nleaves++;
       dfa->parse.depth++;
       break;
     }
@@ -2297,14 +2306,15 @@ state_index (struct dfa *d, position_set const *s, int 
context)
    constraint.  Repeat exhaustively until no funny positions are left.
    S->elems must be large enough to hold the result.  */
 static void
-epsclosure (struct dfa const *d)
+epsclosure (struct dfa const *d, position_set *backword)
 {
   position_set tmp;
   alloc_position_set (&tmp, d->nleaves);
   for (idx_t i = 0; i < d->tindex; i++)
     if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR
         && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR
-        && d->tokens[i] != MBCSET && d->tokens[i] < CSET)
+        && d->tokens[i] != MBCSET && d->tokens[i] < CSET
+        && d->tokens[i] != BEG)
       {
         unsigned int constraint;
         switch (d->tokens[i])
@@ -2327,16 +2337,21 @@ epsclosure (struct dfa const *d)
           case NOTLIMWORD:
             constraint = NOTLIMWORD_CONSTRAINT;
             break;
-          default:
+          case EMPTY:
             constraint = NO_CONSTRAINT;
             break;
+          default:
+            abort ();
           }
 
         delete (i, &d->follows[i]);
 
-        for (idx_t j = 0; j < d->tindex; j++)
-          if (i != j && d->follows[j].nelem > 0)
-            replace (&d->follows[j], i, &d->follows[i], constraint, &tmp);
+        for (idx_t j = 0; j < backword[i].nelem; j++)
+          replace (&d->follows[backword[i].elems[j].index], i, &d->follows[i],
+                   constraint, &tmp);
+        for (idx_t j = 0; j < d->follows[i].nelem; j++)
+          replace (&backword[d->follows[i].elems[j].index], i, &backword[i],
+                   NO_CONSTRAINT, &tmp);
       }
   free (tmp.elems);
 }
@@ -2643,6 +2658,7 @@ dfaanalyze (struct dfa *d, bool searchflag)
   /* Firstpos and lastpos elements.  */
   position *firstpos = posalloc;
   position *lastpos = firstpos + d->nleaves;
+  position_set *backword = NULL;
   position pos;
   position_set tmp;
 
@@ -2675,6 +2691,9 @@ dfaanalyze (struct dfa *d, bool searchflag)
   alloc_position_set (&merged, d->nleaves);
   d->follows = xcalloc (d->tindex, sizeof *d->follows);
 
+  if (d->epsilon)
+    backword = xcalloc (d->tindex, sizeof *backword);
+
   for (idx_t i = 0; i < d->tindex; i++)
     {
       switch (d->tokens[i])
@@ -2712,6 +2731,17 @@ dfaanalyze (struct dfa *d, bool searchflag)
         case CAT:
           /* Every element in the firstpos of the second argument is in the
              follow of every element in the lastpos of the first argument.  */
+          if (d->epsilon)
+            {
+              tmp.nelem = stk[-2].nlastpos;
+              tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
+              position *p = firstpos - stk[-1].nfirstpos;
+              for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
+                {
+                  merge (&tmp, &backword[p[j].index], &merged);
+                  copy (&merged, &backword[p[j].index]);
+                }
+            }
           {
             tmp.nelem = stk[-1].nfirstpos;
             tmp.elems = firstpos - stk[-1].nfirstpos;
@@ -2801,9 +2831,15 @@ dfaanalyze (struct dfa *d, bool searchflag)
 #endif
     }
 
-  /* For each follow set that is the follow set of a real position, replace
-     it with its epsilon closure.  */
-  epsclosure (d);
+  if (d->epsilon)
+    {
+      /* For each follow set that is the follow set of a real position,
+         replace it with its epsilon closure.  */
+      epsclosure (d, backword);
+
+      for (idx_t i = 0; i < d->tindex; i++)
+        free (backword[i].elems);
+    }
 
   dfaoptimize (d);
 
@@ -2865,6 +2901,7 @@ dfaanalyze (struct dfa *d, bool searchflag)
   d->min_trcount++;
   d->trcount = 0;
 
+  free (backword);
   free (posalloc);
   free (stkalloc);
   free (merged.elems);
-- 
2.17.1




reply via email to

[Prev in Thread] Current Thread [Next in Thread]