bug#40634: Massive pattern list handling with -E format seems very slow since 2.28.

Norihiro Tanaka Sat, 18 Apr 2020 19:11:13 -0700

On Sun, 19 Apr 2020 07:41:49 +0900
Norihiro Tanaka <nori...@kcn.ne.jp> wrote:


> 
> On Sat, 18 Apr 2020 00:22:26 +0900
> Norihiro Tanaka <nori...@kcn.ne.jp> wrote:
> 
> > 
> > On Fri, 17 Apr 2020 10:24:42 +0900
> > Norihiro Tanaka <nori...@kcn.ne.jp> wrote:
> > 
> > > 
> > > On Fri, 17 Apr 2020 09:35:36 +0900
> > > Norihiro Tanaka <nori...@kcn.ne.jp> wrote:
> > > 
> > > > 
> > > > On Thu, 16 Apr 2020 16:00:29 -0700
> > > > Paul Eggert <egg...@cs.ucla.edu> wrote:
> > > > 
> > > > > On 4/16/20 3:53 PM, Norihiro Tanaka wrote:
> > > > > 
> > > > > > I have had no idea to solve the problem yet.  If we revert it, 
> > > > > > bug#33357
> > > > > > will come back.
> > > > > 
> > > > > Yes, I'd rather not revert if we can help it.
> > > > > 
> > > > > My own thought was to not analyze the regular expression if we 
> > > > > discover that the input is empty. :-)
> > > > 
> > > > Now, I have a idea, it is that we build indexes of epsilon nodes
> > > > including in follows before remove epsilon nodes.
> > > 
> > > 
> > > I wrote fix for the bug, but it will be slower then at grep 2.27 yet.
> > 
> > It was improved previous patch.
> 
> Sorry, correct patch is here.

I made the previous patch even simpler.

before:

$ env LC_ALL=C time -p src/grep -E -v -m1 -f grep-patterns.txt /dev/null
real 7.24
user 7.14
sys 0.09

after:

$ env LC_ALL=C time -p src/grep -E -v -m1 -f grep-patterns.txt /dev/null
real 0.62
user 0.52
sys 0.10

From fffe326a7cd7d452f86ea51cae6fe08168a1391e Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Sun, 19 Apr 2020 11:01:18 +0900
Subject: [PATCH] dfa: use backword set in removal of epsilon closure

When remove epsilon closure, so far searched nodes including epsilon closure
in all nodes sequentially, but it is slow for some cases.  Now build backword
set before search, and only check previos position with the set.
Problem reported in: https://debbugs.gnu.org/cgi/bugreport.cgi?bug=40634

* lib/dfa.c (struct dfa): New member 'epsilon'.
(addtok_mb): Check whether a pattern has epsilon node or not.
(epsclosure): When remove epsilon node and reconnect, only check previos
positions.
(dfaanalyze): Build backword set.
---
 lib/dfa.c |   65 +++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/lib/dfa.c b/lib/dfa.c
index 9939d22..811c41e 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -486,6 +486,7 @@ struct dfa
   bool fast;                   /* The DFA is fast.  */
   token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales.  */
   mbstate_t mbs;               /* Multibyte conversion state.  */
+  bool epsilon;
 
   /* The following are valid only if MB_CUR_MAX > 1.  */
 
@@ -1613,13 +1614,21 @@ addtok_mb (struct dfa *dfa, token t, char mbprop)
       dfa->parse.depth--;
       break;
 
-    case BACKREF:
-      dfa->fast = false;
+    case BEGLINE:
+    case ENDLINE:
+    case BEGWORD:
+    case ENDWORD:
+    case LIMWORD:
+    case NOTLIMWORD:
+    case EMPTY:
+      dfa->epsilon = true;
       FALLTHROUGH;
+
     default:
-      dfa->nleaves++;
-      FALLTHROUGH;
-    case EMPTY:
+      if (t == BACKREF)
+        dfa->fast = false;
+      if (t != EMPTY)
+        dfa->nleaves++;
       dfa->parse.depth++;
       break;
     }
@@ -2295,14 +2304,15 @@ state_index (struct dfa *d, position_set const *s, int 
context)
    constraint.  Repeat exhaustively until no funny positions are left.
    S->elems must be large enough to hold the result.  */
 static void
-epsclosure (struct dfa const *d)
+epsclosure (struct dfa const *d, position_set *backword)
 {
   position_set tmp;
   alloc_position_set (&tmp, d->nleaves);
   for (idx_t i = 0; i < d->tindex; i++)
     if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR
         && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR
-        && d->tokens[i] != MBCSET && d->tokens[i] < CSET)
+        && d->tokens[i] != MBCSET && d->tokens[i] < CSET
+        && d->tokens[i] != BEG)
       {
         unsigned int constraint;
         switch (d->tokens[i])
@@ -2325,16 +2335,21 @@ epsclosure (struct dfa const *d)
           case NOTLIMWORD:
             constraint = NOTLIMWORD_CONSTRAINT;
             break;
-          default:
+          case EMPTY:
             constraint = NO_CONSTRAINT;
             break;
+          default:
+            abort ();
           }
 
         delete (i, &d->follows[i]);
 
-        for (idx_t j = 0; j < d->tindex; j++)
-          if (i != j && d->follows[j].nelem > 0)
-            replace (&d->follows[j], i, &d->follows[i], constraint, &tmp);
+        for (idx_t j = 0; j < backword[i].nelem; j++)
+          replace (&d->follows[backword[i].elems[j].index], i, &d->follows[i],
+                   constraint, &tmp);
+        for (idx_t j = 0; j < d->follows[i].nelem; j++)
+          replace (&backword[d->follows[i].elems[j].index], i, &backword[i],
+                   NO_CONSTRAINT, &tmp);
       }
   free (tmp.elems);
 }
@@ -2641,6 +2656,7 @@ dfaanalyze (struct dfa *d, bool searchflag)
   /* Firstpos and lastpos elements.  */
   position *firstpos = posalloc;
   position *lastpos = firstpos + d->nleaves;
+  position_set *backword = NULL;
   position pos;
   position_set tmp;
 
@@ -2673,6 +2689,9 @@ dfaanalyze (struct dfa *d, bool searchflag)
   alloc_position_set (&merged, d->nleaves);
   d->follows = xcalloc (d->tindex, sizeof *d->follows);
 
+  if (d->epsilon)
+    backword = xcalloc (d->tindex, sizeof *backword);
+
   for (idx_t i = 0; i < d->tindex; i++)
     {
       switch (d->tokens[i])
@@ -2710,6 +2729,17 @@ dfaanalyze (struct dfa *d, bool searchflag)
         case CAT:
           /* Every element in the firstpos of the second argument is in the
              follow of every element in the lastpos of the first argument.  */
+          if (d->epsilon)
+            {
+              tmp.nelem = stk[-2].nlastpos;
+              tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
+              position *p = firstpos - stk[-1].nfirstpos;
+              for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
+                {
+                  merge (&tmp, &backword[p[j].index], &merged);
+                  copy (&merged, &backword[p[j].index]);
+                }
+            }
           {
             tmp.nelem = stk[-1].nfirstpos;
             tmp.elems = firstpos - stk[-1].nfirstpos;
@@ -2799,9 +2829,15 @@ dfaanalyze (struct dfa *d, bool searchflag)
 #endif
     }
 
-  /* For each follow set that is the follow set of a real position, replace
-     it with its epsilon closure.  */
-  epsclosure (d);
+  if (d->epsilon)
+    {
+      /* For each follow set that is the follow set of a real position,
+         replace it with its epsilon closure.  */
+      epsclosure (d, backword);
+
+      for (idx_t i = 0; i < d->tindex; i++)
+        free (backword[i].elems);
+    }
 
   dfaoptimize (d);
 
@@ -2863,6 +2899,7 @@ dfaanalyze (struct dfa *d, bool searchflag)
   d->min_trcount++;
   d->trcount = 0;
 
+  free (backword);
   free (posalloc);
   free (stkalloc);
   free (merged.elems);
-- 
1.7.1

bug#40634: Massive pattern list handling with -E format seems very slow since 2.28.

Reply via email to