bug#32750: [PATCH 2/2] dfa: optmization of alternation in NFA

Norihiro Tanaka Mon, 17 Sep 2018 07:16:38 -0700

Hi,

Even when similer states exists in alternation, DFA treats them as
separate items.  It may complicate the transition in NFA and cause
slowdown.  This change assembles the states into one.


For example, ab|ac is changed into a(b|c).

This change speeds-up matching for many branched pattern.  For
example, grep speeds-up more than 30x in following case.

  seq 10000 | sed 's/$/ abcdefghijklmnopqrstuvwxyz/; s/$/./' >in
  time -p env LC_ALL=C grep -vf in in

  (before) real 63.43 user 61.67 system 1.65
  (after)  real  1.64 user  1.61 system 0.03

  # If we do not add '.' at last in pattern, not dfa but kwset is used.

grep also speeds-up about 3x in following case.

  time -p env LC_ALL=C grep -vf /usr/share/dict/linux.words 
/usr/share/dict/linux.words

  (before) real  2.48 user  2.09 system 0.38
  (after)  real  7.69 user  6.32 system 1.29

Thanks,
Norihiro

From 3193191730d6ecb3a0c4e38b461484deaf819f87 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Mon, 17 Sep 2018 22:20:37 +0900
Subject: [PATCH 1/2] dfa: simplify initial state

To simplify initial state enables to be easy to optimization of NFA.

dfa.c (enum token): Add new element BEG.
(prtok): Adjust due to adding element BEG.
(dfaparse): Put BEG at a head of tokens.
(state_index): Adjust due to adding element BEG.
(dfaanalyze): Concatenate BEG to other tokens, and simplify to build initial
state.
(dfamust): Adjust due to adding element BEG.  DFAMUST ignores it.
---
 lib/dfa.c |   37 +++++++++++++++++++++----------------
 1 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/lib/dfa.c b/lib/dfa.c
index e33084d..c0b24fc 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -223,12 +223,15 @@ typedef ptrdiff_t state_num;
 /* Predefined token values.  */
 enum
 {
-  END = -1,                     /* END is a terminal symbol that matches the
+  END = -2,                     /* END is a terminal symbol that matches the
                                    end of input; any value of END or less in
                                    the parse tree is such a symbol.  Accepting
                                    states of the DFA are those that would have
                                    a transition on END.  */
 
+  BEG = -1,                     /* BEG is a beginning symbol that matches the
+                                   biginning of input.  */
+
   /* Ordinary character values are terminal symbols that match themselves.  */
 
   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
@@ -630,9 +633,9 @@ mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct 
dfa *d)
 static void
 prtok (token t)
 {
-  if (t < 0)
+  if (t <= END)
     fprintf (stderr, "END");
-  else if (t < NOTCHAR)
+  else if (0 <= t && t < NOTCHAR)
     {
       unsigned int ch = t;
       fprintf (stderr, "0x%02x", ch);
@@ -642,6 +645,9 @@ prtok (token t)
       char const *s;
       switch (t)
         {
+        case BEG:
+          s = "BEG";
+          break;
         case EMPTY:
           s = "EMPTY";
           break;
@@ -1967,6 +1973,9 @@ dfaparse (char const *s, size_t len, struct dfa *d)
   if (!d->syntax.syntax_bits_set)
     dfaerror (_("no syntax specified"));
 
+  if (!d->nregexps)
+    addtok (d, BEG);
+
   d->parse.tok = lex (d);
   d->parse.depth = d->depth;
 
@@ -2180,7 +2189,7 @@ state_index (struct dfa *d, position_set const *s, int 
context)
   for (state_num j = 0; j < s->nelem; j++)
     {
       int c = s->elems[j].constraint;
-      if (d->tokens[s->elems[j].index] < 0)
+      if (d->tokens[s->elems[j].index] <= END)
         {
           if (succeeds_in_context (c, context, CTX_ANY))
             constraint |= c;
@@ -2380,6 +2389,8 @@ dfaanalyze (struct dfa *d, bool searchflag)
 
   position_set merged;          /* Result of merging sets.  */
 
+  addtok (d, CAT);
+
 #ifdef DEBUG
   fprintf (stderr, "dfaanalyze:\n");
   for (size_t i = 0; i < d->tindex; ++i)
@@ -2540,26 +2551,20 @@ dfaanalyze (struct dfa *d, bool searchflag)
       }
 #endif
 
-  /* Get the epsilon closure of the firstpos of the regexp.  The result will
-     be the set of positions of state 0.  */
-  merged.nelem = 0;
-  for (size_t i = 0; i < stk[-1].nfirstpos; ++i)
-    insert (firstpos[i], &merged);
-
   /* For each follow set that is the follow set of a real position, replace
      it with its epsilon closure.  */
-  epsclosure (&merged, d);
+  epsclosure (&d->follows[0], d);
 
   /* Context wanted by some position.  */
-  int separate_contexts = state_separate_contexts (&merged);
+  int separate_contexts = state_separate_contexts (&d->follows[0]);
 
   /* Build the initial state.  */
   if (separate_contexts & CTX_NEWLINE)
-    state_index (d, &merged, CTX_NEWLINE);
+    state_index (d, &d->follows[0], CTX_NEWLINE);
   d->initstate_notbol = d->min_trcount
-    = state_index (d, &merged, separate_contexts ^ CTX_ANY);
+    = state_index (d, &d->follows[0], separate_contexts ^ CTX_ANY);
   if (separate_contexts & CTX_LETTER)
-    d->min_trcount = state_index (d, &merged, CTX_LETTER);
+    d->min_trcount = state_index (d, &d->follows[0], CTX_LETTER);
   d->min_trcount++;
   d->trcount = 0;
 
@@ -3783,7 +3788,7 @@ dfamust (struct dfa const *d)
   bool need_endline = false;
   bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1;
 
-  for (size_t ri = 0; ri < d->tindex; ++ri)
+  for (size_t ri = 1; ri < d->tindex - 1; ++ri)
     {
       token t = d->tokens[ri];
       switch (t)
-- 
1.7.1

From 4728286325834dd02026bf1234c9d481a5de7ee5 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Mon, 17 Sep 2018 22:46:25 +0900
Subject: [PATCH 2/2] dfa: optmization of alternation in NFA

Even when similer states exists in alternation, DFA treats them as
separate items.  It may complicate the transition in NFA and cause
slowdown.  This change assembles the states into one.  For example,
ab|ac is changed into a(b|c).  This change speeds-up matching for
many branched pattern.  For example, grep speeds-up more than 30x
in following case.

  seq 10000 | sed 's/$/ abcdefghijklmnopqrstuvwxyz/; s/$/./' >in
  time -p env LC_ALL=C grep -vf in in

dfa.c (prune): New function.
(merge_nfa_state): New function.  It merges same NFA states.
(dfaoptimize): New function.  It seeks merged and removed nodes.
(dfaanalyze): Call new function.
(dfautf8noss): Change name from dfaoptimize because of addition of new
function.
(dfacomp): Update caller.
---
 lib/dfa.c |  146 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 144 insertions(+), 2 deletions(-)

diff --git a/lib/dfa.c b/lib/dfa.c
index c0b24fc..3991112 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -2121,6 +2121,22 @@ delete (size_t del, position_set *s)
   return 0;
 }
 
+static void
+prune (position_set *s)
+{
+  size_t d = 0;
+
+  for (size_t i = 0; i < s->nelem; i++)
+    {
+      if (s->elems[i].constraint == 0)
+        continue;
+
+      s->elems[d++] = s->elems[i];
+    }
+
+  s->nelem = d;
+}
+
 /* Replace a position with the followed set.  */
 static void
 replace (position_set *dst, size_t del, position_set *add,
@@ -2314,6 +2330,130 @@ state_separate_contexts (position_set const *s)
   return separate_contexts;
 }
 
+enum
+{
+  /* Single token is repeated.  It is distinguished from non-repeated.  */
+  OPT_REPEAT = (1 << 0),
+
+  /* Multiple tokens are repeated.  This flag is on at head of tokens.  The
+     node is not merged.  */
+  OPT_LPAREN = (1 << 1),
+
+  /* Multiple branches are joined.  The node is not merged.  */
+  OPT_RPAREN = (1 << 2),
+
+  /* The node is walked.  If the node is found in walking again, OPT_RPAREN
+     flag is turned on. */
+  OPT_WALKED = (1 << 3),
+
+  /* The node is queued.  The node is not queued again.  */
+  OPT_QUEUED = (1 << 4)
+};
+
+static size_t
+merge_nfa_state (struct dfa *d, size_t *queue, size_t nqueue, size_t tindex,
+                 int *flags, position_set *merged)
+{
+  size_t sindex;
+  size_t dindex;
+
+  for (size_t i = 0; i < d->follows[tindex].nelem; i++)
+    {
+      dindex = d->follows[tindex].elems[i].index;
+
+      /* Skip the node as pruned in future.  */
+      if (d->follows[tindex].elems[i].constraint == 0)
+        continue;
+
+      if (tindex < dindex && !(flags[dindex] & OPT_QUEUED))
+        {
+          queue[nqueue++] = dindex;
+          flags[dindex] |= OPT_QUEUED;
+        }
+
+      if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
+        continue;
+
+      for (size_t j = i + 1; j < d->follows[tindex].nelem; j++)
+        {
+          sindex = d->follows[tindex].elems[j].index;
+
+          if (d->follows[tindex].elems[j].constraint == 0)
+            continue;
+
+          if (flags[sindex] & (OPT_LPAREN | OPT_RPAREN))
+            continue;
+
+          if (d->tokens[sindex] != d->tokens[dindex])
+            continue;
+
+          if (d->follows[tindex].elems[i].constraint !=
+              d->follows[tindex].elems[j].constraint)
+            continue;
+
+          if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
+            continue;
+
+          if (flags[sindex] & OPT_REPEAT)
+            delete (sindex, &d->follows[sindex]);
+
+          merge (&d->follows[dindex], &d->follows[sindex], merged);
+          copy (merged, &d->follows[dindex]);
+
+          /* Mark it as pruned in future.  */
+          d->follows[tindex].elems[j].constraint = 0;
+        }
+    }
+
+  prune (&d->follows[tindex]);
+
+  return nqueue;
+}
+
+static void
+dfaoptimize (struct dfa *d)
+{
+  int *flags;
+  size_t *queue;
+  size_t nqueue;
+  position_set merged0;
+  position_set *merged;
+
+  flags = xnmalloc (d->tindex, sizeof *flags);
+  queue = xnmalloc (d->nleaves, sizeof *queue);
+
+  for (size_t i = 0; i < d->tindex; ++i)
+    flags[i] = 0;
+
+  for (size_t i = 0; i < d->tindex; ++i)
+    {
+      for (size_t j = 0; j < d->follows[i].nelem; j++)
+        {
+          if (d->follows[i].elems[j].index == i)
+            flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
+          else if (d->follows[i].elems[j].index < i)
+            flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
+          else if (flags[d->follows[i].elems[j].index] &=
+                   OPT_WALKED)
+            flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
+          else
+            flags[d->follows[i].elems[j].index] |= OPT_WALKED;
+        }
+    }
+
+  merged = &merged0;
+  alloc_position_set (merged, d->nleaves);
+
+  nqueue = 0;
+  queue[nqueue++] = 0;
+
+  for (size_t i = 0; i < nqueue; i++)
+    nqueue = merge_nfa_state (d, queue, nqueue, queue[i], flags, merged);
+
+  free (merged->elems);
+  free (queue);
+  free (flags);
+}
 
 /* Perform bottom-up analysis on the parse tree, computing various functions.
    Note that at this point, we're pretending constructs like \< are real
@@ -2533,6 +2673,8 @@ dfaanalyze (struct dfa *d, bool searchflag)
 #endif
     }
 
+  dfaoptimize (d);
+
 #ifdef DEBUG
   for (size_t i = 0; i < d->tindex; ++i)
     if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
@@ -3353,7 +3495,7 @@ dfa_supported (struct dfa const *d)
 }
 
 static void
-dfaoptimize (struct dfa *d)
+dfautf8noss (struct dfa *d)
 {
   if (!d->localeinfo.using_utf8)
     return;
@@ -3481,7 +3623,7 @@ dfacomp (char const *s, size_t len, struct dfa *d, bool 
searchflag)
 
   if (dfa_supported (d))
     {
-      dfaoptimize (d);
+      dfautf8noss (d);
       dfaanalyze (d, searchflag);
     }
   else
-- 
1.7.1

bug#32750: [PATCH 2/2] dfa: optmization of alternation in NFA

Reply via email to