Re: [PATCH] dfa: addition of new state on demand

Paul Eggert Fri, 25 Nov 2016 11:04:22 -0800

Norihiro Tanaka wrote:

Can anyone review this change?

Thanks for doing all that, and sorry about the late review. I reviewed it,tweaked its commit message and propagated that into ChangeLog (the gnulibpractice), and installed the result into gnulib, with two followup patches thatI hope are self-explanatory. All three patches are attached. Please let us knowof any problems you see with the result.

CC'ing this to grep-devel since that's where the hardy band of dfa consumershang out.

From 41ed6b2c88e9fa7c2ab7f29d1ec1bea20f095614 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Fri, 25 Nov 2016 10:43:38 -0800
Subject: [PATCH 1/3] dfa: addition of new state on demand

* src/dfa.c (dfastate): Add argument UC, the current input character.
Fill only a group including the character in transition table.
(realloc_trans_if_necessary): Add the dummy state which means that a
transition table is assigned but the next state is not assigned.
(build_state): Return the next state.  All callers updated.
(transit_state_singlebyte): If we get the dummy state,
fill the transition table.
(dfaexec_main): Handle the dummy state.
(free_mbdata, dfafree): Consider the dummy state.
---
 ChangeLog |  13 +++
 lib/dfa.c | 347 +++++++++++++++++++++++++-------------------------------------
 2 files changed, 155 insertions(+), 205 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index b8f4423..ef6d40e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,16 @@
+2016-11-25  Norihiro Tanaka  <nori...@kcn.ne.jp>
+
+	dfa: addition of new state on demand
+	* src/dfa.c (dfastate): Add argument UC, the current input character.
+	Fill only a group including the character in transition table.
+	(realloc_trans_if_necessary): Add the dummy state which means that a
+	transition table is assigned but the next state is not assigned.
+	(build_state): Return the next state.  All callers updated.
+	(transit_state_singlebyte): If we get the dummy state,
+	fill the transition table.
+	(dfaexec_main): Handle the dummy state.
+	(free_mbdata, dfafree): Consider the dummy state.
+
 2016-11-24  Daiki Ueno  <u...@gnu.org>
 
 	srclist: sync with released gettext
diff --git a/lib/dfa.c b/lib/dfa.c
index 7b80a1a..a182b05 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -477,7 +477,8 @@ struct dfa
 
   /* Fields filled by dfaexec.  */
   state_num tralloc;            /* Number of transition tables that have
-                                   slots so far, not counting trans[-1].  */
+                                   slots so far, not counting trans[-1] and
+                                   trans[-2].  */
   int trcount;                  /* Number of transition tables that have
                                    actually been built.  */
   int min_trcount;              /* Minimum of number of transition tables.
@@ -490,7 +491,8 @@ struct dfa
                                    state could possibly accept, its entry in
                                    this table is NULL.  This points to one
                                    past the start of the allocated array,
-                                   and trans[-1] is always NULL.  */
+                                   and trans[-1] and trans[-2] are always
+                                   NULL.  */
   state_num **fails;            /* Transition tables after failing to accept
                                    on a state that potentially could do so.  */
   int *success;                 /* Table of acceptance conditions used in
@@ -507,7 +509,8 @@ struct dfa
                                    do not distinguish between their contexts,
                                    as not supported word.  */
   position_set mb_follows;      /* Follow set added by ANYCHAR on demand.  */
-  state_num **mb_trans;      /* Transition tables for states with ANYCHAR.  */
+  state_num **mb_trans;         /* Transition tables for states with
+                                   ANYCHAR.  */
   state_num mb_trcount;         /* Number of transition tables for states with
                                    ANYCHAR that have actually been built.  */
 
@@ -2510,19 +2513,12 @@ dfaanalyze (struct dfa *d, bool searchflag)
    If after comparing with every group there are characters remaining in C,
    create a new group labeled with the characters of C and insert this
    position in that group.  */
-static void
-dfastate (state_num s, struct dfa *d, state_num trans[])
+static state_num
+dfastate (state_num s, struct dfa *d, unsigned char uc, state_num trans[])
 {
-  leaf_set grps[NOTCHAR];       /* As many as will ever be needed.  */
-  charclass labels[NOTCHAR];    /* Labels corresponding to the groups.  */
-  size_t ngrps = 0;             /* Number of groups actually used.  */
-  position pos;                 /* Current position being considered.  */
+  leaf_set group;               /* As many as will ever be needed.  */
+  charclass labels;             /* Labels corresponding to the group.  */
   charclass matches;            /* Set of matching characters.  */
-  charclass_word matchesf;	/* Nonzero if matches is nonempty.  */
-  charclass intersect;          /* Intersection with some label set.  */
-  charclass_word intersectf;	/* Nonzero if intersect is nonempty.  */
-  charclass leftovers;          /* Stuff in the label that didn't match.  */
-  charclass_word leftoversf;	/* Nonzero if leftovers is nonempty.  */
   position_set follows;         /* Union of the follows of some group.  */
   position_set tmp;             /* Temporary space for merging sets.  */
   int possible_contexts;        /* Contexts that this group can match.  */
@@ -2537,18 +2533,36 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
   fprintf (stderr, "build state %td\n", s);
 #endif
 
-  zeroset (matches);
+  group.elems = xnmalloc (d->nleaves, sizeof *group.elems);
+  group.nelem = 0;
+
+  zeroset (labels);
+  notset (labels);
 
   for (i = 0; i < d->states[s].elems.nelem; ++i)
     {
-      pos = d->states[s].elems.elems[i];
+      position pos = d->states[s].elems.elems[i];
+      bool matched = false;
       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
-        setbit (d->tokens[pos.index], matches);
+        {
+          zeroset (matches);
+          setbit (d->tokens[pos.index], matches);
+          if (d->tokens[pos.index] == uc)
+            matched = true;
+        }
       else if (d->tokens[pos.index] >= CSET)
-        copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
-      else if (d->tokens[pos.index] == ANYCHAR)
         {
+          zeroset (matches);
+          copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
+          if (tstbit (uc, d->charclasses[d->tokens[pos.index] - CSET]))
+            matched = true;
+        }
+       else if (d->tokens[pos.index] == ANYCHAR)
+         {
+          zeroset (matches);
           copyset (d->charclasses[d->canychar], matches);
+          if (tstbit (uc, d->charclasses[d->canychar]))
+            matched = true;
 
           /* ANYCHAR must match with a single character, so we must put
              it to D->states[s].mbps which contains the positions which
@@ -2603,113 +2617,31 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
       fprintf (stderr, "\n");
 #endif
 
-      for (j = 0; j < ngrps; ++j)
+      if (matched)
         {
-          /* If matches contains a single character only, and the current
-             group's label doesn't contain that character, go on to the
-             next group.  */
-          if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
-              && !tstbit (d->tokens[pos.index], labels[j]))
-            continue;
-
-          /* Check if this group's label has a nonempty intersection with
-             matches.  */
-          intersectf = 0;
-          for (k = 0; k < CHARCLASS_WORDS; ++k)
-            intersectf |= intersect[k] = matches[k] & labels[j][k];
-          if (!intersectf)
-            continue;
-
-          /* It does; now find the set differences both ways.  */
-          leftoversf = matchesf = 0;
           for (k = 0; k < CHARCLASS_WORDS; ++k)
-            {
-              /* Even an optimizing compiler can't know this for sure.  */
-              charclass_word match = matches[k], label = labels[j][k];
-
-              leftoversf |= leftovers[k] = label & ~match;
-              matchesf |= matches[k] = match & ~label;
-            }
-
-          /* If there were leftovers, create a new group labeled with them.  */
-          if (leftoversf)
-            {
-              copyset (leftovers, labels[ngrps]);
-              copyset (intersect, labels[j]);
-              grps[ngrps].elems = xnmalloc (d->nleaves,
-                                            sizeof *grps[ngrps].elems);
-              memcpy (grps[ngrps].elems, grps[j].elems,
-                      sizeof (grps[j].elems[0]) * grps[j].nelem);
-              grps[ngrps].nelem = grps[j].nelem;
-              ++ngrps;
-            }
-
-          /* Put the position in the current group.  The constraint is
-             irrelevant here.  */
-          grps[j].elems[grps[j].nelem++] = pos.index;
-
-          /* If every character matching the current position has been
-             accounted for, we're done.  */
-          if (!matchesf)
-            break;
+            labels[k] &= matches[k];
+          group.elems[group.nelem++] = pos.index;
         }
-
-      /* If we've passed the last group, and there are still characters
-         unaccounted for, then we'll have to create a new group.  */
-      if (j == ngrps)
+      else
         {
-          copyset (matches, labels[ngrps]);
-          zeroset (matches);
-          grps[ngrps].elems = xnmalloc (d->nleaves, sizeof *grps[ngrps].elems);
-          grps[ngrps].nelem = 1;
-          grps[ngrps].elems[0] = pos.index;
-          ++ngrps;
+          for (k = 0; k < CHARCLASS_WORDS; ++k)
+            labels[k] &= ~matches[k];
         }
     }
 
   alloc_position_set (&follows, d->nleaves);
   alloc_position_set (&tmp, d->nleaves);
 
-  /* If we are a searching matcher, the default transition is to a state
-     containing the positions of state 0, otherwise the default transition
-     is to fail miserably.  */
-  if (d->searchflag)
-    {
-      int c;
-
-      state_newline = 0;
-      state_letter = d->min_trcount - 1;
-      state = d->initstate_notbol;
-
-      for (c = 0; c < NOTCHAR; ++c)
-        {
-          switch (d->syntax.sbit[c])
-            {
-            case CTX_NEWLINE:
-              trans[c] = state_newline;
-              break;
-            case CTX_LETTER:
-              trans[c] = state_letter;
-              break;
-            default:
-              trans[c] = state;
-              break;
-            }
-        }
-    }
-  else
-    for (i = 0; i < NOTCHAR; ++i)
-      trans[i] = -1;
-
-  for (i = 0; i < ngrps; ++i)
+  if (group.nelem > 0)
     {
       follows.nelem = 0;
 
       /* Find the union of the follows of the positions of the group.
          This is a hideously inefficient loop.  Fix it someday.  */
-      for (j = 0; j < grps[i].nelem; ++j)
-        for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
-          insert (d->follows[grps[i].elems[j]].elems[k], &follows);
+      for (j = 0; j < group.nelem; ++j)
+        for (k = 0; k < d->follows[group.elems[j]].nelem; ++k)
+          insert (d->follows[group.elems[j]].elems[k], &follows);
 
       if (d->localeinfo.multibyte)
         {
@@ -2751,7 +2683,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         }
 
       /* Find out if the new state will want any context information.  */
-      possible_contexts = charclass_context (d, labels[i]);
+      possible_contexts = charclass_context (d, labels);
       separate_contexts = state_separate_contexts (&follows);
 
       /* Find the state(s) corresponding to the union of the follows.  */
@@ -2767,50 +2699,43 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         state_letter = state_index (d, &follows, CTX_LETTER);
       else
         state_letter = state;
+    }
 
-#ifdef DEBUG
-      fprintf (stderr, "group %zu\n nextpos:", i);
-      for (j = 0; j < grps[i].nelem; ++j)
-        {
-          fprintf (stderr, " %zu:", grps[i].elems[j]);
-          prtok (d->tokens[grps[i].elems[j]]);
-        }
-      fprintf (stderr, "\n follows:");
-      for (j = 0; j < follows.nelem; ++j)
-        {
-          fprintf (stderr, " %zu:", follows.elems[j].index);
-          prtok (d->tokens[follows.elems[j].index]);
-        }
-      fprintf (stderr, "\n states:");
-      if (possible_contexts & CTX_NEWLINE)
-        fprintf (stderr, " CTX_NEWLINE:%td", state_newline);
-      if (possible_contexts & CTX_LETTER)
-        fprintf (stderr, " CTX_LETTER:%td", state_letter);
-      if (possible_contexts & CTX_NONE)
-        fprintf (stderr, " CTX_NONE:%td", state);
-      fprintf (stderr, "\n");
-#endif
+  /* If we are a searching matcher, the default transition is to a state
+     containing the positions of state 0, otherwise the default transition
+     is to fail miserably.  */
+  else if (d->searchflag)
+    {
+      state_newline = 0;
+      state_letter = d->min_trcount - 1;
+      state = d->initstate_notbol;
+    }
+  else
+    {
+      state_newline = -1;
+      state_letter = -1;
+      state = -1;
+    }
 
-      /* Set the transitions for each character in the current label.  */
-      for (j = 0; j < CHARCLASS_WORDS; ++j)
-        for (k = 0; k < CHARCLASS_WORD_BITS; ++k)
-          if (labels[i][j] >> k & 1)
+  /* Set the transitions for each character in the current label.  */
+  int c;
+  for (c = 0; c < NOTCHAR; ++c)
+    {
+      if (tstbit (c, labels))
+        {
+          switch (d->syntax.sbit[c])
             {
-              int c = j * CHARCLASS_WORD_BITS + k;
-
-              switch (d->syntax.sbit[c])
-                {
-                case CTX_NEWLINE:
-                  trans[c] = state_newline;
-                  break;
-                case CTX_LETTER:
-                  trans[c] = state_letter;
-                  break;
-                default:
-                  trans[c] = state;
-                  break;
-                }
+            case CTX_NEWLINE:
+              trans[c] = state_newline;
+              break;
+            case CTX_LETTER:
+              trans[c] = state_letter;
+              break;
+            default:
+              trans[c] = state;
+              break;
             }
+        }
     }
 
 #ifdef DEBUG
@@ -2824,10 +2749,19 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
   fprintf (stderr, "\n");
 #endif
 
-  for (i = 0; i < ngrps; ++i)
-    free (grps[i].elems);
+  free (group.elems);
   free (follows.elems);
   free (tmp.elems);
+
+  /* Keep the newline transition in a special place so we can use it as
+     a sentinel.  */
+  if (tstbit (d->syntax.eolbyte, labels))
+    {
+      d->newlines[s] = trans[d->syntax.eolbyte];
+      trans[d->syntax.eolbyte] = -1;
+    }
+
+  return trans[uc];
 }
 
 /* Make sure D's state arrays are large enough to hold NEW_STATE.  */
@@ -2837,23 +2771,23 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state)
   state_num oldalloc = d->tralloc;
   if (oldalloc <= new_state)
     {
-      state_num **realtrans = d->trans ? d->trans - 1 : NULL;
+      state_num **realtrans = d->trans ? d->trans - 2 : NULL;
       size_t newalloc, newalloc1;
-      newalloc1 = new_state + 1;
+      newalloc1 = realtrans ? new_state + 2 : 0;
       realtrans = x2nrealloc (realtrans, &newalloc1, sizeof *realtrans);
-      realtrans[0] = NULL;
-      d->trans = realtrans + 1;
-      d->tralloc = newalloc = newalloc1 - 1;
+      realtrans[0] = realtrans[1] = NULL;
+      d->trans = realtrans + 2;
+      d->tralloc = newalloc = newalloc1 - 2;
       d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
       d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
       d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
       if (d->localeinfo.multibyte)
         {
-          realtrans = d->mb_trans ? d->mb_trans - 1 : NULL;
+          realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
           realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
           if (oldalloc == 0)
-            realtrans[0] = NULL;
-          d->mb_trans = realtrans + 1;
+            realtrans[0] = realtrans[1] = NULL;
+          d->mb_trans = realtrans + 2;
         }
       for (; oldalloc < newalloc; oldalloc++)
         {
@@ -2872,37 +2806,44 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state)
    If it has no table at all, then d->trans[state] is NULL.
    TODO: Improve this comment, get rid of the unnecessary redundancy.  */
 
-static void
-build_state (state_num s, struct dfa *d)
+static state_num
+build_state (state_num s, struct dfa *d, unsigned char uc)
 {
   state_num *trans;             /* The new transition table.  */
   state_num i, maxstate;
 
-  /* Set an upper limit on the number of transition tables that will ever
-     exist at once.  MAX_TRCOUNT is arbitrary.  The idea is that the frequently
-     used transition tables will be quickly rebuilt, whereas the ones that
-     were only needed once or twice will be cleared away.  However, do not
-     clear the initial D->min_trcount states, since they are always used.  */
-  if (MAX_TRCOUNT <= d->trcount)
+  if (d->trans[s] != NULL)
+    trans = d->trans[s];
+  if (d->fails[s] != NULL)
+    trans = d->fails[s];
+  else
     {
-      for (i = d->min_trcount; i < d->tralloc; ++i)
-        {
-          free (d->trans[i]);
-          free (d->fails[i]);
-          d->trans[i] = d->fails[i] = NULL;
-        }
-      d->trcount = d->min_trcount;
-
-      if (d->localeinfo.multibyte)
+      /* Set an upper limit on the number of transition tables that will ever
+         exist at once.  MAX_TRCOUNT is arbitrary.  The idea is that the frequently
+         used transition tables will be quickly rebuilt, whereas the ones that
+         were only needed once or twice will be cleared away.  However, do not
+         clear the initial D->min_trcount states, since they are always used.  */
+      if (MAX_TRCOUNT <= d->trcount)
         {
-          for (i = d->min_trcount; i < d->tralloc; i++)
+          for (i = d->min_trcount; i < d->tralloc; ++i)
             {
-              free (d->mb_trans[i]);
-              d->mb_trans[i] = NULL;
+              free (d->trans[i]);
+              free (d->fails[i]);
+              d->trans[i] = d->fails[i] = NULL;
             }
-          free (d->mb_trans[-1]);
-          d->mb_trans[-1] = NULL;
+          d->trcount = d->min_trcount;
         }
+      trans = xmalloc (NOTCHAR * sizeof *trans);
+
+      /* Fill transition table with default value which means that
+         transited state has not been culculated yet.  */
+      for (i = 0; i < NOTCHAR; i++)
+        trans[i] = -2;
+
+      if (ACCEPTING (s, *d))
+        d->fails[s] = trans;
+      else
+        d->trans[s] = trans;
     }
 
   ++d->trcount;
@@ -2916,8 +2857,7 @@ build_state (state_num s, struct dfa *d)
   if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NONE, s, *d))
     d->success[s] |= CTX_NONE;
 
-  trans = xmalloc (NOTCHAR * sizeof *trans);
-  dfastate (s, d, trans);
+  s = dfastate (s, d, uc, trans);
 
   /* Now go through the new transition table, and make sure that the trans
      and fail arrays are allocated large enough to hold a pointer for the
@@ -2928,15 +2868,7 @@ build_state (state_num s, struct dfa *d)
       maxstate = trans[i];
   realloc_trans_if_necessary (d, maxstate);
 
-  /* Keep the newline transition in a special place so we can use it as
-     a sentinel.  */
-  d->newlines[s] = trans[d->syntax.eolbyte];
-  trans[d->syntax.eolbyte] = -1;
-
-  if (ACCEPTING (s, *d))
-    d->fails[s] = trans;
-  else
-    d->trans[s] = trans;
+  return s;
 }
 
 /* Multibyte character handling sub-routines for dfaexec.  */
@@ -2956,7 +2888,7 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
     t = d->fails[s];
   else
     {
-      build_state (s, d);
+      build_state (s, d, **pp);
       if (d->trans[s])
         t = d->trans[s];
       else
@@ -2966,6 +2898,9 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
         }
     }
 
+  if (t[**pp] == -2)
+    build_state (s, d, **pp);
+
   return t[*(*pp)++];
 }
 
@@ -3031,7 +2966,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
   else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
     return d->mb_trans[s][d->states[s1].mb_trindex];
 
-  if (s < 0)
+  if (s == -1)
     copy (&d->states[s1].mbps, &d->mb_follows);
   else
     merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
@@ -3139,10 +3074,7 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
     }
 
   if (!d->tralloc)
-    {
-      realloc_trans_if_necessary (d, 1);
-      build_state (0, d);
-    }
+    realloc_trans_if_necessary (d, 0);
 
   s = s1 = 0;
   p = mbp = (unsigned char const *) begin;
@@ -3210,7 +3142,7 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
             }
         }
 
-      if (s < 0)
+      if (s == -1)
         {
           if ((char *) p > end || p[-1] != eol || d->newlines[s1] < 0)
             {
@@ -3228,6 +3160,11 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
                : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
                : d->initstate_notbol);
         }
+      else if (s == -2)
+        {
+          s = build_state (s1, d, p[-1]);
+          trans = d->trans;
+        }
       else if (d->fails[s])
         {
           if ((d->success[s] & d->syntax.sbit[*p])
@@ -3256,7 +3193,7 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
         }
       else
         {
-          build_state (s, d);
+          build_state (s, d, p[0]);
           trans = d->trans;
         }
     }
@@ -3336,7 +3273,7 @@ free_mbdata (struct dfa *d)
       state_num s;
       for (s = -1; s < d->tralloc; s++)
         free (d->mb_trans[s]);
-      free (d->mb_trans - 1);
+      free (d->mb_trans - 2);
     }
 }
 
@@ -3545,7 +3482,7 @@ dfafree (struct dfa *d)
           free (d->fails[i]);
         }
 
-      free (d->trans - 1);
+      free (d->trans - 2);
       free (d->fails);
       free (d->newlines);
       free (d->success);
-- 
2.7.4

From c36918f5e752eccc89678083a784eea92b4f30b3 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 25 Nov 2016 10:43:38 -0800
Subject: [PATCH 2/3] dfa: fix glitches with on-demand states

Also, adjust commentary to better match new code.
Some of these glitches predate the recent change.
* lib/dfa.c (dfaanalyze): Clear trcount here, so that it counts
only non-initial states.
(dfastate): Rename locals to better match new roles.
Move them into nested scopes if this is easy.
Omit unnecessary cdalls to zeroset.
Simplify test for whether to throw in the positions of state 0.
Omit C99-ism (decl after statement) since Gawk still wants C89.
(build_state): Omit unnecessary test and assignment.
Fix some confusion that counted transition tables inaccurately
and could cause a memory leak.
(dfaexec_main): Redo to make it clearer to the compiler that
-1 and -2 are the only negative state numbers here.
---
 ChangeLog |  18 +++++
 lib/dfa.c | 240 +++++++++++++++++++++++++++++---------------------------------
 2 files changed, 131 insertions(+), 127 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index ef6d40e..01fa184 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,21 @@
+2016-11-25  Paul Eggert  <egg...@cs.ucla.edu>
+
+	dfa: fix glitches with on-demand states
+	Also, adjust commentary to better match new code.
+	Some of these glitches predate the recent change.
+	* lib/dfa.c (dfaanalyze): Clear trcount here, so that it counts
+	only non-initial states.
+	(dfastate): Rename locals to better match new roles.
+	Move them into nested scopes if this is easy.
+	Omit unnecessary cdalls to zeroset.
+	Simplify test for whether to throw in the positions of state 0.
+	Omit C99-ism (decl after statement) since Gawk still wants C89.
+	(build_state): Omit unnecessary test and assignment.
+	Fix some confusion that counted transition tables inaccurately
+	and could cause a memory leak.
+	(dfaexec_main): Redo to make it clearer to the compiler that
+	-1 and -2 are the only negative state numbers here.
+
 2016-11-25  Norihiro Tanaka  <nori...@kcn.ne.jp>
 
 	dfa: addition of new state on demand
diff --git a/lib/dfa.c b/lib/dfa.c
index a182b05..234f29e 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -314,7 +314,8 @@ typedef struct
                                    ANYCHAR.  */
 } dfa_state;
 
-/* Maximum for any transition table count that exceeds min_trcount.  */
+/* Maximum for any transition table count.  This should be at least 3,
+   for the initial state setup.  */
 enum { MAX_TRCOUNT = 1024 };
 
 /* A bracket operator.
@@ -480,11 +481,11 @@ struct dfa
                                    slots so far, not counting trans[-1] and
                                    trans[-2].  */
   int trcount;                  /* Number of transition tables that have
-                                   actually been built.  */
-  int min_trcount;              /* Minimum of number of transition tables.
-                                   Always keep the number, even after freeing
-                                   the transition tables.  It is also the
-                                   number of initial states.  */
+                                   been built, other than for initial
+                                   states.  */
+  int min_trcount;              /* Number of initial states.  Equivalently,
+                                   the minimum state number for which trcount
+                                   counts transitions.  */
   state_num **trans;            /* Transition tables for states that can
                                    never accept.  If the transitions for a
                                    state have not yet been computed, or the
@@ -494,7 +495,9 @@ struct dfa
                                    and trans[-1] and trans[-2] are always
                                    NULL.  */
   state_num **fails;            /* Transition tables after failing to accept
-                                   on a state that potentially could do so.  */
+                                   on a state that potentially could do so.
+                                   If trans[i] is non-null, fails[i] must
+                                   be null.  */
   int *success;                 /* Table of acceptance conditions used in
                                    dfaexec and computed in build_state.  */
   state_num *newlines;          /* Transitions on newlines.  The entry for a
@@ -2475,6 +2478,7 @@ dfaanalyze (struct dfa *d, bool searchflag)
   if (separate_contexts & CTX_LETTER)
     d->min_trcount = state_index (d, &merged, CTX_LETTER);
   d->min_trcount++;
+  d->trcount = 0;
 
   free (posalloc);
   free (stkalloc);
@@ -2483,31 +2487,31 @@ dfaanalyze (struct dfa *d, bool searchflag)
 }
 
 
-/* Find, for each character, the transition out of state s of d, and store
-   it in the appropriate slot of trans.
+/* Return the transition out of state s of d for the input character uc,
+   updating the slots in trans accordingly.
 
-   We divide the positions of s into groups (positions can appear in more
-   than one group).  Each group is labeled with a set of characters that
+   Do not worry about all possible input characters; calculate just the group
+   of positions that match uc.  Label it with the set of characters that
    every position in the group matches (taking into account, if necessary,
-   preceding context information of s).  For each group, find the union
-   of the its elements' follows.  This set is the set of positions of the
+   preceding context information of s).  Then find the union
+   of these positions' follows, i.e., the set of positions of the
    new state.  For each character in the group's label, set the transition
    on this character to be to a state corresponding to the set's positions,
    and its associated backward context information, if necessary.
 
-   If we are building a searching matcher, we include the positions of state
+   When building a searching matcher, include the positions of state
    0 in every state.
 
-   The collection of groups is constructed by building an equivalence-class
+   The group is constructed by building an equivalence-class
    partition of the positions of s.
 
    For each position, find the set of characters C that it matches.  Eliminate
    any characters from C that fail on grounds of backward context.
 
-   Search through the groups, looking for a group whose label L has nonempty
+   Check whether the group's label L has nonempty
    intersection with C.  If L - C is nonempty, create a new group labeled
    L - C and having the same positions as the current group, and set L to
-   the intersection of L and C.  Insert the position in this group, set
+   the intersection of L and C.  Insert the position in the group, set
    C = C - L, and resume scanning.
 
    If after comparing with every group there are characters remaining in C,
@@ -2516,17 +2520,13 @@ dfaanalyze (struct dfa *d, bool searchflag)
 static state_num
 dfastate (state_num s, struct dfa *d, unsigned char uc, state_num trans[])
 {
-  leaf_set group;               /* As many as will ever be needed.  */
-  charclass labels;             /* Labels corresponding to the group.  */
-  charclass matches;            /* Set of matching characters.  */
-  position_set follows;         /* Union of the follows of some group.  */
+  leaf_set group;               /* Positions that match the input char.  */
+  charclass label;              /* The group's label.  */
+  position_set follows;         /* Union of the follows of the group.  */
   position_set tmp;             /* Temporary space for merging sets.  */
-  int possible_contexts;        /* Contexts that this group can match.  */
-  int separate_contexts;        /* Context that new state wants to know.  */
   state_num state;              /* New state.  */
   state_num state_newline;      /* New state on a newline transition.  */
   state_num state_letter;       /* New state on a letter transition.  */
-  bool next_isnt_1st_byte = false; /* We can't add state0.  */
   size_t i, j, k;
 
 #ifdef DEBUG
@@ -2536,11 +2536,12 @@ dfastate (state_num s, struct dfa *d, unsigned char uc, state_num trans[])
   group.elems = xnmalloc (d->nleaves, sizeof *group.elems);
   group.nelem = 0;
 
-  zeroset (labels);
-  notset (labels);
+  zeroset (label);
+  notset (label);
 
   for (i = 0; i < d->states[s].elems.nelem; ++i)
     {
+      charclass matches;            /* Set of matching characters.  */
       position pos = d->states[s].elems.elems[i];
       bool matched = false;
       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
@@ -2552,14 +2553,12 @@ dfastate (state_num s, struct dfa *d, unsigned char uc, state_num trans[])
         }
       else if (d->tokens[pos.index] >= CSET)
         {
-          zeroset (matches);
           copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
           if (tstbit (uc, d->charclasses[d->tokens[pos.index] - CSET]))
             matched = true;
         }
        else if (d->tokens[pos.index] == ANYCHAR)
          {
-          zeroset (matches);
           copyset (d->charclasses[d->canychar], matches);
           if (tstbit (uc, d->charclasses[d->canychar]))
             matched = true;
@@ -2620,13 +2619,13 @@ dfastate (state_num s, struct dfa *d, unsigned char uc, state_num trans[])
       if (matched)
         {
           for (k = 0; k < CHARCLASS_WORDS; ++k)
-            labels[k] &= matches[k];
+            label[k] &= matches[k];
           group.elems[group.nelem++] = pos.index;
         }
       else
         {
           for (k = 0; k < CHARCLASS_WORDS; ++k)
-            labels[k] &= ~matches[k];
+            label[k] &= ~matches[k];
         }
     }
 
@@ -2635,6 +2634,9 @@ dfastate (state_num s, struct dfa *d, unsigned char uc, state_num trans[])
 
   if (group.nelem > 0)
     {
+      int possible_contexts;    /* Contexts that the group can match.  */
+      int separate_contexts;    /* Context that new state wants to know.  */
+
       follows.nelem = 0;
 
       /* Find the union of the follows of the positions of the group.
@@ -2643,47 +2645,40 @@ dfastate (state_num s, struct dfa *d, unsigned char uc, state_num trans[])
         for (k = 0; k < d->follows[group.elems[j]].nelem; ++k)
           insert (d->follows[group.elems[j]].elems[k], &follows);
 
-      if (d->localeinfo.multibyte)
+      /* If we are building a searching matcher, throw in the positions
+         of state 0 as well, if possible.  */
+      if (d->searchflag)
         {
           /* If a token in follows.elems is not 1st byte of a multibyte
              character, or the states of follows must accept the bytes
              which are not 1st byte of the multibyte character.
-             Then, if a state of follows encounter a byte, it must not be
-             a 1st byte of a multibyte character nor single byte character.
-             We cansel to add state[0].follows to next state, because
-             state[0] must accept 1st-byte
-
-             For example, we assume <sb a> is a certain single byte
-             character, <mb A> is a certain multibyte character, and the
-             codepoint of <sb a> equals the 2nd byte of the codepoint of
-             <mb A>.
-             When state[0] accepts <sb a>, state[i] transit to state[i+1]
-             by accepting accepts 1st byte of <mb A>, and state[i+1]
-             accepts 2nd byte of <mb A>, if state[i+1] encounter the
-             codepoint of <sb a>, it must not be <sb a> but 2nd byte of
-             <mb A>, so we cannot add state[0].  */
-
-          next_isnt_1st_byte = false;
-          for (j = 0; j < follows.nelem; ++j)
+             Then, if a state of follows encounters a byte, it must not be
+             a 1st byte of a multibyte character nor a single byte character.
+             In this case, do not add state[0].follows to next state, because
+             state[0] must accept 1st-byte.
+
+             For example, suppose <sb a> is a certain single byte character,
+             <mb A> is a certain multibyte character, and the codepoint of
+             <sb a> equals the 2nd byte of the codepoint of <mb A>.  When
+             state[0] accepts <sb a>, state[i] transits to state[i+1] by
+             accepting the 1st byte of <mb A>, and state[i+1] accepts the
+             2nd byte of <mb A>, if state[i+1] encounters the codepoint of
+             <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
+             not add state[0].  */
+
+          bool mergeit = !d->localeinfo.multibyte;
+          if (!mergeit)
+            for (mergeit = true, j = 0; mergeit && j < follows.nelem; j++)
+              mergeit &= d->multibyte_prop[follows.elems[j].index];
+          if (mergeit)
             {
-              if (!(d->multibyte_prop[follows.elems[j].index] & 1))
-                {
-                  next_isnt_1st_byte = true;
-                  break;
-                }
+              merge (&d->states[0].elems, &follows, &tmp);
+              copy (&tmp, &follows);
             }
         }
 
-      /* If we are building a searching matcher, throw in the positions
-         of state 0 as well.  */
-      if (d->searchflag && (!d->localeinfo.multibyte || !next_isnt_1st_byte))
-        {
-          merge (&d->states[0].elems, &follows, &tmp);
-          copy (&tmp, &follows);
-        }
-
       /* Find out if the new state will want any context information.  */
-      possible_contexts = charclass_context (d, labels);
+      possible_contexts = charclass_context (d, label);
       separate_contexts = state_separate_contexts (&follows);
 
       /* Find the state(s) corresponding to the union of the follows.  */
@@ -2717,26 +2712,21 @@ dfastate (state_num s, struct dfa *d, unsigned char uc, state_num trans[])
       state = -1;
     }
 
-  /* Set the transitions for each character in the current label.  */
-  int c;
-  for (c = 0; c < NOTCHAR; ++c)
-    {
-      if (tstbit (c, labels))
+  /* Set the transitions for each character in the label.  */
+  for (i = 0; i < NOTCHAR; i++)
+    if (tstbit (i, label))
+      switch (d->syntax.sbit[i])
         {
-          switch (d->syntax.sbit[c])
-            {
-            case CTX_NEWLINE:
-              trans[c] = state_newline;
-              break;
-            case CTX_LETTER:
-              trans[c] = state_letter;
-              break;
-            default:
-              trans[c] = state;
-              break;
-            }
+        case CTX_NEWLINE:
+          trans[i] = state_newline;
+          break;
+        case CTX_LETTER:
+          trans[i] = state_letter;
+          break;
+        default:
+          trans[i] = state;
+          break;
         }
-    }
 
 #ifdef DEBUG
   fprintf (stderr, "trans table %td", s);
@@ -2755,7 +2745,7 @@ dfastate (state_num s, struct dfa *d, unsigned char uc, state_num trans[])
 
   /* Keep the newline transition in a special place so we can use it as
      a sentinel.  */
-  if (tstbit (d->syntax.eolbyte, labels))
+  if (tstbit (d->syntax.eolbyte, label))
     {
       d->newlines[s] = trans[d->syntax.eolbyte];
       trans[d->syntax.eolbyte] = -1;
@@ -2799,12 +2789,9 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state)
     }
 }
 
-/* Some routines for manipulating a compiled dfa's transition tables.
-   Each state may or may not have a transition table; if it does, and it
-   is a non-accepting state, then d->trans[state] points to its table.
-   If it is an accepting state then d->fails[state] points to its table.
-   If it has no table at all, then d->trans[state] is NULL.
-   TODO: Improve this comment, get rid of the unnecessary redundancy.  */
+/* Calculate the transition table for a new state derived from state s
+   for a compiled dfa d after input character uc, and return the new
+   state number.  */
 
 static state_num
 build_state (state_num s, struct dfa *d, unsigned char uc)
@@ -2812,42 +2799,39 @@ build_state (state_num s, struct dfa *d, unsigned char uc)
   state_num *trans;             /* The new transition table.  */
   state_num i, maxstate;
 
-  if (d->trans[s] != NULL)
-    trans = d->trans[s];
   if (d->fails[s] != NULL)
     trans = d->fails[s];
   else
     {
-      /* Set an upper limit on the number of transition tables that will ever
-         exist at once.  MAX_TRCOUNT is arbitrary.  The idea is that the frequently
-         used transition tables will be quickly rebuilt, whereas the ones that
-         were only needed once or twice will be cleared away.  However, do not
-         clear the initial D->min_trcount states, since they are always used.  */
-      if (MAX_TRCOUNT <= d->trcount)
+      state_num **ptrans = (ACCEPTING (s, *d) ? d->fails : d->trans) + s;
+      if (!*ptrans)
         {
-          for (i = d->min_trcount; i < d->tralloc; ++i)
+          /* MAX_TRCOUNT is an arbitrary upper limit on the number of
+             transition tables that can exist at once, other than for
+             initial states.  Often-used transition tables are quickly
+             rebuilt, whereas rarely-used ones are cleared away.  */
+          if (MAX_TRCOUNT <= d->trcount)
             {
-              free (d->trans[i]);
-              free (d->fails[i]);
-              d->trans[i] = d->fails[i] = NULL;
+              for (i = d->min_trcount; i < d->tralloc; i++)
+                {
+                  free (d->trans[i]);
+                  free (d->fails[i]);
+                  d->trans[i] = d->fails[i] = NULL;
+                }
+              d->trcount = 0;
             }
-          d->trcount = d->min_trcount;
+
+          d->trcount++;
+          *ptrans = xmalloc (NOTCHAR * sizeof *trans);
         }
-      trans = xmalloc (NOTCHAR * sizeof *trans);
+      trans = *ptrans;
 
-      /* Fill transition table with default value which means that
-         transited state has not been culculated yet.  */
+      /* Fill transition table with a default value which means that the
+         transited state has not been calculated yet.  */
       for (i = 0; i < NOTCHAR; i++)
         trans[i] = -2;
-
-      if (ACCEPTING (s, *d))
-        d->fails[s] = trans;
-      else
-        d->trans[s] = trans;
     }
 
-  ++d->trcount;
-
   /* Set up the success bits for this state.  */
   d->success[s] = 0;
   if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s, *d))
@@ -3142,28 +3126,30 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
             }
         }
 
-      if (s == -1)
+      if (s < 0)
         {
-          if ((char *) p > end || p[-1] != eol || d->newlines[s1] < 0)
+          if (s == -2)
+            {
+              s = build_state (s1, d, p[-1]);
+              trans = d->trans;
+            }
+          else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
+            {
+              /* The previous character was a newline.  Count it, and skip
+                 checking of multibyte character boundary until here.  */
+              nlcount++;
+              mbp = p;
+
+              s = (allow_nl ? d->newlines[s1]
+                   : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
+                   : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
+                   : d->initstate_notbol);
+            }
+          else
             {
               p = NULL;
               goto done;
             }
-
-          /* The previous character was a newline, count it, and skip
-             checking of multibyte character boundary until here.  */
-          nlcount++;
-          mbp = p;
-
-          s = (allow_nl ? d->newlines[s1]
-               : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
-               : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
-               : d->initstate_notbol);
-        }
-      else if (s == -2)
-        {
-          s = build_state (s1, d, p[-1]);
-          trans = d->trans;
         }
       else if (d->fails[s])
         {
-- 
2.7.4

From cbfffe5902c9cea8bb963e0b0c1977fccdc1339b Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 25 Nov 2016 10:43:38 -0800
Subject: [PATCH 3/3] dfa: simplify with new function fillset

* lib/dfa.c (fillset): New function.
Use it for clarity when applicable.
---
 ChangeLog |  4 ++++
 lib/dfa.c | 34 ++++++++++++++++++++--------------
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 01fa184..2796a03 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
 2016-11-25  Paul Eggert  <egg...@cs.ucla.edu>
 
+	dfa: simplify with new function fillset
+	* lib/dfa.c (fillset): New function.
+	Use it for clarity when applicable.
+
 	dfa: fix glitches with on-demand states
 	Also, adjust commentary to better match new code.
 	Some of these glitches predate the recent change.
diff --git a/lib/dfa.c b/lib/dfa.c
index 234f29e..5578232 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -695,10 +695,17 @@ zeroset (charclass s)
 }
 
 static void
-notset (charclass s)
+fillset (charclass s)
 {
   int i;
+  for (i = 0; i < CHARCLASS_WORDS; i++)
+    s[i] = CHARCLASS_WORD_MASK;
+}
 
+static void
+notset (charclass s)
+{
+  int i;
   for (i = 0; i < CHARCLASS_WORDS; ++i)
     s[i] = CHARCLASS_WORD_MASK & ~s[i];
 }
@@ -1409,8 +1416,7 @@ lex (struct dfa *dfa)
             goto normal_char;
           if (dfa->canychar == (size_t) -1)
             {
-              zeroset (ccl);
-              notset (ccl);
+              fillset (ccl);
               if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
                 clrbit ('\n', ccl);
               if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
@@ -2536,8 +2542,7 @@ dfastate (state_num s, struct dfa *d, unsigned char uc, state_num trans[])
   group.elems = xnmalloc (d->nleaves, sizeof *group.elems);
   group.nelem = 0;
 
-  zeroset (label);
-  notset (label);
+  fillset (label);
 
   for (i = 0; i < d->states[s].elems.nelem; ++i)
     {
@@ -3333,7 +3338,6 @@ static void
 dfassbuild (struct dfa *d)
 {
   size_t i, j;
-  charclass ccl;
   bool have_achar = false;
   bool have_nchar = false;
   struct dfa *sup = dfaalloc ();
@@ -3370,14 +3374,16 @@ dfassbuild (struct dfa *d)
         case ANYCHAR:
         case MBCSET:
         case BACKREF:
-          zeroset (ccl);
-          notset (ccl);
-          sup->tokens[j++] = CSET + charclass_index (sup, ccl);
-          sup->tokens[j++] = STAR;
-          if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
-              || d->tokens[i + 1] == PLUS)
-            i++;
-          have_achar = true;
+          {
+            charclass ccl;
+            fillset (ccl);
+            sup->tokens[j++] = CSET + charclass_index (sup, ccl);
+            sup->tokens[j++] = STAR;
+            if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
+                || d->tokens[i + 1] == PLUS)
+              i++;
+            have_achar = true;
+          }
           break;
         case BEGWORD:
         case ENDWORD:
-- 
2.7.4

Re: [PATCH] dfa: addition of new state on demand

Reply via email to