bug#23932: dfa: use algorithm for single byte character to any single byte character in input text always

Paul Eggert Thu, 01 Sep 2016 11:51:57 -0700

Thanks for that set of patches too. I rebased it and tweaked NEWS and installedthe resulting patch set (attached) into Savannah master.

>From a95ba0203afcc43feea0fa99e4d1b8fb923b1aeb Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Thu, 1 Sep 2016 11:45:18 -0700
Subject: [PATCH 1/4] dfa: use single-byte algorithm even in non-UTF-8


Even in non-UTF8 locales, if the current input character
is single byte, we can use CSET to match ANYCHAR.
* src/dfa.c (struct dfa): New member canychar.
Cache index of CSET for ANYCHAR.
(lex): Make CSET for ANYCHAR.
(state_index): Simplify.
(dfastate): Consider CSET for ANYCHAR.
(transit_state_singlebyte, transit_state): Remove handling for eolbyte,
as we assume that eolbyte does not appear at current position.
(dfaexec_main): Use algorithm for single byte character to any single
byte character in input text always.
(dfasyntax): Initialize canychar.
---
 src/dfa.c | 115 ++++++++++++++++++++++++--------------------------------------
 1 file changed, 45 insertions(+), 70 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 0f5109e..e39d82a 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -401,6 +401,7 @@ struct dfa
   charclass *charclasses;       /* Array of character sets for CSET tokens.  */
   size_t cindex;                /* Index for adding new charclasses.  */
   size_t calloc;                /* Number of charclasses allocated.  */
+  size_t canychar;              /* Index of anychar class, or (size_t) -1.  */
 
   /* Scanner state */
   struct lexer_state lex;
@@ -1398,21 +1399,24 @@ lex (struct dfa *dfa)
         case '.':
           if (backslash)
             goto normal_char;
-          if (dfa->localeinfo.multibyte)
+          if (dfa->canychar == (size_t) -1)
             {
-              /* In multibyte environment period must match with a single
-                 character not a byte.  So we use ANYCHAR.  */
-              dfa->lex.laststart = false;
-              return dfa->lex.lasttok = ANYCHAR;
+              zeroset (ccl);
+              notset (ccl);
+              if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
+                clrbit ('\n', ccl);
+              if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
+                clrbit ('\0', ccl);
+              if (dfa->localeinfo.multibyte)
+                for (c2 = 0; c2 < NOTCHAR; c2++)
+                  if (dfa->localeinfo.sbctowc[c2] == WEOF)
+                    clrbit (c2, ccl);
+              dfa->canychar = charclass_index (dfa, ccl);
             }
-          zeroset (ccl);
-          notset (ccl);
-          if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
-            clrbit ('\n', ccl);
-          if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
-            clrbit ('\0', ccl);
           dfa->lex.laststart = false;
-          return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
+          return dfa->lex.lasttok = (dfa->localeinfo.multibyte
+                                     ? ANYCHAR
+                                     : CSET + dfa->canychar);
 
         case 's':
         case 'S':
@@ -2077,7 +2081,7 @@ state_index (struct dfa *d, position_set const *s, int context)
         }
       else if (d->tokens[s->elems[j].index] == BACKREF)
         constraint = NO_CONSTRAINT;
-      if (d->localeinfo.multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
+      if (d->tokens[s->elems[j].index] == ANYCHAR)
         {
           int acceptable
             = ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE)
@@ -2554,13 +2558,15 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         setbit (d->tokens[pos.index], matches);
       else if (d->tokens[pos.index] >= CSET)
         copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
-      else if (d->localeinfo.multibyte && d->tokens[pos.index] == ANYCHAR)
+      else if (d->tokens[pos.index] == ANYCHAR)
         {
-          /* ANYCHAR must match a single character, so put it to
-             D->states[s].mbps which contains the positions which can
-             match with a single character not a byte.  If all
-             positions with ANYCHAR do not depend on the context of
-             the next character, put its follows instead to
+          copyset (d->charclasses[d->canychar], matches);
+
+          /* ANYCHAR must match with a single character, so we must put
+             it to D->states[s].mbps which contains the positions which
+             can match with a single character not a byte.  If all
+             positions which has ANYCHAR does not depend on context of
+             next character, we put the follows instead of it to
              D->states[s].mbps to optimize.  */
           if (d->states[s].curr_dependent)
             {
@@ -2572,7 +2578,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
                                         d->states[s].context, CTX_ANY))
             {
               if (d->states[s].mbps.nelem == 0)
-                alloc_position_set (&d->states[s].mbps, 1);
+                alloc_position_set (&d->states[s].mbps,
+                                    d->follows[pos.index].nelem);
               for (j = 0; j < d->follows[pos.index].nelem; j++)
                 insert (d->follows[pos.index].elems[j], &d->states[s].mbps);
             }
@@ -2950,16 +2957,6 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
 {
   state_num *t;
 
-  if (**pp == d->syntax.eolbyte)
-    {
-      /* S is always an initial state in transit_state, so the
-         transition table for the state must have been built already.  */
-      assert (d->trans[s] || d->fails[s]);
-
-      ++*pp;
-      return d->newlines[s];
-    }
-
   if (d->trans[s])
     t = d->trans[s];
   else if (d->fails[s])
@@ -2986,15 +2983,12 @@ static state_num
 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
                unsigned char const *end)
 {
-  state_num s1;
+  state_num s1, s2;
   wint_t wc;
   int separate_contexts;
-  state_num state, state_newline, mb_index;
   size_t i, j;
 
   int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
-  int context = wc == d->syntax.eolbyte ? CTX_NEWLINE : CTX_NONE;
-  bool context_newline = context == CTX_NEWLINE;
 
   /* This state has some operators which can match a multibyte character.  */
   d->mb_follows.nelem = 0;
@@ -3016,7 +3010,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
       for (i = 0; i < d->states[s1].mbps.nelem; i++)
         {
           if (!SUCCEEDS_IN_CONTEXT (d->states[s1].mbps.elems[i].constraint,
-                                    d->states[s1].context, context))
+                                    d->states[s1].context, CTX_NONE))
             continue;
           for (j = 0; j < d->follows[d->states[s1].mbps.elems[i].index].nelem;
                j++)
@@ -3025,10 +3019,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
         }
 
       separate_contexts = state_separate_contexts (&d->mb_follows);
-      if (context_newline && separate_contexts & CTX_NEWLINE)
-        s = state_index (d, &d->mb_follows, CTX_NEWLINE);
-      else
-        s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
+      s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
       realloc_trans_if_necessary (d, s);
 
       return s;
@@ -3041,11 +3032,11 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
     {
       if (MAX_TRCOUNT <= d->mb_trcount)
         {
-          state_num s2;
-          for (s2 = -1; s2 < d->tralloc; s2++)
+          state_num s3;
+          for (s3 = -1; s3 < d->tralloc; s3++)
             {
-              free (d->mb_trans[s2]);
-              d->mb_trans[s2] = NULL;
+              free (d->mb_trans[s3]);
+              d->mb_trans[s3] = NULL;
             }
 
           for (i = 0; i < d->sindex; i++)
@@ -3055,22 +3046,16 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
       d->states[s1].mb_trindex = d->mb_trcount++;
     }
 
-  mb_index = d->states[s1].mb_trindex * 2;
-
   if (! d->mb_trans[s])
     {
       enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
-      enum { TRANSALLOC_SIZE = 2 * MAX_TRCOUNT * TRANSPTR_SIZE };
+      enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
       d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
-      for (i = 0; i < 2 * MAX_TRCOUNT; i++)
+      for (i = 0; i < MAX_TRCOUNT; i++)
         d->mb_trans[s][i] = -1;
     }
-  else
-    {
-      state = d->mb_trans[s][mb_index + context_newline];
-      if (0 <= state)
-        return state;
-    }
+  else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
+    return d->mb_trans[s][d->states[s1].mb_trindex];
 
   if (s < 0)
     copy (&d->states[s1].mbps, &d->mb_follows);
@@ -3078,17 +3063,12 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
     merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
 
   separate_contexts = state_separate_contexts (&d->mb_follows);
-  state = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
-  if (separate_contexts & CTX_NEWLINE)
-    state_newline = state_index (d, &d->mb_follows, CTX_NEWLINE);
-  else
-    state_newline = state;
-  realloc_trans_if_necessary (d, state_newline);
+  s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
+  realloc_trans_if_necessary (d, s2);
 
-  d->mb_trans[s][mb_index] = state;
-  d->mb_trans[s][mb_index + 1] = state_newline;
+  d->mb_trans[s][d->states[s1].mb_trindex] = s2;
 
-  return context_newline ? state_newline : state;
+  return s2;
 }
 
 /* The initial state may encounter a byte which is not a single byte character
@@ -3215,10 +3195,8 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
                     }
                 }
 
-              if (d->states[s].mbps.nelem == 0 || (*p == eol && !allow_nl)
-                  || (*p == '\n' && !(d->syntax.syntax_bits & RE_DOT_NEWLINE))
-                  || (*p == '\0' && (d->syntax.syntax_bits & RE_DOT_NOT_NULL))
-                  || (char *) p >= end)
+              if (d->states[s].mbps.nelem == 0
+                  || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
                 {
                   /* If an input character does not match ANYCHAR, do it
                      like a single-byte character.  */
@@ -3227,8 +3205,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
               else
                 {
                   s = transit_state (d, s, &p, (unsigned char *) end);
-                  if (s >= 0 && p[-1] == eol)
-                    nlcount++;
                   mbp = p;
                   trans = d->trans;
                 }
@@ -3297,8 +3273,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
           else
             {
               s = transit_state (d, s, &p, (unsigned char *) end);
-              if (s >= 0 && p[-1] == eol)
-                nlcount++;
               mbp = p;
               trans = d->trans;
             }
@@ -4106,6 +4080,7 @@ dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
 
   dfa->fast = !dfa->localeinfo.multibyte;
 
+  dfa->canychar = -1;
   dfa->lex.cur_mb_len = 1;
   dfa->syntax.syntax_bits_set = true;
   dfa->syntax.syntax_bits = bits;
-- 
2.7.4

>From 5644d1c3884286e2a8a6f51b5d0ef00a8fbe4578 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Thu, 1 Sep 2016 11:45:18 -0700
Subject: [PATCH 2/4] dfa: avoid invalid character matching period

* dfa.c (transit_state): Avoid invalid character matching period.
---
 src/dfa.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/dfa.c b/src/dfa.c
index e39d82a..075576c 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3000,6 +3000,12 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
     s = transit_state_singlebyte (d, s, pp);
   *pp += mbclen - i;
 
+  if (wc == WEOF)
+    {
+      /* It is an invalid character, so ANYCHAR is not accepted.  */
+      return s;
+    }
+
   if (d->states[s1].curr_dependent)
     {
       if (s < 0)
-- 
2.7.4

>From 83db3ea1a09cbff41aa2a9c7d1bd6dc01612c862 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Thu, 1 Sep 2016 11:45:18 -0700
Subject: [PATCH 3/4] dfa: document previous change

* NEWS: Adjust to match previous change.
---
 NEWS | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/NEWS b/NEWS
index 91f1bfc..56c735f 100644
--- a/NEWS
+++ b/NEWS
@@ -13,8 +13,8 @@ GNU grep NEWS                                    -*- outline -*-
   grep -iF is typically much faster in a multibyte locale, if the
   pattern and its case counterparts contain only single byte characters.
 
-  In multibyte locales that are not UTF-8, grep now handles leading
-  "." in patterns more efficiently.
+  In multibyte locales, grep now handles leading "." in patterns more
+  efficiently.
 
   grep now prints a "FILENAME:LINENO: " prefix when diagnosing an
   invalid regular expression that was read from an '-f'-specified file.
-- 
2.7.4

>From 980bcbff1046559d6a5299055f22ba6b8703ed8a Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Thu, 1 Sep 2016 11:45:18 -0700
Subject: [PATCH 4/4] dfa: remove separation by context in transition in
 non-UTF8 multibyte locales

* src/dfa.c (struct dfa): Remove member curr_dependent.  All uses
removed.
---
 src/dfa.c | 53 +++--------------------------------------------------
 1 file changed, 3 insertions(+), 50 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 075576c..00562ea 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -303,9 +303,6 @@ typedef struct
   size_t hash;                  /* Hash of the positions of this state.  */
   position_set elems;           /* Positions this state could match.  */
   unsigned char context;        /* Context from previous state.  */
-  bool curr_dependent;          /* True if the follows of any positions with
-                                   ANYCHAR depends on the next character's
-                                   context.  */
   unsigned short constraint;    /* Constraint for this state to accept.  */
   token first_end;              /* Token value of the first END in elems.  */
   position_set mbps;            /* Positions which can match multibyte
@@ -2027,7 +2024,6 @@ state_index (struct dfa *d, position_set const *s, int context)
   size_t hash = 0;
   int constraint = 0;
   state_num i, j;
-  bool curr_dependent = false;
   token first_end = 0;
 
   for (i = 0; i < s->nelem; ++i)
@@ -2081,17 +2077,6 @@ state_index (struct dfa *d, position_set const *s, int context)
         }
       else if (d->tokens[s->elems[j].index] == BACKREF)
         constraint = NO_CONSTRAINT;
-      if (d->tokens[s->elems[j].index] == ANYCHAR)
-        {
-          int acceptable
-            = ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE)
-                ? CTX_NEWLINE : 0)
-               | (SUCCEEDS_IN_CONTEXT (c, context, CTX_LETTER)
-                  ? CTX_LETTER : 0)
-               | (SUCCEEDS_IN_CONTEXT (c, context, CTX_NONE)
-                  ? CTX_NONE : 0));
-          curr_dependent |= acceptable && (context & ~acceptable);
-        }
     }
 
 
@@ -2102,7 +2087,6 @@ state_index (struct dfa *d, position_set const *s, int context)
   alloc_position_set (&d->states[i].elems, s->nelem);
   copy (s, &d->states[i].elems);
   d->states[i].context = context;
-  d->states[i].curr_dependent = curr_dependent;
   d->states[i].constraint = constraint;
   d->states[i].first_end = first_end;
   d->states[i].mbps.nelem = 0;
@@ -2568,14 +2552,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
              positions which has ANYCHAR does not depend on context of
              next character, we put the follows instead of it to
              D->states[s].mbps to optimize.  */
-          if (d->states[s].curr_dependent)
-            {
-              if (d->states[s].mbps.nelem == 0)
-                alloc_position_set (&d->states[s].mbps, 1);
-              insert (pos, &d->states[s].mbps);
-            }
-          else if (SUCCEEDS_IN_CONTEXT (pos.constraint,
-                                        d->states[s].context, CTX_ANY))
+          if (SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context,
+                                   CTX_NONE))
             {
               if (d->states[s].mbps.nelem == 0)
                 alloc_position_set (&d->states[s].mbps,
@@ -2986,7 +2964,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
   state_num s1, s2;
   wint_t wc;
   int separate_contexts;
-  size_t i, j;
+  size_t i;
 
   int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
 
@@ -3006,31 +2984,6 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
       return s;
     }
 
-  if (d->states[s1].curr_dependent)
-    {
-      if (s < 0)
-        d->mb_follows.nelem = 0;
-      else
-        copy (&d->states[s].elems, &d->mb_follows);
-
-      for (i = 0; i < d->states[s1].mbps.nelem; i++)
-        {
-          if (!SUCCEEDS_IN_CONTEXT (d->states[s1].mbps.elems[i].constraint,
-                                    d->states[s1].context, CTX_NONE))
-            continue;
-          for (j = 0; j < d->follows[d->states[s1].mbps.elems[i].index].nelem;
-               j++)
-            insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j],
-                    &d->mb_follows);
-        }
-
-      separate_contexts = state_separate_contexts (&d->mb_follows);
-      s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
-      realloc_trans_if_necessary (d, s);
-
-      return s;
-    }
-
   /* If all positions which have ANYCHAR do not depend on the context
      of the next character, calculate the next state with
      pre-calculated follows and cache the result.  */
-- 
2.7.4

bug#23932: dfa: use algorithm for single byte character to any single byte character in input text always

Reply via email to