bug#23932: dfa: use algorithm for single byte character to any single byte character in input text always

Norihiro Tanaka Tue, 16 Aug 2016 07:39:20 -0700

On Sun, 10 Jul 2016 18:51:43 +0900
Norihiro Tanaka <nori...@kcn.ne.jp> wrote:


> In multibyte locales, if a pattern start with period expression,
> matching is still slow, as transition table is built at run time,
> even when next character is single byte in input text.
> 
> This patch changes it into as use algorithm for single byte character to
> any single byte character in input text always.  If transition table has
> been built already and a next character in input text is single byte,
> transit to next state by reference of only pre-built transition table,
> even if from a state including ANYCHAR.
> 
> $ yes "$(printf 'a%038db\n' 0)" | head -1000000 >in
> $ env LC_ALL=C gcc -v
> Reading specs from /usr/local/lib/gcc/x86_64-pc-linux-gnu/4.4.7/specs
> Target: x86_64-pc-linux-gnu
> Configured with: ./configure --with-as=/usr/local/bin/as 
> --with-ld=/usr/local/bin/ld --with-system-zlib --enable-__cxa_atexit
> Thread model: posix
> gcc version 4.4.7 (GCC)
> 
> patch#21486 is required before this patch.  grep will speed up by this
> patch additionaly.

I updated the patch due to change in bug#21486, and added a patch
including a minor change.

From ff7d171868b98b7203137579958969a2e7771edd Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Tue, 16 Aug 2016 18:50:03 +0900
Subject: [PATCH 1/2] dfa: use algorithm for single byte character to any single 
byte character in input text always

Even in non-UTF8 locales, if a character at current position in input
text is single byte, we can use CSET to match ANYCHAR.

* src/dfa.c (charclass_index_anychar): New var.  Cache index of CSET
for ANYCHAR.
(lex): Make CSET for ANYCHAR.
(state_index): Simplify.
(dfastate): Consider CSET for ANYCHAR.
(transit_state_singlebyte, transit_state): Remove handling for eolbyte,
as we assume that eolbyte does not appear at current position.
(dfaexec_main): use algorithm for single byte character to any single
byte character in input text always.
---
 src/dfa.c |  126 ++++++++++++++++++++++++-------------------------------------
 1 files changed, 50 insertions(+), 76 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index b64a176..71a8ab7 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -79,6 +79,8 @@ enum
   CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
 };
 
+static size_t charclass_index_anychar = -1;
+
 /* Sets of unsigned characters are stored as bit vectors in arrays of ints.  */
 typedef charclass_word charclass[CHARCLASS_WORDS];
 
@@ -1430,21 +1432,25 @@ lex (void)
         case '.':
           if (backslash)
             goto normal_char;
-          if (dfa->multibyte)
+          if (charclass_index_anychar == (size_t) -1)
             {
-              /* In multibyte environment period must match with a single
-                 character not a byte.  So we use ANYCHAR.  */
-              laststart = false;
-              return lasttok = ANYCHAR;
+              zeroset (ccl);
+              notset (ccl);
+              if (!(syntax_bits & RE_DOT_NEWLINE))
+                clrbit ('\n', ccl);
+              if (syntax_bits & RE_DOT_NOT_NULL)
+                clrbit ('\0', ccl);
+              if (dfa->multibyte)
+                {
+                  for (c2 = 0; c2 < NOTCHAR; ++c2)
+                    if (mbrtowc_cache[c2] == WEOF)
+                      clrbit (c2, ccl);
+                }
+              charclass_index_anychar = charclass_index (ccl);
             }
-          zeroset (ccl);
-          notset (ccl);
-          if (!(syntax_bits & RE_DOT_NEWLINE))
-            clrbit ('\n', ccl);
-          if (syntax_bits & RE_DOT_NOT_NULL)
-            clrbit ('\0', ccl);
           laststart = false;
-          return lasttok = CSET + charclass_index (ccl);
+          return lasttok =
+            dfa->multibyte ? ANYCHAR : CSET + charclass_index_anychar;
 
         case 's':
         case 'S':
@@ -2113,7 +2119,7 @@ state_index (struct dfa *d, position_set const *s, int 
context)
         }
       else if (d->tokens[s->elems[j].index] == BACKREF)
         constraint = NO_CONSTRAINT;
-      if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
+      if (d->tokens[s->elems[j].index] == ANYCHAR)
         {
           int acceptable
             = ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE)
@@ -2590,13 +2596,15 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         setbit (d->tokens[pos.index], matches);
       else if (d->tokens[pos.index] >= CSET)
         copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
-      else if (d->multibyte && d->tokens[pos.index] == ANYCHAR)
+      else if (d->tokens[pos.index] == ANYCHAR)
         {
-          /* ANYCHAR must match a single character, so put it to
-             D->states[s].mbps which contains the positions which can
-             match with a single character not a byte.  If all
-             positions with ANYCHAR do not depend on the context of
-             the next character, put its follows instead to
+          copyset (d->charclasses[charclass_index_anychar], matches);
+
+          /* ANYCHAR must match with a single character, so we must put
+             it to D->states[s].mbps which contains the positions which
+             can match with a single character not a byte.  If all
+             positions which has ANYCHAR does not depend on context of
+             next character, we put the follows instead of it to
              D->states[s].mbps to optimize.  */
           if (d->states[s].curr_dependent)
             {
@@ -2608,9 +2616,11 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
                                         d->states[s].context, CTX_ANY))
             {
               if (d->states[s].mbps.nelem == 0)
-                alloc_position_set (&d->states[s].mbps, 1);
-              for (j = 0; j < d->follows[pos.index].nelem; j++)
-                insert (d->follows[pos.index].elems[j], &d->states[s].mbps);
+                alloc_position_set (&d->states[s].mbps,
+                                    d->follows[pos.index].nelem);
+              for (j = 0; j < d->follows[pos.index].nelem; ++j)
+                insert (d->follows[pos.index].elems[j],
+                        &(d->states[s].mbps));
             }
         }
       else
@@ -2986,16 +2996,6 @@ transit_state_singlebyte (struct dfa *d, state_num s, 
unsigned char const **pp)
 {
   state_num *t;
 
-  if (**pp == eolbyte)
-    {
-      /* S is always an initial state in transit_state, so the
-         transition table for the state must have been built already.  */
-      assert (d->trans[s] || d->fails[s]);
-
-      ++*pp;
-      return d->newlines[s];
-    }
-
   if (d->trans[s])
     t = d->trans[s];
   else if (d->fails[s])
@@ -3022,15 +3022,12 @@ static state_num
 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
                unsigned char const *end)
 {
-  state_num s1;
+  state_num s1, s2;
   wint_t wc;
   int separate_contexts;
-  state_num state, state_newline, mb_index;
   size_t i, j;
 
   int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
-  int context = wc == eolbyte ? CTX_NEWLINE : CTX_NONE;
-  bool context_newline = context == CTX_NEWLINE;
 
   /* This state has some operators which can match a multibyte character.  */
   d->mb_follows.nelem = 0;
@@ -3052,7 +3049,7 @@ transit_state (struct dfa *d, state_num s, unsigned char 
const **pp,
       for (i = 0; i < d->states[s1].mbps.nelem; i++)
         {
           if (!SUCCEEDS_IN_CONTEXT (d->states[s1].mbps.elems[i].constraint,
-                                    d->states[s1].context, context))
+                                    d->states[s1].context, CTX_NONE))
             continue;
           for (j = 0; j < d->follows[d->states[s1].mbps.elems[i].index].nelem;
                j++)
@@ -3061,10 +3058,7 @@ transit_state (struct dfa *d, state_num s, unsigned char 
const **pp,
         }
 
       separate_contexts = state_separate_contexts (&d->mb_follows);
-      if (context_newline && separate_contexts & CTX_NEWLINE)
-        s = state_index (d, &d->mb_follows, CTX_NEWLINE);
-      else
-        s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
+      s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
       realloc_trans_if_necessary (d, s);
 
       return s;
@@ -3077,11 +3071,11 @@ transit_state (struct dfa *d, state_num s, unsigned 
char const **pp,
     {
       if (MAX_TRCOUNT <= d->mb_trcount)
         {
-          state_num s2;
-          for (s2 = -1; s2 < d->tralloc; s2++)
+          state_num s3;
+          for (s3 = -1; s3 < d->tralloc; s3++)
             {
-              free (d->mb_trans[s2]);
-              d->mb_trans[s2] = NULL;
+              free (d->mb_trans[s3]);
+              d->mb_trans[s3] = NULL;
             }
 
           for (i = 0; i < d->sindex; i++)
@@ -3091,22 +3085,16 @@ transit_state (struct dfa *d, state_num s, unsigned 
char const **pp,
       d->states[s1].mb_trindex = d->mb_trcount++;
     }
 
-  mb_index = d->states[s1].mb_trindex * 2;
-
   if (! d->mb_trans[s])
     {
       enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
-      enum { TRANSALLOC_SIZE = 2 * MAX_TRCOUNT * TRANSPTR_SIZE };
+      enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
       d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
-      for (i = 0; i < 2 * MAX_TRCOUNT; i++)
+      for (i = 0; i < MAX_TRCOUNT; i++)
         d->mb_trans[s][i] = -1;
     }
-  else
-    {
-      state = d->mb_trans[s][mb_index + context_newline];
-      if (0 <= state)
-        return state;
-    }
+  else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
+    return d->mb_trans[s][d->states[s1].mb_trindex];
 
   if (s < 0)
     copy (&d->states[s1].mbps, &d->mb_follows);
@@ -3114,17 +3102,12 @@ transit_state (struct dfa *d, state_num s, unsigned 
char const **pp,
     merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
 
   separate_contexts = state_separate_contexts (&d->mb_follows);
-  state = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
-  if (separate_contexts & CTX_NEWLINE)
-    state_newline = state_index (d, &d->mb_follows, CTX_NEWLINE);
-  else
-    state_newline = state;
-  realloc_trans_if_necessary (d, state_newline);
+  s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
+  realloc_trans_if_necessary (d, s2);
 
-  d->mb_trans[s][mb_index] = state;
-  d->mb_trans[s][mb_index + 1] = state_newline;
+  d->mb_trans[s][d->states[s1].mb_trindex] = s2;
 
-  return context_newline ? state_newline : state;
+  return s2;
 }
 
 /* The initial state may encounter a byte which is not a single byte character
@@ -3251,10 +3234,8 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end, bool allow_nl,
                     }
                 }
 
-              if (d->states[s].mbps.nelem == 0 || (*p == eol && !allow_nl)
-                  || (*p == '\n' && !(syntax_bits & RE_DOT_NEWLINE))
-                  || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL))
-                  || (char *) p >= end)
+              if (d->states[s].mbps.nelem == 0
+                  || mbrtowc_cache[*p] != WEOF || (char *) p >= end)
                 {
                   /* If an input character does not match ANYCHAR, do it
                      like a single-byte character.  */
@@ -3263,8 +3244,6 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end, bool allow_nl,
               else
                 {
                   s = transit_state (d, s, &p, (unsigned char *) end);
-                  if (s >= 0 && p[-1] == eol)
-                    nlcount++;
                   mbp = p;
                   trans = d->trans;
                 }
@@ -3321,10 +3300,7 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end, bool allow_nl,
 
           s1 = s;
           if (!multibyte || d->states[s].mbps.nelem == 0
-              || (*p == eol && !allow_nl)
-              || (*p == '\n' && !(syntax_bits & RE_DOT_NEWLINE))
-              || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL))
-              || (char *) p >= end)
+              || mbrtowc_cache[*p] != WEOF || (char *) p >= end)
             {
               /* If a input character does not match ANYCHAR, do it
                  like a single-byte character.  */
@@ -3333,8 +3309,6 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end, bool allow_nl,
           else
             {
               s = transit_state (d, s, &p, (unsigned char *) end);
-              if (s >= 0 && p[-1] == eol)
-                nlcount++;
               mbp = p;
               trans = d->trans;
             }
-- 
1.7.1

From e9286b7e92f094b7c0342c6a38c67d8d30c2a0de Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Tue, 16 Aug 2016 23:23:19 +0900
Subject: [PATCH 2/2] dfa: avoid invalid character matches period

* dfa.c (transit_state): Avoid invalid character matches period.
---
 src/dfa.c |    6 ++++++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 71a8ab7..f79d67e 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3039,6 +3039,12 @@ transit_state (struct dfa *d, state_num s, unsigned char 
const **pp,
     s = transit_state_singlebyte (d, s, pp);
   *pp += mbclen - i;
 
+  if (wc == WEOF)
+    {
+      /* It is invalid character.  Then ANYCHAR is not accepted.  */
+      return s;
+    }
+
   if (d->states[s1].curr_dependent)
     {
       if (s < 0)
-- 
1.7.1

bug#23932: dfa: use algorithm for single byte character to any single byte character in input text always

Reply via email to