bug#23932: dfa: use algorithm for single byte character to any single byte character in input text always

Norihiro Tanaka Sun, 10 Jul 2016 02:54:07 -0700

In multibyte locales, if a pattern start with period expression,
matching is still slow, as transition table is built at run time,
even when next character is single byte in input text.


This patch changes it into as use algorithm for single byte character to
any single byte character in input text always.  If transition table has
been built already and a next character in input text is single byte,
transit to next state by reference of only pre-built transition table,
even if from a state including ANYCHAR.

$ yes "$(printf 'a%038db\n' 0)" | head -1000000 >in
$ env LC_ALL=C gcc -v
Reading specs from /usr/local/lib/gcc/x86_64-pc-linux-gnu/4.4.7/specs
Target: x86_64-pc-linux-gnu
Configured with: ./configure --with-as=/usr/local/bin/as 
--with-ld=/usr/local/bin/ld --with-system-zlib --enable-__cxa_atexit
Thread model: posix
gcc version 4.4.7 (GCC)

patch#21486 is required before this patch.  grep will speed up by this
patch additionaly.

[grep-2.25]
$ time -p env LC_ALL=ja_JP.eucjp grep .a.b in
real 4.78
user 4.42
sys 0.16
$ time -p env LC_ALL=ja_JP.eucjp grep '.\{41\}' in
real 46.23
user 43.98
sys 0.21

[after patch#21486]
$ time -p env LC_ALL=ja_JP.eucjp src/grep .a.b in
real 1.26
user 1.08
sys 0.08
$ time -p env LC_ALL=ja_JP.eucjp src/grep '.\{41\}' in
real 1.14
user 1.00
sys 0.10

[after this patch]
$ time -p env LC_ALL=ja_JP.eucjp src/grep .a.b in
real 0.47
user 0.36
sys 0.07
$ time -p env LC_ALL=ja_JP.eucjp src/grep '.\{41\}' in
real 0.24
user 0.18
sys 0.05

[locale C (ref.)]
$ time -p env LC_ALL=C src/grep .a.b in
real 0.23
user 0.11
sys 0.09
$ time -p env LC_ALL=C src/grep '.\{41\}' in
real 0.22
user 0.13
sys 0.06

From 3d0c130808c974f1271561c7433b2aa661c49507 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Sun, 10 Jul 2016 10:26:39 +0900
Subject: [PATCH] dfa: use algorithm for single byte character to any single byte
 character in input text always

Even in non-UTF8 locales, if a character at current position in input
text is single byte, we can use CSET to match ANYCHAR.

* src/dfa.c (charclass_index_anychar): New var.  Cache index of CSET
for ANYCHAR.
(lex): Make CSET for ANYCHAR.
(state_index): Simplify.
(dfastate): Consider CSET for ANYCHAR.
(transit_state_singlebyte, transit_state): Remove handling for eolbyte,
as we assume that eolbyte does not appear at current position.
(dfaexec_main): use algorithm for single byte character to any single
byte character in input text always.
---
 src/dfa.c |  115 ++++++++++++++++++++++++------------------------------------
 1 files changed, 46 insertions(+), 69 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 3bd0c68..e932b2c 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -79,6 +79,8 @@ enum
   CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
 };
 
+static size_t charclass_index_anychar = -1;
+
 /* Sets of unsigned characters are stored as bit vectors in arrays of ints.  */
 typedef charclass_word charclass[CHARCLASS_WORDS];
 
@@ -1429,21 +1431,25 @@ lex (void)
         case '.':
           if (backslash)
             goto normal_char;
-          if (dfa->multibyte)
+          if (charclass_index_anychar == (size_t) -1)
             {
-              /* In multibyte environment period must match with a single
-                 character not a byte.  So we use ANYCHAR.  */
-              laststart = false;
-              return lasttok = ANYCHAR;
+              zeroset (ccl);
+              notset (ccl);
+              if (!(syntax_bits & RE_DOT_NEWLINE))
+                clrbit ('\n', ccl);
+              if (syntax_bits & RE_DOT_NOT_NULL)
+                clrbit ('\0', ccl);
+              if (dfa->multibyte)
+                {
+                  for (c2 = 0; c2 < NOTCHAR; ++c2)
+                    if (mbrtowc_cache[c2] == WEOF)
+                      clrbit (c2, ccl);
+                }
+              charclass_index_anychar = charclass_index (ccl);
             }
-          zeroset (ccl);
-          notset (ccl);
-          if (!(syntax_bits & RE_DOT_NEWLINE))
-            clrbit ('\n', ccl);
-          if (syntax_bits & RE_DOT_NOT_NULL)
-            clrbit ('\0', ccl);
           laststart = false;
-          return lasttok = CSET + charclass_index (ccl);
+          return lasttok =
+            dfa->multibyte ? ANYCHAR : CSET + charclass_index_anychar;
 
         case 's':
         case 'S':
@@ -2125,7 +2131,7 @@ state_index (struct dfa *d, position_set const *s, int 
context)
         }
       else if (d->tokens[s->elems[j].index] == BACKREF)
         d->states[i].constraint = NO_CONSTRAINT;
-      if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
+      else if (d->tokens[s->elems[j].index] == ANYCHAR)
         {
           int acceptable = 0;
           if (SUCCEEDS_IN_CONTEXT (constraint, context, CTX_NEWLINE))
@@ -2588,14 +2594,16 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         setbit (d->tokens[pos.index], matches);
       else if (d->tokens[pos.index] >= CSET)
         copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
-      else if (d->multibyte && d->tokens[pos.index] == ANYCHAR)
-        /* ANYCHAR must match with a single character, so we must put
-           it to D->states[s].mbps which contains the positions which
-           can match with a single character not a byte.  If all
-           positions which has ANYCHAR does not depend on context of
-           next character, we put the follows instead of it to
-           D->states[s].mbps to optimize.  */
+      else if (d->tokens[pos.index] == ANYCHAR)
         {
+          copyset (d->charclasses[charclass_index_anychar], matches);
+
+          /* ANYCHAR must match with a single character, so we must put
+             it to D->states[s].mbps which contains the positions which
+             can match with a single character not a byte.  If all
+             positions which has ANYCHAR does not depend on context of
+             next character, we put the follows instead of it to
+             D->states[s].mbps to optimize.  */
           if (d->states[s].curr_dependent)
             {
               if (d->states[s].mbps.nelem == 0)
@@ -2606,9 +2614,11 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
                                         d->states[s].context, CTX_ANY))
             {
               if (d->states[s].mbps.nelem == 0)
-                alloc_position_set (&d->states[s].mbps, 1);
+                alloc_position_set (&d->states[s].mbps,
+                                    d->follows[pos.index].nelem);
               for (j = 0; j < d->follows[pos.index].nelem; ++j)
-                insert (d->follows[pos.index].elems[j], &(d->states[s].mbps));
+                insert (d->follows[pos.index].elems[j],
+                        &(d->states[s].mbps));
             }
         }
       else
@@ -2984,16 +2994,6 @@ transit_state_singlebyte (struct dfa *d, state_num s, 
unsigned char const **pp)
 {
   state_num *t;
 
-  if (**pp == eolbyte)
-    {
-      /* S is always an initial state in transit_state, so the
-         transition table for the state must have been built already.  */
-      assert (d->trans[s] || d->fails[s]);
-
-      ++*pp;
-      return d->newlines[s];
-    }
-
   if (d->trans[s])
     t = d->trans[s];
   else if (d->fails[s])
@@ -3020,15 +3020,12 @@ static state_num
 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
                unsigned char const *end)
 {
-  state_num s1;
+  state_num s1, s2;
   wint_t wc;
   int separate_contexts;
-  state_num state, state_newline;
   size_t i, j;
 
   int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
-  int context = wc == eolbyte ? CTX_NEWLINE : CTX_NONE;
-  bool context_newline = context == CTX_NEWLINE;
 
   /* This state has some operators which can match a multibyte character.  */
   d->mb_follows.nelem = 0;
@@ -3051,7 +3048,7 @@ transit_state (struct dfa *d, state_num s, unsigned char 
const **pp,
       for (i = 0; i < d->states[s1].mbps.nelem; ++i)
         {
           if (!SUCCEEDS_IN_CONTEXT (d->states[s1].mbps.elems[i].constraint,
-                                    d->states[s1].context, context))
+                                    d->states[s1].context, CTX_NONE))
             continue;
           for (j = 0; j < d->follows[d->states[s1].mbps.elems[i].index].nelem;
                j++)
@@ -3060,10 +3057,7 @@ transit_state (struct dfa *d, state_num s, unsigned char 
const **pp,
         }
 
       separate_contexts = state_separate_contexts (&d->mb_follows);
-      if (context_newline && separate_contexts & CTX_NEWLINE)
-        s = state_index (d, &d->mb_follows, CTX_NEWLINE);
-      else
-        s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
+      s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
       realloc_trans_if_necessary (d, s);
 
       return s;
@@ -3091,17 +3085,14 @@ transit_state (struct dfa *d, state_num s, unsigned 
char const **pp,
       d->states[s1].mb_trindex = d->mb_trcount++;
     }
 
-  size_t mb_index = d->states[s1].mb_trindex << 1 | (context_newline
-                                                     ? 1 : 0);
-
   if (d->mb_trans[s] == NULL)
     {
-      d->mb_trans[s] = xnmalloc (2 * MAX_TRCOUNT, sizeof *d->mb_trans[s]);
-      for (i = 0; i < 2 * MAX_TRCOUNT; i++)
+      d->mb_trans[s] = xnmalloc (MAX_TRCOUNT, sizeof *d->mb_trans[s]);
+      for (i = 0; i < MAX_TRCOUNT; i++)
         d->mb_trans[s][i] = -1;
     }
-  else if (d->mb_trans[s][mb_index] >= 0)
-    return d->mb_trans[s][mb_index];
+  else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
+    return d->mb_trans[s][d->states[s1].mb_trindex];
 
   if (s < 0)
     copy (&d->states[s1].mbps, &d->mb_follows);
@@ -3109,17 +3100,12 @@ transit_state (struct dfa *d, state_num s, unsigned 
char const **pp,
     merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
 
   separate_contexts = state_separate_contexts (&d->mb_follows);
-  state = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
-  if (separate_contexts & CTX_NEWLINE)
-    state_newline = state_index (d, &d->mb_follows, CTX_NEWLINE);
-  else
-    state_newline = state;
-  realloc_trans_if_necessary (d, state_newline);
+  s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
+  realloc_trans_if_necessary (d, s2);
 
-  d->mb_trans[s][mb_index & ~0] = state;
-  d->mb_trans[s][mb_index | 1] = state_newline;
+  d->mb_trans[s][d->states[s1].mb_trindex] = s2;
 
-  return context_newline ? state_newline : state;
+  return s2;
 }
 
 /* The initial state may encounter a byte which is not a single byte character
@@ -3246,10 +3232,8 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end, bool allow_nl,
                     }
                 }
 
-              if (d->states[s].mbps.nelem == 0 || (*p == eol && !allow_nl)
-                  || (*p == '\n' && !(syntax_bits & RE_DOT_NEWLINE))
-                  || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL))
-                  || (char *) p >= end)
+              if (d->states[s].mbps.nelem == 0
+                  || mbrtowc_cache[*p] != WEOF || (char *) p >= end)
                 {
                   /* If an input character does not match ANYCHAR, do it
                      like a single-byte character.  */
@@ -3258,8 +3242,6 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end, bool allow_nl,
               else
                 {
                   s = transit_state (d, s, &p, (unsigned char *) end);
-                  if (s >= 0 && p[-1] == eol)
-                    nlcount++;
                   mbp = p;
                   trans = d->trans;
                 }
@@ -3311,10 +3293,7 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end, bool allow_nl,
 
           s1 = s;
           if (!multibyte || d->states[s].mbps.nelem == 0
-              || (*p == eol && !allow_nl)
-              || (*p == '\n' && !(syntax_bits & RE_DOT_NEWLINE))
-              || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL))
-              || (char *) p >= end)
+              || mbrtowc_cache[*p] != WEOF || (char *) p >= end)
             {
               /* If a input character does not match ANYCHAR, do it
                  like a single-byte character.  */
@@ -3323,8 +3302,6 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end, bool allow_nl,
           else
             {
               s = transit_state (d, s, &p, (unsigned char *) end);
-              if (s >= 0 && p[-1] == eol)
-                nlcount++;
               mbp = p;
               trans = d->trans;
             }
-- 
1.7.1

bug#23932: dfa: use algorithm for single byte character to any single byte character in input text always

Reply via email to