bug#21486: [PATCH 3/3] dfa: cache transition from a state with dot expression in non-UTF8 multibyte locales

Norihiro Tanaka Tue, 15 Sep 2015 06:49:16 -0700

For many patterns, matching in non-UTF8 multibyte locales is as fast as
in single byte du to many improvement.  However, for patterns to begin
with one or more than period, it is still slow.  This change improves
performance for them.


Compare the run times of this command before and after this change:
(on a i5-4570 CPU @ 3.20GHz using rawhide (~fedora 22) and compiled
with gcc 5.1.1 20150618)
yes "$(printf 'a%38db\n' 0)" | head -1000000 >in

env LC_ALL=ja_JP.eucJP time -p src/grep '.a.b' in
  Before: 2.33
   After: 0.69

env LC_ALL=ja_JP.eucJP time -p src/grep ^..............................$ in
  Before: 2.35
  After : 0.45

$ env LC_ALL=ja_JP.eucJP time -p src/grep 
.......................................... in
  Before: 19.10
  After :  0.55

By the way, before applying this patch, two patches in bug#21266 needs
to be applied.

From 7ceaf089564d74b8ca14be930477d2bb21739687 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Sat, 12 Sep 2015 12:28:09 +0900
Subject: [PATCH 3/3] dfa: cache transition from a state with dot expression in
 non-UTF8 multibyte locales

In non-UTF8 locales, matching dot expression is very slow, as the next
state is calculated on depmand.  This change caches the result for the
typical case.

Compare the run times of this command before and after this change:
(on a i5-4570 CPU @ 3.20GHz using rawhide (~fedora 22) and compiled
with gcc 5.1.1 20150618)
yes "$(printf 'a%38db\n' 0)" | head -1000000 >in
env LC_ALL=ja_JP.eucJP time -p \
src/grep .......................................... in
  Before: 19.10
  After :  0.55

* NEWS: Document this.
* src/dfa.c: (struct dfa_state) [curr_dependent, mb_trindex]: New member.
(enum MAX_TRCOUNT): New enum.
(struct dfa) [mb_trans, mb_trcount]: New member.
(state_index): Initialize new members of struct dfa_state and calculate
dependency on context of next character for positions for dot.
(dfastate): Calculate follows positions for dot if enable.
(realloc_trans_if_necessary): Allocate D->mb_trans.
(build_state): Use new enum and Reset D->mb_trans.
(transit_state): Use cache for transition from a state with dot
expression.
(free_mbdata): Deallocate D->mb_trans.
---
 src/dfa.c | 220 ++++++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 171 insertions(+), 49 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index f8c42fc..af72c14 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -283,16 +283,25 @@ typedef struct
   position_set elems;           /* Positions this state could match.  */
   unsigned char context;        /* Context from previous state.  */
   unsigned short constraint;    /* Constraint for this state to accept.  */
+  bool curr_dependent;          /* True if follows of any positions which
+                                   has ANYCHAR depends on context of next
+                                   character.  */
   token first_end;              /* Token value of the first END in elems.  */
   position_set mbps;            /* Positions which can match multibyte
-                                   characters, e.g., period.
+                                   characters, or the follows, e.g., period.
                                    Used only if MB_CUR_MAX > 1.  */
+  size_t mb_trindex;            /* Index of this state in MB_TRANS.  If
+                                   the state does not have ANYCHAR, this
+                                   value is -1.  */
 } dfa_state;
 
 /* States are indexed by state_num values.  These are normally
    nonnegative but -1 is used as a special value.  */
 typedef ptrdiff_t state_num;
 
+/*  Maximum of number of transition tables.  */
+enum { MAX_TRCOUNT = 1024 };
+
 /* A bracket operator.
    e.g., [a-c], [[:alpha:]], etc.  */
 struct mb_char_classes
@@ -415,8 +424,10 @@ struct dfa
                                    context in multibyte locales, in which we
                                    do not distinguish between their contexts,
                                    as not supported word.  */
-  position_set mb_follows;      /* Follow set added by ANYCHAR and/or MBCSET
-                                   on demand.  */
+  position_set mb_follows;      /* Follow set added by ANYCHAR on demand.  */
+  state_num **mb_trans;         /* Transition tables for states with ANYCHAR.  
*/
+  size_t mb_trcount;            /* Number of transition tables for states with
+                                   ANYCHAR that have actually been built.  */
 };
 
 /* Some macros for user access to dfa internals.  */
@@ -2100,21 +2111,37 @@ state_index (struct dfa *d, position_set const *s, int 
context)
   copy (s, &d->states[i].elems);
   d->states[i].context = context;
   d->states[i].constraint = 0;
+  d->states[i].curr_dependent = false;
   d->states[i].first_end = 0;
   d->states[i].mbps.nelem = 0;
   d->states[i].mbps.elems = NULL;
+  d->states[i].mb_trindex = -1;
 
   for (j = 0; j < s->nelem; ++j)
-    if (d->tokens[s->elems[j].index] < 0)
-      {
-        constraint = s->elems[j].constraint;
-        if (SUCCEEDS_IN_CONTEXT (constraint, context, CTX_ANY))
-          d->states[i].constraint |= constraint;
-        if (!d->states[i].first_end)
-          d->states[i].first_end = d->tokens[s->elems[j].index];
-      }
-    else if (d->tokens[s->elems[j].index] == BACKREF)
-      d->states[i].constraint = NO_CONSTRAINT;
+    {
+      constraint = s->elems[j].constraint;
+      if (d->tokens[s->elems[j].index] < 0)
+        {
+          if (SUCCEEDS_IN_CONTEXT (constraint, context, CTX_ANY))
+            d->states[i].constraint |= constraint;
+          if (!d->states[i].first_end)
+            d->states[i].first_end = d->tokens[s->elems[j].index];
+        }
+      else if (d->tokens[s->elems[j].index] == BACKREF)
+        d->states[i].constraint = NO_CONSTRAINT;
+      if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
+        {
+          int acceptable = 0;
+          if (SUCCEEDS_IN_CONTEXT (constraint, context, CTX_NEWLINE))
+            acceptable |= CTX_NEWLINE;
+          if (SUCCEEDS_IN_CONTEXT (constraint, context, CTX_LETTER))
+            acceptable |= CTX_LETTER;
+          if (SUCCEEDS_IN_CONTEXT (constraint, context, CTX_NONE))
+            acceptable |= CTX_NONE;
+          if (acceptable != 0 && (acceptable & context) != context)
+            d->states[i].curr_dependent = true;
+        }
+    }
 
   ++d->sindex;
 
@@ -2565,20 +2592,31 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         setbit (d->tokens[pos.index], matches);
       else if (d->tokens[pos.index] >= CSET)
         copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
-      else
+      else if (d->multibyte && d->tokens[pos.index] == ANYCHAR)
+        /* ANYCHAR must match with a single character, so we must put
+           it to D->states[s].mbps which contains the positions which
+           can match with a single character not a byte.  If all
+           positions which has ANYCHAR does not depend on context of
+           next character, we put the follows instead of it to
+           D->states[s].mbps to optimize.  */
         {
-          if (d->tokens[pos.index] == MBCSET
-              || d->tokens[pos.index] == ANYCHAR)
+          if (d->states[s].curr_dependent)
             {
-              /* ANYCHAR and MBCSET must match with a single character, so we
-                 must put it to d->states[s].mbps, which contains the positions
-                 which can match with a single character not a byte.  */
               if (d->states[s].mbps.nelem == 0)
                 alloc_position_set (&d->states[s].mbps, 1);
               insert (pos, &(d->states[s].mbps));
             }
-          continue;
+          else if (SUCCEEDS_IN_CONTEXT (pos.constraint,
+                                        d->states[s].context, CTX_ANY))
+            {
+              if (d->states[s].mbps.nelem == 0)
+                alloc_position_set (&d->states[s].mbps, 1);
+              for (j = 0; j < d->follows[pos.index].nelem; ++j)
+                insert (d->follows[pos.index].elems[j], &(d->states[s].mbps));
+            }
         }
+      else
+        continue;
 
       /* Some characters may need to be eliminated from matches because
          they fail in the current context.  */
@@ -2847,10 +2885,20 @@ realloc_trans_if_necessary (struct dfa *d, state_num 
new_state)
       d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
       d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
       d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
+      if (d->multibyte)
+        {
+          realtrans = d->mb_trans ? d->mb_trans - 1 : NULL;
+          realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
+          if (oldalloc == 0)
+            realtrans[0] = NULL;
+          d->mb_trans = realtrans + 1;
+        }
       for (; oldalloc < newalloc; oldalloc++)
         {
           d->trans[oldalloc] = NULL;
           d->fails[oldalloc] = NULL;
+          if (d->multibyte)
+            d->mb_trans[oldalloc] = NULL;
         }
     }
 }
@@ -2869,11 +2917,11 @@ build_state (state_num s, struct dfa *d)
   state_num i, maxstate;
 
   /* Set an upper limit on the number of transition tables that will ever
-     exist at once.  1024 is arbitrary.  The idea is that the frequently
+     exist at once.  MAX_TRCOUNT is arbitrary.  The idea is that the frequently
      used transition tables will be quickly rebuilt, whereas the ones that
      were only needed once or twice will be cleared away.  However, do not
      clear the initial D->min_trcount states, since they are always used.  */
-  if (d->trcount >= 1024)
+  if (d->trcount >= MAX_TRCOUNT)
     {
       for (i = d->min_trcount; i < d->tralloc; ++i)
         {
@@ -2882,6 +2930,17 @@ build_state (state_num s, struct dfa *d)
           d->trans[i] = d->fails[i] = NULL;
         }
       d->trcount = d->min_trcount;
+
+      if (d->multibyte)
+        {
+          for (i = d->min_trcount; i < d->tralloc; ++i)
+            {
+              free (d->mb_trans[i]);
+              d->mb_trans[i] = NULL;
+            }
+          free (d->mb_trans[-1]);
+          d->mb_trans[-1] = NULL;
+        }
     }
 
   ++d->trcount;
@@ -2966,53 +3025,108 @@ static state_num
 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
                unsigned char const *end)
 {
-  state_num s1, s2;
-  int mbclen;  /* The length of current input multibyte character.  */
+  state_num s1;
+  size_t mbclen;  /* The length of current input multibyte character.  */
   wint_t wc;
+  bool context_newline;
   int context;
-  size_t i, j;
-  int k;
   int separate_contexts;
+  size_t mb_index;
+  state_num state, state_newline;
+  size_t i, j;
 
   /* Note: caller must free the return value of this function.  */
   mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
 
-  context = (wc == (wchar_t) eolbyte || wc == 0) ? CTX_NEWLINE : CTX_NONE;
-
-  /* This state has some operators which can match a multibyte character.  */
-  d->mb_follows.nelem = 0;
+  context_newline = (wc == (wchar_t) eolbyte || wc == 0);
+  context = context_newline ? CTX_NEWLINE : CTX_NONE;
 
   /* Calculate the state which can be reached from the state 's' by
      consuming 'mbclen' single bytes from the buffer.  */
   s1 = s;
-  for (k = 0; k < mbclen; k++)
+  for (i = 0; i < mbclen && s >= 0; i++)
+    s = transit_state_singlebyte (d, s, pp);
+  for (; i < mbclen; i++)
+    (*pp)++;
+
+  if (d->states[s1].curr_dependent)
     {
-      s2 = s1;
-      s1 = transit_state_singlebyte (d, s2, pp);
+      if (s >= 0)
+        copy (&d->states[s].elems, &d->mb_follows);
+      else
+        d->mb_follows.nelem = 0;
+
+      for (i = 0; i < d->states[s1].mbps.nelem; ++i)
+        {
+          if (!SUCCEEDS_IN_CONTEXT (d->states[s1].mbps.elems[i].constraint,
+                                    d->states[s1].context, context))
+            continue;
+          for (j = 0; j < d->follows[d->states[s1].mbps.elems[i].index].nelem;
+               j++)
+            insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j],
+                    &d->mb_follows);
+        }
+
+      separate_contexts = state_separate_contexts (&d->mb_follows);
+      if (context == CTX_NEWLINE && separate_contexts & CTX_NEWLINE)
+        s = state_index (d, &d->mb_follows, CTX_NEWLINE);
+      else
+        s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
+      realloc_trans_if_necessary (d, s);
+
+      return s;
     }
-  copy (&d->states[s1].elems, &d->mb_follows);
 
-  /* Add all of the positions which can be reached from 's' by consuming
-     a single character.  */
-  for (i = 0; i < d->states[s].mbps.nelem; i++)
+  /* If all positions which has ANYCHAR does not depend on context of
+     next character, calculate next state with pre-calculated follows
+     and cache the result.  */
+  if (d->states[s1].mb_trindex == (size_t) -1)
     {
-      if (!SUCCEEDS_IN_CONTEXT (d->states[s].mbps.elems[i].constraint,
-                                d->states[s].context, context))
-        continue;
-      for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
-           j++)
-        insert (d->follows[d->states[s].mbps.elems[i].index].elems[j],
-                &d->mb_follows);
+      if (d->mb_trcount >= MAX_TRCOUNT)
+        {
+          for (i = 0; i < d->tralloc; ++i)
+            {
+              free (d->mb_trans[i]);
+              d->mb_trans[i] = NULL;
+            }
+          free (d->mb_trans[-1]);
+          d->mb_trans[-1] = NULL;
+
+          for (i = 0; i < d->sindex; ++i)
+            d->states[i].mb_trindex = -1;
+          d->mb_trcount = 0;
+        }
+      d->states[s1].mb_trindex = d->mb_trcount++;
     }
 
+  mb_index = d->states[s1].mb_trindex << 1 | (context_newline ? 1 : 0);
+
+  if (d->mb_trans[s] == NULL)
+    {
+      d->mb_trans[s] = xnmalloc (2 * MAX_TRCOUNT, sizeof *d->mb_trans[s]);
+      for (i = 0; i < 2 * MAX_TRCOUNT; i++)
+        d->mb_trans[s][i] = -1;
+    }
+  else if (d->mb_trans[s][mb_index] >= 0)
+    return d->mb_trans[s][mb_index];
+
+  if (s < 0)
+    copy (&d->states[s1].mbps, &d->mb_follows);
+  else
+    merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
+
   separate_contexts = state_separate_contexts (&d->mb_follows);
-  if (context == CTX_NEWLINE && separate_contexts & CTX_NEWLINE)
-    s1 = state_index (d, &d->mb_follows, CTX_NEWLINE);
+  state = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
+  if (separate_contexts & CTX_NEWLINE)
+    state_newline = state_index (d, &d->mb_follows, CTX_NEWLINE);
   else
-    s1 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
-  realloc_trans_if_necessary (d, s1);
+    state_newline = state;
+  realloc_trans_if_necessary (d, state_newline);
 
-  return s1;
+  d->mb_trans[s][mb_index & ~0] = state;
+  d->mb_trans[s][mb_index | 1] = state_newline;
+
+  return (context == CTX_NEWLINE) ? state_newline : state;
 }
 
 /* The initial state may encounter a byte which is not a single byte character
@@ -3295,6 +3409,14 @@ free_mbdata (struct dfa *d)
 
   free (d->mbcsets);
   free (d->mb_follows.elems);
+
+  if (d->mb_trans)
+    {
+      for (i = 0; i < d->tralloc; ++i)
+        free (d->mb_trans[i]);
+      free (d->mb_trans[-1]);
+      free (d->mb_trans - 1);
+    }
 }
 
 /* Initialize the components of a dfa that the other routines don't
-- 
2.4.6

bug#21486: [PATCH 3/3] dfa: cache transition from a state with dot expression in non-UTF8 multibyte locales

Reply via email to