Set constraint, may be caused wrong result in non-UTF8 locales.  Below
fails in a current master.

  $ pattern=$(printf '^x\|\244\263')
  $ printf '\263\244\263\244\n' |
     env LC_ALL=ja_JP.eucJP src/grep "$pattern" && echo FAIL

skip_remains_mb runs in only state 0, but it's wrong.  Set constraint,
may transit to a state besides state 0 after failure.
From 060bcdbdfde4fb73fb0c90c05c6298cd37be6663 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Sat, 11 Oct 2014 11:38:09 +0900
Subject: [PATCH] dfa: treat a multibyte character even with constraints
 correctly

* src/dfa.c (struct dfa): Add a new members `min_trcount',
`initstate_letter' and `initstate_others'.
(dfaanalyze): Build states with not only a newline context but others.
(build_state): Don't release initial states.
(dfaexec_main): If multiple states exists in initial, transit a state
to another after skip a middle position in a multibyte character
tests/euc-mb: Add a new test.
---
 src/dfa.c    | 68 ++++++++++++++++++++++++++++++++++++++++++++++++------------
 tests/euc-mb |  1 +
 2 files changed, 56 insertions(+), 13 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 58a4b83..9899749 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -405,6 +405,10 @@ struct dfa
                                    slots so far, not counting trans[-1].  */
   int trcount;                  /* Number of transition tables that have
                                    actually been built.  */
+  int min_trcount;              /* Minimum of a number of transition tables.
+                                   Always keep the number, even if release
+                                   transition tables.  It also a number of
+                                   initial states.  */
   state_num **trans;            /* Transition tables for states that can
                                    never accept.  If the transitions for a
                                    state have not yet been computed, or the
@@ -423,6 +427,8 @@ struct dfa
                                    newline is stored separately and handled
                                    as a special case.  Newline is also used
                                    as a sentinel at the end of the buffer.  */
+  state_num initstate_letter;   /* Initial state for letter context.  */
+  state_num initstate_others;   /* Initial state for other contexts.  */
   struct dfamust *musts;        /* List of strings, at least one of which
                                    is known to appear in any r.e. matching
                                    the dfa.  */
@@ -2517,9 +2523,16 @@ dfaanalyze (struct dfa *d, int searchflag)
 
   /* Build the initial state.  */
   separate_contexts = state_separate_contexts (&merged);
-  state_index (d, &merged,
-               (separate_contexts & CTX_NEWLINE
-                ? CTX_NEWLINE : separate_contexts ^ CTX_ANY));
+  if (separate_contexts & CTX_NEWLINE)
+    state_index (d, &merged, CTX_NEWLINE);
+  d->initstate_others = d->min_trcount
+    = state_index (d, &merged, separate_contexts ^ CTX_ANY);
+  if (separate_contexts & CTX_LETTER)
+    d->initstate_letter = d->min_trcount
+      = state_index (d, &merged, CTX_LETTER);
+  else
+    d->initstate_letter = d->initstate_others;
+  d->min_trcount++;
 
   free (posalloc);
   free (stkalloc);
@@ -2859,13 +2872,13 @@ build_state (state_num s, struct dfa *d)
      not clear the initial state, as it's always used.  */
   if (d->trcount >= 1024)
     {
-      for (i = 1; i < d->tralloc; ++i)
+      for (i = d->min_trcount; i < d->tralloc; ++i)
         {
           free (d->trans[i]);
           free (d->fails[i]);
           d->trans[i] = d->fails[i] = NULL;
         }
-      d->trcount = 1;
+      d->trcount = d->min_trcount;
     }
 
   ++d->trcount;
@@ -3316,20 +3329,49 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end,
             {
               s1 = s;
 
-              if (s == 0)
+              if (s < d->min_trcount)
                 {
-                  if (d->states[s].mbps.nelem == 0)
+                  if (d->min_trcount == 1)
                     {
-                      do
+                      if (d->states[s].mbps.nelem == 0)
                         {
-                          while (t[*p] == 0)
-                            p++;
-                          p = mbp = skip_remains_mb (d, p, mbp, end);
+                          do
+                            {
+                              while (t[*p] == 0)
+                                p++;
+                              p = mbp = skip_remains_mb (d, p, mbp, end);
+                            }
+                          while (t[*p] == 0);
                         }
-                      while (t[*p] == 0);
+                      else
+                        p = mbp = skip_remains_mb (d, p, mbp, end);
                     }
                   else
-                    p = mbp = skip_remains_mb (d, p, mbp, end);
+                    {
+                      mbp = skip_remains_mb (d, p, mbp, end);
+
+                      /* If d->min_trcount is greater than 1, maybe
+                         transit to another initial state after skip.  */
+                      if (p < mbp)
+                        {
+                          if (*p == eol)
+                            s = 0;
+                          else if (d->initstate_letter == d->initstate_others)
+                            s = d->initstate_others;
+                          else
+                            {
+                              wint_t wc;
+                              mbs_to_wchar (&wc, (char const *) p,
+                                            (unsigned char *) end - p, d);
+                              if (wchar_context (wc))
+                                s = d->initstate_letter;
+                              else
+                                s = d->initstate_others;
+                            }
+                          p = mbp;
+                          s1 = s;
+                        }
+                    }
                 }
 
               if (d->states[s].mbps.nelem == 0)
diff --git a/tests/euc-mb b/tests/euc-mb
index 6a9a845..b625046 100755
--- a/tests/euc-mb
+++ b/tests/euc-mb
@@ -39,6 +39,7 @@ make_input BABAAB |euc_grep AB > out || fail=1
 make_input BABAAB > exp || framework_failure_
 compare exp out || fail=1
 make_input BABABA |euc_grep AB; test $? = 1 || fail=1
+make_input BABABA |euc_grep '^x\|AB'; test $? = 1 || fail=1
 
 # -P supports only unibyte and UTF-8 locales.
 LC_ALL=$locale grep -P x /dev/null
-- 
2.1.1

Reply via email to