bug#21266: [PATCH] dfa: simplify for non-POSIX locales

Paul Eggert Wed, 06 Jul 2016 10:56:57 -0700

Thanks, I merged those two patches and installed them into the grepmaster, with minor adjustments to the commit messages. I also installedtwo minor fixup patches, mostly fixing minor style issues. I did noticeone minor technical issue; the second patch had code that looked like this:


 +  context = (wc == (wchar_t) eolbyte || wc == 0) ? CTX_NEWLINE : CTX_NONE;

I realize this came from the old wchar_context function, but I don't seewhy that "|| wc == 0" is there, so I removed it. The tests still pass.If you (or someone else) can explain why it's needed I can put it back in.

Attached are the four patches I installed; the first and third are yourpatches and the second and fourth are the fixups.

>From f0951ff04e023c24db8755fb5213f54491b795a9 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Wed, 6 Jul 2016 18:41:57 +0200
Subject: [PATCH 1/4] dfa: simplify for non-POSIX locales

Simplify the dfa code, since it no longer supports ranges,
collating elements, and equivalent classes in non-POSIX locales.
* src/dfa.c (struct dfa): Remove mb_match_lens.
(enum status_transit_state, match_anychar)
(check_matching_with_multibyte_ops, transit_state_consume_1char):
(State_transition): Remove.
(transit_state_singlebyte): Accepts pointer-to-pointer position,
instead of pointer, and no longer accept pointer to next state.
Return next state instead of status_transit_state.  All callers
changed.
(transit_state_singlebyte, transit_state): Simplify.
(dfaexec_main): Now transit_state is called only when next character
matches with ANYCHAR.
---
 src/dfa.c | 321 ++++++++++++++++----------------------------------------------
 1 file changed, 81 insertions(+), 240 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 19363ce..74833ba 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -411,9 +411,6 @@ struct dfa
   state_num initstate_others;   /* Initial state for other contexts.  */
   position_set mb_follows;	/* Follow set added by ANYCHAR and/or MBCSET
                                    on demand.  */
-  int *mb_match_lens;           /* Array of length reduced by ANYCHAR and/or
-                                   MBCSET.  Null if mb_follows.elems has not
-                                   been allocated.  */
 };
 
 /* Some macros for user access to dfa internals.  */
@@ -2930,132 +2927,66 @@ build_state (state_num s, struct dfa *d)
 
 /* Multibyte character handling sub-routines for dfaexec.  */
 
-/* Return values of transit_state_singlebyte, and
-   transit_state_consume_1char.  */
-typedef enum
-{
-  TRANSIT_STATE_IN_PROGRESS,    /* State transition has not finished.  */
-  TRANSIT_STATE_DONE,           /* State transition has finished.  */
-  TRANSIT_STATE_END_BUFFER      /* Reach the end of the buffer.  */
-} status_transit_state;
-
 /* Consume a single byte and transit state from 's' to '*next_state'.
    This function is almost same as the state transition routin in dfaexec.
    But state transition is done just once, otherwise matching succeed or
    reach the end of the buffer.  */
-static status_transit_state
-transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const *p,
-                          state_num * next_state)
+static state_num
+transit_state_singlebyte (struct dfa *d, state_num const s,
+                          unsigned char const **pp)
 {
   state_num *t;
-  state_num works = s;
-
-  status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
 
-  while (rval == TRANSIT_STATE_IN_PROGRESS)
+  if (**pp == eolbyte)
     {
-      if ((t = d->trans[works]) != NULL)
-        {
-          works = t[*p];
-          rval = TRANSIT_STATE_DONE;
-          if (works < 0)
-            works = 0;
-        }
-      else if (works < 0)
-        works = 0;
-      else if (d->fails[works])
-        {
-          works = d->fails[works][*p];
-          rval = TRANSIT_STATE_DONE;
-        }
-      else
-        {
-          build_state (works, d);
-        }
-    }
-  *next_state = works;
-  return rval;
-}
+      /* S is always an initial state in transit_state in order that the
+         newline is the single.  When transit_state is called, the
+         transition table for the state must have been built already.  */
+      assert (d->trans[s] != NULL || d->fails[s] != NULL);
 
-/* Match a "." against the current context.  Return the length of the
-   match, in bytes.  POS is the position of the ".".  */
-static int
-match_anychar (struct dfa *d, state_num s, position pos,
-               wint_t wc, size_t mbclen)
-{
-  int context;
-
-  /* Check syntax bits.  */
-  if (wc == (wchar_t) '\n')
-    {
-      if (!(syntax_bits & RE_DOT_NEWLINE))
-        return 0;
-    }
-  else if (wc == (wchar_t) '\0')
-    {
-      if (syntax_bits & RE_DOT_NOT_NULL)
-        return 0;
+      ++*pp;
+      return d->newlines[s];
     }
-  else if (wc == WEOF)
-    return 0;
-
-  context = wchar_context (wc);
-  if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context))
-    return 0;
-
-  return mbclen;
-}
-
-/* Check whether each of 'd->states[s].mbps.elem' can match.  Then return the
-   array which corresponds to 'd->states[s].mbps.elem'; each element of the
-   array contains the number of bytes with which the element can match.
 
-   The caller MUST free the array which this function return.  */
-static int *
-check_matching_with_multibyte_ops (struct dfa *d, state_num s,
-                                   char const *p, wint_t wc, size_t mbclen)
-{
-  size_t i;
-  int *rarray;
-
-  rarray = d->mb_match_lens;
-  for (i = 0; i < d->states[s].mbps.nelem; ++i)
+  if (d->trans[s] != NULL)
+    t = d->trans[s];
+  else if (d->fails[s] != NULL)
+    t = d->fails[s];
+  else
     {
-      position pos = d->states[s].mbps.elems[i];
-      switch (d->tokens[pos.index])
-        {
-        case ANYCHAR:
-          rarray[i] = match_anychar (d, s, pos, wc, mbclen);
-          break;
-        default:
-          break;                /* cannot happen.  */
-        }
+      build_state (s, d);
+      if (d->trans[s])
+        t = d->trans[s];
+      else if (d->fails[s])
+        t = d->fails[s];
+      else
+        abort ();
     }
-  return rarray;
-}
 
-/* Consume a single character and enumerate all of the positions which can
-   be the next position from the state 's'.
-
-   'match_lens' is the input.  It can be NULL, but it can also be the output
-   of check_matching_with_multibyte_ops for optimization.
+  return t[*(*pp)++];
+}
 
-   'mbclen' and 'pps' are the output.  'mbclen' is the length of the
-   character consumed, and 'pps' is the set this function enumerates.  */
-static status_transit_state
-transit_state_consume_1char (struct dfa *d, state_num s,
-                             unsigned char const **pp,
-                             wint_t wc, size_t mbclen,
-                             int *match_lens)
+/* Transit state from s, then return new state and update the pointer of
+   the buffer.  This function is for a period operator which can match a
+   multi-byte character.  */
+static state_num
+transit_state (struct dfa *d, state_num s, unsigned char const **pp,
+               unsigned char const *end)
 {
+  state_num s1, s2;
+  int mbclen;  /* The length of current input multibyte character.  */
+  wint_t wc;
+  int context;
   size_t i, j;
   int k;
-  state_num s1, s2;
-  status_transit_state rs = TRANSIT_STATE_DONE;
 
-  if (! match_lens && d->states[s].mbps.nelem != 0)
-    match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp,
-                                                    wc, mbclen);
+  /* Note: caller must free the return value of this function.  */
+  mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
+
+  context = wchar_context (wc);
+
+  /* This state has some operators which can match a multibyte character.  */
+  d->mb_follows.nelem = 0;
 
   /* Calculate the state which can be reached from the state 's' by
      consuming 'mbclen' single bytes from the buffer.  */
@@ -3063,7 +2994,7 @@ transit_state_consume_1char (struct dfa *d, state_num s,
   for (k = 0; k < mbclen; k++)
     {
       s2 = s1;
-      rs = transit_state_singlebyte (d, s2, (*pp)++, &s1);
+      s1 = transit_state_singlebyte (d, s2, pp);
     }
   copy (&d->states[s1].elems, &d->mb_follows);
 
@@ -3071,94 +3002,18 @@ transit_state_consume_1char (struct dfa *d, state_num s,
      a single character.  */
   for (i = 0; i < d->states[s].mbps.nelem; i++)
     {
-      if (match_lens[i] == mbclen)
-        for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
-             j++)
-          insert (d->follows[d->states[s].mbps.elems[i].index].elems[j],
-                  &d->mb_follows);
-    }
-
-  /* FIXME: this return value is always ignored.  */
-  return rs;
-}
-
-/* Transit state from s, then return new state and update the pointer of the
-   buffer.  This function is for some operator which can match with a multi-
-   byte character or a collating element (which may be multi characters).  */
-static state_num
-transit_state (struct dfa *d, state_num s, unsigned char const **pp,
-               unsigned char const *end)
-{
-  state_num s1;
-  int mbclen;  /* The length of current input multibyte character.  */
-  int maxlen = 0;
-  size_t i, j;
-  int *match_lens = NULL;
-  size_t nelem = d->states[s].mbps.nelem;       /* Just a alias.  */
-  unsigned char const *p1 = *pp;
-  wint_t wc;
-
-  if (nelem > 0)
-    /* This state has (a) multibyte operator(s).
-       We check whether each of them can match or not.  */
-    {
-      /* Note: caller must free the return value of this function.  */
-      mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
-      match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp,
-                                                      wc, mbclen);
-
-      for (i = 0; i < nelem; i++)
-        /* Search the operator which match the longest string,
-           in this state.  */
-        {
-          if (match_lens[i] > maxlen)
-            maxlen = match_lens[i];
-        }
-    }
-
-  if (nelem == 0 || maxlen == 0)
-    /* This state has no multibyte operator which can match.
-       We need to check only one single byte character.  */
-    {
-      status_transit_state rs;
-      rs = transit_state_singlebyte (d, s, *pp, &s1);
-
-      /* We must update the pointer if state transition succeeded.  */
-      if (rs == TRANSIT_STATE_DONE)
-        ++*pp;
-
-      return s1;
+      if (!SUCCEEDS_IN_CONTEXT (d->states[s].mbps.elems[i].constraint,
+                                d->states[s].context, context))
+        continue;
+      for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
+           j++)
+        insert (d->follows[d->states[s].mbps.elems[i].index].elems[j],
+                &d->mb_follows);
     }
 
-  /* This state has some operators which can match a multibyte character.  */
-  d->mb_follows.nelem = 0;
-
-  /* 'maxlen' may be longer than the length of a character, because it may
-     not be a character but a (multi character) collating element.
-     We enumerate all of the positions which 's' can reach by consuming
-     'maxlen' bytes.  */
-  transit_state_consume_1char (d, s, pp, wc, mbclen, match_lens);
-
   s1 = state_index (d, &d->mb_follows, wchar_context (wc));
   realloc_trans_if_necessary (d, s1);
 
-  while (*pp - p1 < maxlen)
-    {
-      mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
-      transit_state_consume_1char (d, s1, pp, wc, mbclen, NULL);
-
-      for (i = 0; i < nelem; i++)
-        {
-          if (match_lens[i] == *pp - p1)
-            for (j = 0;
-                 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
-              insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j],
-                      &d->mb_follows);
-        }
-
-      s1 = state_index (d, &d->mb_follows, wchar_context (wc));
-      realloc_trans_if_necessary (d, s1);
-    }
   return s1;
 }
 
@@ -3238,11 +3093,8 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
   if (multibyte)
     {
       memset (&d->mbs, 0, sizeof d->mbs);
-      if (! d->mb_match_lens)
-        {
-          d->mb_match_lens = xnmalloc (d->nleaves, sizeof *d->mb_match_lens);
-          alloc_position_set (&d->mb_follows, d->nleaves);
-        }
+      if (d->mb_follows.alloc == 0)
+        alloc_position_set (&d->mb_follows, d->nleaves);
     }
 
   for (;;)
@@ -3293,44 +3145,21 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
                     }
                 }
 
-              if (d->states[s].mbps.nelem == 0)
+              if (d->states[s].mbps.nelem == 0 || (*p == eol && !allow_nl)
+                  || (*p == '\n' && !(syntax_bits & RE_DOT_NEWLINE))
+                  || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL))
+                  || (char *) p >= end)
+                /* If a input character does not match ANYCHAR, do it
+                   like a single-byte character.  */
+                s = t[*p++];
+              else
                 {
-                  s = t[*p++];
-                  continue;
+                  s = transit_state (d, s, &p, (unsigned char *) end);
+                  if (s >= 0 && p[-1] == eol)
+                    nlcount++;
+                  mbp = p;
+                  trans = d->trans;
                 }
-
-              /* The following code is used twice.
-                 Use a macro to avoid the risk that they diverge.  */
-#define State_transition()                                              \
-  do {                                                                  \
-              /* Can match with a multibyte character (and multi-character \
-                 collating element).  Transition table might be updated.  */ \
-              s = transit_state (d, s, &p, (unsigned char *) end);      \
-                                                                        \
-              /* If previous character is newline after a transition    \
-                 for ANYCHAR or MBCSET in non-UTF8 multibyte locales,   \
-                 check whether current position is beyond the end of    \
-                 the input buffer.  Also, transit to initial state if   \
-                 !ALLOW_NL, even if RE_DOT_NEWLINE is set. */           \
-              if (p[-1] == eol)                                         \
-                {                                                       \
-                  if ((char *) p > end)                                 \
-                    {                                                   \
-                      p = NULL;                                         \
-                      goto done;                                        \
-                    }                                                   \
-                                                                        \
-                  nlcount++;                                            \
-                                                                        \
-                  if (!allow_nl)                                        \
-                    s = 0;                                              \
-                }                                                       \
-                                                                        \
-              mbp = p;                                                  \
-              trans = d->trans;                                         \
-  } while (false)
-
-              State_transition();
             }
         }
       else
@@ -3378,10 +3207,24 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
             goto done;
 
           s1 = s;
-          if (multibyte)
-            State_transition();
-          else
+          if (!multibyte || d->states[s].mbps.nelem == 0
+              || (*p == eol && !allow_nl)
+              || (*p == '\n' && !(syntax_bits & RE_DOT_NEWLINE))
+              || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL))
+              || (char *) p >= end)
+           /* If a input character does not match ANYCHAR, do it
+              like a single-byte character.  */
             s = d->fails[s][*p++];
+          else
+            {
+              s = transit_state (d, s, &p, (unsigned char *) end);
+
+              if (s >= 0 && p[-1] == eol)
+                nlcount++;
+
+              mbp = p;
+              trans = d->trans;
+            }
         }
       else
         {
@@ -3461,8 +3304,6 @@ free_mbdata (struct dfa *d)
 
   free (d->mbcsets);
   free (d->mb_follows.elems);
-  free (d->mb_match_lens);
-  d->mb_match_lens = NULL;
 }
 
 /* Initialize the components of a dfa that the other routines don't
-- 
2.5.5

>From 8ac05f9cc80e1f0c1cbaeb4215d59822080421c5 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Wed, 6 Jul 2016 18:53:25 +0200
Subject: [PATCH 2/4] dfa: minor cleanups for non-POSIX simplification

* src/dfa.c (transit_state_singlebyte): Remove unnecessary 'const'
from arg; we usually don't bother with 'const' on locals.
(transit_state_singlebyte): Omit '!= NULL' in boolean context.
Use assert rather than abort.
---
 src/dfa.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 74833ba..9116df4 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -2932,35 +2932,34 @@ build_state (state_num s, struct dfa *d)
    But state transition is done just once, otherwise matching succeed or
    reach the end of the buffer.  */
 static state_num
-transit_state_singlebyte (struct dfa *d, state_num const s,
-                          unsigned char const **pp)
+transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
 {
   state_num *t;
 
   if (**pp == eolbyte)
     {
-      /* S is always an initial state in transit_state in order that the
-         newline is the single.  When transit_state is called, the
+      /* S is always an initial state in transit_state, so the
          transition table for the state must have been built already.  */
-      assert (d->trans[s] != NULL || d->fails[s] != NULL);
+      assert (d->trans[s] || d->fails[s]);
 
       ++*pp;
       return d->newlines[s];
     }
 
-  if (d->trans[s] != NULL)
+  if (d->trans[s])
     t = d->trans[s];
-  else if (d->fails[s] != NULL)
+  else if (d->fails[s])
     t = d->fails[s];
   else
     {
       build_state (s, d);
       if (d->trans[s])
         t = d->trans[s];
-      else if (d->fails[s])
-        t = d->fails[s];
       else
-        abort ();
+        {
+          t = d->fails[s];
+          assert (t);
+        }
     }
 
   return t[*(*pp)++];
@@ -3005,8 +3004,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
       if (!SUCCEEDS_IN_CONTEXT (d->states[s].mbps.elems[i].constraint,
                                 d->states[s].context, context))
         continue;
-      for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
-           j++)
+      for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem; j++)
         insert (d->follows[d->states[s].mbps.elems[i].index].elems[j],
                 &d->mb_follows);
     }
@@ -3149,9 +3147,11 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
                   || (*p == '\n' && !(syntax_bits & RE_DOT_NEWLINE))
                   || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL))
                   || (char *) p >= end)
-                /* If a input character does not match ANYCHAR, do it
-                   like a single-byte character.  */
-                s = t[*p++];
+                {
+                  /* If an input character does not match ANYCHAR, do it
+                     like a single-byte character.  */
+                  s = t[*p++];
+                }
               else
                 {
                   s = transit_state (d, s, &p, (unsigned char *) end);
@@ -3212,16 +3212,16 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
               || (*p == '\n' && !(syntax_bits & RE_DOT_NEWLINE))
               || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL))
               || (char *) p >= end)
-           /* If a input character does not match ANYCHAR, do it
-              like a single-byte character.  */
-            s = d->fails[s][*p++];
+            {
+              /* If a input character does not match ANYCHAR, do it
+                 like a single-byte character.  */
+              s = d->fails[s][*p++];
+            }
           else
             {
               s = transit_state (d, s, &p, (unsigned char *) end);
-
               if (s >= 0 && p[-1] == eol)
                 nlcount++;
-
               mbp = p;
               trans = d->trans;
             }
-- 
2.5.5

>From 7c0d855bfa8d6e5aa91ebd60681bc7afbbe1f8d4 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Wed, 6 Jul 2016 19:10:04 +0200
Subject: [PATCH 3/4] dfa: don't distingish letter in non-POSIX locales

For non-POSIX locales, dfa does not support word delimiter
support, so remove distinction between letters and non-letters.
* src/dfa.c (struct dfa): Remove members initstate_letter,
initstate_others.  All uses removed.  New member initstate_notbol.
(dfaanalyze, dfaexec_main): Replace old members with new member.
(wchar_context): Remove.  Update callers.
---
 src/dfa.c | 47 ++++++++++++++++++-----------------------------
 1 file changed, 18 insertions(+), 29 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 9116df4..d5ffe72 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -407,9 +407,11 @@ struct dfa
                                    newline is stored separately and handled
                                    as a special case.  Newline is also used
                                    as a sentinel at the end of the buffer.  */
-  state_num initstate_letter;   /* Initial state for letter context.  */
-  state_num initstate_others;   /* Initial state for other contexts.  */
-  position_set mb_follows;	/* Follow set added by ANYCHAR and/or MBCSET
+  state_num initstate_notbol;   /* Initial state for CTX_LETTER and CTX_NONE
+                                   context in multibyte locales, in which we
+                                   do not distinguish between their contexts,
+                                   as not supported word.  */
+  position_set mb_follows;      /* Follow set added by ANYCHAR and/or MBCSET
                                    on demand.  */
 };
 
@@ -676,16 +678,6 @@ char_context (unsigned char c)
   return CTX_NONE;
 }
 
-static int
-wchar_context (wint_t wc)
-{
-  if (wc == (wchar_t) eolbyte || wc == 0)
-    return CTX_NEWLINE;
-  if (wc == L'_' || iswalnum (wc))
-    return CTX_LETTER;
-  return CTX_NONE;
-}
-
 /* Entry point to set syntax options.  */
 void
 dfasyntax (reg_syntax_t bits, bool fold, unsigned char eol)
@@ -2490,13 +2482,10 @@ dfaanalyze (struct dfa *d, bool searchflag)
   separate_contexts = state_separate_contexts (&merged);
   if (separate_contexts & CTX_NEWLINE)
     state_index (d, &merged, CTX_NEWLINE);
-  d->initstate_others = d->min_trcount
+  d->initstate_notbol = d->min_trcount
     = state_index (d, &merged, separate_contexts ^ CTX_ANY);
   if (separate_contexts & CTX_LETTER)
-    d->initstate_letter = d->min_trcount
-      = state_index (d, &merged, CTX_LETTER);
-  else
-    d->initstate_letter = d->initstate_others;
+    d->min_trcount = state_index (d, &merged, CTX_LETTER);
   d->min_trcount++;
 
   free (posalloc);
@@ -2978,11 +2967,12 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
   int context;
   size_t i, j;
   int k;
+  int separate_contexts;
 
   /* Note: caller must free the return value of this function.  */
   mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
 
-  context = wchar_context (wc);
+  context = (wc == (wchar_t) eolbyte || wc == 0) ? CTX_NEWLINE : CTX_NONE;
 
   /* This state has some operators which can match a multibyte character.  */
   d->mb_follows.nelem = 0;
@@ -3009,7 +2999,11 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
                 &d->mb_follows);
     }
 
-  s1 = state_index (d, &d->mb_follows, wchar_context (wc));
+  separate_contexts = state_separate_contexts (&d->mb_follows);
+  if (context == CTX_NEWLINE && separate_contexts & CTX_NEWLINE)
+    s1 = state_index (d, &d->mb_follows, CTX_NEWLINE);
+  else
+    s1 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
   realloc_trans_if_necessary (d, s1);
 
   return s1;
@@ -3129,16 +3123,11 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
                          transit to another initial state after skip.  */
                       if (p < mbp)
                         {
-                          int context = wchar_context (wc);
-                          if (context == CTX_LETTER)
-                            s = d->initstate_letter;
-                          else
-                            /* It's CTX_NONE.  CTX_NEWLINE cannot happen,
-                               as we assume that a newline is always a
-                               single byte character.  */
-                            s = d->initstate_others;
+                          /* It's CTX_LETTER or CTX_NONE.  CTX_NEWLINE
+                             cannot happen, as we assume that a newline
+                             is always a single byte character.  */
+                          s1 = s = d->initstate_notbol;
                           p = mbp;
-                          s1 = s;
                         }
                     }
                 }
-- 
2.5.5

>From 4a3f8c93d8848ea42cabb747282aa05a32ae8c13 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Wed, 6 Jul 2016 19:25:20 +0200
Subject: [PATCH 4/4] dfa: don't treat null bytes specially

* src/dfa.c (transit_state): Do not treat null byte specially
when eolbyte == '\n'.
---
 src/dfa.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index d5ffe72..8f9f0bc 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -2962,17 +2962,13 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
                unsigned char const *end)
 {
   state_num s1, s2;
-  int mbclen;  /* The length of current input multibyte character.  */
   wint_t wc;
-  int context;
   size_t i, j;
   int k;
   int separate_contexts;
 
-  /* Note: caller must free the return value of this function.  */
-  mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
-
-  context = (wc == (wchar_t) eolbyte || wc == 0) ? CTX_NEWLINE : CTX_NONE;
+  int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
+  int context = wc == eolbyte ? CTX_NEWLINE : CTX_NONE;
 
   /* This state has some operators which can match a multibyte character.  */
   d->mb_follows.nelem = 0;
@@ -3000,10 +2996,9 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
     }
 
   separate_contexts = state_separate_contexts (&d->mb_follows);
-  if (context == CTX_NEWLINE && separate_contexts & CTX_NEWLINE)
-    s1 = state_index (d, &d->mb_follows, CTX_NEWLINE);
-  else
-    s1 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
+  if (! (context == CTX_NEWLINE || separate_contexts & CTX_NEWLINE))
+    context = separate_contexts ^ CTX_ANY;
+  s1 = state_index (d, &d->mb_follows, context);
   realloc_trans_if_necessary (d, s1);
 
   return s1;
-- 
2.5.5

bug#21266: [PATCH] dfa: simplify for non-POSIX locales

Reply via email to