Norihiro Tanaka <nori...@kcn.ne.jp> wrote:
> I do not distinguish between eol and newline in previous patch, so I
> fixed it.   In addition, I wrote the patch that dfa does not distingish
> letter and not letter in non-POSIX locales.

Previous two patches was buggy.  e.g. the line is returned in following
test case after patching.

  printf 'a00000000000000000000000000000000000000b\n' >in
  LC_ALL=ja_JP.eucJP src/grep '^..............................$' in

I fixed the bug, and improved other unexpected behaviors.  In addition,
removed unused member `mb_match_lens' of struct dfa.
From d73f38ac97e05679be988b7a529c067c723d21b0 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Mon, 10 Aug 2015 20:18:23 +0900
Subject: [PATCH 1/3] dfa: simplify for non-POSIX locales

Now dfa is not support range, collating element, equivalent class in
non-POSIX locales.  We can do dfa more simply by removing codes for them.

* src/dfa.c (enum status_transit_state): Remove function.
(transit_state_singlebyte): Update arguments and now returns next state
instead of status_transit_state.
(match_anychar): Remove function.
(check_matching_with_multibyte_ops): Remove function.
(transit_state_consume_1char): Remove function.
(transit_state): Simplify it.
(dfaexec_main): Now transit_state is called only when next character
matches with ANYCHAR.
(State_transition): Remove macro.
---
 src/dfa.c | 322 ++++++++++++++++----------------------------------------------
 1 file changed, 82 insertions(+), 240 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index ac5129b..aadc03e 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -415,9 +415,6 @@ struct dfa
   state_num initstate_others;   /* Initial state for other contexts.  */
   position_set mb_follows;     /* Follow set added by ANYCHAR and/or MBCSET
                                    on demand.  */
-  int *mb_match_lens;           /* Array of length reduced by ANYCHAR and/or
-                                   MBCSET.  Null if mb_follows.elems has not
-                                   been allocated.  */
 };
 
 /* Some macros for user access to dfa internals.  */
@@ -2934,132 +2931,66 @@ build_state (state_num s, struct dfa *d)
 
 /* Multibyte character handling sub-routines for dfaexec.  */
 
-/* Return values of transit_state_singlebyte, and
-   transit_state_consume_1char.  */
-typedef enum
-{
-  TRANSIT_STATE_IN_PROGRESS,    /* State transition has not finished.  */
-  TRANSIT_STATE_DONE,           /* State transition has finished.  */
-  TRANSIT_STATE_END_BUFFER      /* Reach the end of the buffer.  */
-} status_transit_state;
-
 /* Consume a single byte and transit state from 's' to '*next_state'.
    This function is almost same as the state transition routin in dfaexec.
    But state transition is done just once, otherwise matching succeed or
    reach the end of the buffer.  */
-static status_transit_state
-transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const *p,
-                          state_num * next_state)
+static state_num
+transit_state_singlebyte (struct dfa *d, state_num const s,
+                          unsigned char const **pp)
 {
   state_num *t;
-  state_num works = s;
-
-  status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
 
-  while (rval == TRANSIT_STATE_IN_PROGRESS)
+  if (**pp == eolbyte)
     {
-      if ((t = d->trans[works]) != NULL)
-        {
-          works = t[*p];
-          rval = TRANSIT_STATE_DONE;
-          if (works < 0)
-            works = 0;
-        }
-      else if (works < 0)
-        works = 0;
-      else if (d->fails[works])
-        {
-          works = d->fails[works][*p];
-          rval = TRANSIT_STATE_DONE;
-        }
-      else
-        {
-          build_state (works, d);
-        }
-    }
-  *next_state = works;
-  return rval;
-}
+      /* S is always an initial state in transit_state in order that the
+         newline is the single.  When transit_state is called, the
+         transition table for the state must have been built already.  */
+      assert (d->trans[s] != NULL || d->fails[s] != NULL);
 
-/* Match a "." against the current context.  Return the length of the
-   match, in bytes.  POS is the position of the ".".  */
-static int
-match_anychar (struct dfa *d, state_num s, position pos,
-               wint_t wc, size_t mbclen)
-{
-  int context;
-
-  /* Check syntax bits.  */
-  if (wc == (wchar_t) '\n')
-    {
-      if (!(syntax_bits & RE_DOT_NEWLINE))
-        return 0;
-    }
-  else if (wc == (wchar_t) '\0')
-    {
-      if (syntax_bits & RE_DOT_NOT_NULL)
-        return 0;
+      ++*pp;
+      return d->newlines[s];
     }
-  else if (wc == WEOF)
-    return 0;
-
-  context = wchar_context (wc);
-  if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context))
-    return 0;
-
-  return mbclen;
-}
-
-/* Check whether each of 'd->states[s].mbps.elem' can match.  Then return the
-   array which corresponds to 'd->states[s].mbps.elem'; each element of the
-   array contains the number of bytes with which the element can match.
 
-   The caller MUST free the array which this function return.  */
-static int *
-check_matching_with_multibyte_ops (struct dfa *d, state_num s,
-                                   char const *p, wint_t wc, size_t mbclen)
-{
-  size_t i;
-  int *rarray;
-
-  rarray = d->mb_match_lens;
-  for (i = 0; i < d->states[s].mbps.nelem; ++i)
+  if (d->trans[s] != NULL)
+    t = d->trans[s];
+  else if (d->fails[s] != NULL)
+    t = d->fails[s];
+  else
     {
-      position pos = d->states[s].mbps.elems[i];
-      switch (d->tokens[pos.index])
-        {
-        case ANYCHAR:
-          rarray[i] = match_anychar (d, s, pos, wc, mbclen);
-          break;
-        default:
-          break;                /* cannot happen.  */
-        }
+      build_state (s, d);
+      if (d->trans[s])
+        t = d->trans[s];
+      else if (d->fails[s])
+        t = d->fails[s];
+      else
+        abort ();
     }
-  return rarray;
-}
-
-/* Consume a single character and enumerate all of the positions which can
-   be the next position from the state 's'.
 
-   'match_lens' is the input.  It can be NULL, but it can also be the output
-   of check_matching_with_multibyte_ops for optimization.
+  return t[*(*pp)++];
+}
 
-   'mbclen' and 'pps' are the output.  'mbclen' is the length of the
-   character consumed, and 'pps' is the set this function enumerates.  */
-static status_transit_state
-transit_state_consume_1char (struct dfa *d, state_num s,
-                             unsigned char const **pp,
-                             wint_t wc, size_t mbclen,
-                             int *match_lens)
+/* Transit state from s, then return new state and update the pointer of
+   the buffer.  This function is for a period operator which can match a
+   multi-byte character.  */
+static state_num
+transit_state (struct dfa *d, state_num s, unsigned char const **pp,
+               unsigned char const *end)
 {
+  state_num s1, s2;
+  int mbclen;  /* The length of current input multibyte character.  */
+  wint_t wc;
+  int context;
   size_t i, j;
   int k;
-  state_num s1, s2;
-  status_transit_state rs = TRANSIT_STATE_DONE;
 
-  if (! match_lens && d->states[s].mbps.nelem != 0)
-    match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp,
-                                                    wc, mbclen);
+  /* Note: caller must free the return value of this function.  */
+  mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
+
+  context = wchar_context (wc);
+
+  /* This state has some operators which can match a multibyte character.  */
+  d->mb_follows.nelem = 0;
 
   /* Calculate the state which can be reached from the state 's' by
      consuming 'mbclen' single bytes from the buffer.  */
@@ -3067,7 +2998,7 @@ transit_state_consume_1char (struct dfa *d, state_num s,
   for (k = 0; k < mbclen; k++)
     {
       s2 = s1;
-      rs = transit_state_singlebyte (d, s2, (*pp)++, &s1);
+      s1 = transit_state_singlebyte (d, s2, pp);
     }
   copy (&d->states[s1].elems, &d->mb_follows);
 
@@ -3075,94 +3006,18 @@ transit_state_consume_1char (struct dfa *d, state_num s,
      a single character.  */
   for (i = 0; i < d->states[s].mbps.nelem; i++)
     {
-      if (match_lens[i] == mbclen)
-        for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
-             j++)
-          insert (d->follows[d->states[s].mbps.elems[i].index].elems[j],
-                  &d->mb_follows);
-    }
-
-  /* FIXME: this return value is always ignored.  */
-  return rs;
-}
-
-/* Transit state from s, then return new state and update the pointer of the
-   buffer.  This function is for some operator which can match with a multi-
-   byte character or a collating element (which may be multi characters).  */
-static state_num
-transit_state (struct dfa *d, state_num s, unsigned char const **pp,
-               unsigned char const *end)
-{
-  state_num s1;
-  int mbclen;  /* The length of current input multibyte character.  */
-  int maxlen = 0;
-  size_t i, j;
-  int *match_lens = NULL;
-  size_t nelem = d->states[s].mbps.nelem;       /* Just a alias.  */
-  unsigned char const *p1 = *pp;
-  wint_t wc;
-
-  if (nelem > 0)
-    /* This state has (a) multibyte operator(s).
-       We check whether each of them can match or not.  */
-    {
-      /* Note: caller must free the return value of this function.  */
-      mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
-      match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp,
-                                                      wc, mbclen);
-
-      for (i = 0; i < nelem; i++)
-        /* Search the operator which match the longest string,
-           in this state.  */
-        {
-          if (match_lens[i] > maxlen)
-            maxlen = match_lens[i];
-        }
-    }
-
-  if (nelem == 0 || maxlen == 0)
-    /* This state has no multibyte operator which can match.
-       We need to check only one single byte character.  */
-    {
-      status_transit_state rs;
-      rs = transit_state_singlebyte (d, s, *pp, &s1);
-
-      /* We must update the pointer if state transition succeeded.  */
-      if (rs == TRANSIT_STATE_DONE)
-        ++*pp;
-
-      return s1;
+      if (!SUCCEEDS_IN_CONTEXT (d->states[s].mbps.elems[i].constraint,
+                                d->states[s].context, context))
+        continue;
+      for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
+           j++)
+        insert (d->follows[d->states[s].mbps.elems[i].index].elems[j],
+                &d->mb_follows);
     }
 
-  /* This state has some operators which can match a multibyte character.  */
-  d->mb_follows.nelem = 0;
-
-  /* 'maxlen' may be longer than the length of a character, because it may
-     not be a character but a (multi character) collating element.
-     We enumerate all of the positions which 's' can reach by consuming
-     'maxlen' bytes.  */
-  transit_state_consume_1char (d, s, pp, wc, mbclen, match_lens);
-
   s1 = state_index (d, &d->mb_follows, wchar_context (wc));
   realloc_trans_if_necessary (d, s1);
 
-  while (*pp - p1 < maxlen)
-    {
-      mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
-      transit_state_consume_1char (d, s1, pp, wc, mbclen, NULL);
-
-      for (i = 0; i < nelem; i++)
-        {
-          if (match_lens[i] == *pp - p1)
-            for (j = 0;
-                 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
-              insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j],
-                      &d->mb_follows);
-        }
-
-      s1 = state_index (d, &d->mb_follows, wchar_context (wc));
-      realloc_trans_if_necessary (d, s1);
-    }
   return s1;
 }
 
@@ -3237,11 +3092,8 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end, int allow_nl,
   if (multibyte)
     {
       memset (&d->mbs, 0, sizeof d->mbs);
-      if (! d->mb_match_lens)
-        {
-          d->mb_match_lens = xnmalloc (d->nleaves, sizeof *d->mb_match_lens);
-          alloc_position_set (&d->mb_follows, d->nleaves);
-        }
+      if (d->mb_follows.alloc == 0)
+        alloc_position_set (&d->mb_follows, d->nleaves);
     }
 
   for (;;)
@@ -3292,44 +3144,22 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end, int allow_nl,
                     }
                 }
 
-              if (d->states[s].mbps.nelem == 0)
+              if (d->states[s].mbps.nelem == 0 || (*p == eol && !allow_nl)
+                  || (*p == '\n' && !(syntax_bits & RE_DOT_NEWLINE))
+                  || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL))
+                  || (char *) p >= end)
+                /* If a input character does not match ANYCHAR, do it
+                   like a single-byte character.  */
+                s = t[*p++];
+              else
                 {
-                  s = t[*p++];
-                  continue;
-                }
+                  s = transit_state (d, s, &p, (unsigned char *) end);
 
-              /* The following code is used twice.
-                 Use a macro to avoid the risk that they diverge.  */
-#define State_transition()                                              \
-  do {                                                                  \
-              /* Can match with a multibyte character (and multi-character \
-                 collating element).  Transition table might be updated.  */ \
-              s = transit_state (d, s, &p, (unsigned char *) end);      \
-                                                                        \
-              /* If previous character is newline after a transition    \
-                 for ANYCHAR or MBCSET in non-UTF8 multibyte locales,   \
-                 check whether current position is beyond the end of    \
-                 the input buffer.  Also, transit to initial state if   \
-                 !ALLOW_NL, even if RE_DOT_NEWLINE is set. */           \
-              if (p[-1] == eol)                                         \
-                {                                                       \
-                  if ((char *) p > end)                                 \
-                    {                                                   \
-                      p = NULL;                                         \
-                      goto done;                                        \
-                    }                                                   \
-                                                                        \
-                  nlcount++;                                            \
-                                                                        \
-                  if (!allow_nl)                                        \
-                    s = 0;                                              \
-                }                                                       \
-                                                                        \
-              mbp = p;                                                  \
-              trans = d->trans;                                         \
-  } while (0)
-
-              State_transition();
+                  if (s >= 0 && p[-1] == eol)
+                    nlcount++;
+                  mbp = p;
+                  trans = d->trans;
+                }
             }
         }
       else
@@ -3378,10 +3208,24 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end, int allow_nl,
             goto done;
 
           s1 = s;
-          if (multibyte)
-            State_transition();
-          else
+          if (!multibyte || d->states[s].mbps.nelem == 0
+              || (*p == eol && !allow_nl)
+              || (*p == '\n' && !(syntax_bits & RE_DOT_NEWLINE))
+              || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL))
+              || (char *) p >= end)
+           /* If a input character does not match ANYCHAR, do it
+              like a single-byte character.  */
             s = d->fails[s][*p++];
+          else
+            {
+              s = transit_state (d, s, &p, (unsigned char *) end);
+
+              if (s >= 0 && p[-1] == eol)
+                nlcount++;
+
+              mbp = p;
+              trans = d->trans;
+            }
         }
       else
         {
@@ -3462,8 +3306,6 @@ free_mbdata (struct dfa *d)
 
   free (d->mbcsets);
   free (d->mb_follows.elems);
-  free (d->mb_match_lens);
-  d->mb_match_lens = NULL;
 }
 
 /* Initialize the components of a dfa that the other routines don't
-- 
2.4.6

From 4b2e0641496b6421aee8e81db3b8f62f798957cd Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Mon, 10 Aug 2015 21:46:50 +0900
Subject: [PATCH 2/3] dfa: not distingish letter and not letter in non-POSIX
 locales

For non-POSIX locales, dfa is not support word delimiter support, so
remove distinction between letter and not letter.

* dfa.c (struct dfa) [initstate_letter, initstate_others,
mb_match_lens]: Remove members and all uses.
(struct dfa) [initstate_notbol]: New member.
(dfaanalyze, dfaexec_main): Replace old members with new member.
(wchar_context): Remove function.  Update callers.
---
 src/dfa.c | 47 ++++++++++++++++++-----------------------------
 1 file changed, 18 insertions(+), 29 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index aadc03e..f8c42fc 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -411,9 +411,11 @@ struct dfa
                                    newline is stored separately and handled
                                    as a special case.  Newline is also used
                                    as a sentinel at the end of the buffer.  */
-  state_num initstate_letter;   /* Initial state for letter context.  */
-  state_num initstate_others;   /* Initial state for other contexts.  */
-  position_set mb_follows;     /* Follow set added by ANYCHAR and/or MBCSET
+  state_num initstate_notbol;   /* Initial state for CTX_LETTER and CTX_NONE
+                                   context in multibyte locales, in which we
+                                   do not distinguish between their contexts,
+                                   as not supported word.  */
+  position_set mb_follows;      /* Follow set added by ANYCHAR and/or MBCSET
                                    on demand.  */
 };
 
@@ -691,16 +693,6 @@ char_context (unsigned char c)
   return CTX_NONE;
 }
 
-static int
-wchar_context (wint_t wc)
-{
-  if (wc == (wchar_t) eolbyte || wc == 0)
-    return CTX_NEWLINE;
-  if (wc == L'_' || iswalnum (wc))
-    return CTX_LETTER;
-  return CTX_NONE;
-}
-
 /* Entry point to set syntax options.  */
 void
 dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
@@ -2494,13 +2486,10 @@ dfaanalyze (struct dfa *d, int searchflag)
   separate_contexts = state_separate_contexts (&merged);
   if (separate_contexts & CTX_NEWLINE)
     state_index (d, &merged, CTX_NEWLINE);
-  d->initstate_others = d->min_trcount
+  d->initstate_notbol = d->min_trcount
     = state_index (d, &merged, separate_contexts ^ CTX_ANY);
   if (separate_contexts & CTX_LETTER)
-    d->initstate_letter = d->min_trcount
-      = state_index (d, &merged, CTX_LETTER);
-  else
-    d->initstate_letter = d->initstate_others;
+    d->min_trcount = state_index (d, &merged, CTX_LETTER);
   d->min_trcount++;
 
   free (posalloc);
@@ -2983,11 +2972,12 @@ transit_state (struct dfa *d, state_num s, unsigned 
char const **pp,
   int context;
   size_t i, j;
   int k;
+  int separate_contexts;
 
   /* Note: caller must free the return value of this function.  */
   mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
 
-  context = wchar_context (wc);
+  context = (wc == (wchar_t) eolbyte || wc == 0) ? CTX_NEWLINE : CTX_NONE;
 
   /* This state has some operators which can match a multibyte character.  */
   d->mb_follows.nelem = 0;
@@ -3015,7 +3005,11 @@ transit_state (struct dfa *d, state_num s, unsigned char 
const **pp,
                 &d->mb_follows);
     }
 
-  s1 = state_index (d, &d->mb_follows, wchar_context (wc));
+  separate_contexts = state_separate_contexts (&d->mb_follows);
+  if (context == CTX_NEWLINE && separate_contexts & CTX_NEWLINE)
+    s1 = state_index (d, &d->mb_follows, CTX_NEWLINE);
+  else
+    s1 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
   realloc_trans_if_necessary (d, s1);
 
   return s1;
@@ -3130,16 +3124,11 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end, int allow_nl,
                          transit to another initial state after skip.  */
                       if (p < mbp)
                         {
-                          int context = wchar_context (wc);
-                          if (context == CTX_LETTER)
-                            s = d->initstate_letter;
-                          else
-                            /* It's CTX_NONE.  CTX_NEWLINE cannot happen,
-                               as we assume that a newline is always a
-                               single byte character.  */
-                            s = d->initstate_others;
+                          /* It's CTX_LETTER or CTX_NONE.  CTX_NEWLINE
+                             cannot happen, as we assume that a newline
+                             is always a single byte character.  */
+                          s1 = s = d->initstate_notbol;
                           p = mbp;
-                          s1 = s;
                         }
                     }
                 }
-- 
2.4.6

Reply via email to