Now dfa is not support range, collating element, equivalent class in
non-POSIX locales.  We can do dfa more simply by removing codes for them.


First, I made update dfaexec_main.  Now transit_state is called only
when next character matches with ANYCHAR, and match_anychar always
returns the length of the multibyte character for next character at all
positions in the state.  So we can remove match_anychar.  Further more,
we can also remove check_matching_with_multibyte_ops, which is caller
for match_anychar.

Second, dfa does not support collating element, we can also simplify
transit_state.

In addition, simplify transit_state_singlebyte.
From b94e2adefb22a5394324ec3681521214c51dc638 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Mon, 10 Aug 2015 20:18:23 +0900
Subject: [PATCH] dfa: simplify for non-POSIX locales

Now dfa is not support range, collating element, equivalent class in
non-POSIX locales.  We can do dfa more simply by removing codes for them.

* src/dfa.c (enum status_transit_state): Remove function.
(transit_state_singlebyte): Update arguments and now returns next state
instead of status_transit_state.
(match_anychar): Remove function.
(check_matching_with_multibyte_ops): Remove function.
(transit_state_consume_1char): Remove function.
(transit_state): Simplify it.
(dfaexec_main): Now transit_state is called only when next character
matches with ANYCHAR.
(State_transition): Remove macro.
---
 src/dfa.c | 303 +++++++++++++++-----------------------------------------------
 1 file changed, 73 insertions(+), 230 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index ac5129b..b5c31f7 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -2934,132 +2934,63 @@ build_state (state_num s, struct dfa *d)
 
 /* Multibyte character handling sub-routines for dfaexec.  */
 
-/* Return values of transit_state_singlebyte, and
-   transit_state_consume_1char.  */
-typedef enum
-{
-  TRANSIT_STATE_IN_PROGRESS,    /* State transition has not finished.  */
-  TRANSIT_STATE_DONE,           /* State transition has finished.  */
-  TRANSIT_STATE_END_BUFFER      /* Reach the end of the buffer.  */
-} status_transit_state;
-
 /* Consume a single byte and transit state from 's' to '*next_state'.
    This function is almost same as the state transition routin in dfaexec.
    But state transition is done just once, otherwise matching succeed or
    reach the end of the buffer.  */
-static status_transit_state
-transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const *p,
-                          state_num * next_state)
+static state_num
+transit_state_singlebyte (struct dfa *d, state_num const s,
+                          unsigned char const **pp)
 {
   state_num *t;
-  state_num works = s;
 
-  status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
-
-  while (rval == TRANSIT_STATE_IN_PROGRESS)
+  if (**pp == eolbyte)
     {
-      if ((t = d->trans[works]) != NULL)
-        {
-          works = t[*p];
-          rval = TRANSIT_STATE_DONE;
-          if (works < 0)
-            works = 0;
-        }
-      else if (works < 0)
-        works = 0;
-      else if (d->fails[works])
-        {
-          works = d->fails[works][*p];
-          rval = TRANSIT_STATE_DONE;
-        }
-      else
-        {
-          build_state (works, d);
-        }
-    }
-  *next_state = works;
-  return rval;
-}
-
-/* Match a "." against the current context.  Return the length of the
-   match, in bytes.  POS is the position of the ".".  */
-static int
-match_anychar (struct dfa *d, state_num s, position pos,
-               wint_t wc, size_t mbclen)
-{
-  int context;
+      /* S is always an initial state in transit_state in order that the
+         newline is the single.  When transit_state is called, the
+         transition table for the state must have been built already.  */
+      assert (d->trans[s] != NULL || d->fails[s] != NULL);
 
-  /* Check syntax bits.  */
-  if (wc == (wchar_t) '\n')
-    {
-      if (!(syntax_bits & RE_DOT_NEWLINE))
-        return 0;
-    }
-  else if (wc == (wchar_t) '\0')
-    {
-      if (syntax_bits & RE_DOT_NOT_NULL)
-        return 0;
+      ++*pp;
+      return d->newlines[s];
     }
-  else if (wc == WEOF)
-    return 0;
-
-  context = wchar_context (wc);
-  if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context))
-    return 0;
-
-  return mbclen;
-}
-
-/* Check whether each of 'd->states[s].mbps.elem' can match.  Then return the
-   array which corresponds to 'd->states[s].mbps.elem'; each element of the
-   array contains the number of bytes with which the element can match.
-
-   The caller MUST free the array which this function return.  */
-static int *
-check_matching_with_multibyte_ops (struct dfa *d, state_num s,
-                                   char const *p, wint_t wc, size_t mbclen)
-{
-  size_t i;
-  int *rarray;
 
-  rarray = d->mb_match_lens;
-  for (i = 0; i < d->states[s].mbps.nelem; ++i)
+  if (d->trans[s] != NULL)
+    t = d->trans[s];
+  else if (d->fails[s] != NULL)
+    t = d->fails[s];
+  else
     {
-      position pos = d->states[s].mbps.elems[i];
-      switch (d->tokens[pos.index])
-        {
-        case ANYCHAR:
-          rarray[i] = match_anychar (d, s, pos, wc, mbclen);
-          break;
-        default:
-          break;                /* cannot happen.  */
-        }
+      build_state (s, d);
+      if (d->trans[s])
+        t = d->trans[s];
+      else if (d->fails[s])
+        t = d->fails[s];
+      else
+        abort ();
     }
-  return rarray;
-}
-
-/* Consume a single character and enumerate all of the positions which can
-   be the next position from the state 's'.
 
-   'match_lens' is the input.  It can be NULL, but it can also be the output
-   of check_matching_with_multibyte_ops for optimization.
+  return t[*(*pp)++];
+}
 
-   'mbclen' and 'pps' are the output.  'mbclen' is the length of the
-   character consumed, and 'pps' is the set this function enumerates.  */
-static status_transit_state
-transit_state_consume_1char (struct dfa *d, state_num s,
-                             unsigned char const **pp,
-                             wint_t wc, size_t mbclen,
-                             int *match_lens)
+/* Transit state from s, then return new state and update the pointer of the
+   buffer.  This function is for some operator which can match with a multi-
+   byte character or a collating element (which may be multi characters).  */
+static state_num
+transit_state (struct dfa *d, state_num s, unsigned char const **pp,
+               unsigned char const *end)
 {
+  state_num s1, s2;
+  int mbclen;  /* The length of current input multibyte character.  */
+  wint_t wc;
   size_t i, j;
   int k;
-  state_num s1, s2;
-  status_transit_state rs = TRANSIT_STATE_DONE;
 
-  if (! match_lens && d->states[s].mbps.nelem != 0)
-    match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp,
-                                                    wc, mbclen);
+  /* Note: caller must free the return value of this function.  */
+  mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
+
+  /* This state has some operators which can match a multibyte character.  */
+  d->mb_follows.nelem = 0;
 
   /* Calculate the state which can be reached from the state 's' by
      consuming 'mbclen' single bytes from the buffer.  */
@@ -3067,7 +2998,7 @@ transit_state_consume_1char (struct dfa *d, state_num s,
   for (k = 0; k < mbclen; k++)
     {
       s2 = s1;
-      rs = transit_state_singlebyte (d, s2, (*pp)++, &s1);
+      s1 = transit_state_singlebyte (d, s2, pp);
     }
   copy (&d->states[s1].elems, &d->mb_follows);
 
@@ -3075,94 +3006,15 @@ transit_state_consume_1char (struct dfa *d, state_num s,
      a single character.  */
   for (i = 0; i < d->states[s].mbps.nelem; i++)
     {
-      if (match_lens[i] == mbclen)
-        for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
-             j++)
-          insert (d->follows[d->states[s].mbps.elems[i].index].elems[j],
-                  &d->mb_follows);
-    }
-
-  /* FIXME: this return value is always ignored.  */
-  return rs;
-}
-
-/* Transit state from s, then return new state and update the pointer of the
-   buffer.  This function is for some operator which can match with a multi-
-   byte character or a collating element (which may be multi characters).  */
-static state_num
-transit_state (struct dfa *d, state_num s, unsigned char const **pp,
-               unsigned char const *end)
-{
-  state_num s1;
-  int mbclen;  /* The length of current input multibyte character.  */
-  int maxlen = 0;
-  size_t i, j;
-  int *match_lens = NULL;
-  size_t nelem = d->states[s].mbps.nelem;       /* Just a alias.  */
-  unsigned char const *p1 = *pp;
-  wint_t wc;
-
-  if (nelem > 0)
-    /* This state has (a) multibyte operator(s).
-       We check whether each of them can match or not.  */
-    {
-      /* Note: caller must free the return value of this function.  */
-      mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
-      match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp,
-                                                      wc, mbclen);
-
-      for (i = 0; i < nelem; i++)
-        /* Search the operator which match the longest string,
-           in this state.  */
-        {
-          if (match_lens[i] > maxlen)
-            maxlen = match_lens[i];
-        }
-    }
-
-  if (nelem == 0 || maxlen == 0)
-    /* This state has no multibyte operator which can match.
-       We need to check only one single byte character.  */
-    {
-      status_transit_state rs;
-      rs = transit_state_singlebyte (d, s, *pp, &s1);
-
-      /* We must update the pointer if state transition succeeded.  */
-      if (rs == TRANSIT_STATE_DONE)
-        ++*pp;
-
-      return s1;
+      for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
+           j++)
+        insert (d->follows[d->states[s].mbps.elems[i].index].elems[j],
+                &d->mb_follows);
     }
 
-  /* This state has some operators which can match a multibyte character.  */
-  d->mb_follows.nelem = 0;
-
-  /* 'maxlen' may be longer than the length of a character, because it may
-     not be a character but a (multi character) collating element.
-     We enumerate all of the positions which 's' can reach by consuming
-     'maxlen' bytes.  */
-  transit_state_consume_1char (d, s, pp, wc, mbclen, match_lens);
-
   s1 = state_index (d, &d->mb_follows, wchar_context (wc));
   realloc_trans_if_necessary (d, s1);
 
-  while (*pp - p1 < maxlen)
-    {
-      mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
-      transit_state_consume_1char (d, s1, pp, wc, mbclen, NULL);
-
-      for (i = 0; i < nelem; i++)
-        {
-          if (match_lens[i] == *pp - p1)
-            for (j = 0;
-                 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
-              insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j],
-                      &d->mb_follows);
-        }
-
-      s1 = state_index (d, &d->mb_follows, wchar_context (wc));
-      realloc_trans_if_necessary (d, s1);
-    }
   return s1;
 }
 
@@ -3292,44 +3144,22 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end, int allow_nl,
                     }
                 }
 
-              if (d->states[s].mbps.nelem == 0)
+              if (d->states[s].mbps.nelem == 0 ||
+                  (*p == eol && !(allow_nl && (syntax_bits & RE_DOT_NEWLINE)))
+                  || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL))
+                  || (char *) p >= end)
+                /* If a input character does not match ANYCHAR, do it
+                   like a single-byte character.  */
+                s = t[*p++];
+              else
                 {
-                  s = t[*p++];
-                  continue;
-                }
-
-              /* The following code is used twice.
-                 Use a macro to avoid the risk that they diverge.  */
-#define State_transition()                                              \
-  do {                                                                  \
-              /* Can match with a multibyte character (and multi-character \
-                 collating element).  Transition table might be updated.  */ \
-              s = transit_state (d, s, &p, (unsigned char *) end);      \
-                                                                        \
-              /* If previous character is newline after a transition    \
-                 for ANYCHAR or MBCSET in non-UTF8 multibyte locales,   \
-                 check whether current position is beyond the end of    \
-                 the input buffer.  Also, transit to initial state if   \
-                 !ALLOW_NL, even if RE_DOT_NEWLINE is set. */           \
-              if (p[-1] == eol)                                         \
-                {                                                       \
-                  if ((char *) p > end)                                 \
-                    {                                                   \
-                      p = NULL;                                         \
-                      goto done;                                        \
-                    }                                                   \
-                                                                        \
-                  nlcount++;                                            \
-                                                                        \
-                  if (!allow_nl)                                        \
-                    s = 0;                                              \
-                }                                                       \
-                                                                        \
-              mbp = p;                                                  \
-              trans = d->trans;                                         \
-  } while (0)
+                  s = transit_state (d, s, &p, (unsigned char *) end);
 
-              State_transition();
+                  if (s >= 0 && p[-1] == eol)
+                    nlcount++;
+                  mbp = p;
+                  trans = d->trans;
+                }
             }
         }
       else
@@ -3378,10 +3208,23 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end, int allow_nl,
             goto done;
 
           s1 = s;
-          if (multibyte)
-            State_transition();
-          else
+          if (!multibyte || d->states[s].mbps.nelem == 0 ||
+              (*p == eol && !(allow_nl && (syntax_bits & RE_DOT_NEWLINE)))
+              || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL))
+              || (char *) p >= end)
+           /* If a input character does not match ANYCHAR, do it
+              like a single-byte character.  */
             s = d->fails[s][*p++];
+          else
+            {
+              s = transit_state (d, s, &p, (unsigned char *) end);
+
+              if (s >= 0 && p[-1] == eol)
+                nlcount++;
+
+              mbp = p;
+              trans = d->trans;
+            }
         }
       else
         {
-- 
2.4.6

Reply via email to