The patch introduces not newline_anchor option of regex to dfa.  grep is
always newline_anchor, so newer codes is not used.  I expect it is used
by sed and gawk.

However, the patch adds an argument to dfasyntax().   To synchronize
between grep and dfa easily, I expect it is applied before dfa is moved
to gnulib.
From b31ebd2bb5aae54ba46ac3bc88161872b50f9513 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Thu, 11 Aug 2016 11:53:24 +0900
Subject: [PATCH 2/2] dfa: support not newline_anchor of regex

* src/dfa.c (char_context): Define context for not newline_anchor.
(dfasyntax): Add argument newline_anchor.  Update all callers.
(lex): Use cached values to check whether each character is letter or
not.
(charclass_context): Avoid context from hard-coded for EOL byte
(dfastate): Use cached values to check whether each character is
newline, letter or none.
(dfaexec_main): Define transition after found newline in input and
accepted condition for not newline_anchor.
---
 src/dfa.c             |   51 +++++++++++++++++++++++++++++++-----------------
 src/dfa.h             |    2 +-
 src/dfasearch.c       |    2 +-
 tests/dfa-match-aux.c |    2 +-
 4 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 59bb3bc..1609ad6 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -681,9 +681,9 @@ unibyte_word_constituent (unsigned char c)
 }
 
 static int
-char_context (unsigned char c)
+char_context (unsigned char c, bool newline_anchor)
 {
-  if (c == eolbyte)
+  if (c == eolbyte && newline_anchor)
     return CTX_NEWLINE;
   if (unibyte_word_constituent (c))
     return CTX_LETTER;
@@ -692,7 +692,7 @@ char_context (unsigned char c)
 
 /* Entry point to set syntax options.  */
 void
-dfasyntax (reg_syntax_t bits, bool fold, unsigned char eol)
+dfasyntax (reg_syntax_t bits, bool fold, unsigned char eol, bool 
newline_anchor)
 {
   int i;
   syntax_bits_set = true;
@@ -709,7 +709,7 @@ dfasyntax (reg_syntax_t bits, bool fold, unsigned char eol)
       mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF;
 
       /* Now that mbrtowc_cache[uc] is set, use it to calculate sbit.  */
-      sbit[uc] = char_context (uc);
+      sbit[uc] = char_context (uc, newline_anchor);
       switch (sbit[uc])
         {
         case CTX_LETTER:
@@ -1486,7 +1486,7 @@ lex (void)
             {
               zeroset (ccl);
               for (c2 = 0; c2 < NOTCHAR; ++c2)
-                if (unibyte_word_constituent (c2))
+                if (sbit[c2] == CTX_LETTER)
                   setbit (c2, ccl);
               if (c == 'W')
                 notset (ccl);
@@ -2221,11 +2221,10 @@ charclass_context (charclass c)
   int context = 0;
   unsigned int j;
 
-  if (tstbit (eolbyte, c))
-    context |= CTX_NEWLINE;
-
   for (j = 0; j < CHARCLASS_WORDS; ++j)
     {
+      if (c[j] & newline[j])
+        context |= CTX_NEWLINE;
       if (c[j] & letters[j])
         context |= CTX_LETTER;
       if (c[j] & ~(letters[j] | newline[j]))
@@ -2736,8 +2735,9 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         state_letter = state;
 
       for (i = 0; i < NOTCHAR; ++i)
-        trans[i] = unibyte_word_constituent (i) ? state_letter : state;
-      trans[eolbyte] = state_newline;
+        trans[i] = sbit[i] == CTX_LETTER ? state_letter : state;
+      if (sbit[eolbyte] == CTX_NEWLINE)
+        trans[eolbyte] = state_newline;
     }
   else
     for (i = 0; i < NOTCHAR; ++i)
@@ -2840,12 +2840,21 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
             {
               int c = j * CHARCLASS_WORD_BITS + k;
 
-              if (c == eolbyte)
-                trans[c] = state_newline;
-              else if (unibyte_word_constituent (c))
-                trans[c] = state_letter;
-              else if (c < NOTCHAR)
-                trans[c] = state;
+              if (c >= NOTCHAR)
+                break;
+
+              switch (sbit[c])
+                {
+                case CTX_NEWLINE:
+                  trans[c] = state_newline;
+                  break;
+                case CTX_LETTER:
+                  trans[c] = state_letter;
+                  break;
+                default:
+                  trans[c] = state;
+                  break;
+                }
             }
     }
 
@@ -3276,11 +3285,17 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end, bool allow_nl,
           nlcount++;
           mbp = p;
 
-          s = allow_nl ? d->newlines[s1] : 0;
+          s = (allow_nl ? d->newlines[s1]
+               : (sbit[eol] == CTX_NEWLINE ? 0
+                  : (sbit[eol] == CTX_LETTER ? d->min_trcount - 1
+                     : d->initstate_notbol)));
         }
       else if (d->fails[s])
         {
-          if (d->success[s] & sbit[*p])
+          if (d->success[s] & sbit[*p]
+              || ((char *) p == end
+                  && ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s,
+                                         *d)))
             goto done;
 
           if (multibyte && s < d->min_trcount)
diff --git a/src/dfa.h b/src/dfa.h
index 60da0e4..0e259bf 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -53,7 +53,7 @@ extern void dfamustfree (struct dfamust *);
 /* dfasyntax() takes three arguments; the first sets the syntax bits described
    earlier in this file, the second sets the case-folding flag, and the
    third specifies the line terminator. */
-extern void dfasyntax (reg_syntax_t, bool, unsigned char);
+extern void dfasyntax (reg_syntax_t, bool, unsigned char, bool);
 
 /* Compile the given string of the given length into the given struct dfa.
    Final argument is a flag specifying whether to build a searching or an
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 9a523c8..17d6a74 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -128,7 +128,7 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t 
syntax_bits)
   if (match_icase)
     syntax_bits |= RE_ICASE;
   re_set_syntax (syntax_bits);
-  dfasyntax (syntax_bits, match_icase, eolbyte);
+  dfasyntax (syntax_bits, match_icase, eolbyte, true);
 
   /* For GNU regex, pass the patterns separately to detect errors like
      "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and
diff --git a/tests/dfa-match-aux.c b/tests/dfa-match-aux.c
index af933ff..f8db72c 100644
--- a/tests/dfa-match-aux.c
+++ b/tests/dfa-match-aux.c
@@ -54,7 +54,7 @@ main (int argc, char **argv)
 
   setlocale (LC_ALL, "");
 
-  dfasyntax (RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n');
+  dfasyntax (RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n', 1);
   dfa = dfaalloc ();
   dfacomp (argv[1], strlen (argv[1]), dfa, 0);
 
-- 
1.7.1

Reply via email to