bug#24262: [PATCH 2/2] dfa: support not newline_anchor of regex

Norihiro Tanaka Fri, 19 Aug 2016 08:54:43 -0700

On Thu, 18 Aug 2016 23:57:27 +0900
Norihiro Tanaka <nori...@kcn.ne.jp> wrote:


> The patch introduces not newline_anchor option of regex to dfa.  grep is
> always newline_anchor, so newer codes is not used.  I expect it is used
> by sed and gawk.
> 
> However, the patch adds an argument to dfasyntax().   To synchronize
> between grep and dfa easily, I expect it is applied before dfa is moved
> to gnulib.

Applying the patch, if we generate multiple dfa instance, three global
vars. (sbit, letters, newline) may be overrided.  So I moved them to
member of struct dfa in modified patch.

From ec2fcd6e4386c7e1ce34883f643cdff71552a9d6 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Thu, 11 Aug 2016 11:53:24 +0900
Subject: [PATCH 2/2] dfa: support not newline_anchor of regex

* src/dfa.c (struct dfa): Add member sbit, letters and newline.
(sbit, letters, newline): Remove global vars.
(newline_anchor): Add new global var.
(char_context): Define context for not newline_anchor.
(dfasyntax): Add argument newline_anchor.  Update prototype and all
callers.  And remove checking context.
(lex): Use cached values to check whether each character is letter or
not.
(charclass_context): Avoid context from hard-coded for EOL byte
(dfastate): Use cached values to check whether each character is
newline, letter or none.
letter or not.
(dfaexec_main): Define transition after found newline in input and
accepted condition for not newline_anchor.
(dfainit): Add checking context.
---
 src/dfa.c             |  118 +++++++++++++++++++++++++++++--------------------
 src/dfa.h             |    2 +-
 src/dfasearch.c       |    2 +-
 tests/dfa-match-aux.c |    2 +-
 4 files changed, 73 insertions(+), 51 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 59bb3bc..7db96e1 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -316,6 +316,11 @@ struct mb_char_classes
 /* A compiled regular expression.  */
 struct dfa
 {
+   /* Fields filled by the initializer.  */
+   int sbit[NOTCHAR];           /* Cache of char-context values.  */
+   charclass letters;           /* Set of characters considered letters.  */
+   charclass newline;           /* Set of characters that are newline.  */
+
   /* Fields filled by the scanner.  */
   charclass *charclasses;       /* Array of character sets for CSET tokens.  */
   size_t cindex;                /* Index for adding new charclasses.  */
@@ -661,18 +666,12 @@ static bool case_fold;
 /* End-of-line byte in data.  */
 static unsigned char eolbyte;
 
-/* Cache of char-context values.  */
-static int sbit[NOTCHAR];
-
 /* If never_trail[B], the byte B cannot be a non-initial byte in a
    multibyte character.  */
 static bool never_trail[NOTCHAR];
 
-/* Set of characters considered letters.  */
-static charclass letters;
-
-/* Set of characters that are newline.  */
-static charclass newline;
+/* Whether ^ and $ match newline or not.  */
+static bool newline_anchor;
 
 static bool
 unibyte_word_constituent (unsigned char c)
@@ -683,7 +682,7 @@ unibyte_word_constituent (unsigned char c)
 static int
 char_context (unsigned char c)
 {
-  if (c == eolbyte)
+  if (c == eolbyte && newline_anchor)
     return CTX_NEWLINE;
   if (unibyte_word_constituent (c))
     return CTX_LETTER;
@@ -692,13 +691,14 @@ char_context (unsigned char c)
 
 /* Entry point to set syntax options.  */
 void
-dfasyntax (reg_syntax_t bits, bool fold, unsigned char eol)
+dfasyntax (reg_syntax_t bits, bool fold, unsigned char eol, bool anchor)
 {
   int i;
   syntax_bits_set = true;
   syntax_bits = bits;
   case_fold = fold;
   eolbyte = eol;
+  newline_anchor = anchor;
 
   for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
     {
@@ -707,23 +707,6 @@ dfasyntax (reg_syntax_t bits, bool fold, unsigned char eol)
       mbstate_t s = { 0 };
       wchar_t wc;
       mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF;
-
-      /* Now that mbrtowc_cache[uc] is set, use it to calculate sbit.  */
-      sbit[uc] = char_context (uc);
-      switch (sbit[uc])
-        {
-        case CTX_LETTER:
-          setbit (uc, letters);
-          break;
-        case CTX_NEWLINE:
-          setbit (uc, newline);
-          break;
-        }
-
-      /* POSIX requires that the five bytes in "\n\r./" (including the
-         terminating NUL) cannot occur inside a multibyte character.  */
-      never_trail[uc] = (using_utf8 () ? (uc & 0xc0) != 0x80
-                         : strchr ("\n\r./", uc) != NULL);
     }
 }
 
@@ -1486,7 +1469,7 @@ lex (void)
             {
               zeroset (ccl);
               for (c2 = 0; c2 < NOTCHAR; ++c2)
-                if (unibyte_word_constituent (c2))
+                if (dfa->sbit[c2] == CTX_LETTER)
                   setbit (c2, ccl);
               if (c == 'W')
                 notset (ccl);
@@ -2216,19 +2199,18 @@ epsclosure (position_set *s, struct dfa const *d, char 
*visited)
    character included in C.  */
 
 static int
-charclass_context (charclass c)
+charclass_context (struct dfa *d, charclass c)
 {
   int context = 0;
   unsigned int j;
 
-  if (tstbit (eolbyte, c))
-    context |= CTX_NEWLINE;
-
   for (j = 0; j < CHARCLASS_WORDS; ++j)
     {
-      if (c[j] & letters[j])
+      if (c[j] & d->newline[j])
+        context |= CTX_NEWLINE;
+      if (c[j] & d->letters[j])
         context |= CTX_LETTER;
-      if (c[j] & ~(letters[j] | newline[j]))
+      if (c[j] & ~(d->letters[j] | d->newline[j]))
         context |= CTX_NONE;
     }
 
@@ -2623,15 +2605,15 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
           if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
                                     d->states[s].context, CTX_NEWLINE))
             for (j = 0; j < CHARCLASS_WORDS; ++j)
-              matches[j] &= ~newline[j];
+              matches[j] &= ~d->newline[j];
           if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
                                     d->states[s].context, CTX_LETTER))
             for (j = 0; j < CHARCLASS_WORDS; ++j)
-              matches[j] &= ~letters[j];
+              matches[j] &= ~d->letters[j];
           if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
                                     d->states[s].context, CTX_NONE))
             for (j = 0; j < CHARCLASS_WORDS; ++j)
-              matches[j] &= letters[j] | newline[j];
+              matches[j] &= d->letters[j] | d->newline[j];
 
           /* If there are no characters left, there's no point in going on.  */
           for (j = 0; j < CHARCLASS_WORDS && !matches[j]; ++j)
@@ -2736,8 +2718,9 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         state_letter = state;
 
       for (i = 0; i < NOTCHAR; ++i)
-        trans[i] = unibyte_word_constituent (i) ? state_letter : state;
-      trans[eolbyte] = state_newline;
+        trans[i] = d->sbit[i] == CTX_LETTER ? state_letter : state;
+      if (d->sbit[eolbyte] == CTX_NEWLINE)
+        trans[eolbyte] = state_newline;
     }
   else
     for (i = 0; i < NOTCHAR; ++i)
@@ -2793,7 +2776,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         }
 
       /* Find out if the new state will want any context information.  */
-      possible_contexts = charclass_context (labels[i]);
+      possible_contexts = charclass_context (d, labels[i]);
       separate_contexts = state_separate_contexts (&follows);
 
       /* Find the state(s) corresponding to the union of the follows.  */
@@ -2840,12 +2823,21 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
             {
               int c = j * CHARCLASS_WORD_BITS + k;
 
-              if (c == eolbyte)
-                trans[c] = state_newline;
-              else if (unibyte_word_constituent (c))
-                trans[c] = state_letter;
-              else if (c < NOTCHAR)
-                trans[c] = state;
+              if (c >= NOTCHAR)
+                break;
+
+              switch (d->sbit[c])
+                {
+                case CTX_NEWLINE:
+                  trans[c] = state_newline;
+                  break;
+                case CTX_LETTER:
+                  trans[c] = state_letter;
+                  break;
+                default:
+                  trans[c] = state;
+                  break;
+                }
             }
     }
 
@@ -3276,11 +3268,17 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end, bool allow_nl,
           nlcount++;
           mbp = p;
 
-          s = allow_nl ? d->newlines[s1] : 0;
+          s = (allow_nl ? d->newlines[s1]
+               : (d->sbit[eol] == CTX_NEWLINE ? 0
+                  : (d->sbit[eol] == CTX_LETTER ? d->min_trcount - 1
+                     : d->initstate_notbol)));
         }
       else if (d->fails[s])
         {
-          if (d->success[s] & sbit[*p])
+          if (d->success[s] & d->sbit[*p]
+              || ((char *) p == end
+                  && ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s,
+                                         *d)))
             goto done;
 
           if (multibyte && s < d->min_trcount)
@@ -3397,10 +3395,34 @@ free_mbdata (struct dfa *d)
 static void
 dfainit (struct dfa *d)
 {
+  int i;
+
   memset (d, 0, sizeof *d);
   d->multibyte = MB_CUR_MAX > 1;
   d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb;
   d->fast = !d->multibyte;
+
+  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+    {
+      unsigned char uc = i;
+
+      /* Now that mbrtowc_cache[uc] is set, use it to calculate sbit.  */
+      d->sbit[uc] = char_context (uc);
+      switch (d->sbit[uc])
+        {
+        case CTX_LETTER:
+          setbit (uc, d->letters);
+          break;
+        case CTX_NEWLINE:
+          setbit (uc, d->newline);
+          break;
+        }
+
+      /* POSIX requires that the five bytes in "\n\r./" (including the
+         terminating NUL) cannot occur inside a multibyte character.  */
+      never_trail[uc] = (using_utf8 () ? (uc & 0xc0) != 0x80
+                         : strchr ("\n\r./", uc) != NULL);
+    }
 }
 
 /* Return true if every construct in D is supported by this DFA matcher.  */
diff --git a/src/dfa.h b/src/dfa.h
index 60da0e4..0e259bf 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -53,7 +53,7 @@ extern void dfamustfree (struct dfamust *);
 /* dfasyntax() takes three arguments; the first sets the syntax bits described
    earlier in this file, the second sets the case-folding flag, and the
    third specifies the line terminator. */
-extern void dfasyntax (reg_syntax_t, bool, unsigned char);
+extern void dfasyntax (reg_syntax_t, bool, unsigned char, bool);
 
 /* Compile the given string of the given length into the given struct dfa.
    Final argument is a flag specifying whether to build a searching or an
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 9a523c8..17d6a74 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -128,7 +128,7 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t 
syntax_bits)
   if (match_icase)
     syntax_bits |= RE_ICASE;
   re_set_syntax (syntax_bits);
-  dfasyntax (syntax_bits, match_icase, eolbyte);
+  dfasyntax (syntax_bits, match_icase, eolbyte, true);
 
   /* For GNU regex, pass the patterns separately to detect errors like
      "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and
diff --git a/tests/dfa-match-aux.c b/tests/dfa-match-aux.c
index af933ff..f8db72c 100644
--- a/tests/dfa-match-aux.c
+++ b/tests/dfa-match-aux.c
@@ -54,7 +54,7 @@ main (int argc, char **argv)
 
   setlocale (LC_ALL, "");
 
-  dfasyntax (RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n');
+  dfasyntax (RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n', 1);
   dfa = dfaalloc ();
   dfacomp (argv[1], strlen (argv[1]), dfa, 0);
 
-- 
1.7.1

bug#24262: [PATCH 2/2] dfa: support not newline_anchor of regex

Reply via email to