If a pattern includes an unsupported expression which is labeled as
BACKREF, a text will almost fail to match the pattern.

First patch makes changes to return immediately and avoid pidding search
if a pattern includes an unsupported expression.

Second patch removes word constraint support which is as labeled BEGWORD,
ENDWORD, LIMWORD and NOTLIMWORD for multibyte-locales including UTF-8
from DFA, as we use regex instead of DFA for a pattern including word
constraint so that it does not correctly treat word constraint.
From e2a434467a616d1da72f190176b90ba6d3ea578f Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Mon, 3 Nov 2014 08:12:40 +0900
Subject: [PATCH 1/2] dfa: avoid execution for a pattern including an
 unsupported expression

If a pattern includes unsupported expression by DFA matcher, the
search with the matcher will fail in most cases.  So dfaexec immediately
returns for the pattern.

src/dfa.c (struct dfa_state): Remove 'has_backref' and 'has_mbcset'.
All uses removed.
(dfaexec_main): Remove 'backref'.
All uses and callers removed.
(dfaexec_br): New function, always sets a backreference bit and returns
a beginning pointer in an input.  When a pattern includes any
unsupported expression by DFA matcher, the function is used.
(dfabackref): Test wether patterns has any unsupported expression by DFA
matcher or not.
(dfassbuild): Remove no longer unsed code.
(dfacomp): If A pattern has any unsupported expression by DFA matcher,
avoid analysis for the pattern.
---
 src/dfa.c | 68 +++++++++++++++++++++++++++++++--------------------------------
 1 file changed, 33 insertions(+), 35 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 806cb04..49c9505 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -284,8 +284,6 @@ typedef struct
   size_t hash;                  /* Hash of the positions of this state.  */
   position_set elems;           /* Positions this state could match.  */
   unsigned char context;        /* Context from previous state.  */
-  bool has_backref;            /* This state matches a \<digit>.  */
-  bool has_mbcset;             /* This state matches a MBCSET.  */
   unsigned short constraint;    /* Constraint for this state to accept.  */
   token first_end;              /* Token value of the first END in elems.  */
   position_set mbps;            /* Positions which can match multibyte
@@ -2152,8 +2150,6 @@ state_index (struct dfa *d, position_set const *s, int 
context)
   alloc_position_set (&d->states[i].elems, s->nelem);
   copy (s, &d->states[i].elems);
   d->states[i].context = context;
-  d->states[i].has_backref = false;
-  d->states[i].has_mbcset = false;
   d->states[i].constraint = 0;
   d->states[i].first_end = 0;
   d->states[i].mbps.nelem = 0;
@@ -2169,10 +2165,7 @@ state_index (struct dfa *d, position_set const *s, int 
context)
           d->states[i].first_end = d->tokens[s->elems[j].index];
       }
     else if (d->tokens[s->elems[j].index] == BACKREF)
-      {
-        d->states[i].constraint = NO_CONSTRAINT;
-        d->states[i].has_backref = true;
-      }
+      d->states[i].constraint = NO_CONSTRAINT;
 
   ++d->sindex;
 
@@ -2627,9 +2620,6 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
           if (d->tokens[pos.index] == MBCSET
               || d->tokens[pos.index] == ANYCHAR)
             {
-              /* MB_CUR_MAX > 1 */
-              if (d->tokens[pos.index] == MBCSET)
-                d->states[s].has_mbcset = true;
               /* ANYCHAR and MBCSET must match with a single character, so we
                  must put it to d->states[s].mbps, which contains the positions
                  which can match with a single character not a byte.  */
@@ -3304,8 +3294,8 @@ skip_remains_mb (struct dfa *d, unsigned char const *p,
    encoding-error bytes.  Otherwise, the input consists of single-byte
    characters.  */
 static inline char *
-dfaexec_main (struct dfa *d, char const *begin, char *end,
-             int allow_nl, size_t *count, int *backref, bool multibyte)
+dfaexec_main (struct dfa *d, char const *begin, char *end, int allow_nl,
+             size_t *count, bool multibyte)
 {
   state_num s, s1;              /* Current state.  */
   unsigned char const *p, *mbp; /* Current input character.  */
@@ -3395,16 +3385,6 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end,
                  Use a macro to avoid the risk that they diverge.  */
 #define State_transition()                                              \
   do {                                                                  \
-              /* Falling back to the glibc matcher in this case gives   \
-                 better performance (up to 25% better on [a-z], for     \
-                 example) and enables support for collating symbols and \
-                 equivalence classes.  */                               \
-              if (d->states[s].has_mbcset && backref)                   \
-                {                                                       \
-                  *backref = 1;                                         \
-                  goto done;                                            \
-                }                                                       \
-                                                                        \
               /* Can match with a multibyte character (and multi-character \
                  collating element).  Transition table might be updated.  */ \
               s = transit_state (d, s, &p, (unsigned char *) end);      \
@@ -3478,11 +3458,7 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end,
       if (d->fails[s])
         {
           if (d->success[s] & sbit[*p])
-            {
-              if (backref)
-                *backref = d->states[s].has_backref;
-              goto done;
-            }
+            goto done;
 
           s1 = s;
           if (multibyte)
@@ -3512,14 +3488,22 @@ static char *
 dfaexec_mb (struct dfa *d, char const *begin, char *end,
             int allow_nl, size_t *count, int *backref)
 {
-  return dfaexec_main (d, begin, end, allow_nl, count, backref, true);
+  return dfaexec_main (d, begin, end, allow_nl, count, true);
 }
 
 static char *
 dfaexec_sb (struct dfa *d, char const *begin, char *end,
             int allow_nl, size_t *count, int *backref)
 {
-  return dfaexec_main (d, begin, end, allow_nl, count, backref, false);
+  return dfaexec_main (d, begin, end, allow_nl, count, false);
+}
+
+static char *
+dfaexec_br (struct dfa *d, char const *begin, char *end,
+            int allow_nl, size_t *count, int *backref)
+{
+  *backref = 1;
+  return (char *) begin;
 }
 
 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, BACKREF, D->multibyte),
@@ -3585,6 +3569,19 @@ dfainit (struct dfa *d)
   d->fast = !d->multibyte;
 }
 
+bool
+dfabackref (struct dfa *d)
+{
+  size_t i;
+  for (i = 0; i < d->tindex; i++)
+    if (d->tokens[i] == BACKREF || d->tokens[i] == MBCSET)
+      {
+        d->dfaexec = dfaexec_br;
+        return true;
+      }
+  return false;
+}
+
 static void
 dfaoptimize (struct dfa *d)
 {
@@ -3683,10 +3680,8 @@ dfassbuild (struct dfa *d)
           if (d->multibyte)
             {
               /* These constraints aren't supported in a multibyte locale.
-                 Ignore them in the superset DFA, and treat them as
-                 backreferences in the main DFA.  */
+                 Ignore them in the superset DFA.  */
               sup->tokens[j++] = EMPTY;
-              d->tokens[i] = BACKREF;
               break;
             }
         default:
@@ -3717,8 +3712,11 @@ dfacomp (char const *s, size_t len, struct dfa *d, int 
searchflag)
   dfaparse (s, len, d);
   dfamust (d);
   dfassbuild (d);
-  dfaoptimize (d);
-  dfaanalyze (d, searchflag);
+  if (!dfabackref (d))
+    {
+      dfaoptimize (d);
+      dfaanalyze (d, searchflag);
+    }
   if (d->superset)
     {
       d->fast = true;
-- 
2.2.0

From 5b2b45a50463acad08b403594612d43377271642 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Sun, 7 Dec 2014 20:16:41 +0900
Subject: [PATCH 2/2] dfa: remove a support of a word delimiter expression for
 non-multibyte locales

DFA supports a word delimiter expression, but it does not behave
correctly for non-multibyte locales.  Even if it is fixed, will not be
faster than regex.  So remove a support of a word delimiter expression
for non-multibyte locales from DFA.

* src/dfa.c (dfabackref): If a pattern has any word delimiter expression
for non-multibyte locales, sets a backreference bit and returns.
---
 src/dfa.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/dfa.c b/src/dfa.c
index 49c9505..adc397e 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3574,8 +3574,18 @@ dfabackref (struct dfa *d)
 {
   size_t i;
   for (i = 0; i < d->tindex; i++)
-    if (d->tokens[i] == BACKREF || d->tokens[i] == MBCSET)
+    switch (d->tokens[i])
       {
+      case BEGWORD:
+      case ENDWORD:
+      case LIMWORD:
+      case NOTLIMWORD:
+        if (!d->multibyte)
+          continue;
+        /* fallthrough */
+
+      case BACKREF:
+      case MBCSET:
         d->dfaexec = dfaexec_br;
         return true;
       }
-- 
2.2.0

Reply via email to