On Sun, 19 Jul 2015 05:24:01 -0600
arn...@skeeve.com wrote:

> I have a minor suggestion, which is to make dfabackref into a
> static function.

dfabackref() is not called from external, so it should be static, as you
say.  I fixed it.

Thanks.
From d928dd7f791cab38b9f852d03ee1d63e3f4464aa Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Mon, 3 Nov 2014 08:12:40 +0900
Subject: [PATCH 1/2] dfa: avoid execution for a pattern including an
 unsupported expression

If a pattern includes unsupported expression by DFA matcher, the
search with the matcher will fail in most cases.  So dfaexec immediately
returns for the pattern.

src/dfa.c (struct dfa_state): Remove 'has_backref' and 'has_mbcset'.
All uses removed.
(dfaexec_main): Remove 'backref'.
All uses and callers removed.
(dfaexec_br): New function, always sets a backreference bit and returns
a beginning pointer in an input.  When a pattern includes any
unsupported expression by DFA matcher, the function is used.
(dfabackref): Test wether patterns has any unsupported expression by DFA
matcher or not.
(dfassbuild): Remove no longer unsed code.
(dfacomp): If A pattern has any unsupported expression by DFA matcher,
avoid analysis for the pattern.
---
 src/dfa.c | 68 +++++++++++++++++++++++++++++++--------------------------------
 1 file changed, 33 insertions(+), 35 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index f9b3da9..7a0d4f6 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -282,8 +282,6 @@ typedef struct
   size_t hash;                  /* Hash of the positions of this state.  */
   position_set elems;           /* Positions this state could match.  */
   unsigned char context;        /* Context from previous state.  */
-  bool has_backref;            /* This state matches a \<digit>.  */
-  bool has_mbcset;             /* This state matches a MBCSET.  */
   unsigned short constraint;    /* Constraint for this state to accept.  */
   token first_end;              /* Token value of the first END in elems.  */
   position_set mbps;            /* Positions which can match multibyte
@@ -2146,8 +2144,6 @@ state_index (struct dfa *d, position_set const *s, int 
context)
   alloc_position_set (&d->states[i].elems, s->nelem);
   copy (s, &d->states[i].elems);
   d->states[i].context = context;
-  d->states[i].has_backref = false;
-  d->states[i].has_mbcset = false;
   d->states[i].constraint = 0;
   d->states[i].first_end = 0;
   d->states[i].mbps.nelem = 0;
@@ -2163,10 +2159,7 @@ state_index (struct dfa *d, position_set const *s, int 
context)
           d->states[i].first_end = d->tokens[s->elems[j].index];
       }
     else if (d->tokens[s->elems[j].index] == BACKREF)
-      {
-        d->states[i].constraint = NO_CONSTRAINT;
-        d->states[i].has_backref = true;
-      }
+      d->states[i].constraint = NO_CONSTRAINT;
 
   ++d->sindex;
 
@@ -2621,9 +2614,6 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
           if (d->tokens[pos.index] == MBCSET
               || d->tokens[pos.index] == ANYCHAR)
             {
-              /* MB_CUR_MAX > 1 */
-              if (d->tokens[pos.index] == MBCSET)
-                d->states[s].has_mbcset = true;
               /* ANYCHAR and MBCSET must match with a single character, so we
                  must put it to d->states[s].mbps, which contains the positions
                  which can match with a single character not a byte.  */
@@ -3298,8 +3288,8 @@ skip_remains_mb (struct dfa *d, unsigned char const *p,
    encoding-error bytes.  Otherwise, the input consists of single-byte
    characters.  */
 static inline char *
-dfaexec_main (struct dfa *d, char const *begin, char *end,
-             int allow_nl, size_t *count, int *backref, bool multibyte)
+dfaexec_main (struct dfa *d, char const *begin, char *end, int allow_nl,
+             size_t *count, bool multibyte)
 {
   state_num s, s1;              /* Current state.  */
   unsigned char const *p, *mbp; /* Current input character.  */
@@ -3389,16 +3379,6 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end,
                  Use a macro to avoid the risk that they diverge.  */
 #define State_transition()                                              \
   do {                                                                  \
-              /* Falling back to the glibc matcher in this case gives   \
-                 better performance (up to 25% better on [a-z], for     \
-                 example) and enables support for collating symbols and \
-                 equivalence classes.  */                               \
-              if (d->states[s].has_mbcset && backref)                   \
-                {                                                       \
-                  *backref = 1;                                         \
-                  goto done;                                            \
-                }                                                       \
-                                                                        \
               /* Can match with a multibyte character (and multi-character \
                  collating element).  Transition table might be updated.  */ \
               s = transit_state (d, s, &p, (unsigned char *) end);      \
@@ -3472,11 +3452,7 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end,
       if (d->fails[s])
         {
           if (d->success[s] & sbit[*p])
-            {
-              if (backref)
-                *backref = d->states[s].has_backref;
-              goto done;
-            }
+            goto done;
 
           s1 = s;
           if (multibyte)
@@ -3506,14 +3482,22 @@ static char *
 dfaexec_mb (struct dfa *d, char const *begin, char *end,
             int allow_nl, size_t *count, int *backref)
 {
-  return dfaexec_main (d, begin, end, allow_nl, count, backref, true);
+  return dfaexec_main (d, begin, end, allow_nl, count, true);
 }
 
 static char *
 dfaexec_sb (struct dfa *d, char const *begin, char *end,
             int allow_nl, size_t *count, int *backref)
 {
-  return dfaexec_main (d, begin, end, allow_nl, count, backref, false);
+  return dfaexec_main (d, begin, end, allow_nl, count, false);
+}
+
+static char *
+dfaexec_br (struct dfa *d, char const *begin, char *end,
+            int allow_nl, size_t *count, int *backref)
+{
+  *backref = 1;
+  return (char *) begin;
 }
 
 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, BACKREF, D->multibyte),
@@ -3579,6 +3563,19 @@ dfainit (struct dfa *d)
   d->fast = !d->multibyte;
 }
 
+static bool
+dfabackref (struct dfa *d)
+{
+  size_t i;
+  for (i = 0; i < d->tindex; i++)
+    if (d->tokens[i] == BACKREF || d->tokens[i] == MBCSET)
+      {
+        d->dfaexec = dfaexec_br;
+        return true;
+      }
+  return false;
+}
+
 static void
 dfaoptimize (struct dfa *d)
 {
@@ -3676,10 +3673,8 @@ dfassbuild (struct dfa *d)
           if (d->multibyte)
             {
               /* These constraints aren't supported in a multibyte locale.
-                 Ignore them in the superset DFA, and treat them as
-                 backreferences in the main DFA.  */
+                 Ignore them in the superset DFA.  */
               sup->tokens[j++] = EMPTY;
-              d->tokens[i] = BACKREF;
               break;
             }
         default:
@@ -3709,8 +3704,11 @@ dfacomp (char const *s, size_t len, struct dfa *d, int 
searchflag)
   dfambcache (d);
   dfaparse (s, len, d);
   dfassbuild (d);
-  dfaoptimize (d);
-  dfaanalyze (d, searchflag);
+  if (!dfabackref (d))
+    {
+      dfaoptimize (d);
+      dfaanalyze (d, searchflag);
+    }
   if (d->superset)
     {
       d->fast = true;
-- 
2.2.0

Reply via email to