On Mon, Nov 24, 2014 at 5:15 PM, Norihiro Tanaka <nori...@kcn.ne.jp> wrote:
> DFA trys to find a long sequence of characters that must appear in any
> line containing the r.e. in dfamust()  However, if a pattern is long, it
> is very slow, as it processes all characters step by step.  This change
> makes a string concatenated some normal characters process at a time.
>
> Following test case is posted in bug#15191.  It speeds-up more than 60x.
>
> http://debbugs.gnu.org/cgi/bugreport.cgi?bug=15191#5
>
> << before changed >>
>   $ time -p grep -f regex.re input_lines.txt
>   real 1.28
>   user 1.27
>   sys 0.00
>
> << after changed >>
>   $ time -p grep -f regex.re input_lines.txt
>   real 0.02
>   user 0.01
>   sys 0.00

Thank you for that patch.
I have rebased it and made some small improvements:
I combined an if+do loop into a single for-loop and moved
some declarations "down". I constructed a reproducer that
does not require two large inputs to demonstrate the
performance improvement.

I've also begun to reword the commit log, but am out of time
for this evening, so will post this here, for now.

I am still trying to convince myself that this is a strict
improvement, i.e., that the O(N^2) strstr calls avoided
by this change served no purpose.
From 763af945ee2365ed57e4d0fdf1cf47f25db1a6ec Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Mon, 17 Nov 2014 08:26:53 +0900
Subject: [PATCH] dfa: speed up handling of long pattern

DFA tries to find a long sequence of characters that must appear
in any matching line.  However, when a pattern is long (length N),
it is very slow, because it makes O(N^2) strstr calls.
This change reduces that to O(N) by processing each sequence of
adjacent "regular" characters as a group.

* src/dfa.c (dfamust): Process a string concatenated normal
characters at a time.
* NEWS (Improvement): Mention it.
Prompted by a bug report and patch by Ivan Yanikov
in http://bugs.gnu.org/15191#5

Compare the run times of this command before and after this change:
(on a i7-4770S CPU @ 3.10GHz using rawhide (~fedora 22) and compiled
with gcc 6.0.0 20150627)
  env time -f %e grep -f <(seq -s '' 9999 ) <<<1
  Before: 0.85
   After: 0.02
---
 NEWS      |  2 ++
 src/dfa.c | 53 +++++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/NEWS b/NEWS
index 35c4aad..fb398c4 100644
--- a/NEWS
+++ b/NEWS
@@ -22,6 +22,8 @@ GNU grep NEWS                                    -*- outline -*-
   'grep -D skip PATTERN FILE' no longer hangs if FILE is a fifo.
   [bug introduced in grep-2.12]

+  Performance has improved for patterns containing very long strings.
+

 * Noteworthy changes in release 2.21 (2014-11-23) [stable]

diff --git a/src/dfa.c b/src/dfa.c
index 2f82282..381676d 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3967,13 +3967,13 @@ struct must
 };

 static must *
-allocmust (must *mp)
+allocmust (must *mp, size_t size)
 {
   must *new_mp = xmalloc (sizeof *new_mp);
   new_mp->in = xzalloc (sizeof *new_mp->in);
-  new_mp->left = xzalloc (2);
-  new_mp->right = xzalloc (2);
-  new_mp->is = xzalloc (2);
+  new_mp->left = xzalloc (size);
+  new_mp->right = xzalloc (size);
+  new_mp->is = xzalloc (size);
   new_mp->begline = false;
   new_mp->endline = false;
   new_mp->prev = mp;
@@ -4006,23 +4006,22 @@ dfamust (struct dfa const *d)
 {
   must *mp = NULL;
   char const *result = "";
-  size_t ri;
   size_t i;
   bool exact = false;
   bool begline = false;
   bool endline = false;

-  for (ri = 0; ri < d->tindex; ++ri)
+  for (size_t ri = 0; ri < d->tindex; ++ri)
     {
       token t = d->tokens[ri];
       switch (t)
         {
         case BEGLINE:
-          mp = allocmust (mp);
+          mp = allocmust (mp, 2);
           mp->begline = true;
           break;
         case ENDLINE:
-          mp = allocmust (mp);
+          mp = allocmust (mp, 2);
           mp->endline = true;
           break;
         case LPAREN:
@@ -4037,7 +4036,7 @@ dfamust (struct dfa const *d)
         case BACKREF:
         case ANYCHAR:
         case MBCSET:
-          mp = allocmust (mp);
+          mp = allocmust (mp, 2);
           break;

         case STAR:
@@ -4154,7 +4153,6 @@ dfamust (struct dfa const *d)
           goto done;

         default:
-          mp = allocmust (mp);
           if (CSET <= t)
             {
               /* If T is a singleton, or if case-folding in a unibyte
@@ -4167,7 +4165,10 @@ dfamust (struct dfa const *d)
                 if (tstbit (j, *ccl))
                   break;
               if (! (j < NOTCHAR))
-                break;
+                {
+                  mp = allocmust (mp, 2);
+                  break;
+                }
               t = j;
               while (++j < NOTCHAR)
                 if (tstbit (j, *ccl)
@@ -4175,12 +4176,36 @@ dfamust (struct dfa const *d)
                           && toupper (j) == toupper (t)))
                   break;
               if (j < NOTCHAR)
-                break;
+                {
+                  mp = allocmust (mp, 2);
+                  break;
+                }
             }
+
+          size_t rj = ri + 2;
+          if (d->tokens[ri + 1] == CAT)
+            {
+              for (; rj < d->tindex - 1; rj += 2)
+                {
+                  if ((rj != ri && (d->tokens[rj] <= 0
+                                    || NOTCHAR <= d->tokens[rj]))
+                      || d->tokens[rj + 1] != CAT)
+                    break;
+                }
+            }
+          mp = allocmust (mp, ((rj - ri) >> 1) + 1);
           mp->is[0] = mp->left[0] = mp->right[0]
             = case_fold && MB_CUR_MAX == 1 ? toupper (t) : t;
-          mp->is[1] = mp->left[1] = mp->right[1] = '\0';
-          mp->in = enlist (mp->in, mp->is, 1);
+
+          for (i = 1; ri + 2 < rj; i++)
+            {
+              ri += 2;
+              t = d->tokens[ri];
+              mp->is[i] = mp->left[i] = mp->right[i]
+                = case_fold && MB_CUR_MAX == 1 ? toupper (t) : t;
+            }
+          mp->is[i] = mp->left[i] = mp->right[i] = '\0';
+          mp->in = enlist (mp->in, mp->is, i - 1);
           break;
         }
     }
-- 
2.3.7

Reply via email to