On Mon, Nov 24, 2014 at 5:15 PM, Norihiro Tanaka <nori...@kcn.ne.jp> wrote: > DFA trys to find a long sequence of characters that must appear in any > line containing the r.e. in dfamust() However, if a pattern is long, it > is very slow, as it processes all characters step by step. This change > makes a string concatenated some normal characters process at a time. > > Following test case is posted in bug#15191. It speeds-up more than 60x. > > http://debbugs.gnu.org/cgi/bugreport.cgi?bug=15191#5 > > << before changed >> > $ time -p grep -f regex.re input_lines.txt > real 1.28 > user 1.27 > sys 0.00 > > << after changed >> > $ time -p grep -f regex.re input_lines.txt > real 0.02 > user 0.01 > sys 0.00
Thank you for that patch. I have rebased it and made some small improvements: I combined an if+do loop into a single for-loop and moved some declarations "down". I constructed a reproducer that does not require two large inputs to demonstrate the performance improvement. I've also begun to reword the commit log, but am out of time for this evening, so will post this here, for now. I am still trying to convince myself that this is a strict improvement, i.e., that the O(N^2) strstr calls avoided by this change served no purpose.
From 763af945ee2365ed57e4d0fdf1cf47f25db1a6ec Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka <nori...@kcn.ne.jp> Date: Mon, 17 Nov 2014 08:26:53 +0900 Subject: [PATCH] dfa: speed up handling of long pattern DFA tries to find a long sequence of characters that must appear in any matching line. However, when a pattern is long (length N), it is very slow, because it makes O(N^2) strstr calls. This change reduces that to O(N) by processing each sequence of adjacent "regular" characters as a group. * src/dfa.c (dfamust): Process a string concatenated normal characters at a time. * NEWS (Improvement): Mention it. Prompted by a bug report and patch by Ivan Yanikov in http://bugs.gnu.org/15191#5 Compare the run times of this command before and after this change: (on a i7-4770S CPU @ 3.10GHz using rawhide (~fedora 22) and compiled with gcc 6.0.0 20150627) env time -f %e grep -f <(seq -s '' 9999 ) <<<1 Before: 0.85 After: 0.02 --- NEWS | 2 ++ src/dfa.c | 53 +++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/NEWS b/NEWS index 35c4aad..fb398c4 100644 --- a/NEWS +++ b/NEWS @@ -22,6 +22,8 @@ GNU grep NEWS -*- outline -*- 'grep -D skip PATTERN FILE' no longer hangs if FILE is a fifo. [bug introduced in grep-2.12] + Performance has improved for patterns containing very long strings. + * Noteworthy changes in release 2.21 (2014-11-23) [stable] diff --git a/src/dfa.c b/src/dfa.c index 2f82282..381676d 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -3967,13 +3967,13 @@ struct must }; static must * -allocmust (must *mp) +allocmust (must *mp, size_t size) { must *new_mp = xmalloc (sizeof *new_mp); new_mp->in = xzalloc (sizeof *new_mp->in); - new_mp->left = xzalloc (2); - new_mp->right = xzalloc (2); - new_mp->is = xzalloc (2); + new_mp->left = xzalloc (size); + new_mp->right = xzalloc (size); + new_mp->is = xzalloc (size); new_mp->begline = false; new_mp->endline = false; new_mp->prev = mp; @@ -4006,23 +4006,22 @@ dfamust (struct dfa const *d) { must *mp = NULL; char const *result = ""; - size_t ri; size_t i; bool exact = false; bool begline = false; bool endline = false; - for (ri = 0; ri < d->tindex; ++ri) + for (size_t ri = 0; ri < d->tindex; ++ri) { token t = d->tokens[ri]; switch (t) { case BEGLINE: - mp = allocmust (mp); + mp = allocmust (mp, 2); mp->begline = true; break; case ENDLINE: - mp = allocmust (mp); + mp = allocmust (mp, 2); mp->endline = true; break; case LPAREN: @@ -4037,7 +4036,7 @@ dfamust (struct dfa const *d) case BACKREF: case ANYCHAR: case MBCSET: - mp = allocmust (mp); + mp = allocmust (mp, 2); break; case STAR: @@ -4154,7 +4153,6 @@ dfamust (struct dfa const *d) goto done; default: - mp = allocmust (mp); if (CSET <= t) { /* If T is a singleton, or if case-folding in a unibyte @@ -4167,7 +4165,10 @@ dfamust (struct dfa const *d) if (tstbit (j, *ccl)) break; if (! (j < NOTCHAR)) - break; + { + mp = allocmust (mp, 2); + break; + } t = j; while (++j < NOTCHAR) if (tstbit (j, *ccl) @@ -4175,12 +4176,36 @@ dfamust (struct dfa const *d) && toupper (j) == toupper (t))) break; if (j < NOTCHAR) - break; + { + mp = allocmust (mp, 2); + break; + } } + + size_t rj = ri + 2; + if (d->tokens[ri + 1] == CAT) + { + for (; rj < d->tindex - 1; rj += 2) + { + if ((rj != ri && (d->tokens[rj] <= 0 + || NOTCHAR <= d->tokens[rj])) + || d->tokens[rj + 1] != CAT) + break; + } + } + mp = allocmust (mp, ((rj - ri) >> 1) + 1); mp->is[0] = mp->left[0] = mp->right[0] = case_fold && MB_CUR_MAX == 1 ? toupper (t) : t; - mp->is[1] = mp->left[1] = mp->right[1] = '\0'; - mp->in = enlist (mp->in, mp->is, 1); + + for (i = 1; ri + 2 < rj; i++) + { + ri += 2; + t = d->tokens[ri]; + mp->is[i] = mp->left[i] = mp->right[i] + = case_fold && MB_CUR_MAX == 1 ? toupper (t) : t; + } + mp->is[i] = mp->left[i] = mp->right[i] = '\0'; + mp->in = enlist (mp->in, mp->is, i - 1); break; } } -- 2.3.7