On Sat, 18 Apr 2020 00:22:26 +0900 Norihiro Tanaka <nori...@kcn.ne.jp> wrote:
> > On Fri, 17 Apr 2020 10:24:42 +0900 > Norihiro Tanaka <nori...@kcn.ne.jp> wrote: > > > > > On Fri, 17 Apr 2020 09:35:36 +0900 > > Norihiro Tanaka <nori...@kcn.ne.jp> wrote: > > > > > > > > On Thu, 16 Apr 2020 16:00:29 -0700 > > > Paul Eggert <egg...@cs.ucla.edu> wrote: > > > > > > > On 4/16/20 3:53 PM, Norihiro Tanaka wrote: > > > > > > > > > I have had no idea to solve the problem yet. If we revert it, > > > > > bug#33357 > > > > > will come back. > > > > > > > > Yes, I'd rather not revert if we can help it. > > > > > > > > My own thought was to not analyze the regular expression if we discover > > > > that the input is empty. :-) > > > > > > Now, I have a idea, it is that we build indexes of epsilon nodes > > > including in follows before remove epsilon nodes. > > > > > > I wrote fix for the bug, but it will be slower then at grep 2.27 yet. > > It was improved previous patch. Sorry, correct patch is here.
From 8357fa551c8a5a4f14540b250bc2485c4812a234 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka <nori...@kcn.ne.jp> Date: Fri, 17 Apr 2020 10:12:01 +0900 Subject: [PATCH] dfa: build auxiliary indexes before remove epsilon closure When remove epsilon closure, so far searched nodes including epsilon closure in all nodes sequentially, but it is slow for some cases. Now build auxiliary indexes before search. Problem reported in: https://debbugs.gnu.org/cgi/bugreport.cgi?bug=40634 * lib/dfa.c (epsclosure): build auxiliary indexes before remove epsilon closure. --- lib/dfa.c | 129 +++++++++++++++++++++++++++++++++++++++++++++---------------- 1 files changed, 95 insertions(+), 34 deletions(-) diff --git a/lib/dfa.c b/lib/dfa.c index 9939d22..5f90a92 100644 --- a/lib/dfa.c +++ b/lib/dfa.c @@ -2298,45 +2298,106 @@ static void epsclosure (struct dfa const *d) { position_set tmp; + idx_t *currs, *backs; + idx_t ncurr = 0; + position_set *prevs; + alloc_position_set (&tmp, d->nleaves); + currs = xnmalloc (d->nleaves, sizeof *currs); + backs = xnmalloc (d->tindex, sizeof *backs); + for (idx_t i = 0; i < d->tindex; i++) - if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR - && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR - && d->tokens[i] != MBCSET && d->tokens[i] < CSET) - { - unsigned int constraint; - switch (d->tokens[i]) - { - case BEGLINE: - constraint = BEGLINE_CONSTRAINT; - break; - case ENDLINE: - constraint = ENDLINE_CONSTRAINT; - break; - case BEGWORD: - constraint = BEGWORD_CONSTRAINT; - break; - case ENDWORD: - constraint = ENDWORD_CONSTRAINT; - break; - case LIMWORD: - constraint = LIMWORD_CONSTRAINT; - break; - case NOTLIMWORD: - constraint = NOTLIMWORD_CONSTRAINT; - break; - default: - constraint = NO_CONSTRAINT; - break; - } + { + if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR + && d->tokens[i] != ANYCHAR && d->tokens[i] != BEG + && d->tokens[i] != BACKREF && d->tokens[i] != MBCSET + && d->tokens[i] < CSET) + { + currs[ncurr] = i; + backs[i] = ncurr++; + } + else + backs[i] = -1; + } - delete (i, &d->follows[i]); + prevs = xnmalloc (ncurr, sizeof *prevs); - for (idx_t j = 0; j < d->tindex; j++) - if (i != j && d->follows[j].nelem > 0) - replace (&d->follows[j], i, &d->follows[i], constraint, &tmp); - } + for (idx_t i = 0; i < ncurr; i++) + { + prevs[i].elems = NULL; + prevs[i].nelem = 0; + } + + for (idx_t i = 0; i < d->tindex; i++) + { + for (idx_t j = 0, k = 0; j < d->follows[i].nelem && k < ncurr;) + { + if (d->follows[i].elems[j].index < currs[k]) + j++; + else if (currs[k] < d->follows[i].elems[j].index) + k++; + else + { + if (currs[k] != i) + { + position p; + p.index = i; + p.constraint = NO_CONSTRAINT; + if (prevs[k].nelem == 0) + alloc_position_set (&prevs[k], 1); + insert (p, &prevs[k]); + } + j++; + k++; + } + } + } + + for (idx_t i = 0; i < ncurr; i++) + { + unsigned int constraint; + switch (d->tokens[currs[i]]) + { + case BEGLINE: + constraint = BEGLINE_CONSTRAINT; + break; + case ENDLINE: + constraint = ENDLINE_CONSTRAINT; + break; + case BEGWORD: + constraint = BEGWORD_CONSTRAINT; + break; + case ENDWORD: + constraint = ENDWORD_CONSTRAINT; + break; + case LIMWORD: + constraint = LIMWORD_CONSTRAINT; + break; + case NOTLIMWORD: + constraint = NOTLIMWORD_CONSTRAINT; + break; + default: + constraint = NO_CONSTRAINT; + break; + } + + delete (currs[i], &d->follows[currs[i]]); + + for (idx_t j = 0; j < prevs[i].nelem; j++) + replace (&d->follows[prevs[i].elems[j].index], currs[i], + &d->follows[currs[i]], constraint, &tmp); + for (idx_t j = 0; j < d->follows[currs[i]].nelem; j++) + if (backs[d->follows[currs[i]].elems[j].index] >= 0) + replace (&prevs[backs[d->follows[currs[i]].elems[j].index]], + currs[i], &prevs[i], NO_CONSTRAINT, &tmp); + } + + for (idx_t i = 0; i < ncurr; i++) + free (prevs[i].elems); + free (prevs); free (tmp.elems); + free (currs); + free (backs); } /* Returns the set of contexts for which there is at least one -- 1.7.1