In multibyte locales, if a pattern start with period expression, matching is still slow, as transition table is built at run time, even when next character is single byte in input text.
This patch changes it into as use algorithm for single byte character to any single byte character in input text always. If transition table has been built already and a next character in input text is single byte, transit to next state by reference of only pre-built transition table, even if from a state including ANYCHAR. $ yes "$(printf 'a%038db\n' 0)" | head -1000000 >in $ env LC_ALL=C gcc -v Reading specs from /usr/local/lib/gcc/x86_64-pc-linux-gnu/4.4.7/specs Target: x86_64-pc-linux-gnu Configured with: ./configure --with-as=/usr/local/bin/as --with-ld=/usr/local/bin/ld --with-system-zlib --enable-__cxa_atexit Thread model: posix gcc version 4.4.7 (GCC) patch#21486 is required before this patch. grep will speed up by this patch additionaly. [grep-2.25] $ time -p env LC_ALL=ja_JP.eucjp grep .a.b in real 4.78 user 4.42 sys 0.16 $ time -p env LC_ALL=ja_JP.eucjp grep '.\{41\}' in real 46.23 user 43.98 sys 0.21 [after patch#21486] $ time -p env LC_ALL=ja_JP.eucjp src/grep .a.b in real 1.26 user 1.08 sys 0.08 $ time -p env LC_ALL=ja_JP.eucjp src/grep '.\{41\}' in real 1.14 user 1.00 sys 0.10 [after this patch] $ time -p env LC_ALL=ja_JP.eucjp src/grep .a.b in real 0.47 user 0.36 sys 0.07 $ time -p env LC_ALL=ja_JP.eucjp src/grep '.\{41\}' in real 0.24 user 0.18 sys 0.05 [locale C (ref.)] $ time -p env LC_ALL=C src/grep .a.b in real 0.23 user 0.11 sys 0.09 $ time -p env LC_ALL=C src/grep '.\{41\}' in real 0.22 user 0.13 sys 0.06
From 3d0c130808c974f1271561c7433b2aa661c49507 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka <nori...@kcn.ne.jp> Date: Sun, 10 Jul 2016 10:26:39 +0900 Subject: [PATCH] dfa: use algorithm for single byte character to any single byte character in input text always Even in non-UTF8 locales, if a character at current position in input text is single byte, we can use CSET to match ANYCHAR. * src/dfa.c (charclass_index_anychar): New var. Cache index of CSET for ANYCHAR. (lex): Make CSET for ANYCHAR. (state_index): Simplify. (dfastate): Consider CSET for ANYCHAR. (transit_state_singlebyte, transit_state): Remove handling for eolbyte, as we assume that eolbyte does not appear at current position. (dfaexec_main): use algorithm for single byte character to any single byte character in input text always. --- src/dfa.c | 115 ++++++++++++++++++++++++------------------------------------ 1 files changed, 46 insertions(+), 69 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 3bd0c68..e932b2c 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -79,6 +79,8 @@ enum CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS }; +static size_t charclass_index_anychar = -1; + /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */ typedef charclass_word charclass[CHARCLASS_WORDS]; @@ -1429,21 +1431,25 @@ lex (void) case '.': if (backslash) goto normal_char; - if (dfa->multibyte) + if (charclass_index_anychar == (size_t) -1) { - /* In multibyte environment period must match with a single - character not a byte. So we use ANYCHAR. */ - laststart = false; - return lasttok = ANYCHAR; + zeroset (ccl); + notset (ccl); + if (!(syntax_bits & RE_DOT_NEWLINE)) + clrbit ('\n', ccl); + if (syntax_bits & RE_DOT_NOT_NULL) + clrbit ('\0', ccl); + if (dfa->multibyte) + { + for (c2 = 0; c2 < NOTCHAR; ++c2) + if (mbrtowc_cache[c2] == WEOF) + clrbit (c2, ccl); + } + charclass_index_anychar = charclass_index (ccl); } - zeroset (ccl); - notset (ccl); - if (!(syntax_bits & RE_DOT_NEWLINE)) - clrbit ('\n', ccl); - if (syntax_bits & RE_DOT_NOT_NULL) - clrbit ('\0', ccl); laststart = false; - return lasttok = CSET + charclass_index (ccl); + return lasttok = + dfa->multibyte ? ANYCHAR : CSET + charclass_index_anychar; case 's': case 'S': @@ -2125,7 +2131,7 @@ state_index (struct dfa *d, position_set const *s, int context) } else if (d->tokens[s->elems[j].index] == BACKREF) d->states[i].constraint = NO_CONSTRAINT; - if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR) + else if (d->tokens[s->elems[j].index] == ANYCHAR) { int acceptable = 0; if (SUCCEEDS_IN_CONTEXT (constraint, context, CTX_NEWLINE)) @@ -2588,14 +2594,16 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) setbit (d->tokens[pos.index], matches); else if (d->tokens[pos.index] >= CSET) copyset (d->charclasses[d->tokens[pos.index] - CSET], matches); - else if (d->multibyte && d->tokens[pos.index] == ANYCHAR) - /* ANYCHAR must match with a single character, so we must put - it to D->states[s].mbps which contains the positions which - can match with a single character not a byte. If all - positions which has ANYCHAR does not depend on context of - next character, we put the follows instead of it to - D->states[s].mbps to optimize. */ + else if (d->tokens[pos.index] == ANYCHAR) { + copyset (d->charclasses[charclass_index_anychar], matches); + + /* ANYCHAR must match with a single character, so we must put + it to D->states[s].mbps which contains the positions which + can match with a single character not a byte. If all + positions which has ANYCHAR does not depend on context of + next character, we put the follows instead of it to + D->states[s].mbps to optimize. */ if (d->states[s].curr_dependent) { if (d->states[s].mbps.nelem == 0) @@ -2606,9 +2614,11 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) d->states[s].context, CTX_ANY)) { if (d->states[s].mbps.nelem == 0) - alloc_position_set (&d->states[s].mbps, 1); + alloc_position_set (&d->states[s].mbps, + d->follows[pos.index].nelem); for (j = 0; j < d->follows[pos.index].nelem; ++j) - insert (d->follows[pos.index].elems[j], &(d->states[s].mbps)); + insert (d->follows[pos.index].elems[j], + &(d->states[s].mbps)); } } else @@ -2984,16 +2994,6 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp) { state_num *t; - if (**pp == eolbyte) - { - /* S is always an initial state in transit_state, so the - transition table for the state must have been built already. */ - assert (d->trans[s] || d->fails[s]); - - ++*pp; - return d->newlines[s]; - } - if (d->trans[s]) t = d->trans[s]; else if (d->fails[s]) @@ -3020,15 +3020,12 @@ static state_num transit_state (struct dfa *d, state_num s, unsigned char const **pp, unsigned char const *end) { - state_num s1; + state_num s1, s2; wint_t wc; int separate_contexts; - state_num state, state_newline; size_t i, j; int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d); - int context = wc == eolbyte ? CTX_NEWLINE : CTX_NONE; - bool context_newline = context == CTX_NEWLINE; /* This state has some operators which can match a multibyte character. */ d->mb_follows.nelem = 0; @@ -3051,7 +3048,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, for (i = 0; i < d->states[s1].mbps.nelem; ++i) { if (!SUCCEEDS_IN_CONTEXT (d->states[s1].mbps.elems[i].constraint, - d->states[s1].context, context)) + d->states[s1].context, CTX_NONE)) continue; for (j = 0; j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++) @@ -3060,10 +3057,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, } separate_contexts = state_separate_contexts (&d->mb_follows); - if (context_newline && separate_contexts & CTX_NEWLINE) - s = state_index (d, &d->mb_follows, CTX_NEWLINE); - else - s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); + s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); realloc_trans_if_necessary (d, s); return s; @@ -3091,17 +3085,14 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, d->states[s1].mb_trindex = d->mb_trcount++; } - size_t mb_index = d->states[s1].mb_trindex << 1 | (context_newline - ? 1 : 0); - if (d->mb_trans[s] == NULL) { - d->mb_trans[s] = xnmalloc (2 * MAX_TRCOUNT, sizeof *d->mb_trans[s]); - for (i = 0; i < 2 * MAX_TRCOUNT; i++) + d->mb_trans[s] = xnmalloc (MAX_TRCOUNT, sizeof *d->mb_trans[s]); + for (i = 0; i < MAX_TRCOUNT; i++) d->mb_trans[s][i] = -1; } - else if (d->mb_trans[s][mb_index] >= 0) - return d->mb_trans[s][mb_index]; + else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0) + return d->mb_trans[s][d->states[s1].mb_trindex]; if (s < 0) copy (&d->states[s1].mbps, &d->mb_follows); @@ -3109,17 +3100,12 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows); separate_contexts = state_separate_contexts (&d->mb_follows); - state = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); - if (separate_contexts & CTX_NEWLINE) - state_newline = state_index (d, &d->mb_follows, CTX_NEWLINE); - else - state_newline = state; - realloc_trans_if_necessary (d, state_newline); + s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); + realloc_trans_if_necessary (d, s2); - d->mb_trans[s][mb_index & ~0] = state; - d->mb_trans[s][mb_index | 1] = state_newline; + d->mb_trans[s][d->states[s1].mb_trindex] = s2; - return context_newline ? state_newline : state; + return s2; } /* The initial state may encounter a byte which is not a single byte character @@ -3246,10 +3232,8 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, } } - if (d->states[s].mbps.nelem == 0 || (*p == eol && !allow_nl) - || (*p == '\n' && !(syntax_bits & RE_DOT_NEWLINE)) - || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL)) - || (char *) p >= end) + if (d->states[s].mbps.nelem == 0 + || mbrtowc_cache[*p] != WEOF || (char *) p >= end) { /* If an input character does not match ANYCHAR, do it like a single-byte character. */ @@ -3258,8 +3242,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, else { s = transit_state (d, s, &p, (unsigned char *) end); - if (s >= 0 && p[-1] == eol) - nlcount++; mbp = p; trans = d->trans; } @@ -3311,10 +3293,7 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, s1 = s; if (!multibyte || d->states[s].mbps.nelem == 0 - || (*p == eol && !allow_nl) - || (*p == '\n' && !(syntax_bits & RE_DOT_NEWLINE)) - || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL)) - || (char *) p >= end) + || mbrtowc_cache[*p] != WEOF || (char *) p >= end) { /* If a input character does not match ANYCHAR, do it like a single-byte character. */ @@ -3323,8 +3302,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, else { s = transit_state (d, s, &p, (unsigned char *) end); - if (s >= 0 && p[-1] == eol) - nlcount++; mbp = p; trans = d->trans; } -- 1.7.1