On Sun, 10 Jul 2016 18:51:43 +0900 Norihiro Tanaka <nori...@kcn.ne.jp> wrote:
> In multibyte locales, if a pattern start with period expression, > matching is still slow, as transition table is built at run time, > even when next character is single byte in input text. > > This patch changes it into as use algorithm for single byte character to > any single byte character in input text always. If transition table has > been built already and a next character in input text is single byte, > transit to next state by reference of only pre-built transition table, > even if from a state including ANYCHAR. > > $ yes "$(printf 'a%038db\n' 0)" | head -1000000 >in > $ env LC_ALL=C gcc -v > Reading specs from /usr/local/lib/gcc/x86_64-pc-linux-gnu/4.4.7/specs > Target: x86_64-pc-linux-gnu > Configured with: ./configure --with-as=/usr/local/bin/as > --with-ld=/usr/local/bin/ld --with-system-zlib --enable-__cxa_atexit > Thread model: posix > gcc version 4.4.7 (GCC) > > patch#21486 is required before this patch. grep will speed up by this > patch additionaly. I updated the patch due to change in bug#21486, and added a patch including a minor change.
From ff7d171868b98b7203137579958969a2e7771edd Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka <nori...@kcn.ne.jp> Date: Tue, 16 Aug 2016 18:50:03 +0900 Subject: [PATCH 1/2] dfa: use algorithm for single byte character to any single byte character in input text always Even in non-UTF8 locales, if a character at current position in input text is single byte, we can use CSET to match ANYCHAR. * src/dfa.c (charclass_index_anychar): New var. Cache index of CSET for ANYCHAR. (lex): Make CSET for ANYCHAR. (state_index): Simplify. (dfastate): Consider CSET for ANYCHAR. (transit_state_singlebyte, transit_state): Remove handling for eolbyte, as we assume that eolbyte does not appear at current position. (dfaexec_main): use algorithm for single byte character to any single byte character in input text always. --- src/dfa.c | 126 ++++++++++++++++++++++++------------------------------------- 1 files changed, 50 insertions(+), 76 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index b64a176..71a8ab7 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -79,6 +79,8 @@ enum CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS }; +static size_t charclass_index_anychar = -1; + /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */ typedef charclass_word charclass[CHARCLASS_WORDS]; @@ -1430,21 +1432,25 @@ lex (void) case '.': if (backslash) goto normal_char; - if (dfa->multibyte) + if (charclass_index_anychar == (size_t) -1) { - /* In multibyte environment period must match with a single - character not a byte. So we use ANYCHAR. */ - laststart = false; - return lasttok = ANYCHAR; + zeroset (ccl); + notset (ccl); + if (!(syntax_bits & RE_DOT_NEWLINE)) + clrbit ('\n', ccl); + if (syntax_bits & RE_DOT_NOT_NULL) + clrbit ('\0', ccl); + if (dfa->multibyte) + { + for (c2 = 0; c2 < NOTCHAR; ++c2) + if (mbrtowc_cache[c2] == WEOF) + clrbit (c2, ccl); + } + charclass_index_anychar = charclass_index (ccl); } - zeroset (ccl); - notset (ccl); - if (!(syntax_bits & RE_DOT_NEWLINE)) - clrbit ('\n', ccl); - if (syntax_bits & RE_DOT_NOT_NULL) - clrbit ('\0', ccl); laststart = false; - return lasttok = CSET + charclass_index (ccl); + return lasttok = + dfa->multibyte ? ANYCHAR : CSET + charclass_index_anychar; case 's': case 'S': @@ -2113,7 +2119,7 @@ state_index (struct dfa *d, position_set const *s, int context) } else if (d->tokens[s->elems[j].index] == BACKREF) constraint = NO_CONSTRAINT; - if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR) + if (d->tokens[s->elems[j].index] == ANYCHAR) { int acceptable = ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE) @@ -2590,13 +2596,15 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) setbit (d->tokens[pos.index], matches); else if (d->tokens[pos.index] >= CSET) copyset (d->charclasses[d->tokens[pos.index] - CSET], matches); - else if (d->multibyte && d->tokens[pos.index] == ANYCHAR) + else if (d->tokens[pos.index] == ANYCHAR) { - /* ANYCHAR must match a single character, so put it to - D->states[s].mbps which contains the positions which can - match with a single character not a byte. If all - positions with ANYCHAR do not depend on the context of - the next character, put its follows instead to + copyset (d->charclasses[charclass_index_anychar], matches); + + /* ANYCHAR must match with a single character, so we must put + it to D->states[s].mbps which contains the positions which + can match with a single character not a byte. If all + positions which has ANYCHAR does not depend on context of + next character, we put the follows instead of it to D->states[s].mbps to optimize. */ if (d->states[s].curr_dependent) { @@ -2608,9 +2616,11 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) d->states[s].context, CTX_ANY)) { if (d->states[s].mbps.nelem == 0) - alloc_position_set (&d->states[s].mbps, 1); - for (j = 0; j < d->follows[pos.index].nelem; j++) - insert (d->follows[pos.index].elems[j], &d->states[s].mbps); + alloc_position_set (&d->states[s].mbps, + d->follows[pos.index].nelem); + for (j = 0; j < d->follows[pos.index].nelem; ++j) + insert (d->follows[pos.index].elems[j], + &(d->states[s].mbps)); } } else @@ -2986,16 +2996,6 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp) { state_num *t; - if (**pp == eolbyte) - { - /* S is always an initial state in transit_state, so the - transition table for the state must have been built already. */ - assert (d->trans[s] || d->fails[s]); - - ++*pp; - return d->newlines[s]; - } - if (d->trans[s]) t = d->trans[s]; else if (d->fails[s]) @@ -3022,15 +3022,12 @@ static state_num transit_state (struct dfa *d, state_num s, unsigned char const **pp, unsigned char const *end) { - state_num s1; + state_num s1, s2; wint_t wc; int separate_contexts; - state_num state, state_newline, mb_index; size_t i, j; int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d); - int context = wc == eolbyte ? CTX_NEWLINE : CTX_NONE; - bool context_newline = context == CTX_NEWLINE; /* This state has some operators which can match a multibyte character. */ d->mb_follows.nelem = 0; @@ -3052,7 +3049,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, for (i = 0; i < d->states[s1].mbps.nelem; i++) { if (!SUCCEEDS_IN_CONTEXT (d->states[s1].mbps.elems[i].constraint, - d->states[s1].context, context)) + d->states[s1].context, CTX_NONE)) continue; for (j = 0; j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++) @@ -3061,10 +3058,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, } separate_contexts = state_separate_contexts (&d->mb_follows); - if (context_newline && separate_contexts & CTX_NEWLINE) - s = state_index (d, &d->mb_follows, CTX_NEWLINE); - else - s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); + s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); realloc_trans_if_necessary (d, s); return s; @@ -3077,11 +3071,11 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, { if (MAX_TRCOUNT <= d->mb_trcount) { - state_num s2; - for (s2 = -1; s2 < d->tralloc; s2++) + state_num s3; + for (s3 = -1; s3 < d->tralloc; s3++) { - free (d->mb_trans[s2]); - d->mb_trans[s2] = NULL; + free (d->mb_trans[s3]); + d->mb_trans[s3] = NULL; } for (i = 0; i < d->sindex; i++) @@ -3091,22 +3085,16 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, d->states[s1].mb_trindex = d->mb_trcount++; } - mb_index = d->states[s1].mb_trindex * 2; - if (! d->mb_trans[s]) { enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] }; - enum { TRANSALLOC_SIZE = 2 * MAX_TRCOUNT * TRANSPTR_SIZE }; + enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE }; d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE); - for (i = 0; i < 2 * MAX_TRCOUNT; i++) + for (i = 0; i < MAX_TRCOUNT; i++) d->mb_trans[s][i] = -1; } - else - { - state = d->mb_trans[s][mb_index + context_newline]; - if (0 <= state) - return state; - } + else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0) + return d->mb_trans[s][d->states[s1].mb_trindex]; if (s < 0) copy (&d->states[s1].mbps, &d->mb_follows); @@ -3114,17 +3102,12 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows); separate_contexts = state_separate_contexts (&d->mb_follows); - state = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); - if (separate_contexts & CTX_NEWLINE) - state_newline = state_index (d, &d->mb_follows, CTX_NEWLINE); - else - state_newline = state; - realloc_trans_if_necessary (d, state_newline); + s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); + realloc_trans_if_necessary (d, s2); - d->mb_trans[s][mb_index] = state; - d->mb_trans[s][mb_index + 1] = state_newline; + d->mb_trans[s][d->states[s1].mb_trindex] = s2; - return context_newline ? state_newline : state; + return s2; } /* The initial state may encounter a byte which is not a single byte character @@ -3251,10 +3234,8 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, } } - if (d->states[s].mbps.nelem == 0 || (*p == eol && !allow_nl) - || (*p == '\n' && !(syntax_bits & RE_DOT_NEWLINE)) - || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL)) - || (char *) p >= end) + if (d->states[s].mbps.nelem == 0 + || mbrtowc_cache[*p] != WEOF || (char *) p >= end) { /* If an input character does not match ANYCHAR, do it like a single-byte character. */ @@ -3263,8 +3244,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, else { s = transit_state (d, s, &p, (unsigned char *) end); - if (s >= 0 && p[-1] == eol) - nlcount++; mbp = p; trans = d->trans; } @@ -3321,10 +3300,7 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, s1 = s; if (!multibyte || d->states[s].mbps.nelem == 0 - || (*p == eol && !allow_nl) - || (*p == '\n' && !(syntax_bits & RE_DOT_NEWLINE)) - || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL)) - || (char *) p >= end) + || mbrtowc_cache[*p] != WEOF || (char *) p >= end) { /* If a input character does not match ANYCHAR, do it like a single-byte character. */ @@ -3333,8 +3309,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, else { s = transit_state (d, s, &p, (unsigned char *) end); - if (s >= 0 && p[-1] == eol) - nlcount++; mbp = p; trans = d->trans; } -- 1.7.1
From e9286b7e92f094b7c0342c6a38c67d8d30c2a0de Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka <nori...@kcn.ne.jp> Date: Tue, 16 Aug 2016 23:23:19 +0900 Subject: [PATCH 2/2] dfa: avoid invalid character matches period * dfa.c (transit_state): Avoid invalid character matches period. --- src/dfa.c | 6 ++++++ 1 files changed, 6 insertions(+), 0 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 71a8ab7..f79d67e 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -3039,6 +3039,12 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, s = transit_state_singlebyte (d, s, pp); *pp += mbclen - i; + if (wc == WEOF) + { + /* It is invalid character. Then ANYCHAR is not accepted. */ + return s; + } + if (d->states[s1].curr_dependent) { if (s < 0) -- 1.7.1