Thanks for that set of patches too. I rebased it and tweaked NEWS and installed
the resulting patch set (attached) into Savannah master.
>From a95ba0203afcc43feea0fa99e4d1b8fb923b1aeb Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Thu, 1 Sep 2016 11:45:18 -0700
Subject: [PATCH 1/4] dfa: use single-byte algorithm even in non-UTF-8
Even in non-UTF8 locales, if the current input character
is single byte, we can use CSET to match ANYCHAR.
* src/dfa.c (struct dfa): New member canychar.
Cache index of CSET for ANYCHAR.
(lex): Make CSET for ANYCHAR.
(state_index): Simplify.
(dfastate): Consider CSET for ANYCHAR.
(transit_state_singlebyte, transit_state): Remove handling for eolbyte,
as we assume that eolbyte does not appear at current position.
(dfaexec_main): Use algorithm for single byte character to any single
byte character in input text always.
(dfasyntax): Initialize canychar.
---
src/dfa.c | 115 ++++++++++++++++++++++++--------------------------------------
1 file changed, 45 insertions(+), 70 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 0f5109e..e39d82a 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -401,6 +401,7 @@ struct dfa
charclass *charclasses; /* Array of character sets for CSET tokens. */
size_t cindex; /* Index for adding new charclasses. */
size_t calloc; /* Number of charclasses allocated. */
+ size_t canychar; /* Index of anychar class, or (size_t) -1. */
/* Scanner state */
struct lexer_state lex;
@@ -1398,21 +1399,24 @@ lex (struct dfa *dfa)
case '.':
if (backslash)
goto normal_char;
- if (dfa->localeinfo.multibyte)
+ if (dfa->canychar == (size_t) -1)
{
- /* In multibyte environment period must match with a single
- character not a byte. So we use ANYCHAR. */
- dfa->lex.laststart = false;
- return dfa->lex.lasttok = ANYCHAR;
+ zeroset (ccl);
+ notset (ccl);
+ if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
+ clrbit ('\n', ccl);
+ if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
+ clrbit ('\0', ccl);
+ if (dfa->localeinfo.multibyte)
+ for (c2 = 0; c2 < NOTCHAR; c2++)
+ if (dfa->localeinfo.sbctowc[c2] == WEOF)
+ clrbit (c2, ccl);
+ dfa->canychar = charclass_index (dfa, ccl);
}
- zeroset (ccl);
- notset (ccl);
- if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
- clrbit ('\n', ccl);
- if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
- clrbit ('\0', ccl);
dfa->lex.laststart = false;
- return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
+ return dfa->lex.lasttok = (dfa->localeinfo.multibyte
+ ? ANYCHAR
+ : CSET + dfa->canychar);
case 's':
case 'S':
@@ -2077,7 +2081,7 @@ state_index (struct dfa *d, position_set const *s, int context)
}
else if (d->tokens[s->elems[j].index] == BACKREF)
constraint = NO_CONSTRAINT;
- if (d->localeinfo.multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
+ if (d->tokens[s->elems[j].index] == ANYCHAR)
{
int acceptable
= ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE)
@@ -2554,13 +2558,15 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
setbit (d->tokens[pos.index], matches);
else if (d->tokens[pos.index] >= CSET)
copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
- else if (d->localeinfo.multibyte && d->tokens[pos.index] == ANYCHAR)
+ else if (d->tokens[pos.index] == ANYCHAR)
{
- /* ANYCHAR must match a single character, so put it to
- D->states[s].mbps which contains the positions which can
- match with a single character not a byte. If all
- positions with ANYCHAR do not depend on the context of
- the next character, put its follows instead to
+ copyset (d->charclasses[d->canychar], matches);
+
+ /* ANYCHAR must match with a single character, so we must put
+ it to D->states[s].mbps which contains the positions which
+ can match with a single character not a byte. If all
+ positions which has ANYCHAR does not depend on context of
+ next character, we put the follows instead of it to
D->states[s].mbps to optimize. */
if (d->states[s].curr_dependent)
{
@@ -2572,7 +2578,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
d->states[s].context, CTX_ANY))
{
if (d->states[s].mbps.nelem == 0)
- alloc_position_set (&d->states[s].mbps, 1);
+ alloc_position_set (&d->states[s].mbps,
+ d->follows[pos.index].nelem);
for (j = 0; j < d->follows[pos.index].nelem; j++)
insert (d->follows[pos.index].elems[j], &d->states[s].mbps);
}
@@ -2950,16 +2957,6 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
{
state_num *t;
- if (**pp == d->syntax.eolbyte)
- {
- /* S is always an initial state in transit_state, so the
- transition table for the state must have been built already. */
- assert (d->trans[s] || d->fails[s]);
-
- ++*pp;
- return d->newlines[s];
- }
-
if (d->trans[s])
t = d->trans[s];
else if (d->fails[s])
@@ -2986,15 +2983,12 @@ static state_num
transit_state (struct dfa *d, state_num s, unsigned char const **pp,
unsigned char const *end)
{
- state_num s1;
+ state_num s1, s2;
wint_t wc;
int separate_contexts;
- state_num state, state_newline, mb_index;
size_t i, j;
int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
- int context = wc == d->syntax.eolbyte ? CTX_NEWLINE : CTX_NONE;
- bool context_newline = context == CTX_NEWLINE;
/* This state has some operators which can match a multibyte character. */
d->mb_follows.nelem = 0;
@@ -3016,7 +3010,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
for (i = 0; i < d->states[s1].mbps.nelem; i++)
{
if (!SUCCEEDS_IN_CONTEXT (d->states[s1].mbps.elems[i].constraint,
- d->states[s1].context, context))
+ d->states[s1].context, CTX_NONE))
continue;
for (j = 0; j < d->follows[d->states[s1].mbps.elems[i].index].nelem;
j++)
@@ -3025,10 +3019,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
}
separate_contexts = state_separate_contexts (&d->mb_follows);
- if (context_newline && separate_contexts & CTX_NEWLINE)
- s = state_index (d, &d->mb_follows, CTX_NEWLINE);
- else
- s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
+ s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
realloc_trans_if_necessary (d, s);
return s;
@@ -3041,11 +3032,11 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
{
if (MAX_TRCOUNT <= d->mb_trcount)
{
- state_num s2;
- for (s2 = -1; s2 < d->tralloc; s2++)
+ state_num s3;
+ for (s3 = -1; s3 < d->tralloc; s3++)
{
- free (d->mb_trans[s2]);
- d->mb_trans[s2] = NULL;
+ free (d->mb_trans[s3]);
+ d->mb_trans[s3] = NULL;
}
for (i = 0; i < d->sindex; i++)
@@ -3055,22 +3046,16 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
d->states[s1].mb_trindex = d->mb_trcount++;
}
- mb_index = d->states[s1].mb_trindex * 2;
-
if (! d->mb_trans[s])
{
enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
- enum { TRANSALLOC_SIZE = 2 * MAX_TRCOUNT * TRANSPTR_SIZE };
+ enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
- for (i = 0; i < 2 * MAX_TRCOUNT; i++)
+ for (i = 0; i < MAX_TRCOUNT; i++)
d->mb_trans[s][i] = -1;
}
- else
- {
- state = d->mb_trans[s][mb_index + context_newline];
- if (0 <= state)
- return state;
- }
+ else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
+ return d->mb_trans[s][d->states[s1].mb_trindex];
if (s < 0)
copy (&d->states[s1].mbps, &d->mb_follows);
@@ -3078,17 +3063,12 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
separate_contexts = state_separate_contexts (&d->mb_follows);
- state = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
- if (separate_contexts & CTX_NEWLINE)
- state_newline = state_index (d, &d->mb_follows, CTX_NEWLINE);
- else
- state_newline = state;
- realloc_trans_if_necessary (d, state_newline);
+ s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
+ realloc_trans_if_necessary (d, s2);
- d->mb_trans[s][mb_index] = state;
- d->mb_trans[s][mb_index + 1] = state_newline;
+ d->mb_trans[s][d->states[s1].mb_trindex] = s2;
- return context_newline ? state_newline : state;
+ return s2;
}
/* The initial state may encounter a byte which is not a single byte character
@@ -3215,10 +3195,8 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
}
}
- if (d->states[s].mbps.nelem == 0 || (*p == eol && !allow_nl)
- || (*p == '\n' && !(d->syntax.syntax_bits & RE_DOT_NEWLINE))
- || (*p == '\0' && (d->syntax.syntax_bits & RE_DOT_NOT_NULL))
- || (char *) p >= end)
+ if (d->states[s].mbps.nelem == 0
+ || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
{
/* If an input character does not match ANYCHAR, do it
like a single-byte character. */
@@ -3227,8 +3205,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
else
{
s = transit_state (d, s, &p, (unsigned char *) end);
- if (s >= 0 && p[-1] == eol)
- nlcount++;
mbp = p;
trans = d->trans;
}
@@ -3297,8 +3273,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
else
{
s = transit_state (d, s, &p, (unsigned char *) end);
- if (s >= 0 && p[-1] == eol)
- nlcount++;
mbp = p;
trans = d->trans;
}
@@ -4106,6 +4080,7 @@ dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
dfa->fast = !dfa->localeinfo.multibyte;
+ dfa->canychar = -1;
dfa->lex.cur_mb_len = 1;
dfa->syntax.syntax_bits_set = true;
dfa->syntax.syntax_bits = bits;
--
2.7.4
>From 5644d1c3884286e2a8a6f51b5d0ef00a8fbe4578 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Thu, 1 Sep 2016 11:45:18 -0700
Subject: [PATCH 2/4] dfa: avoid invalid character matching period
* dfa.c (transit_state): Avoid invalid character matching period.
---
src/dfa.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/src/dfa.c b/src/dfa.c
index e39d82a..075576c 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3000,6 +3000,12 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
s = transit_state_singlebyte (d, s, pp);
*pp += mbclen - i;
+ if (wc == WEOF)
+ {
+ /* It is an invalid character, so ANYCHAR is not accepted. */
+ return s;
+ }
+
if (d->states[s1].curr_dependent)
{
if (s < 0)
--
2.7.4
>From 83db3ea1a09cbff41aa2a9c7d1bd6dc01612c862 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Thu, 1 Sep 2016 11:45:18 -0700
Subject: [PATCH 3/4] dfa: document previous change
* NEWS: Adjust to match previous change.
---
NEWS | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/NEWS b/NEWS
index 91f1bfc..56c735f 100644
--- a/NEWS
+++ b/NEWS
@@ -13,8 +13,8 @@ GNU grep NEWS -*- outline -*-
grep -iF is typically much faster in a multibyte locale, if the
pattern and its case counterparts contain only single byte characters.
- In multibyte locales that are not UTF-8, grep now handles leading
- "." in patterns more efficiently.
+ In multibyte locales, grep now handles leading "." in patterns more
+ efficiently.
grep now prints a "FILENAME:LINENO: " prefix when diagnosing an
invalid regular expression that was read from an '-f'-specified file.
--
2.7.4
>From 980bcbff1046559d6a5299055f22ba6b8703ed8a Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Thu, 1 Sep 2016 11:45:18 -0700
Subject: [PATCH 4/4] dfa: remove separation by context in transition in
non-UTF8 multibyte locales
* src/dfa.c (struct dfa): Remove member curr_dependent. All uses
removed.
---
src/dfa.c | 53 +++--------------------------------------------------
1 file changed, 3 insertions(+), 50 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 075576c..00562ea 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -303,9 +303,6 @@ typedef struct
size_t hash; /* Hash of the positions of this state. */
position_set elems; /* Positions this state could match. */
unsigned char context; /* Context from previous state. */
- bool curr_dependent; /* True if the follows of any positions with
- ANYCHAR depends on the next character's
- context. */
unsigned short constraint; /* Constraint for this state to accept. */
token first_end; /* Token value of the first END in elems. */
position_set mbps; /* Positions which can match multibyte
@@ -2027,7 +2024,6 @@ state_index (struct dfa *d, position_set const *s, int context)
size_t hash = 0;
int constraint = 0;
state_num i, j;
- bool curr_dependent = false;
token first_end = 0;
for (i = 0; i < s->nelem; ++i)
@@ -2081,17 +2077,6 @@ state_index (struct dfa *d, position_set const *s, int context)
}
else if (d->tokens[s->elems[j].index] == BACKREF)
constraint = NO_CONSTRAINT;
- if (d->tokens[s->elems[j].index] == ANYCHAR)
- {
- int acceptable
- = ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE)
- ? CTX_NEWLINE : 0)
- | (SUCCEEDS_IN_CONTEXT (c, context, CTX_LETTER)
- ? CTX_LETTER : 0)
- | (SUCCEEDS_IN_CONTEXT (c, context, CTX_NONE)
- ? CTX_NONE : 0));
- curr_dependent |= acceptable && (context & ~acceptable);
- }
}
@@ -2102,7 +2087,6 @@ state_index (struct dfa *d, position_set const *s, int context)
alloc_position_set (&d->states[i].elems, s->nelem);
copy (s, &d->states[i].elems);
d->states[i].context = context;
- d->states[i].curr_dependent = curr_dependent;
d->states[i].constraint = constraint;
d->states[i].first_end = first_end;
d->states[i].mbps.nelem = 0;
@@ -2568,14 +2552,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
positions which has ANYCHAR does not depend on context of
next character, we put the follows instead of it to
D->states[s].mbps to optimize. */
- if (d->states[s].curr_dependent)
- {
- if (d->states[s].mbps.nelem == 0)
- alloc_position_set (&d->states[s].mbps, 1);
- insert (pos, &d->states[s].mbps);
- }
- else if (SUCCEEDS_IN_CONTEXT (pos.constraint,
- d->states[s].context, CTX_ANY))
+ if (SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context,
+ CTX_NONE))
{
if (d->states[s].mbps.nelem == 0)
alloc_position_set (&d->states[s].mbps,
@@ -2986,7 +2964,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
state_num s1, s2;
wint_t wc;
int separate_contexts;
- size_t i, j;
+ size_t i;
int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
@@ -3006,31 +2984,6 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
return s;
}
- if (d->states[s1].curr_dependent)
- {
- if (s < 0)
- d->mb_follows.nelem = 0;
- else
- copy (&d->states[s].elems, &d->mb_follows);
-
- for (i = 0; i < d->states[s1].mbps.nelem; i++)
- {
- if (!SUCCEEDS_IN_CONTEXT (d->states[s1].mbps.elems[i].constraint,
- d->states[s1].context, CTX_NONE))
- continue;
- for (j = 0; j < d->follows[d->states[s1].mbps.elems[i].index].nelem;
- j++)
- insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j],
- &d->mb_follows);
- }
-
- separate_contexts = state_separate_contexts (&d->mb_follows);
- s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
- realloc_trans_if_necessary (d, s);
-
- return s;
- }
-
/* If all positions which have ANYCHAR do not depend on the context
of the next character, calculate the next state with
pre-calculated follows and cache the result. */
--
2.7.4