Thanks for writing that patch. I installed it in grep master (after tweaking the
commit message a bit) and am marking this bug report as done.
I noticed what appears to be a problem in the patch, in the code:
d->mb_trans[s][mb_index & ~0] = state;
I expect the "0" was intended to be a "1". I attempted to fix this by installing
the attached patch 1, using + rather than & and |, as this was easier for me to
follow and is likely a tiny bit faster anyway.
I also installed the attached patch 2, which ports the resulting dfa.c to C90;
as I understand it Gawk still needs this.
I also installed the attached patch 3, which does some minor refactoring and
cleanup and commentary fixes. As you can see, I am a fan of Leibniz-style
comparison (preferring < to > and <= to >=).
Thanks again for the patch.
From 4bdec4bc3fcc19e0fb199845713b7df01e97d2c0 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Tue, 16 Aug 2016 00:37:27 -0700
Subject: [PATCH 1/3] dfa: fix context newline confusion
* src/dfa.c (transit_state): Fix "... & ~0" that was evidently
intended to be "... & ~1". Do index calculation in a simpler way,
that uses just addition (Bug#21486).
---
src/dfa.c | 15 +++++++++------
1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 65d0975..8dd2536 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3091,8 +3091,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
d->states[s1].mb_trindex = d->mb_trcount++;
}
- size_t mb_index = d->states[s1].mb_trindex << 1 | (context_newline
- ? 1 : 0);
+ size_t mb_index = d->states[s1].mb_trindex * 2;
if (d->mb_trans[s] == NULL)
{
@@ -3100,8 +3099,12 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
for (i = 0; i < 2 * MAX_TRCOUNT; i++)
d->mb_trans[s][i] = -1;
}
- else if (d->mb_trans[s][mb_index] >= 0)
- return d->mb_trans[s][mb_index];
+ else
+ {
+ state = d->mb_trans[s][mb_index + context_newline];
+ if (0 <= state)
+ return state;
+ }
if (s < 0)
copy (&d->states[s1].mbps, &d->mb_follows);
@@ -3116,8 +3119,8 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
state_newline = state;
realloc_trans_if_necessary (d, state_newline);
- d->mb_trans[s][mb_index & ~0] = state;
- d->mb_trans[s][mb_index | 1] = state_newline;
+ d->mb_trans[s][mb_index] = state;
+ d->mb_trans[s][mb_index + 1] = state_newline;
return context_newline ? state_newline : state;
}
--
2.5.5
From fe4b6367026585eaa5011c594e388c4856e6d8f0 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Tue, 16 Aug 2016 00:37:27 -0700
Subject: [PATCH 2/3] dfa: port to C90
* src/dfa.c (transit_state, dfa_supported, dfamust):
Don't use declarations after statements.
If I recall correctly, gawk still wants to port to C90.
---
src/dfa.c | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 8dd2536..a2ddc9c 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3023,7 +3023,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
state_num s1;
wint_t wc;
int separate_contexts;
- state_num state, state_newline;
+ state_num state, state_newline, mb_index;
size_t i, j;
int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
@@ -3091,7 +3091,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
d->states[s1].mb_trindex = d->mb_trcount++;
}
- size_t mb_index = d->states[s1].mb_trindex * 2;
+ mb_index = d->states[s1].mb_trindex * 2;
if (d->mb_trans[s] == NULL)
{
@@ -3436,7 +3436,8 @@ dfainit (struct dfa *d)
static bool _GL_ATTRIBUTE_PURE
dfa_supported (struct dfa const *d)
{
- for (size_t i = 0; i < d->tindex; i++)
+ size_t i;
+ for (i = 0; i < d->tindex; i++)
{
switch (d->tokens[i])
{
@@ -3891,7 +3892,7 @@ dfamust (struct dfa const *d)
{
must *mp = NULL;
char const *result = "";
- size_t i;
+ size_t i, ri;
bool exact = false;
bool begline = false;
bool endline = false;
@@ -3899,7 +3900,7 @@ dfamust (struct dfa const *d)
bool need_endline = false;
bool case_fold_unibyte = case_fold && MB_CUR_MAX == 1;
- for (size_t ri = 0; ri < d->tindex; ++ri)
+ for (ri = 0; ri < d->tindex; ++ri)
{
token t = d->tokens[ri];
switch (t)
--
2.5.5
From 60975f9b0fe8c788f248dffec055ccee0173662f Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Tue, 16 Aug 2016 00:37:27 -0700
Subject: [PATCH 3/3] dfa: minor refactoring and doc fixes
* NEWS: Improve description of recent change.
* src/dfa.c: Improve commentary. Indent new code (and some
long-existing howlers) more in GNU style.
(dfa_state): Reorder members to make struct smaller on x86.
mb_trindex member is now state_num, not size_t, so that -1 is more
natural; all uses changed.
(struct dfa): Similarly for mb_trcount member.
(state_index): Compute values for new state components before
allocating the state, to make the code easier to understand.
(state_index, dfastate): Prefer A & ~B to other forms like (A & B)
!= A.
(dfastate, build_state, transit_state): In new code, prefer i++ to
++i in for-loop control.
(build_state, transit_state): In new code, prefer < to >.
(transit_state): Add to *PP in one assignment, rather than in a
loop. Prefer !x to x == NULL. Use xmalloc instead of xnmalloc,
since the size is a constant. Do the size calculation as a signed
integer constant expression, so that the compiler diagnoses any
overflow.
(transit_state, free_mbdata): Tune by looping from -1 to N - 1,
rather than from 0 to N - 1 with a separate instance for -1.
(dfaexec_main): Rewrite to avoid side effects in if-part.
(free_mbdata): Simplify.
---
NEWS | 6 +-
src/dfa.c | 192 ++++++++++++++++++++++++++++++++------------------------------
2 files changed, 101 insertions(+), 97 deletions(-)
diff --git a/NEWS b/NEWS
index 413c29a..21db87a 100644
--- a/NEWS
+++ b/NEWS
@@ -4,15 +4,15 @@ GNU grep NEWS -*- outline -*-
** Improvements
- grep now makes cache transition from a state with dot expression to
- speed up matches in non-UTF8 multibyte locales.
-
grep can be much faster now when standard output is /dev/null.
grep -F is now typically much faster when many patterns are given,
as it now uses the Aho-Corasick algorithm instead of the
Commentz-Walter algorithm in that case.
+ In multibyte locales that are not UTF-8, grep now handles leading
+ "." in patterns more efficiently.
+
grep now prints a "FILENAME:LINENO: " prefix when diagnosing an
invalid regular expression that was read from an '-f'-specified file.
diff --git a/src/dfa.c b/src/dfa.c
index a2ddc9c..b64a176 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -111,7 +111,7 @@ to_uchar (char ch)
/* Sometimes characters can only be matched depending on the surrounding
context. Such context decisions depend on what the previous character
was, and the value of the current (lookahead) character. Context
- dependent constraints are encoded as 8 bit integers. Each bit that
+ dependent constraints are encoded as 12-bit integers. Each bit that
is set indicates that the constraint succeeds in the corresponding
context.
@@ -130,7 +130,8 @@ to_uchar (char ch)
#define SUCCEEDS_IN_CONTEXT(constraint, prev, curr) \
((((curr) & CTX_NONE ? OTHER_CONSTRAINT (constraint) : 0) \
| ((curr) & CTX_LETTER ? LETTER_CONSTRAINT (constraint) : 0) \
- | ((curr) & CTX_NEWLINE ? NEWLINE_CONSTRAINT (constraint) : 0)) & (prev))
+ | ((curr) & CTX_NEWLINE ? NEWLINE_CONSTRAINT (constraint) : 0)) \
+ & (prev))
/* The following macros describe what a constraint depends on. */
#define PREV_NEWLINE_CONSTRAINT(constraint) (((constraint) >> 2) & 0x111)
@@ -160,6 +161,10 @@ to_uchar (char ch)
typedef ptrdiff_t token;
+/* States are indexed by state_num values. These are normally
+ nonnegative but -1 is used as a special value. */
+typedef ptrdiff_t state_num;
+
/* Predefined token values. */
enum
{
@@ -282,24 +287,20 @@ typedef struct
size_t hash; /* Hash of the positions of this state. */
position_set elems; /* Positions this state could match. */
unsigned char context; /* Context from previous state. */
+ bool curr_dependent; /* True if the follows of any positions with
+ ANYCHAR depends on the next character's
+ context. */
unsigned short constraint; /* Constraint for this state to accept. */
- bool curr_dependent; /* True if follows of any positions which
- has ANYCHAR depends on context of next
- character. */
token first_end; /* Token value of the first END in elems. */
position_set mbps; /* Positions which can match multibyte
- characters, or the follows, e.g., period.
+ characters or the follows, e.g., period.
Used only if MB_CUR_MAX > 1. */
- size_t mb_trindex; /* Index of this state in MB_TRANS. If
- the state does not have ANYCHAR, this
- value is -1. */
+ state_num mb_trindex; /* Index of this state in MB_TRANS, or
+ negative if the state does not have
+ ANYCHAR. */
} dfa_state;
-/* States are indexed by state_num values. These are normally
- nonnegative but -1 is used as a special value. */
-typedef ptrdiff_t state_num;
-
-/* Maximum of number of transition tables. */
+/* Maximum for any transition table count that exceeds min_trcount. */
enum { MAX_TRCOUNT = 1024 };
/* A bracket operator.
@@ -422,7 +423,7 @@ struct dfa
as not supported word. */
position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
state_num **mb_trans; /* Transition tables for states with ANYCHAR. */
- size_t mb_trcount; /* Number of transition tables for states with
+ state_num mb_trcount; /* Number of transition tables for states with
ANYCHAR that have actually been built. */
};
@@ -991,7 +992,7 @@ parse_bracket_exp (void)
decide the index in dfa->tokens[]. */
/* Initialize work area. */
- work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
+ work_mbc = &dfa->mbcsets[dfa->nmbcsets++];
memset (work_mbc, 0, sizeof *work_mbc);
}
else
@@ -2056,8 +2057,10 @@ static state_num
state_index (struct dfa *d, position_set const *s, int context)
{
size_t hash = 0;
- int constraint;
+ int constraint = 0;
state_num i, j;
+ bool curr_dependent = false;
+ token first_end = 0;
for (i = 0; i < s->nelem; ++i)
hash ^= s->elems[i].index + s->elems[i].constraint;
@@ -2069,8 +2072,7 @@ state_index (struct dfa *d, position_set const *s, int context)
|| context != d->states[i].context)
continue;
for (j = 0; j < s->nelem; ++j)
- if (s->elems[j].constraint
- != d->states[i].elems.elems[j].constraint
+ if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
|| s->elems[j].index != d->states[i].elems.elems[j].index)
break;
if (j == s->nelem)
@@ -2099,46 +2101,46 @@ state_index (struct dfa *d, position_set const *s, int context)
fprintf (stderr, "\n");
#endif
- /* We'll have to create a new state. */
- d->states = maybe_realloc (d->states, d->sindex, &d->salloc,
- sizeof *d->states);
- d->states[i].hash = hash;
- alloc_position_set (&d->states[i].elems, s->nelem);
- copy (s, &d->states[i].elems);
- d->states[i].context = context;
- d->states[i].constraint = 0;
- d->states[i].curr_dependent = false;
- d->states[i].first_end = 0;
- d->states[i].mbps.nelem = 0;
- d->states[i].mbps.elems = NULL;
- d->states[i].mb_trindex = -1;
-
for (j = 0; j < s->nelem; ++j)
{
- constraint = s->elems[j].constraint;
+ int c = s->elems[j].constraint;
if (d->tokens[s->elems[j].index] < 0)
{
- if (SUCCEEDS_IN_CONTEXT (constraint, context, CTX_ANY))
- d->states[i].constraint |= constraint;
- if (!d->states[i].first_end)
- d->states[i].first_end = d->tokens[s->elems[j].index];
+ if (SUCCEEDS_IN_CONTEXT (c, context, CTX_ANY))
+ constraint |= c;
+ if (!first_end)
+ first_end = d->tokens[s->elems[j].index];
}
else if (d->tokens[s->elems[j].index] == BACKREF)
- d->states[i].constraint = NO_CONSTRAINT;
+ constraint = NO_CONSTRAINT;
if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
{
- int acceptable = 0;
- if (SUCCEEDS_IN_CONTEXT (constraint, context, CTX_NEWLINE))
- acceptable |= CTX_NEWLINE;
- if (SUCCEEDS_IN_CONTEXT (constraint, context, CTX_LETTER))
- acceptable |= CTX_LETTER;
- if (SUCCEEDS_IN_CONTEXT (constraint, context, CTX_NONE))
- acceptable |= CTX_NONE;
- if (acceptable != 0 && (acceptable & context) != context)
- d->states[i].curr_dependent = true;
+ int acceptable
+ = ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE)
+ ? CTX_NEWLINE : 0)
+ | (SUCCEEDS_IN_CONTEXT (c, context, CTX_LETTER)
+ ? CTX_LETTER : 0)
+ | (SUCCEEDS_IN_CONTEXT (c, context, CTX_NONE)
+ ? CTX_NONE : 0));
+ curr_dependent |= acceptable && (context & ~acceptable);
}
}
+
+ /* Create a new state. */
+ d->states = maybe_realloc (d->states, d->sindex, &d->salloc,
+ sizeof *d->states);
+ d->states[i].hash = hash;
+ alloc_position_set (&d->states[i].elems, s->nelem);
+ copy (s, &d->states[i].elems);
+ d->states[i].context = context;
+ d->states[i].curr_dependent = curr_dependent;
+ d->states[i].constraint = constraint;
+ d->states[i].first_end = first_end;
+ d->states[i].mbps.nelem = 0;
+ d->states[i].mbps.elems = NULL;
+ d->states[i].mb_trindex = -1;
+
++d->sindex;
return i;
@@ -2589,26 +2591,26 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
else if (d->tokens[pos.index] >= CSET)
copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
else if (d->multibyte && d->tokens[pos.index] == ANYCHAR)
- /* ANYCHAR must match with a single character, so we must put
- it to D->states[s].mbps which contains the positions which
- can match with a single character not a byte. If all
- positions which has ANYCHAR does not depend on context of
- next character, we put the follows instead of it to
- D->states[s].mbps to optimize. */
{
+ /* ANYCHAR must match a single character, so put it to
+ D->states[s].mbps which contains the positions which can
+ match with a single character not a byte. If all
+ positions with ANYCHAR do not depend on the context of
+ the next character, put its follows instead to
+ D->states[s].mbps to optimize. */
if (d->states[s].curr_dependent)
{
if (d->states[s].mbps.nelem == 0)
alloc_position_set (&d->states[s].mbps, 1);
- insert (pos, &(d->states[s].mbps));
+ insert (pos, &d->states[s].mbps);
}
else if (SUCCEEDS_IN_CONTEXT (pos.constraint,
d->states[s].context, CTX_ANY))
{
if (d->states[s].mbps.nelem == 0)
alloc_position_set (&d->states[s].mbps, 1);
- for (j = 0; j < d->follows[pos.index].nelem; ++j)
- insert (d->follows[pos.index].elems[j], &(d->states[s].mbps));
+ for (j = 0; j < d->follows[pos.index].nelem; j++)
+ insert (d->follows[pos.index].elems[j], &d->states[s].mbps);
}
}
else
@@ -2672,7 +2674,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
/* Even an optimizing compiler can't know this for sure. */
charclass_word match = matches[k], label = labels[j][k];
- leftoversf |= leftovers[k] = ~match & label;
+ leftoversf |= leftovers[k] = label & ~match;
matchesf |= matches[k] = match & ~label;
}
@@ -2795,7 +2797,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
separate_contexts = state_separate_contexts (&follows);
/* Find the state(s) corresponding to the union of the follows. */
- if ((separate_contexts & possible_contexts) != possible_contexts)
+ if (possible_contexts & ~separate_contexts)
state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
else
state = -1;
@@ -2917,7 +2919,7 @@ build_state (state_num s, struct dfa *d)
used transition tables will be quickly rebuilt, whereas the ones that
were only needed once or twice will be cleared away. However, do not
clear the initial D->min_trcount states, since they are always used. */
- if (d->trcount >= MAX_TRCOUNT)
+ if (MAX_TRCOUNT <= d->trcount)
{
for (i = d->min_trcount; i < d->tralloc; ++i)
{
@@ -2929,7 +2931,7 @@ build_state (state_num s, struct dfa *d)
if (d->multibyte)
{
- for (i = d->min_trcount; i < d->tralloc; ++i)
+ for (i = d->min_trcount; i < d->tralloc; i++)
{
free (d->mb_trans[i]);
d->mb_trans[i] = NULL;
@@ -3036,19 +3038,18 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
/* Calculate the state which can be reached from the state 's' by
consuming 'mbclen' single bytes from the buffer. */
s1 = s;
- for (i = 0; i < mbclen && s >= 0; i++)
+ for (i = 0; i < mbclen && 0 <= s; i++)
s = transit_state_singlebyte (d, s, pp);
- for (; i < mbclen; i++)
- (*pp)++;
+ *pp += mbclen - i;
if (d->states[s1].curr_dependent)
{
- if (s >= 0)
- copy (&d->states[s].elems, &d->mb_follows);
- else
+ if (s < 0)
d->mb_follows.nelem = 0;
+ else
+ copy (&d->states[s].elems, &d->mb_follows);
- for (i = 0; i < d->states[s1].mbps.nelem; ++i)
+ for (i = 0; i < d->states[s1].mbps.nelem; i++)
{
if (!SUCCEEDS_IN_CONTEXT (d->states[s1].mbps.elems[i].constraint,
d->states[s1].context, context))
@@ -3069,22 +3070,21 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
return s;
}
- /* If all positions which has ANYCHAR does not depend on context of
- next character, calculate next state with pre-calculated follows
- and cache the result. */
- if (d->states[s1].mb_trindex == (size_t) -1)
+ /* If all positions which have ANYCHAR do not depend on the context
+ of the next character, calculate the next state with
+ pre-calculated follows and cache the result. */
+ if (d->states[s1].mb_trindex < 0)
{
- if (d->mb_trcount >= MAX_TRCOUNT)
+ if (MAX_TRCOUNT <= d->mb_trcount)
{
- for (i = 0; i < d->tralloc; ++i)
+ state_num s2;
+ for (s2 = -1; s2 < d->tralloc; s2++)
{
- free (d->mb_trans[i]);
- d->mb_trans[i] = NULL;
+ free (d->mb_trans[s2]);
+ d->mb_trans[s2] = NULL;
}
- free (d->mb_trans[-1]);
- d->mb_trans[-1] = NULL;
- for (i = 0; i < d->sindex; ++i)
+ for (i = 0; i < d->sindex; i++)
d->states[i].mb_trindex = -1;
d->mb_trcount = 0;
}
@@ -3093,9 +3093,11 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
mb_index = d->states[s1].mb_trindex * 2;
- if (d->mb_trans[s] == NULL)
+ if (! d->mb_trans[s])
{
- d->mb_trans[s] = xnmalloc (2 * MAX_TRCOUNT, sizeof *d->mb_trans[s]);
+ enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
+ enum { TRANSALLOC_SIZE = 2 * MAX_TRCOUNT * TRANSPTR_SIZE };
+ d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
for (i = 0; i < 2 * MAX_TRCOUNT; i++)
d->mb_trans[s][i] = -1;
}
@@ -3270,18 +3272,23 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
}
else
{
- if (s == 0 && (t = trans[s]) != NULL)
+ if (s == 0)
{
- while (t[*p] == 0)
- p++;
- s1 = 0;
- s = t[*p++];
+ t = trans[s];
+ if (t)
+ {
+ while (t[*p] == 0)
+ p++;
+ s1 = 0;
+ s = t[*p++];
+ }
}
while ((t = trans[s]) != NULL)
{
s1 = t[*p++];
- if ((t = trans[s1]) == NULL)
+ t = trans[s1];
+ if (! t)
{
state_num tmp = s;
s = s1;
@@ -3404,19 +3411,16 @@ free_mbdata (struct dfa *d)
free (d->multibyte_prop);
for (i = 0; i < d->nmbcsets; ++i)
- {
- struct mb_char_classes *p = &(d->mbcsets[i]);
- free (p->chars);
- }
+ free (d->mbcsets[i].chars);
free (d->mbcsets);
free (d->mb_follows.elems);
if (d->mb_trans)
{
- for (i = 0; i < d->tralloc; ++i)
- free (d->mb_trans[i]);
- free (d->mb_trans[-1]);
+ state_num s;
+ for (s = -1; s < d->tralloc; s++)
+ free (d->mb_trans[s]);
free (d->mb_trans - 1);
}
}
--
2.5.5