Following up further on this, I installed the attached patch into the grep
master on Savannah. This patch shouldn't affect grep's behavior, or
significantly affect its efficiency. The idea is to make the DFA code usable in
multilocale apps, plus it should make the code a bit cleaner even in a
single-locale environment.
From 27eb891acc00d81782ce876520f3d221144facf1 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Wed, 31 Aug 2016 20:16:32 -0700
Subject: [PATCH] dfa: make dfa.c fully thread-safe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This follows up on Zev Weiss’s recent patches to make the DFA code
thread-safe (Bug#24249). It removes the remaining static
variables used by dfa.c. These variables are locale-dependent, so
they would cause problems in multithreaded code where different
threads are in different locales (e.g., via uselocale). I
abstracted most of the variables into a new localeinfo module.
* src/Makefile.am (grep_SOURCES): Add localeinfo.c.
(noinst_HEADERS): Add localeinfo.h.
* src/dfa.c: Include localeinfo.h.
(struct dfa): Remove multibyte member, as it is now part of
localeinfo. New members simple_locale and localeinfo.
Put locale-related members at the end.
(mbrtowc_cache): Remove; now part of dfa->localeinfo.
(charclass_index): Rename back from dfa_charclass_index,
since it's private.
(unibyte_word_constituent): New arg DFA; use its sbctowc member.
(using_utf8, dfa_using_utf8, init_mbrtowc_cache, check_utf8):
Remove; now done by localeinfo members. All uses changed.
(dfasyntax): New localeinfo arg. Move to end to avoid forward decls.
Initialize the entire DFA.
(unibyte_c, check_unibyte_c): Remove; now in simple_locale member.
(using_simple_locale): Now takes bool instead of DFA.
Do the locale check here, rather than in the caller,
as the result is now cached in dfa->simple_locale.
(dfaalloc): Just allocate the DFA. dfasyntax now initializes it.
* src/dfa.h: Add forward decl of struct localeinfo.
Adjust to new dfa.c API.
* src/dfasearch.c (localeinfo): New var, replacing former static
vars like mbrtowc_cache.
* src/localeinfo.c, src/localeinfo.h: New files.
* src/search.h: Include localeinfo.h.
(localeinfo): New decl.
* src/searchutils.c (mbclen_cache, build_mbclen_cache):
Remove. All uses changed to localeinfo.
* tests/Makefile.am (dfa_match_aux_LDADD): Add localeinfo.o.
* tests/dfa-match-aux.c: Include localeinfo.h.
(main): Adjust to changes in DFA API.
---
src/Makefile.am | 4 +-
src/dfa.c | 269 ++++++++++++++++++++++----------------------------
src/dfa.h | 22 +++--
src/dfasearch.c | 6 +-
src/grep.c | 5 +-
src/kwsearch.c | 2 +-
src/localeinfo.c | 66 +++++++++++++
src/localeinfo.h | 47 +++++++++
src/pcresearch.c | 4 +-
src/search.h | 7 +-
src/searchutils.c | 18 ----
tests/Makefile.am | 2 +-
tests/dfa-match-aux.c | 7 +-
13 files changed, 262 insertions(+), 197 deletions(-)
create mode 100644 src/localeinfo.c
create mode 100644 src/localeinfo.h
diff --git a/src/Makefile.am b/src/Makefile.am
index 941384e..2b0ba0f 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -25,9 +25,9 @@ bin_PROGRAMS = grep
bin_SCRIPTS = egrep fgrep
grep_SOURCES = grep.c searchutils.c \
dfa.c dfasearch.c \
- kwset.c kwsearch.c \
+ kwset.c kwsearch.c localeinfo.c \
pcresearch.c
-noinst_HEADERS = grep.h dfa.h kwset.h search.h system.h
+noinst_HEADERS = grep.h dfa.h kwset.h localeinfo.h search.h system.h
# Sometimes, the expansion of $(LIBINTL) includes -lc which may
# include modules defining variables like 'optind', so libgreputils.a
diff --git a/src/dfa.c b/src/dfa.c
index 8451c81..bf8c546 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -24,6 +24,8 @@
#include "dfa.h"
+#include "localeinfo.h"
+
#include <assert.h>
#include <ctype.h>
#include <stdio.h>
@@ -418,14 +420,9 @@ struct dfa
size_t nregexps; /* Count of parallel regexps being built
with dfaparse. */
bool fast; /* The DFA is fast. */
- bool multibyte; /* MB_CUR_MAX > 1. */
token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
mbstate_t mbs; /* Multibyte conversion state. */
- /* dfaexec implementation. */
- char *(*dfaexec) (struct dfa *, char const *, char *,
- bool, size_t *, bool *);
-
/* The following are valid only if MB_CUR_MAX > 1. */
/* The value of multibyte_prop[i] is defined by following rule.
@@ -511,6 +508,21 @@ struct dfa
state_num **mb_trans; /* Transition tables for states with ANYCHAR. */
state_num mb_trcount; /* Number of transition tables for states with
ANYCHAR that have actually been built. */
+
+ /* Information derived from the locale. This is at the end so that
+ a quick memset need not clear it specially. */
+
+ /* dfaexec implementation. */
+ char *(*dfaexec) (struct dfa *, char const *, char *,
+ bool, size_t *, bool *);
+
+ /* The locale is simple, like the C locale. These locales can be
+ processed more efficiently, e.g., the relationship between lower-
+ and upper-case letters is 1-1. */
+ bool simple_locale;
+
+ /* Other cached information derived from the locale. */
+ struct localeinfo localeinfo;
};
/* Some macros for user access to dfa internals. */
@@ -524,13 +536,8 @@ struct dfa
static void regexp (struct dfa *dfa);
-/* A table indexed by byte values that contains the corresponding wide
- character (if any) for that byte. WEOF means the byte is not a
- valid single-byte character. */
-static wint_t mbrtowc_cache[NOTCHAR];
-
/* Store into *PWC the result of converting the leading bytes of the
- multibyte buffer S of length N bytes, using the mbrtowc_cache in *D
+ multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
and updating the conversion state in *D. On conversion error,
convert just a single byte, to WEOF. Return the number of bytes
converted.
@@ -539,7 +546,7 @@ static wint_t mbrtowc_cache[NOTCHAR];
* PWC points to wint_t, not to wchar_t.
* The last arg is a dfa *D instead of merely a multibyte conversion
- state D->mbs. D also contains an mbrtowc_cache for speed.
+ state D->mbs.
* N must be at least 1.
* S[N - 1] must be a sentinel byte.
* Shift encodings are not supported.
@@ -550,7 +557,7 @@ static size_t
mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
{
unsigned char uc = s[0];
- wint_t wc = mbrtowc_cache[uc];
+ wint_t wc = d->localeinfo.sbctowc[uc];
if (wc == WEOF)
{
@@ -727,7 +734,7 @@ maybe_realloc (void *ptr, size_t nitems, size_t *nalloc,
size_t itemsize)
/* In DFA D, find the index of charclass S, or allocate a new one. */
static size_t
-dfa_charclass_index (struct dfa *d, charclass const s)
+charclass_index (struct dfa *d, charclass const s)
{
size_t i;
@@ -742,9 +749,9 @@ dfa_charclass_index (struct dfa *d, charclass const s)
}
static bool
-unibyte_word_constituent (unsigned char c)
+unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
{
- return mbrtowc_cache[c] != WEOF && (isalnum (c) || (c) == '_');
+ return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
}
static int
@@ -752,68 +759,11 @@ char_context (struct dfa const *dfa, unsigned char c)
{
if (c == dfa->syntax.eolbyte)
return CTX_NEWLINE;
- if (unibyte_word_constituent (c))
+ if (unibyte_word_constituent (dfa, c))
return CTX_LETTER;
return CTX_NONE;
}
-/* UTF-8 encoding allows some optimizations that we can't otherwise
- assume in a multibyte encoding. */
-static bool using_utf8;
-
-bool
-dfa_using_utf8 (void)
-{
- return using_utf8;
-}
-
-static void
-init_mbrtowc_cache (void)
-{
- int i;
- for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
- {
- char c = i;
- unsigned char uc = i;
- mbstate_t s = { 0 };
- wchar_t wc;
- mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF;
- }
-}
-
-/* Entry point to set syntax options. */
-void
-dfasyntax (struct dfa *dfa, reg_syntax_t bits, bool fold, unsigned char eol)
-{
- int i;
- dfa->syntax.syntax_bits_set = true;
- dfa->syntax.syntax_bits = bits;
- dfa->syntax.case_fold = fold;
- dfa->syntax.eolbyte = eol;
-
- for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
- {
- unsigned char uc = i;
-
- /* Use mbrtowc_cache to calculate sbit. */
- dfa->syntax.sbit[uc] = char_context (dfa, uc);
- switch (dfa->syntax.sbit[uc])
- {
- case CTX_LETTER:
- setbit (uc, dfa->syntax.letters);
- break;
- case CTX_NEWLINE:
- setbit (uc, dfa->syntax.newline);
- break;
- }
-
- /* POSIX requires that the five bytes in "\n\r./" (including the
- terminating NUL) cannot occur inside a multibyte character. */
- dfa->syntax.never_trail[uc] = (using_utf8 ? (uc & 0xc0) != 0x80
- : strchr ("\n\r./", uc) != NULL);
- }
-}
-
/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
this may happen when folding case in weird Turkish locales where
@@ -842,30 +792,10 @@ setbit_case_fold_c (int b, charclass c)
setbit (i, c);
}
-static void check_utf8 (void)
-{
- wchar_t wc;
- mbstate_t mbs = { 0 };
- using_utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
-}
-
-static bool unibyte_c;
-
-static void check_unibyte_c (void)
-{
- char const *locale = setlocale (LC_ALL, NULL);
- unibyte_c = (!locale
- || STREQ (locale, "C")
- || STREQ (locale, "POSIX"));
-}
-
-/* The current locale is known to be a unibyte locale
- without multicharacter collating sequences and where range
- comparisons simply use the native encoding. These locales can be
- processed more efficiently. */
+/* Return true if the locale compatible with the C locale. */
static bool
-using_simple_locale (struct dfa const *dfa)
+using_simple_locale (bool multibyte)
{
/* The native character set is known to be compatible with
the C locale. The following test isn't perfect, but it's good
@@ -883,7 +813,15 @@ using_simple_locale (struct dfa const *dfa)
&& '}' == 125 && '~' == 126)
};
- return (native_c_charset & !dfa->multibyte) | unibyte_c;
+ if (native_c_charset && !multibyte)
+ return true;
+ else
+ {
+ /* Treat C and POSIX locales as being compatible. Also, treat
+ errors as compatible, as these are invariably from stubs. */
+ char const *loc = setlocale (LC_ALL, NULL);
+ return !loc || strcmp (loc, "C") == 0 || strcmp (loc, "POSIX") == 0;
+ }
}
/* Fetch the next lexical input character. Set C (of type int) to the
@@ -1034,7 +972,7 @@ parse_bracket_exp (struct dfa *dfa)
size_t chars_al;
chars_al = 0;
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
{
dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets,
&dfa->mbcsets_alloc,
@@ -1057,7 +995,7 @@ parse_bracket_exp (struct dfa *dfa)
{
FETCH_WC (dfa, c, wc, _("unbalanced ["));
invert = true;
- known_bracket_exp = using_simple_locale (dfa);
+ known_bracket_exp = dfa->simple_locale;
}
else
invert = false;
@@ -1112,7 +1050,7 @@ parse_bracket_exp (struct dfa *dfa)
if (!pred)
dfaerror (_("invalid character class"));
- if (dfa->multibyte && !pred->single_byte_only)
+ if (dfa->localeinfo.multibyte && !pred->single_byte_only)
known_bracket_exp = false;
else
for (c2 = 0; c2 < NOTCHAR; ++c2)
@@ -1172,9 +1110,9 @@ parse_bracket_exp (struct dfa *dfa)
/* Treat [x-y] as a range if x != y. */
if (wc != wc2 || wc == WEOF)
{
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
known_bracket_exp = false;
- else if (using_simple_locale (dfa))
+ else if (dfa->simple_locale)
{
int ci;
for (ci = c; ci <= c2; ci++)
@@ -1201,7 +1139,7 @@ parse_bracket_exp (struct dfa *dfa)
colon_warning_state |= (c == ':') ? 2 : 4;
- if (!dfa->multibyte)
+ if (!dfa->localeinfo.multibyte)
{
if (dfa->syntax.case_fold)
setbit_case_fold_c (c, ccl);
@@ -1238,22 +1176,22 @@ parse_bracket_exp (struct dfa *dfa)
if (! known_bracket_exp)
return BACKREF;
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
{
work_mbc->invert = invert;
- work_mbc->cset = emptyset (ccl) ? -1 : dfa_charclass_index (dfa, ccl);
+ work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (dfa, ccl);
return MBCSET;
}
if (invert)
{
- assert (!dfa->multibyte);
+ assert (!dfa->localeinfo.multibyte);
notset (ccl);
if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
clrbit ('\n', ccl);
}
- return CSET + dfa_charclass_index (dfa, ccl);
+ return CSET + charclass_index (dfa, ccl);
}
struct lexptr
@@ -1508,7 +1446,7 @@ lex (struct dfa *dfa)
case '.':
if (backslash)
goto normal_char;
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
{
/* In multibyte environment period must match with a single
character not a byte. So we use ANYCHAR. */
@@ -1522,13 +1460,13 @@ lex (struct dfa *dfa)
if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
clrbit ('\0', ccl);
dfa->lex.laststart = false;
- return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+ return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
case 's':
case 'S':
if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
goto normal_char;
- if (!dfa->multibyte)
+ if (!dfa->localeinfo.multibyte)
{
zeroset (ccl);
for (c2 = 0; c2 < NOTCHAR; ++c2)
@@ -1537,7 +1475,7 @@ lex (struct dfa *dfa)
if (c == 'S')
notset (ccl);
dfa->lex.laststart = false;
- return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+ return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
}
/* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1561,16 +1499,16 @@ lex (struct dfa *dfa)
if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
goto normal_char;
- if (!dfa->multibyte)
+ if (!dfa->localeinfo.multibyte)
{
zeroset (ccl);
for (c2 = 0; c2 < NOTCHAR; ++c2)
- if (unibyte_word_constituent (c2))
+ if (unibyte_word_constituent (dfa, c2))
setbit (c2, ccl);
if (c == 'W')
notset (ccl);
dfa->lex.laststart = false;
- return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+ return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
}
/* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1600,14 +1538,14 @@ lex (struct dfa *dfa)
dfa->lex.laststart = false;
/* For multibyte character sets, folding is done in atom. Always
return WCHAR. */
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
return dfa->lex.lasttok = WCHAR;
if (dfa->syntax.case_fold && isalpha (c))
{
zeroset (ccl);
setbit_case_fold_c (c, ccl);
- return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+ return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
}
return dfa->lex.lasttok = c;
@@ -1627,11 +1565,11 @@ addtok_mb (struct dfa *dfa, token t, int mbprop)
{
dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc,
sizeof *dfa->tokens);
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
sizeof *dfa->multibyte_prop);
}
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
dfa->multibyte_prop[dfa->tindex] = mbprop;
dfa->tokens[dfa->tindex++] = t;
@@ -1668,7 +1606,7 @@ static void addtok_wc (struct dfa *dfa, wint_t wc);
static void
addtok (struct dfa *dfa, token t)
{
- if (dfa->multibyte && t == MBCSET)
+ if (dfa->localeinfo.multibyte && t == MBCSET)
{
bool need_or = false;
struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
@@ -1767,7 +1705,7 @@ add_utf8_anychar (struct dfa *dfa)
if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
clrbit ('\0', c);
}
- dfa->utf8_anychar_classes[i] = CSET + dfa_charclass_index (dfa, c);
+ dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, c);
}
/* A valid UTF-8 character is
@@ -1851,7 +1789,7 @@ atom (struct dfa *dfa)
dfa->parse.tok = lex (dfa);
}
- else if (dfa->parse.tok == ANYCHAR && using_utf8)
+ else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
{
/* For UTF-8 expand the period to a series of CSETs that define a valid
UTF-8 character. This avoids using the slow multibyte path. I'm
@@ -1912,7 +1850,7 @@ copytoks (struct dfa *dfa, size_t tindex, size_t ntokens)
{
size_t i;
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
for (i = 0; i < ntokens; ++i)
addtok_mb (dfa, dfa->tokens[tindex + i], dfa->multibyte_prop[tindex +
i]);
else
@@ -1998,7 +1936,7 @@ dfaparse (char const *s, size_t len, struct dfa *d)
d->lex.lasttok = END;
d->lex.laststart = true;
d->lex.parens = 0;
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
d->lex.cur_mb_len = 0;
memset (&d->mbs, 0, sizeof d->mbs);
@@ -2187,7 +2125,7 @@ state_index (struct dfa *d, position_set const *s, int
context)
}
else if (d->tokens[s->elems[j].index] == BACKREF)
constraint = NO_CONSTRAINT;
- if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
+ if (d->localeinfo.multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
{
int acceptable
= ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE)
@@ -2664,7 +2602,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
setbit (d->tokens[pos.index], matches);
else if (d->tokens[pos.index] >= CSET)
copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
- else if (d->multibyte && d->tokens[pos.index] == ANYCHAR)
+ else if (d->localeinfo.multibyte && d->tokens[pos.index] == ANYCHAR)
{
/* ANYCHAR must match a single character, so put it to
D->states[s].mbps which contains the positions which can
@@ -2810,7 +2748,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
state_letter = state;
for (i = 0; i < NOTCHAR; ++i)
- trans[i] = unibyte_word_constituent (i) ? state_letter : state;
+ trans[i] = unibyte_word_constituent (d, i) ? state_letter : state;
trans[d->syntax.eolbyte] = state_newline;
}
else
@@ -2827,7 +2765,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
insert (d->follows[grps[i].elems[j]].elems[k], &follows);
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
/* If a token in follows.elems is not 1st byte of a multibyte
character, or the states of follows must accept the bytes
@@ -2860,7 +2798,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
/* If we are building a searching matcher, throw in the positions
of state 0 as well. */
- if (d->searchflag && (!d->multibyte || !next_isnt_1st_byte))
+ if (d->searchflag && (!d->localeinfo.multibyte || !next_isnt_1st_byte))
{
merge (&d->states[0].elems, &follows, &tmp);
copy (&tmp, &follows);
@@ -2916,7 +2854,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
if (c == d->syntax.eolbyte)
trans[c] = state_newline;
- else if (unibyte_word_constituent (c))
+ else if (unibyte_word_constituent (d, c))
trans[c] = state_letter;
else if (c < NOTCHAR)
trans[c] = state;
@@ -2957,7 +2895,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num
new_state)
d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
realtrans = d->mb_trans ? d->mb_trans - 1 : NULL;
realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
@@ -2969,7 +2907,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num
new_state)
{
d->trans[oldalloc] = NULL;
d->fails[oldalloc] = NULL;
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
d->mb_trans[oldalloc] = NULL;
}
}
@@ -3003,7 +2941,7 @@ build_state (state_num s, struct dfa *d)
}
d->trcount = d->min_trcount;
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
for (i = d->min_trcount; i < d->tralloc; i++)
{
@@ -3454,7 +3392,7 @@ dfaexec_noop (struct dfa *d, char const *begin, char *end,
return (char *) begin;
}
-/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->multibyte),
+/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
but faster and set *BACKREF if the DFA code does not support this
regexp usage. */
@@ -3512,7 +3450,7 @@ dfa_supported (struct dfa const *d)
case ENDWORD:
case LIMWORD:
case NOTLIMWORD:
- if (!d->multibyte)
+ if (!d->localeinfo.multibyte)
continue;
/* fallthrough */
@@ -3530,7 +3468,7 @@ dfaoptimize (struct dfa *d)
size_t i;
bool have_backref = false;
- if (!using_utf8)
+ if (!d->localeinfo.using_utf8)
return;
for (i = 0; i < d->tindex; ++i)
@@ -3560,7 +3498,7 @@ dfaoptimize (struct dfa *d)
}
free_mbdata (d);
- d->multibyte = false;
+ d->localeinfo.multibyte = false;
d->dfaexec = dfaexec_sb;
d->fast = true;
}
@@ -3575,7 +3513,7 @@ dfassbuild (struct dfa *d)
struct dfa *sup = dfaalloc ();
*sup = *d;
- sup->multibyte = false;
+ sup->localeinfo.multibyte = false;
sup->dfaexec = dfaexec_sb;
sup->multibyte_prop = NULL;
sup->mbcsets = NULL;
@@ -3608,7 +3546,7 @@ dfassbuild (struct dfa *d)
case BACKREF:
zeroset (ccl);
notset (ccl);
- sup->tokens[j++] = CSET + dfa_charclass_index (sup, ccl);
+ sup->tokens[j++] = CSET + charclass_index (sup, ccl);
sup->tokens[j++] = STAR;
if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
|| d->tokens[i + 1] == PLUS)
@@ -3619,7 +3557,7 @@ dfassbuild (struct dfa *d)
case ENDWORD:
case LIMWORD:
case NOTLIMWORD:
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
/* These constraints aren't supported in a multibyte locale.
Ignore them in the superset DFA. */
@@ -3636,7 +3574,7 @@ dfassbuild (struct dfa *d)
}
sup->tindex = j;
- if (have_nchar && (have_achar || d->multibyte))
+ if (have_nchar && (have_achar || d->localeinfo.multibyte))
d->superset = sup;
else
{
@@ -3678,7 +3616,7 @@ dfafree (struct dfa *d)
free (d->charclasses);
free (d->tokens);
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
free_mbdata (d);
for (i = 0; i < d->sindex; ++i)
@@ -4200,20 +4138,49 @@ dfamustfree (struct dfamust *dm)
struct dfa *
dfaalloc (void)
{
- struct dfa *d = xzalloc (sizeof *d);
- d->multibyte = MB_CUR_MAX > 1;
- d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb;
- d->fast = !d->multibyte;
- d->lex.cur_mb_len = 1;
- return d;
+ return xmalloc (sizeof (struct dfa));
}
+/* Initialize DFA. */
void
-dfa_init (void)
+dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
+ reg_syntax_t bits, bool fold, unsigned char eol)
{
- check_utf8 ();
- check_unibyte_c ();
- init_mbrtowc_cache ();
+ int i;
+ memset (dfa, 0, offsetof (struct dfa, dfaexec));
+ dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
+ dfa->simple_locale = using_simple_locale (linfo->multibyte);
+ dfa->localeinfo = *linfo;
+
+ dfa->fast = !dfa->localeinfo.multibyte;
+
+ dfa->lex.cur_mb_len = 1;
+ dfa->syntax.syntax_bits_set = true;
+ dfa->syntax.syntax_bits = bits;
+ dfa->syntax.case_fold = fold;
+ dfa->syntax.eolbyte = eol;
+
+ for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+ {
+ unsigned char uc = i;
+
+ dfa->syntax.sbit[uc] = char_context (dfa, uc);
+ switch (dfa->syntax.sbit[uc])
+ {
+ case CTX_LETTER:
+ setbit (uc, dfa->syntax.letters);
+ break;
+ case CTX_NEWLINE:
+ setbit (uc, dfa->syntax.newline);
+ break;
+ }
+
+ /* POSIX requires that the five bytes in "\n\r./" (including the
+ terminating NUL) cannot occur inside a multibyte character. */
+ dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
+ ? (uc & 0xc0) != 0x80
+ : strchr ("\n\r./", uc) != NULL);
+ }
}
/* vim:set shiftwidth=2: */
diff --git a/src/dfa.h b/src/dfa.h
index 585390a..31baf7a 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -24,6 +24,8 @@
#include "xalloc.h" /* for _GL_ATTRIBUTE_MALLOC */
+struct localeinfo; /* See localeinfo.h. */
+
/* Element of a list of strings, at least one of which is known to
appear in any R.E. matching the DFA. */
struct dfamust
@@ -44,17 +46,22 @@ struct dfa;
calling dfafree() on it. */
extern struct dfa *dfaalloc (void) _GL_ATTRIBUTE_MALLOC;
+/* Initialize or reinitialize a DFA. This must be called before
+ any of the routines below. The arguments are:
+ 1. The DFA to operate on.
+ 2. Information about the current locale.
+ 3. The syntax bits described earlier in this file.
+ 4. The case-folding flag.
+ 5. The line terminator. */
+extern void dfasyntax (struct dfa *, struct localeinfo const *,
+ reg_syntax_t, bool, unsigned char);
+
/* Build and return the struct dfamust from the given struct dfa. */
extern struct dfamust *dfamust (struct dfa const *);
/* Free the storage held by the components of a struct dfamust. */
extern void dfamustfree (struct dfamust *);
-/* dfasyntax() takes four arguments; the first is the dfa to operate on, the
- second sets the syntax bits described earlier in this file, the third sets
- the case-folding flag, and the fourth specifies the line terminator. */
-extern void dfasyntax (struct dfa *, reg_syntax_t, bool, unsigned char);
-
/* Compile the given string of the given length into the given struct dfa.
Final argument is a flag specifying whether to build a searching or an
exact matcher. */
@@ -99,8 +106,3 @@ extern void dfawarn (const char *);
takes a single argument, a NUL-terminated string describing the error.
The user must supply a dfaerror. */
extern _Noreturn void dfaerror (const char *);
-
-extern bool dfa_using_utf8 (void) _GL_ATTRIBUTE_PURE;
-
-/* This must be called before calling any of the above dfa*() functions. */
-extern void dfa_init (void);
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 10c4f51..c2e0177 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -22,6 +22,8 @@
#include "intprops.h"
#include "search.h"
+struct localeinfo localeinfo;
+
/* Whether -w considers WC to be a word constituent. */
static bool
wordchar (wint_t wc)
@@ -128,7 +130,7 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t
syntax_bits)
if (match_icase)
syntax_bits |= RE_ICASE;
re_set_syntax (syntax_bits);
- dfasyntax (dfa, syntax_bits, match_icase, eolbyte);
+ dfasyntax (dfa, &localeinfo, syntax_bits, match_icase, eolbyte);
/* For GNU regex, pass the patterns separately to detect errors like
"[\nallo\n]\n", where the patterns are "[", "allo" and "]", and
@@ -277,7 +279,7 @@ EGexecute (char *buf, size_t size, size_t *match_size,
if (exact_kwset_match)
{
- if (MB_CUR_MAX == 1 || dfa_using_utf8 ())
+ if (MB_CUR_MAX == 1 || localeinfo.using_utf8)
goto success;
if (mb_start < beg)
mb_start = beg;
diff --git a/src/grep.c b/src/grep.c
index 0c84b2a..fc22c7b 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -642,7 +642,7 @@ initialize_unibyte_mask (void)
unsigned char mask = 0;
int ms1b = 1;
for (int i = 1; i <= UCHAR_MAX; i++)
- if ((mbclen_cache[i] != 1) & ! (mask & i))
+ if ((localeinfo.sbclen[i] != 1) & ! (mask & i))
{
while (ms1b * 2 <= i)
ms1b *= 2;
@@ -2344,7 +2344,7 @@ main (int argc, char **argv)
textdomain (PACKAGE);
#endif
- dfa_init ();
+ init_localeinfo (&localeinfo);
atexit (clean_up_stdout);
@@ -2726,7 +2726,6 @@ main (int argc, char **argv)
else
usage (EXIT_TROUBLE);
- build_mbclen_cache ();
initialize_unibyte_mask ();
/* In a unibyte locale, switch from fgrep to grep if
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 57fd4d7..508ebc5 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -93,7 +93,7 @@ Fexecute (char *buf, size_t size, size_t *match_size,
mb_check = longest = false;
else
{
- mb_check = MB_CUR_MAX > 1 && !dfa_using_utf8 ();
+ mb_check = MB_CUR_MAX > 1 && !localeinfo.using_utf8;
longest = mb_check | !!start_ptr | match_words;
}
diff --git a/src/localeinfo.c b/src/localeinfo.c
new file mode 100644
index 0000000..329d431
--- /dev/null
+++ b/src/localeinfo.c
@@ -0,0 +1,66 @@
+/* locale information
+
+ Copyright 2016 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+ 02110-1301, USA. */
+
+/* Written by Paul Eggert. */
+
+#include <config.h>
+
+#include <localeinfo.h>
+
+#include <verify.h>
+
+#include <limits.h>
+#include <locale.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* The sbclen implementation relies on this. */
+verify (MB_LEN_MAX <= SCHAR_MAX);
+
+/* Return true if the locale uses UTF-8. */
+
+static bool
+is_using_utf8 (void)
+{
+ wchar_t wc;
+ mbstate_t mbs = {0};
+ return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
+}
+
+/* Initialize *LOCALEINFO from the current locale. */
+
+void
+init_localeinfo (struct localeinfo *localeinfo)
+{
+ int i;
+
+ localeinfo->multibyte = MB_CUR_MAX > 1;
+ localeinfo->using_utf8 = is_using_utf8 ();
+
+ for (i = CHAR_MIN; i <= CHAR_MAX; i++)
+ {
+ char c = i;
+ unsigned char uc = i;
+ mbstate_t s = {0};
+ wchar_t wc;
+ size_t len = mbrtowc (&wc, &c, 1, &s);
+ localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len;
+ localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
+ }
+}
diff --git a/src/localeinfo.h b/src/localeinfo.h
new file mode 100644
index 0000000..70b55a8
--- /dev/null
+++ b/src/localeinfo.h
@@ -0,0 +1,47 @@
+/* locale information
+
+ Copyright 2016 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+ 02110-1301, USA. */
+
+/* Written by Paul Eggert. */
+
+#include <limits.h>
+#include <stdbool.h>
+#include <wchar.h>
+
+struct localeinfo
+{
+ /* MB_CUR_MAX > 1. */
+ bool multibyte;
+
+ /* The locale uses UTF-8. */
+ bool using_utf8;
+
+ /* An array indexed by byte values B that contains 1 if B is a
+ single-byte character, -1 if B is an encoding error, and -2 if B
+ is the leading byte of a multibyte character that contains more
+ than one byte. */
+ signed char sbclen[UCHAR_MAX + 1];
+
+ /* An array indexed by byte values B that contains the corresponding
+ wide character (if any) for B if sbclen[B] == 1. WEOF means the
+ byte is not a valid single-byte character, i.e., sbclen[B] == -1
+ or -2. */
+ wint_t sbctowc[UCHAR_MAX + 1];
+};
+
+extern void init_localeinfo (struct localeinfo *);
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 3f76603..9ffa22a 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -114,7 +114,7 @@ Pcompile (char const *pattern, size_t size)
if (1 < MB_CUR_MAX)
{
- if (! dfa_using_utf8 ())
+ if (! localeinfo.using_utf8)
error (EXIT_TROUBLE, 0,
_("-P supports only unibyte and UTF-8 locales"));
multibyte_locale = true;
@@ -254,7 +254,7 @@ Pexecute (char *buf, size_t size, size_t *match_size,
/* Skip past bytes that are easily determined to be encoding
errors, treating them as data that cannot match. This is
faster than having pcre_exec check them. */
- while (mbclen_cache[to_uchar (*p)] == (size_t) -1)
+ while (localeinfo.sbclen[to_uchar (*p)] == -1)
{
p++;
subject = p;
diff --git a/src/search.h b/src/search.h
index 7dc1940..431a67d 100644
--- a/src/search.h
+++ b/src/search.h
@@ -33,6 +33,7 @@
#include "dfa.h"
#include "kwset.h"
#include "xalloc.h"
+#include "localeinfo.h"
_GL_INLINE_HEADER_BEGIN
#ifndef SEARCH_INLINE
@@ -47,14 +48,12 @@ typedef signed char mb_len_map_t;
/* searchutils.c */
extern void kwsinit (kwset_t *);
-
-extern void build_mbclen_cache (void);
-extern size_t mbclen_cache[];
extern ptrdiff_t mb_goback (char const **, char const *, char const *);
extern wint_t mb_prev_wc (char const *, char const *, char const *);
extern wint_t mb_next_wc (char const *, char const *);
/* dfasearch.c */
+extern struct localeinfo localeinfo;
extern void GEAcompile (char const *, size_t, reg_syntax_t);
extern size_t EGexecute (char *, size_t, size_t *, char const *);
@@ -73,7 +72,7 @@ extern size_t Pexecute (char *, size_t, size_t *, char const
*);
SEARCH_INLINE size_t
mb_clen (char const *s, size_t n, mbstate_t *mbs)
{
- size_t len = mbclen_cache[to_uchar (*s)];
+ size_t len = localeinfo.sbclen[to_uchar (*s)];
return len == (size_t) -2 ? mbrlen (s, n, mbs) : len;
}
diff --git a/src/searchutils.c b/src/searchutils.c
index d25e5f8..8081d41 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -24,8 +24,6 @@
#define NCHAR (UCHAR_MAX + 1)
-size_t mbclen_cache[NCHAR];
-
void
kwsinit (kwset_t *kwset)
{
@@ -46,22 +44,6 @@ kwsinit (kwset_t *kwset)
xalloc_die ();
}
-/* Initialize a cache of mbrlen values for each of its 1-byte inputs. */
-void
-build_mbclen_cache (void)
-{
- int i;
-
- for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
- {
- char c = i;
- unsigned char uc = i;
- mbstate_t mbs = { 0 };
- size_t len = mbrlen (&c, 1, &mbs);
- mbclen_cache[uc] = len ? len : 1;
- }
-}
-
/* In the buffer *MB_START, return the number of bytes needed to go
back from CUR to the previous boundary, where a "boundary" is the
start of a multibyte character or is an error-encoding byte. The
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 77502ca..355f44e 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -42,7 +42,7 @@ AM_CFLAGS = $(WARN_CFLAGS) $(WERROR_CFLAGS)
# Tell the linker to omit references to unused shared libraries.
AM_LDFLAGS = $(IGNORE_UNUSED_LIBRARIES_CFLAGS)
LDADD = ../lib/libgreputils.a $(LIBINTL) ../lib/libgreputils.a
-dfa_match_aux_LDADD = ../src/dfa.$(OBJEXT) $(LDADD)
+dfa_match_aux_LDADD = ../src/dfa.$(OBJEXT) ../src/localeinfo.$(OBJEXT) $(LDADD)
# The triple-backref test is expected to fail with both the system
# matcher (i.e., with glibc) and with the included matcher.
diff --git a/tests/dfa-match-aux.c b/tests/dfa-match-aux.c
index e651735..e001b7d 100644
--- a/tests/dfa-match-aux.c
+++ b/tests/dfa-match-aux.c
@@ -24,6 +24,7 @@
#include <string.h>
#include <regex.h>
#include <dfa.h>
+#include <localeinfo.h>
#include "progname.h"
@@ -47,17 +48,17 @@ main (int argc, char **argv)
struct dfa *dfa;
char *beg, *end, *p;
int allow_nl;
+ struct localeinfo localeinfo;
set_program_name (argv[0]);
if (argc < 3)
exit (EXIT_FAILURE);
setlocale (LC_ALL, "");
-
- dfa_init ();
+ init_localeinfo (&localeinfo);
dfa = dfaalloc ();
- dfasyntax (dfa, RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n');
+ dfasyntax (dfa, &localeinfo, RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n');
dfacomp (argv[1], strlen (argv[1]), dfa, 0);
beg = argv[2];
--
2.7.4