* src/dfa.c: Replace utf8 and unibyte_c static local variables with static globals initialized by a new function dfa_init() which must be called before any other dfa*() functions. (dfa_using_utf8): Rename using_utf8() to dfa_using_utf8() for consistency with other exported functions. * src/dfa.h (dfa_using_utf8): Rename using_utf8() to dfa_using_utf8(); also add _GL_ATTRIBUTE_PURE. (dfa_init): New function. * src/grep.c (main), tests/dfa-match-aux.c (main): Call dfa_init(). * src/dfasearch.c (EGexecute), src/kwsearch.c (Fexecute), src/pcresearch.c (Pcompile): Replace using_utf8() with dfa_using_utf8(). --- src/dfa.c | 62 +++++++++++++++++++++++++++------------------------ src/dfa.h | 5 ++++- src/dfasearch.c | 2 +- src/grep.c | 2 ++ src/kwsearch.c | 2 +- src/pcresearch.c | 2 +- tests/dfa-match-aux.c | 2 ++ 7 files changed, 44 insertions(+), 33 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c index ae1b340..970b51f 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -756,6 +756,16 @@ char_context (struct dfa *dfa, unsigned char c) return CTX_NONE; } +/* UTF-8 encoding allows some optimizations that we can't otherwise + assume in a multibyte encoding. */ +static bool using_utf8; + +bool +dfa_using_utf8 (void) +{ + return using_utf8; +} + /* Entry point to set syntax options. */ void dfasyntax (struct dfa *dfa, reg_syntax_t bits, bool fold, unsigned char eol) @@ -788,7 +798,7 @@ dfasyntax (struct dfa *dfa, reg_syntax_t bits, bool fold, unsigned char eol) /* POSIX requires that the five bytes in "\n\r./" (including the terminating NUL) cannot occur inside a multibyte character. */ - dfa->syntax.never_trail[uc] = (using_utf8 () ? (uc & 0xc0) != 0x80 + dfa->syntax.never_trail[uc] = (using_utf8 ? (uc & 0xc0) != 0x80 : strchr ("\n\r./", uc) != NULL); } } @@ -821,21 +831,21 @@ setbit_case_fold_c (int b, charclass c) setbit (i, c); } +static void check_utf8 (void) +{ + wchar_t wc; + mbstate_t mbs = { 0 }; + using_utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100; +} +static bool unibyte_c; -/* UTF-8 encoding allows some optimizations that we can't otherwise - assume in a multibyte encoding. */ -bool -using_utf8 (void) +static void check_unibyte_c (void) { - static int utf8 = -1; - if (utf8 < 0) - { - wchar_t wc; - mbstate_t mbs = { 0 }; - utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100; - } - return utf8; + char const *locale = setlocale (LC_ALL, NULL); + unibyte_c = (!locale + || STREQ (locale, "C") + || STREQ (locale, "POSIX")); } /* The current locale is known to be a unibyte locale @@ -862,20 +872,7 @@ using_simple_locale (struct dfa *dfa) && '}' == 125 && '~' == 126) }; - if (! native_c_charset || dfa->multibyte) - return false; - else - { - static int unibyte_c = -1; - if (unibyte_c < 0) - { - char const *locale = setlocale (LC_ALL, NULL); - unibyte_c = (!locale - || STREQ (locale, "C") - || STREQ (locale, "POSIX")); - } - return unibyte_c; - } + return (!native_c_charset || dfa->multibyte) ? false : unibyte_c; } /* Fetch the next lexical input character. Set C (of type int) to the @@ -1842,7 +1839,7 @@ atom (struct dfa *dfa) dfa->parsestate.tok = lex (dfa); } - else if (dfa->parsestate.tok == ANYCHAR && using_utf8 ()) + else if (dfa->parsestate.tok == ANYCHAR && using_utf8) { /* For UTF-8 expand the period to a series of CSETs that define a valid UTF-8 character. This avoids using the slow multibyte path. I'm @@ -3523,7 +3520,7 @@ dfaoptimize (struct dfa *d) size_t i; bool have_backref = false; - if (!using_utf8 ()) + if (!using_utf8) return; for (i = 0; i < d->tindex; ++i) @@ -4201,4 +4198,11 @@ dfaalloc (void) return d; } +void +dfa_init (void) +{ + check_utf8 (); + check_unibyte_c (); +} + /* vim:set shiftwidth=2: */ diff --git a/src/dfa.h b/src/dfa.h index 014ae96..585390a 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -100,4 +100,7 @@ extern void dfawarn (const char *); The user must supply a dfaerror. */ extern _Noreturn void dfaerror (const char *); -extern bool using_utf8 (void); +extern bool dfa_using_utf8 (void) _GL_ATTRIBUTE_PURE; + +/* This must be called before calling any of the above dfa*() functions. */ +extern void dfa_init (void); diff --git a/src/dfasearch.c b/src/dfasearch.c index 3dbf76b..10c4f51 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -277,7 +277,7 @@ EGexecute (char *buf, size_t size, size_t *match_size, if (exact_kwset_match) { - if (MB_CUR_MAX == 1 || using_utf8 ()) + if (MB_CUR_MAX == 1 || dfa_using_utf8 ()) goto success; if (mb_start < beg) mb_start = beg; diff --git a/src/grep.c b/src/grep.c index a82da61..bd1c5cc 100644 --- a/src/grep.c +++ b/src/grep.c @@ -2351,6 +2351,8 @@ main (int argc, char **argv) textdomain (PACKAGE); #endif + dfa_init (); + atexit (clean_up_stdout); last_recursive = 0; diff --git a/src/kwsearch.c b/src/kwsearch.c index d2afa40..fb77280 100644 --- a/src/kwsearch.c +++ b/src/kwsearch.c @@ -93,7 +93,7 @@ Fexecute (char *buf, size_t size, size_t *match_size, mb_check = longest = false; else { - mb_check = MB_CUR_MAX > 1 && !using_utf8 (); + mb_check = MB_CUR_MAX > 1 && !dfa_using_utf8 (); longest = mb_check || start_ptr || match_words; } diff --git a/src/pcresearch.c b/src/pcresearch.c index f6e72b0..3f76603 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -114,7 +114,7 @@ Pcompile (char const *pattern, size_t size) if (1 < MB_CUR_MAX) { - if (! using_utf8 ()) + if (! dfa_using_utf8 ()) error (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales")); multibyte_locale = true; diff --git a/tests/dfa-match-aux.c b/tests/dfa-match-aux.c index 25b0535..e651735 100644 --- a/tests/dfa-match-aux.c +++ b/tests/dfa-match-aux.c @@ -54,6 +54,8 @@ main (int argc, char **argv) setlocale (LC_ALL, ""); + dfa_init (); + dfa = dfaalloc (); dfasyntax (dfa, RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n'); dfacomp (argv[1], strlen (argv[1]), dfa, 0); -- 2.8.1