On 04/18/2016 08:05 AM, Paul Eggert wrote:
'm afraid someone with access to MirBSD will need to debug it.
On second thought there is a simpler fix: stop using btowc. I installed the attached patch, which is a good idea anyway. By using only mbrtowc (which we need to use anyway), it avoids problems on misconfigured systems like MirOS BSD where btowc disagrees with mbrtowc.
After writing and debugging this patch I looked at Gawk and noticed that it already has its own equivalent of this patch's new mbrtowc_cache variable. Gawk obtains its cache via btowc; although this doesn't work on MirOS BSD due to its buggy btowc, Arnold says he's not worried about MirOS BSD any more which is quite understandable. Still, it's a bit odd to have two caches in Gawk that do the same thing; perhaps we can unify them at some point.
From c1db4a618b21fd6e3cedb3f6817ade5f47a62d26 Mon Sep 17 00:00:00 2001 From: Paul Eggert <eggert@cs.ucla.edu> Date: Tue, 19 Apr 2016 08:54:32 -0700 Subject: [PATCH] dfa: remove dependency on btowc MirOS BSD btowc is a macro that (when GCC is being used) hardcodes btowc (0x80) == WEOF regardless of locale, which contradicts future POSIX in the C locale. Instead of bothering to develop a Gnulib workaround for the btowc incompatibility, use mbrtowc, which we are using elsewhere and fixing anyway, and are caching so it is fast here. Problem reported by Nelson H. F. Beebe via Jim Meyering in: http://bugs.gnu.org/23269#14 * bootstrap.conf (gnulib_modules): Remove btowc. * src/dfa.c (struct dfa): Remove mbrtowc_cache member, replacing with ... (mbrtowc_cache): ... this new static var. All uses changed. (dfambcache): Remove; now done by setsyntax. Call removed. (is_valid_unibyte_character): Remove. (IS_WORD_CONSTITUENT): Remove this macro, replacing it with ... (unibyte_word_constituent): ... this new function. It uses mbrtowc_cache rather than btowc. (dfasyntax): Initialize mbrtowc_cache before using it. --- bootstrap.conf | 1 - src/dfa.c | 70 ++++++++++++++++++++++------------------------------------ 2 files changed, 27 insertions(+), 44 deletions(-) diff --git a/bootstrap.conf b/bootstrap.conf index 3bff7c3..9e76131 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -27,7 +27,6 @@ alloca announce-gen argmatch binary-io -btowc c-ctype closeout do-release-commit-and-tag diff --git a/src/dfa.c b/src/dfa.c index adc5de3..98ee4ac 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -350,11 +350,6 @@ struct dfa */ int *multibyte_prop; - /* A table indexed by byte values that contains the corresponding wide - character (if any) for that byte. WEOF means the byte is not a - valid single-byte character. */ - wint_t mbrtowc_cache[NOTCHAR]; - /* Array of the bracket expression in the DFA. */ struct mb_char_classes *mbcsets; size_t nmbcsets; @@ -431,19 +426,10 @@ struct dfa static void regexp (void); -static void -dfambcache (struct dfa *d) -{ - int i; - for (i = CHAR_MIN; i <= CHAR_MAX; ++i) - { - char c = i; - unsigned char uc = i; - mbstate_t s = { 0 }; - wchar_t wc; - d->mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF; - } -} +/* A table indexed by byte values that contains the corresponding wide + character (if any) for that byte. WEOF means the byte is not a + valid single-byte character. */ +static wint_t mbrtowc_cache[NOTCHAR]; /* Store into *PWC the result of converting the leading bytes of the multibyte buffer S of length N bytes, using the mbrtowc_cache in *D @@ -466,7 +452,7 @@ static size_t mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d) { unsigned char uc = s[0]; - wint_t wc = d->mbrtowc_cache[uc]; + wint_t wc = mbrtowc_cache[uc]; if (wc == WEOF) { @@ -671,25 +657,18 @@ static charclass letters; /* Set of characters that are newline. */ static charclass newline; -/* Add this to the test for whether a byte is word-constituent, since on - BSD-based systems, many values in the 128..255 range are classified as - alphabetic, while on glibc-based systems, they are not. */ -#ifdef __GLIBC__ -# define is_valid_unibyte_character(c) 1 -#else -# define is_valid_unibyte_character(c) (btowc (c) != WEOF) -#endif - -/* C is a "word-constituent" byte. */ -#define IS_WORD_CONSTITUENT(C) \ - (is_valid_unibyte_character (C) && (isalnum (C) || (C) == '_')) +static bool +unibyte_word_constituent (unsigned char c) +{ + return mbrtowc_cache[c] != WEOF && (isalnum (c) || (c) == '_'); +} static int char_context (unsigned char c) { if (c == eolbyte) return CTX_NEWLINE; - if (IS_WORD_CONSTITUENT (c)) + if (unibyte_word_constituent (c)) return CTX_LETTER; return CTX_NONE; } @@ -708,23 +687,29 @@ wchar_context (wint_t wc) void dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) { - unsigned int i; - + int i; syntax_bits_set = 1; syntax_bits = bits; case_fold = fold != 0; eolbyte = eol; - for (i = 0; i < NOTCHAR; ++i) + for (i = CHAR_MIN; i <= CHAR_MAX; ++i) { - sbit[i] = char_context (i); - switch (sbit[i]) + char c = i; + unsigned char uc = i; + mbstate_t s = { 0 }; + wchar_t wc; + mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF; + + /* Now that mbrtowc_cache[uc] is set, use it to calculate sbit. */ + sbit[uc] = char_context (uc); + switch (sbit[uc]) { case CTX_LETTER: - setbit (i, letters); + setbit (uc, letters); break; case CTX_NEWLINE: - setbit (i, newline); + setbit (uc, newline); break; } } @@ -1489,7 +1474,7 @@ lex (void) { zeroset (ccl); for (c2 = 0; c2 < NOTCHAR; ++c2) - if (IS_WORD_CONSTITUENT (c2)) + if (unibyte_word_constituent (c2)) setbit (c2, ccl); if (c == 'W') notset (ccl); @@ -2714,7 +2699,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) state_letter = state; for (i = 0; i < NOTCHAR; ++i) - trans[i] = (IS_WORD_CONSTITUENT (i)) ? state_letter : state; + trans[i] = unibyte_word_constituent (i) ? state_letter : state; trans[eolbyte] = state_newline; } else @@ -2820,7 +2805,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) if (c == eolbyte) trans[c] = state_newline; - else if (IS_WORD_CONSTITUENT (c)) + else if (unibyte_word_constituent (c)) trans[c] = state_letter; else if (c < NOTCHAR) trans[c] = state; @@ -3626,7 +3611,6 @@ void dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) { dfainit (d); - dfambcache (d); dfaparse (s, len, d); dfassbuild (d); -- 2.5.5