On 04/18/2016 08:05 AM, Paul Eggert wrote:
'm afraid someone with access to MirBSD will need to debug it.

On second thought there is a simpler fix: stop using btowc. I installed the attached patch, which is a good idea anyway. By using only mbrtowc (which we need to use anyway), it avoids problems on misconfigured systems like MirOS BSD where btowc disagrees with mbrtowc.

After writing and debugging this patch I looked at Gawk and noticed that it already has its own equivalent of this patch's new mbrtowc_cache variable. Gawk obtains its cache via btowc; although this doesn't work on MirOS BSD due to its buggy btowc, Arnold says he's not worried about MirOS BSD any more which is quite understandable. Still, it's a bit odd to have two caches in Gawk that do the same thing; perhaps we can unify them at some point.
From c1db4a618b21fd6e3cedb3f6817ade5f47a62d26 Mon Sep 17 00:00:00 2001
From: Paul Eggert <eggert@cs.ucla.edu>
Date: Tue, 19 Apr 2016 08:54:32 -0700
Subject: [PATCH] dfa: remove dependency on btowc

MirOS BSD btowc is a macro that (when GCC is being used) hardcodes
btowc (0x80) == WEOF regardless of locale, which contradicts
future POSIX in the C locale.  Instead of bothering to develop a
Gnulib workaround for the btowc incompatibility, use mbrtowc,
which we are using elsewhere and fixing anyway, and are caching so
it is fast here.  Problem reported by Nelson H. F. Beebe via Jim
Meyering in: http://bugs.gnu.org/23269#14
* bootstrap.conf (gnulib_modules): Remove btowc.
* src/dfa.c (struct dfa): Remove mbrtowc_cache member, replacing with ...
(mbrtowc_cache): ... this new static var.  All uses changed.
(dfambcache): Remove; now done by setsyntax.  Call removed.
(is_valid_unibyte_character): Remove.
(IS_WORD_CONSTITUENT): Remove this macro, replacing it with ...
(unibyte_word_constituent): ... this new function.  It uses
mbrtowc_cache rather than btowc.
(dfasyntax): Initialize mbrtowc_cache before using it.
---
 bootstrap.conf |  1 -
 src/dfa.c      | 70 ++++++++++++++++++++++------------------------------------
 2 files changed, 27 insertions(+), 44 deletions(-)

diff --git a/bootstrap.conf b/bootstrap.conf
index 3bff7c3..9e76131 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -27,7 +27,6 @@ alloca
 announce-gen
 argmatch
 binary-io
-btowc
 c-ctype
 closeout
 do-release-commit-and-tag
diff --git a/src/dfa.c b/src/dfa.c
index adc5de3..98ee4ac 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -350,11 +350,6 @@ struct dfa
    */
   int *multibyte_prop;
 
-  /* A table indexed by byte values that contains the corresponding wide
-     character (if any) for that byte.  WEOF means the byte is not a
-     valid single-byte character.  */
-  wint_t mbrtowc_cache[NOTCHAR];
-
   /* Array of the bracket expression in the DFA.  */
   struct mb_char_classes *mbcsets;
   size_t nmbcsets;
@@ -431,19 +426,10 @@ struct dfa
 
 static void regexp (void);
 
-static void
-dfambcache (struct dfa *d)
-{
-  int i;
-  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
-    {
-      char c = i;
-      unsigned char uc = i;
-      mbstate_t s = { 0 };
-      wchar_t wc;
-      d->mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF;
-    }
-}
+/* A table indexed by byte values that contains the corresponding wide
+   character (if any) for that byte.  WEOF means the byte is not a
+   valid single-byte character.  */
+static wint_t mbrtowc_cache[NOTCHAR];
 
 /* Store into *PWC the result of converting the leading bytes of the
    multibyte buffer S of length N bytes, using the mbrtowc_cache in *D
@@ -466,7 +452,7 @@ static size_t
 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
 {
   unsigned char uc = s[0];
-  wint_t wc = d->mbrtowc_cache[uc];
+  wint_t wc = mbrtowc_cache[uc];
 
   if (wc == WEOF)
     {
@@ -671,25 +657,18 @@ static charclass letters;
 /* Set of characters that are newline.  */
 static charclass newline;
 
-/* Add this to the test for whether a byte is word-constituent, since on
-   BSD-based systems, many values in the 128..255 range are classified as
-   alphabetic, while on glibc-based systems, they are not.  */
-#ifdef __GLIBC__
-# define is_valid_unibyte_character(c) 1
-#else
-# define is_valid_unibyte_character(c) (btowc (c) != WEOF)
-#endif
-
-/* C is a "word-constituent" byte.  */
-#define IS_WORD_CONSTITUENT(C) \
-  (is_valid_unibyte_character (C) && (isalnum (C) || (C) == '_'))
+static bool
+unibyte_word_constituent (unsigned char c)
+{
+  return mbrtowc_cache[c] != WEOF && (isalnum (c) || (c) == '_');
+}
 
 static int
 char_context (unsigned char c)
 {
   if (c == eolbyte)
     return CTX_NEWLINE;
-  if (IS_WORD_CONSTITUENT (c))
+  if (unibyte_word_constituent (c))
     return CTX_LETTER;
   return CTX_NONE;
 }
@@ -708,23 +687,29 @@ wchar_context (wint_t wc)
 void
 dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
 {
-  unsigned int i;
-
+  int i;
   syntax_bits_set = 1;
   syntax_bits = bits;
   case_fold = fold != 0;
   eolbyte = eol;
 
-  for (i = 0; i < NOTCHAR; ++i)
+  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
     {
-      sbit[i] = char_context (i);
-      switch (sbit[i])
+      char c = i;
+      unsigned char uc = i;
+      mbstate_t s = { 0 };
+      wchar_t wc;
+      mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF;
+
+      /* Now that mbrtowc_cache[uc] is set, use it to calculate sbit.  */
+      sbit[uc] = char_context (uc);
+      switch (sbit[uc])
         {
         case CTX_LETTER:
-          setbit (i, letters);
+          setbit (uc, letters);
           break;
         case CTX_NEWLINE:
-          setbit (i, newline);
+          setbit (uc, newline);
           break;
         }
     }
@@ -1489,7 +1474,7 @@ lex (void)
             {
               zeroset (ccl);
               for (c2 = 0; c2 < NOTCHAR; ++c2)
-                if (IS_WORD_CONSTITUENT (c2))
+                if (unibyte_word_constituent (c2))
                   setbit (c2, ccl);
               if (c == 'W')
                 notset (ccl);
@@ -2714,7 +2699,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         state_letter = state;
 
       for (i = 0; i < NOTCHAR; ++i)
-        trans[i] = (IS_WORD_CONSTITUENT (i)) ? state_letter : state;
+        trans[i] = unibyte_word_constituent (i) ? state_letter : state;
       trans[eolbyte] = state_newline;
     }
   else
@@ -2820,7 +2805,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
 
               if (c == eolbyte)
                 trans[c] = state_newline;
-              else if (IS_WORD_CONSTITUENT (c))
+              else if (unibyte_word_constituent (c))
                 trans[c] = state_letter;
               else if (c < NOTCHAR)
                 trans[c] = state;
@@ -3626,7 +3611,6 @@ void
 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
 {
   dfainit (d);
-  dfambcache (d);
   dfaparse (s, len, d);
   dfassbuild (d);
 
-- 
2.5.5

Reply via email to