On Sun, Apr 10, 2016 at 1:43 AM, Paul Eggert <egg...@cs.ucla.edu> wrote:
> Paul Eggert wrote:
>>
>> I plan to change GNU grep to use this new facility, and to add some grep
>> test
>> cases for this issue.
>
>
> I did that by installing the attached patches into the grep master. This
> fixes the bug for me, so I'm closing the bug report.
>
> These patches mostly just report the fix and add test cases. The actual fix
> was in gnulib, here:
>
> http://git.savannah.gnu.org/cgit/gnulib.git/commit/?id=b7bc3c1a4e78add4cbad39ae1a0c4fb0747b483f
>
> This gnulib fix works around the underyling glibc facility which caused the
> problem, for which I've filed a bug report here:
>
> https://sourceware.org/bugzilla/show_bug.cgi?id=19932
>
> It's not clear when the glibc bug will be fixed. Until it is, one should
> expect similar problems to crop up in applications other than 'grep'.

Thanks for the fine work, Paul.
With this fix, I would like to make yet another grep release.
Does anyone have any pending changes we should consider?

Incidentally, looking at mbrtowc uses, I found an unused
function and removed it with this patch:
From 704de8725fa9df80b0cb74305273acf5dde0b1d7 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyer...@fb.com>
Date: Sun, 10 Apr 2016 12:36:18 -0700
Subject: [PATCH] maint: remove unused mbtoupper function

* src/searchutils.c (mbtoupper): Remove now-unused function.
Also remove inclusion of <assert.h>, since this change removed
the final use of assert.
* src/search.h (mbtoupper): Remove declaration.
---
 src/search.h      |   1 -
 src/searchutils.c | 165 ------------------------------------------------------
 2 files changed, 166 deletions(-)

diff --git a/src/search.h b/src/search.h
index a69bf19..7dc1940 100644
--- a/src/search.h
+++ b/src/search.h
@@ -48,7 +48,6 @@ typedef signed char mb_len_map_t;
 /* searchutils.c */
 extern void kwsinit (kwset_t *);

-extern char *mbtoupper (char const *, size_t *, mb_len_map_t **);
 extern void build_mbclen_cache (void);
 extern size_t mbclen_cache[];
 extern ptrdiff_t mb_goback (char const **, char const *, char const *);
diff --git a/src/searchutils.c b/src/searchutils.c
index 2eab3dc..1f21a0e 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -22,8 +22,6 @@
 #define SYSTEM_INLINE _GL_EXTERN_INLINE
 #include "search.h"

-#include <assert.h>
-
 #define NCHAR (UCHAR_MAX + 1)

 size_t mbclen_cache[NCHAR];
@@ -48,169 +46,6 @@ kwsinit (kwset_t *kwset)
     xalloc_die ();
 }

-/* Convert BEG, an *N-byte string, to uppercase, and write the
-   NUL-terminated result into malloc'd storage.  Upon success, set *N
-   to the length (in bytes) of the resulting string (not including the
-   trailing NUL byte), and return a pointer to the uppercase string.
-   Upon memory allocation failure, exit.  *N must be positive.
-
-   Although this function returns a pointer to malloc'd storage,
-   the caller must not free it, since this function retains a pointer
-   to the buffer and reuses it on any subsequent call.  As a consequence,
-   this function is not thread-safe.
-
-   When each character in the uppercase result string has the same length
-   as the corresponding character in the input string, set *LEN_MAP_P
-   to NULL.  Otherwise, set it to a malloc'd buffer (like the returned
-   buffer, this must not be freed by caller) of the same length as the
-   result string.  (*LEN_MAP_P)[J] is the change in byte-length of the
-   character in BEG that formed byte J of the result as it was converted to
-   uppercase.  It is usually zero.  For lowercase Turkish dotless I it
-   is -1, since the lowercase input occupies two bytes, while the
-   uppercase output occupies only one byte.  For lowercase I in the
-   tr_TR.utf8 locale, it is 1 because the uppercase Turkish dotted I
-   is one byte longer than the original.  When that happens, we have two
-   or more slots in *LEN_MAP_P for each such character.  We store the
-   difference in the first one and 0's in any remaining slots.
-
-   This map is used by the caller to convert offset,length pairs that
-   reference the uppercase result to numbers that refer to the matched
-   part of the original buffer.  */
-
-char *
-mbtoupper (const char *beg, size_t *n, mb_len_map_t **len_map_p)
-{
-  static char *out;
-  static mb_len_map_t *len_map;
-  static size_t outalloc;
-  size_t outlen, mb_cur_max;
-  mbstate_t is, os;
-  const char *end;
-  char *p;
-  mb_len_map_t *m;
-  bool lengths_differ = false;
-
-  if (*n > outalloc || outalloc == 0)
-    {
-      outalloc = MAX (1, *n);
-      out = xrealloc (out, outalloc);
-      len_map = xrealloc (len_map, outalloc);
-    }
-
-  /* appease clang-2.6 */
-  assert (out);
-  assert (len_map);
-  if (*n == 0)
-    return out;
-
-  memset (&is, 0, sizeof (is));
-  memset (&os, 0, sizeof (os));
-  end = beg + *n;
-
-  mb_cur_max = MB_CUR_MAX;
-  p = out;
-  m = len_map;
-  outlen = 0;
-  while (beg < end)
-    {
-      wchar_t wc;
-      size_t mbclen = mbrtowc (&wc, beg, end - beg, &is);
-#ifdef __CYGWIN__
-      /* Handle a UTF-8 sequence for a character beyond the base plane.
-         Cygwin's wchar_t is UTF-16, as in the underlying OS.  This
-         results in surrogate pairs which need some extra attention.  */
-      wint_t wci = 0;
-      if (mbclen == 3 && (wc & 0xdc00) == 0xd800)
-        {
-          /* We got the start of a 4 byte UTF-8 sequence.  This is returned
-             as a UTF-16 surrogate pair.  The first call to mbrtowc returned 3
-             and wc has been set to a high surrogate value, now we're going
-             to fetch the matching low surrogate.  This second call to mbrtowc
-             is supposed to return 1 to complete the 4 byte UTF-8 sequence.  */
-          wchar_t wc_2;
-          size_t mbclen_2 = mbrtowc (&wc_2, beg + mbclen, end - beg - mbclen,
-                                     &is);
-          if (mbclen_2 == 1 && (wc_2 & 0xdc00) == 0xdc00)
-            {
-              /* Match.  Convert this to a 4 byte wint_t which constitutes
-                 a 32-bit UTF-32 value.  */
-              wci = ( (((wint_t) (wc - 0xd800)) << 10)
-                     | ((wint_t) (wc_2 - 0xdc00)))
-                    + 0x10000;
-              ++mbclen;
-            }
-          else
-            {
-              /* Invalid UTF-8 sequence.  */
-              mbclen = (size_t) -1;
-            }
-        }
-#endif
-      if (outlen + mb_cur_max >= outalloc)
-        {
-          size_t dm = m - len_map;
-          out = x2nrealloc (out, &outalloc, 1);
-          len_map = xrealloc (len_map, outalloc);
-          p = out + outlen;
-          m = len_map + dm;
-        }
-
-      if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
-        {
-          /* An invalid sequence, or a truncated multi-octet character.
-             We treat it as a single-octet character.  */
-          *m++ = 0;
-          *p++ = *beg++;
-          outlen++;
-          memset (&is, 0, sizeof (is));
-          memset (&os, 0, sizeof (os));
-        }
-      else
-        {
-          size_t ombclen;
-          beg += mbclen;
-#ifdef __CYGWIN__
-          /* Handle Unicode characters beyond the base plane.  */
-          if (mbclen == 4)
-            {
-              /* towupper, taking wint_t (4 bytes), handles UCS-4 values.  */
-              wci = towupper (wci);
-              if (wci >= 0x10000)
-                {
-                  wci -= 0x10000;
-                  wc = (wci >> 10) | 0xd800;
-                  /* No need to check the return value.  When reading the
-                     high surrogate, the return value will be 0 and only the
-                     mbstate indicates that we're in the middle of reading a
-                     surrogate pair.  The next wcrtomb call reading the low
-                     surrogate will then return 4 and reset the mbstate.  */
-                  wcrtomb (p, wc, &os);
-                  wc = (wci & 0x3ff) | 0xdc00;
-                }
-              else
-                {
-                  wc = (wchar_t) wci;
-                }
-              ombclen = wcrtomb (p, wc, &os);
-            }
-          else
-#endif
-          ombclen = wcrtomb (p, towupper (wc), &os);
-          *m = mbclen - ombclen;
-          memset (m + 1, 0, ombclen - 1);
-          m += ombclen;
-          p += ombclen;
-          outlen += ombclen;
-          lengths_differ |= (mbclen != ombclen);
-        }
-    }
-
-  *len_map_p = lengths_differ ? len_map : NULL;
-  *n = p - out;
-  *p = 0;
-  return out;
-}
-
 /* Initialize a cache of mbrlen values for each of its 1-byte inputs.  */
 void
 build_mbclen_cache (void)
-- 
2.5.5

Reply via email to