The mbrtoc32 function, newly added in Cygwin 3.5.0, is buggy.

These two patches provide a workaround (at the cost of deactivating
the GB18030 locale support in Cygwin), and add a two test cases.


2024-05-23  Bruno Haible  <br...@clisp.org>

        mbrtoc32: Strengthen tests.
        * tests/test-mbrtoc32.c (main): Add tests for one-by-one input in the
        UTF-8 and GB18030 encodings.

        mbrtoc32: Work around bug in Cygwin 3.5.3.
        * m4/mbrtoc32.m4 (gl_MBRTOC32_UTF8_LOCALE): New macro.
        (gl_FUNC_MBRTOC32): Invoke it. If mbrtoc32 has this bug, define
        MBRTOC32_MULTIBYTE_LOCALE_BUG and reset LOCALE_ZH_CN to 'none'.
        * lib/mbrtoc32.c (mbrtoc32): Test MBRTOC32_MULTIBYTE_LOCALE_BUG.
        * doc/posix-functions/mbrtoc32.texi: Mention the Cygwin bug.

>From 2b8bee901a7d508a6eb2c7e3d173cae6ee4b60b2 Mon Sep 17 00:00:00 2001
From: Bruno Haible <br...@clisp.org>
Date: Thu, 23 May 2024 23:46:52 +0200
Subject: [PATCH 1/2] mbrtoc32: Work around bug in Cygwin 3.5.3.

* m4/mbrtoc32.m4 (gl_MBRTOC32_UTF8_LOCALE): New macro.
(gl_FUNC_MBRTOC32): Invoke it. If mbrtoc32 has this bug, define
MBRTOC32_MULTIBYTE_LOCALE_BUG and reset LOCALE_ZH_CN to 'none'.
* lib/mbrtoc32.c (mbrtoc32): Test MBRTOC32_MULTIBYTE_LOCALE_BUG.
* doc/posix-functions/mbrtoc32.texi: Mention the Cygwin bug.
---
 ChangeLog                         |  9 +++++
 doc/posix-functions/mbrtoc32.texi |  6 ++++
 lib/mbrtoc32.c                    |  2 +-
 m4/mbrtoc32.m4                    | 59 ++++++++++++++++++++++++++++++-
 4 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index ea0e3fa766..cfe2da79ae 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2024-05-23  Bruno Haible  <br...@clisp.org>
+
+	mbrtoc32: Work around bug in Cygwin 3.5.3.
+	* m4/mbrtoc32.m4 (gl_MBRTOC32_UTF8_LOCALE): New macro.
+	(gl_FUNC_MBRTOC32): Invoke it. If mbrtoc32 has this bug, define
+	MBRTOC32_MULTIBYTE_LOCALE_BUG and reset LOCALE_ZH_CN to 'none'.
+	* lib/mbrtoc32.c (mbrtoc32): Test MBRTOC32_MULTIBYTE_LOCALE_BUG.
+	* doc/posix-functions/mbrtoc32.texi: Mention the Cygwin bug.
+
 2024-05-23  Bruno Haible  <br...@clisp.org>
 
 	sethostname tests: Avoid test failure on Cygwin.
diff --git a/doc/posix-functions/mbrtoc32.texi b/doc/posix-functions/mbrtoc32.texi
index 0e95457eaa..bc1e89fc77 100644
--- a/doc/posix-functions/mbrtoc32.texi
+++ b/doc/posix-functions/mbrtoc32.texi
@@ -33,6 +33,12 @@
 Solaris 11.4, mingw, MSVC 14.
 @c For MSVC this is because it assumes that the input is always UTF-8 encoded.
 @c See https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/mbrtoc16-mbrtoc323
+@item
+This function does not work when it's fed the input bytes one-by-one
+on some platforms:
+@c https://cygwin.com/pipermail/cygwin/2024-May/255989.html
+@c https://cygwin.com/pipermail/cygwin/2024-May/255990.html
+Cygwin 3.5.3.
 @end itemize
 
 Portability problems fixed by Gnulib module @code{mbrtoc32-regular}:
diff --git a/lib/mbrtoc32.c b/lib/mbrtoc32.c
index 56e4a86011..d97912876c 100644
--- a/lib/mbrtoc32.c
+++ b/lib/mbrtoc32.c
@@ -117,7 +117,7 @@ mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps)
   if (ps == NULL)
     ps = &internal_state;
 
-# if HAVE_WORKING_MBRTOC32
+# if HAVE_WORKING_MBRTOC32 && !MBRTOC32_MULTIBYTE_LOCALE_BUG
   /* mbrtoc32() may produce different values for wc than mbrtowc().  Therefore
      use mbrtoc32().  */
 
diff --git a/m4/mbrtoc32.m4 b/m4/mbrtoc32.m4
index 6303ec4601..1f1d91cd72 100644
--- a/m4/mbrtoc32.m4
+++ b/m4/mbrtoc32.m4
@@ -1,5 +1,5 @@
 # mbrtoc32.m4
-# serial 18
+# serial 19
 dnl Copyright (C) 2014-2024 Free Software Foundation, Inc.
 dnl This file is free software; the Free Software Foundation
 dnl gives unlimited permission to copy and/or distribute it,
@@ -27,6 +27,7 @@ AC_DEFUN([gl_FUNC_MBRTOC32]
     else
       gl_MBRTOC32_EMPTY_INPUT
       gl_MBRTOC32_C_LOCALE
+      gl_MBRTOC32_UTF8_LOCALE
       case "$gl_cv_func_mbrtoc32_empty_input" in
         *yes) ;;
         *) AC_DEFINE([MBRTOC32_EMPTY_INPUT_BUG], [1],
@@ -41,6 +42,15 @@ AC_DEFUN([gl_FUNC_MBRTOC32]
            REPLACE_MBRTOC32=1
            ;;
       esac
+      case "$gl_cv_func_mbrtoc32_utf8_locale_works" in
+        *yes) ;;
+        *) AC_DEFINE([MBRTOC32_MULTIBYTE_LOCALE_BUG], [1],
+             [Define if the mbrtoc32 function does not accept the input bytes one-by-one.])
+           REPLACE_MBRTOC32=1
+           dnl Our replacement mbrtoc32 can handle UTF-8, but not GB18030.
+           LOCALE_ZH_CN=none
+           ;;
+      esac
     fi
     if test $HAVE_WORKING_MBRTOC32 = 0; then
       REPLACE_MBRTOC32=1
@@ -163,6 +173,53 @@ AC_DEFUN([gl_MBRTOC32_C_LOCALE]
     ])
 ])
 
+dnl Test whether mbrtoc32 works when it's fed the bytes one-by-one in an UTF-8
+dnl locale.
+
+AC_DEFUN([gl_MBRTOC32_UTF8_LOCALE],
+[
+  AC_REQUIRE([AC_CANONICAL_HOST]) dnl for cross-compiles
+  AC_CACHE_CHECK([whether mbrtoc32 works in an UTF-8 locale],
+    [gl_cv_func_mbrtoc32_utf8_locale_works],
+    [AC_RUN_IFELSE(
+       [AC_LANG_PROGRAM(
+          [[#include <locale.h>
+            #ifdef __HAIKU__
+             #include <stdint.h>
+            #endif
+            #include <uchar.h>
+          ]], [[
+            char *locale = setlocale (LC_ALL, "en_US.UTF-8");
+            if (locale)
+              {
+                /* This test fails on Cygwin 3.5.3.  */
+                mbstate_t state = { 0, };
+                char32_t uc = 0xDEADBEEF;
+                /* \360\237\220\203 = U+0001F403 */
+                if (mbrtoc32 (&uc, "\360", 1, &state) != (size_t)-2)
+                  return 1;
+                if (mbrtoc32 (&uc, "\237", 1, &state) != (size_t)-2)
+                  return 2;
+                if (mbrtoc32 (&uc, "\220", 1, &state) != (size_t)-2)
+                  return 3;
+                if (mbrtoc32 (&uc, "\203", 1, &state) != 1)
+                  return 4;
+                if (uc != 0x0001F403)
+                  return 5;
+              }
+            return 0;
+          ]])],
+       [gl_cv_func_mbrtoc32_utf8_locale_works=yes],
+       [gl_cv_func_mbrtoc32_utf8_locale_works=no],
+       [case "$host_os" in
+                   # Guess no on Cygwin.
+          cygwin*) gl_cv_func_mbrtoc32_utf8_locale_works="guessing yes" ;;
+          *)       gl_cv_func_mbrtoc32_utf8_locale_works="$gl_cross_guess_normal" ;;
+        esac
+       ])
+    ])
+])
+
 dnl Test whether mbrtoc32 works not worse than mbrtowc.
 dnl Result is HAVE_WORKING_MBRTOC32.
 
-- 
2.34.1

>From 765ab42d8baeb2bf9b6ff242809df4c064a576bf Mon Sep 17 00:00:00 2001
From: Bruno Haible <br...@clisp.org>
Date: Thu, 23 May 2024 23:58:06 +0200
Subject: [PATCH 2/2] mbrtoc32: Strengthen tests.

* tests/test-mbrtoc32.c (main): Add tests for one-by-one input in the
UTF-8 and GB18030 encodings.
---
 ChangeLog             |  4 ++++
 tests/test-mbrtoc32.c | 54 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index cfe2da79ae..25e150a59a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
 2024-05-23  Bruno Haible  <br...@clisp.org>
 
+	mbrtoc32: Strengthen tests.
+	* tests/test-mbrtoc32.c (main): Add tests for one-by-one input in the
+	UTF-8 and GB18030 encodings.
+
 	mbrtoc32: Work around bug in Cygwin 3.5.3.
 	* m4/mbrtoc32.m4 (gl_MBRTOC32_UTF8_LOCALE): New macro.
 	(gl_FUNC_MBRTOC32): Invoke it. If mbrtoc32 has this bug, define
diff --git a/tests/test-mbrtoc32.c b/tests/test-mbrtoc32.c
index 83312ee4f7..68c601e2e5 100644
--- a/tests/test-mbrtoc32.c
+++ b/tests/test-mbrtoc32.c
@@ -291,6 +291,33 @@ main (int argc, char *argv[])
           ASSERT (wc == '!');
           ASSERT (mbsinit (&state));
         }
+        { /* \360\237\220\203 = U+0001F403 */
+          memset (&state, '\0', sizeof (mbstate_t));
+
+          wc = (char32_t) 0xBADFACE;
+          ret = mbrtoc32 (&wc, "\360", 1, &state);
+          ASSERT (ret == (size_t)(-2));
+          ASSERT (wc == (char32_t) 0xBADFACE);
+          ASSERT (!mbsinit (&state));
+
+          wc = (char32_t) 0xBADFACE;
+          ret = mbrtoc32 (&wc, "\237", 1, &state);
+          ASSERT (ret == (size_t)(-2));
+          ASSERT (wc == (char32_t) 0xBADFACE);
+          ASSERT (!mbsinit (&state));
+
+          wc = (char32_t) 0xBADFACE;
+          ret = mbrtoc32 (&wc, "\220", 1, &state);
+          ASSERT (ret == (size_t)(-2));
+          ASSERT (wc == (char32_t) 0xBADFACE);
+          ASSERT (!mbsinit (&state));
+
+          wc = (char32_t) 0xBADFACE;
+          ret = mbrtoc32 (&wc, "\203", 1, &state);
+          ASSERT (ret == 1);
+          ASSERT (wc == 0x0001F403);
+          ASSERT (mbsinit (&state));
+        }
         return test_exit_status;
 
       case '4':
@@ -434,6 +461,33 @@ main (int argc, char *argv[])
           ASSERT (wc == '!');
           ASSERT (mbsinit (&state));
         }
+        { /* \224\071\311\067 = U+0001F403 */
+          memset (&state, '\0', sizeof (mbstate_t));
+
+          wc = (char32_t) 0xBADFACE;
+          ret = mbrtoc32 (&wc, "\224", 1, &state);
+          ASSERT (ret == (size_t)(-2));
+          ASSERT (wc == (char32_t) 0xBADFACE);
+          ASSERT (!mbsinit (&state));
+
+          wc = (char32_t) 0xBADFACE;
+          ret = mbrtoc32 (&wc, "\071", 1, &state);
+          ASSERT (ret == (size_t)(-2));
+          ASSERT (wc == (char32_t) 0xBADFACE);
+          ASSERT (!mbsinit (&state));
+
+          wc = (char32_t) 0xBADFACE;
+          ret = mbrtoc32 (&wc, "\311", 1, &state);
+          ASSERT (ret == (size_t)(-2));
+          ASSERT (wc == (char32_t) 0xBADFACE);
+          ASSERT (!mbsinit (&state));
+
+          wc = (char32_t) 0xBADFACE;
+          ret = mbrtoc32 (&wc, "\067", 1, &state);
+          ASSERT (ret == 1);
+          ASSERT (wc == 0x0001F403);
+          ASSERT (mbsinit (&state));
+        }
         return test_exit_status;
       }
 
-- 
2.34.1

Reply via email to