The mbrtoc32 function, newly added in Cygwin 3.5.0, is buggy. These two patches provide a workaround (at the cost of deactivating the GB18030 locale support in Cygwin), and add a two test cases.
2024-05-23 Bruno Haible <br...@clisp.org> mbrtoc32: Strengthen tests. * tests/test-mbrtoc32.c (main): Add tests for one-by-one input in the UTF-8 and GB18030 encodings. mbrtoc32: Work around bug in Cygwin 3.5.3. * m4/mbrtoc32.m4 (gl_MBRTOC32_UTF8_LOCALE): New macro. (gl_FUNC_MBRTOC32): Invoke it. If mbrtoc32 has this bug, define MBRTOC32_MULTIBYTE_LOCALE_BUG and reset LOCALE_ZH_CN to 'none'. * lib/mbrtoc32.c (mbrtoc32): Test MBRTOC32_MULTIBYTE_LOCALE_BUG. * doc/posix-functions/mbrtoc32.texi: Mention the Cygwin bug.
>From 2b8bee901a7d508a6eb2c7e3d173cae6ee4b60b2 Mon Sep 17 00:00:00 2001 From: Bruno Haible <br...@clisp.org> Date: Thu, 23 May 2024 23:46:52 +0200 Subject: [PATCH 1/2] mbrtoc32: Work around bug in Cygwin 3.5.3. * m4/mbrtoc32.m4 (gl_MBRTOC32_UTF8_LOCALE): New macro. (gl_FUNC_MBRTOC32): Invoke it. If mbrtoc32 has this bug, define MBRTOC32_MULTIBYTE_LOCALE_BUG and reset LOCALE_ZH_CN to 'none'. * lib/mbrtoc32.c (mbrtoc32): Test MBRTOC32_MULTIBYTE_LOCALE_BUG. * doc/posix-functions/mbrtoc32.texi: Mention the Cygwin bug. --- ChangeLog | 9 +++++ doc/posix-functions/mbrtoc32.texi | 6 ++++ lib/mbrtoc32.c | 2 +- m4/mbrtoc32.m4 | 59 ++++++++++++++++++++++++++++++- 4 files changed, 74 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index ea0e3fa766..cfe2da79ae 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2024-05-23 Bruno Haible <br...@clisp.org> + + mbrtoc32: Work around bug in Cygwin 3.5.3. + * m4/mbrtoc32.m4 (gl_MBRTOC32_UTF8_LOCALE): New macro. + (gl_FUNC_MBRTOC32): Invoke it. If mbrtoc32 has this bug, define + MBRTOC32_MULTIBYTE_LOCALE_BUG and reset LOCALE_ZH_CN to 'none'. + * lib/mbrtoc32.c (mbrtoc32): Test MBRTOC32_MULTIBYTE_LOCALE_BUG. + * doc/posix-functions/mbrtoc32.texi: Mention the Cygwin bug. + 2024-05-23 Bruno Haible <br...@clisp.org> sethostname tests: Avoid test failure on Cygwin. diff --git a/doc/posix-functions/mbrtoc32.texi b/doc/posix-functions/mbrtoc32.texi index 0e95457eaa..bc1e89fc77 100644 --- a/doc/posix-functions/mbrtoc32.texi +++ b/doc/posix-functions/mbrtoc32.texi @@ -33,6 +33,12 @@ Solaris 11.4, mingw, MSVC 14. @c For MSVC this is because it assumes that the input is always UTF-8 encoded. @c See https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/mbrtoc16-mbrtoc323 +@item +This function does not work when it's fed the input bytes one-by-one +on some platforms: +@c https://cygwin.com/pipermail/cygwin/2024-May/255989.html +@c https://cygwin.com/pipermail/cygwin/2024-May/255990.html +Cygwin 3.5.3. @end itemize Portability problems fixed by Gnulib module @code{mbrtoc32-regular}: diff --git a/lib/mbrtoc32.c b/lib/mbrtoc32.c index 56e4a86011..d97912876c 100644 --- a/lib/mbrtoc32.c +++ b/lib/mbrtoc32.c @@ -117,7 +117,7 @@ mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps) if (ps == NULL) ps = &internal_state; -# if HAVE_WORKING_MBRTOC32 +# if HAVE_WORKING_MBRTOC32 && !MBRTOC32_MULTIBYTE_LOCALE_BUG /* mbrtoc32() may produce different values for wc than mbrtowc(). Therefore use mbrtoc32(). */ diff --git a/m4/mbrtoc32.m4 b/m4/mbrtoc32.m4 index 6303ec4601..1f1d91cd72 100644 --- a/m4/mbrtoc32.m4 +++ b/m4/mbrtoc32.m4 @@ -1,5 +1,5 @@ # mbrtoc32.m4 -# serial 18 +# serial 19 dnl Copyright (C) 2014-2024 Free Software Foundation, Inc. dnl This file is free software; the Free Software Foundation dnl gives unlimited permission to copy and/or distribute it, @@ -27,6 +27,7 @@ AC_DEFUN([gl_FUNC_MBRTOC32] else gl_MBRTOC32_EMPTY_INPUT gl_MBRTOC32_C_LOCALE + gl_MBRTOC32_UTF8_LOCALE case "$gl_cv_func_mbrtoc32_empty_input" in *yes) ;; *) AC_DEFINE([MBRTOC32_EMPTY_INPUT_BUG], [1], @@ -41,6 +42,15 @@ AC_DEFUN([gl_FUNC_MBRTOC32] REPLACE_MBRTOC32=1 ;; esac + case "$gl_cv_func_mbrtoc32_utf8_locale_works" in + *yes) ;; + *) AC_DEFINE([MBRTOC32_MULTIBYTE_LOCALE_BUG], [1], + [Define if the mbrtoc32 function does not accept the input bytes one-by-one.]) + REPLACE_MBRTOC32=1 + dnl Our replacement mbrtoc32 can handle UTF-8, but not GB18030. + LOCALE_ZH_CN=none + ;; + esac fi if test $HAVE_WORKING_MBRTOC32 = 0; then REPLACE_MBRTOC32=1 @@ -163,6 +173,53 @@ AC_DEFUN([gl_MBRTOC32_C_LOCALE] ]) ]) +dnl Test whether mbrtoc32 works when it's fed the bytes one-by-one in an UTF-8 +dnl locale. + +AC_DEFUN([gl_MBRTOC32_UTF8_LOCALE], +[ + AC_REQUIRE([AC_CANONICAL_HOST]) dnl for cross-compiles + AC_CACHE_CHECK([whether mbrtoc32 works in an UTF-8 locale], + [gl_cv_func_mbrtoc32_utf8_locale_works], + [AC_RUN_IFELSE( + [AC_LANG_PROGRAM( + [[#include <locale.h> + #ifdef __HAIKU__ + #include <stdint.h> + #endif + #include <uchar.h> + ]], [[ + char *locale = setlocale (LC_ALL, "en_US.UTF-8"); + if (locale) + { + /* This test fails on Cygwin 3.5.3. */ + mbstate_t state = { 0, }; + char32_t uc = 0xDEADBEEF; + /* \360\237\220\203 = U+0001F403 */ + if (mbrtoc32 (&uc, "\360", 1, &state) != (size_t)-2) + return 1; + if (mbrtoc32 (&uc, "\237", 1, &state) != (size_t)-2) + return 2; + if (mbrtoc32 (&uc, "\220", 1, &state) != (size_t)-2) + return 3; + if (mbrtoc32 (&uc, "\203", 1, &state) != 1) + return 4; + if (uc != 0x0001F403) + return 5; + } + return 0; + ]])], + [gl_cv_func_mbrtoc32_utf8_locale_works=yes], + [gl_cv_func_mbrtoc32_utf8_locale_works=no], + [case "$host_os" in + # Guess no on Cygwin. + cygwin*) gl_cv_func_mbrtoc32_utf8_locale_works="guessing yes" ;; + *) gl_cv_func_mbrtoc32_utf8_locale_works="$gl_cross_guess_normal" ;; + esac + ]) + ]) +]) + dnl Test whether mbrtoc32 works not worse than mbrtowc. dnl Result is HAVE_WORKING_MBRTOC32. -- 2.34.1
>From 765ab42d8baeb2bf9b6ff242809df4c064a576bf Mon Sep 17 00:00:00 2001 From: Bruno Haible <br...@clisp.org> Date: Thu, 23 May 2024 23:58:06 +0200 Subject: [PATCH 2/2] mbrtoc32: Strengthen tests. * tests/test-mbrtoc32.c (main): Add tests for one-by-one input in the UTF-8 and GB18030 encodings. --- ChangeLog | 4 ++++ tests/test-mbrtoc32.c | 54 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/ChangeLog b/ChangeLog index cfe2da79ae..25e150a59a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,9 @@ 2024-05-23 Bruno Haible <br...@clisp.org> + mbrtoc32: Strengthen tests. + * tests/test-mbrtoc32.c (main): Add tests for one-by-one input in the + UTF-8 and GB18030 encodings. + mbrtoc32: Work around bug in Cygwin 3.5.3. * m4/mbrtoc32.m4 (gl_MBRTOC32_UTF8_LOCALE): New macro. (gl_FUNC_MBRTOC32): Invoke it. If mbrtoc32 has this bug, define diff --git a/tests/test-mbrtoc32.c b/tests/test-mbrtoc32.c index 83312ee4f7..68c601e2e5 100644 --- a/tests/test-mbrtoc32.c +++ b/tests/test-mbrtoc32.c @@ -291,6 +291,33 @@ main (int argc, char *argv[]) ASSERT (wc == '!'); ASSERT (mbsinit (&state)); } + { /* \360\237\220\203 = U+0001F403 */ + memset (&state, '\0', sizeof (mbstate_t)); + + wc = (char32_t) 0xBADFACE; + ret = mbrtoc32 (&wc, "\360", 1, &state); + ASSERT (ret == (size_t)(-2)); + ASSERT (wc == (char32_t) 0xBADFACE); + ASSERT (!mbsinit (&state)); + + wc = (char32_t) 0xBADFACE; + ret = mbrtoc32 (&wc, "\237", 1, &state); + ASSERT (ret == (size_t)(-2)); + ASSERT (wc == (char32_t) 0xBADFACE); + ASSERT (!mbsinit (&state)); + + wc = (char32_t) 0xBADFACE; + ret = mbrtoc32 (&wc, "\220", 1, &state); + ASSERT (ret == (size_t)(-2)); + ASSERT (wc == (char32_t) 0xBADFACE); + ASSERT (!mbsinit (&state)); + + wc = (char32_t) 0xBADFACE; + ret = mbrtoc32 (&wc, "\203", 1, &state); + ASSERT (ret == 1); + ASSERT (wc == 0x0001F403); + ASSERT (mbsinit (&state)); + } return test_exit_status; case '4': @@ -434,6 +461,33 @@ main (int argc, char *argv[]) ASSERT (wc == '!'); ASSERT (mbsinit (&state)); } + { /* \224\071\311\067 = U+0001F403 */ + memset (&state, '\0', sizeof (mbstate_t)); + + wc = (char32_t) 0xBADFACE; + ret = mbrtoc32 (&wc, "\224", 1, &state); + ASSERT (ret == (size_t)(-2)); + ASSERT (wc == (char32_t) 0xBADFACE); + ASSERT (!mbsinit (&state)); + + wc = (char32_t) 0xBADFACE; + ret = mbrtoc32 (&wc, "\071", 1, &state); + ASSERT (ret == (size_t)(-2)); + ASSERT (wc == (char32_t) 0xBADFACE); + ASSERT (!mbsinit (&state)); + + wc = (char32_t) 0xBADFACE; + ret = mbrtoc32 (&wc, "\311", 1, &state); + ASSERT (ret == (size_t)(-2)); + ASSERT (wc == (char32_t) 0xBADFACE); + ASSERT (!mbsinit (&state)); + + wc = (char32_t) 0xBADFACE; + ret = mbrtoc32 (&wc, "\067", 1, &state); + ASSERT (ret == 1); + ASSERT (wc == 0x0001F403); + ASSERT (mbsinit (&state)); + } return test_exit_status; } -- 2.34.1