On native Windows, in the UTF-8 locale, towlower and towupper don't even support case mappings between ISO-8859-1 characters.
How to reproduce: ======================= foo.c ======================= #include <stdio.h> #include <locale.h> #include <wchar.h> #include <wctype.h> int main (int argc, char *argv[]) { char *loc = setlocale (LC_ALL, "French_France.65001"); if (loc != NULL) printf ("-> %s\n", loc); { wchar_t wc = towlower (0x00C9); printf ("towlower(0x00C9) = 0x%04X\n", (unsigned int) wc); } { wchar_t wc = towlower (0x00E9); printf ("towlower(0x00E9) = 0x%04X\n", (unsigned int) wc); } { wchar_t wc = towupper (0x00C9); printf ("towupper(0x00C9) = 0x%04X\n", (unsigned int) wc); } { wchar_t wc = towupper (0x00E9); printf ("towupper(0x00E9) = 0x%04X\n", (unsigned int) wc); } } ===================================================== Output: -> French_France.utf8 towlower(0x00C9) = 0x00C9 towlower(0x00E9) = 0x00E9 towupper(0x00C9) = 0x00C9 towupper(0x00E9) = 0x00E9 Whereas in an 8-bit locale, it works as expected. Output with "French_France.1252": -> French_France.1252 towlower(0x00C9) = 0x00E9 towlower(0x00E9) = 0x00E9 towupper(0x00C9) = 0x00C9 towupper(0x00E9) = 0x00C9 This is the cause for a test failure that I see with MSVC: FAIL: test-mbscasestr2.sh ========================= C:\cygwin64\home\bruno\testdir-all-for-mingw\gltests\test-mbscasestr2.c:56: assertion 'result == input + 19' failed This patch fixes it. 2024-09-01 Bruno Haible <br...@clisp.org> Fix mbscasestr test failure on native Windows with MSVC. * lib/c32to-impl.h (FUNC): On native Windows, ignore the system's towlower/towupper function entirely. * tests/test-c32tolower.c (main): On native Windows, reenable test that previously failed. * tests/test-c32toupper.c (main): Likewise. Disable two other tests on native Windows. * doc/posix-functions/towlower.texi: Mention bug in the native Windows UTF-8 locale. * doc/posix-functions/towupper.texi: Likewise. diff --git a/doc/posix-functions/towlower.texi b/doc/posix-functions/towlower.texi index 3ac7336ed0..500c8fff1c 100644 --- a/doc/posix-functions/towlower.texi +++ b/doc/posix-functions/towlower.texi @@ -27,6 +27,9 @@ @code{c32tolower}, operates on 32-bit wide characters and therefore does not have this limitation. @item +On native Windows, in an UTF-8 locale, this function does not even do +the simple expected mappings, such as from 0x00C9 to 0x00E9. +@item This function returns wrong values even for the ASCII characters in a zh_CN.GB18030 locale on some platforms: @c https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=57339 diff --git a/doc/posix-functions/towupper.texi b/doc/posix-functions/towupper.texi index 4ce05b946e..860b7ae438 100644 --- a/doc/posix-functions/towupper.texi +++ b/doc/posix-functions/towupper.texi @@ -27,6 +27,9 @@ @code{c32toupper}, operates on 32-bit wide characters and therefore does not have this limitation. @item +On native Windows, in an UTF-8 locale, this function does not even do +the simple expected mappings, such as from 0x00E9 to 0x00C9. +@item This function returns wrong values even for the ASCII characters in a zh_CN.GB18030 locale on some platforms: @c https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=57339 diff --git a/lib/c32to-impl.h b/lib/c32to-impl.h index 32039c612d..2299ab75ba 100644 --- a/lib/c32to-impl.h +++ b/lib/c32to-impl.h @@ -73,11 +73,22 @@ FUNC (wint_t wc) /* The wchar_t encoding is UTF-16. The char32_t encoding is UCS-4. */ +# if defined _WIN32 && !defined __CYGWIN__ + /* On native Windows, in the UTF-8 locale, towlower and towupper are + lacking (at least) the mappings for ISO-8859-1 characters, such as + 0x00C9 <-> 0x00E9. Since it is expensive to test whether the locale + encoding is UTF-8, ignore the system's WCHAR_FUNC altogether. */ + if (wc != WEOF) + return UCS_FUNC (wc); + else + return wc; +# else if (wc == WEOF || wc == (wchar_t) wc) /* wc is in the range for the tow* functions. */ return WCHAR_FUNC (wc); else return UCS_FUNC (wc); +# endif #else /* macOS, FreeBSD, NetBSD, OpenBSD, HP-UX, Solaris, Minix, Android */ /* char32_t and wchar_t are equivalent. */ diff --git a/tests/test-c32tolower.c b/tests/test-c32tolower.c index eb956b5009..072338bde1 100644 --- a/tests/test-c32tolower.c +++ b/tests/test-c32tolower.c @@ -255,12 +255,10 @@ main (int argc, char *argv[]) mb = for_character ("\302\265", 2); ASSERT (mb.nbytes == 2); ASSERT (memcmp (mb.buf, "\302\265", 2) == 0); - #if !(defined _WIN32 && !defined __CYGWIN__) /* U+00C9 LATIN CAPITAL LETTER E WITH ACUTE */ mb = for_character ("\303\211", 2); ASSERT (mb.nbytes == 2); ASSERT (memcmp (mb.buf, "\303\251", 2) == 0); - #endif /* U+00DF LATIN SMALL LETTER SHARP S */ mb = for_character ("\303\237", 2); ASSERT (mb.nbytes == 2); diff --git a/tests/test-c32toupper.c b/tests/test-c32toupper.c index 18c3ffddb5..eb9668afff 100644 --- a/tests/test-c32toupper.c +++ b/tests/test-c32toupper.c @@ -163,7 +163,7 @@ main (int argc, char *argv[]) mb = for_character ("\262", 1); ASSERT (mb.nbytes == 1); ASSERT (memcmp (mb.buf, "\262", 1) == 0); - #if !(defined __GLIBC__ || (defined __APPLE__ && defined __MACH__) || defined __FreeBSD__ || defined __NetBSD__ || defined __sun || defined __CYGWIN__) + #if !(defined __GLIBC__ || (defined __APPLE__ && defined __MACH__) || defined __FreeBSD__ || defined __NetBSD__ || defined __sun || defined __CYGWIN__ || (defined _WIN32 && !defined __CYGWIN__)) /* U+00B5 MICRO SIGN */ mb = for_character ("\265", 1); ASSERT (mb.nbytes == 1); @@ -259,7 +259,7 @@ main (int argc, char *argv[]) mb = for_character ("\302\262", 2); ASSERT (mb.nbytes == 2); ASSERT (memcmp (mb.buf, "\302\262", 2) == 0); - #if !(defined __GLIBC__ || defined MUSL_LIBC || (defined __APPLE__ && defined __MACH__) || defined __FreeBSD__ || defined __DragonFly__ || defined __NetBSD__ || defined __OpenBSD__ || defined _AIX || defined __sun || defined __CYGWIN__ || defined __ANDROID__) + #if !(defined __GLIBC__ || defined MUSL_LIBC || (defined __APPLE__ && defined __MACH__) || defined __FreeBSD__ || defined __DragonFly__ || defined __NetBSD__ || defined __OpenBSD__ || defined _AIX || defined __sun || defined __CYGWIN__ || (defined _WIN32 && !defined __CYGWIN__) || defined __ANDROID__) /* U+00B5 MICRO SIGN */ mb = for_character ("\302\265", 2); ASSERT (mb.nbytes == 2); @@ -275,7 +275,6 @@ main (int argc, char *argv[]) ASSERT (mb.nbytes == 2); ASSERT (memcmp (mb.buf, "\303\237", 2) == 0); #endif - #if !(defined _WIN32 && !defined __CYGWIN__) /* U+00E9 LATIN SMALL LETTER E WITH ACUTE */ mb = for_character ("\303\251", 2); ASSERT (mb.nbytes == 2); @@ -284,7 +283,6 @@ main (int argc, char *argv[]) mb = for_character ("\303\277", 2); ASSERT (mb.nbytes == 2); ASSERT (memcmp (mb.buf, "\305\270", 2) == 0); - #endif /* U+0141 LATIN CAPITAL LETTER L WITH STROKE */ mb = for_character ("\305\201", 2); ASSERT (mb.nbytes == 2);