Lasse Collin wrote: > (1) > In 9f7ff4f423cd ("localename-unsafe: Support the UTF-8 environment on > native Windows."), the N(name) macro is used with strings that include > @modifier. For example, N("az_AZ@cyrillic") can expand to > "az...@cyrillic.utf-8". Similarly in 00211fc69c92 ("setlocale: Support > the UTF-8 environment on native Windows."), ".65001" is appended after > the @modifier. However, the typical order would be az_AZ.UTF-8@cyrillic.
Good point. Fixed through the patch below. > I suppose you had a reason to use .65001 instead of .UTF-8 or .utf8. > I expect identical behavior from those. Yes: There was some period (ca. 5 years ago) when Windows supported the .65001 suffix but not the .utf8 suffix. The ability to use .utf8 to denote code page 65001 was added a bit later. > The MS setlocale() docs use variants of .UTF8: > > > https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/setlocale-wsetlocale?view=msvc-170#utf-8-support Yeah, the docs don't tell you everything (as usual with Microsoft). > (2) > In 2f4391fde862 ("setlocale tests: Test in the UTF-8 environment on > native Windows."), the condition > > (strlen (name) > 6 && strcmp (name + strlen (name) - 6, ".UTF-8") == 0) > > matches the two long strings below it too, making those two extra > strcmp calls redundant. Correct. Still, it's useful to have a writedown of what the output in the legacy mode is. (Unit test code is not optimized for performance.) > (3) > When a manifest is added via a resource file, a possible default > manifest from the toolchain is replaced; they aren't merged. For > example, on MSYS2, the mingw-w64-ucrt-x86_64-gcc package depends on > mingw-w64-ucrt-x86_64-windows-default-manifest. The manifest comes from > Cygwin: > > > https://sourceware.org/git/?p=cygwin-apps/windows-default-manifest.git;a=blob;f=default-manifest.rc > > Omitting the <compatibility> section makes the application run with > Vista as the Operating System Context. Omitting the <trustInfo> section > makes Windows treat the application as not UAC compliant, that is, a > pre-Vista app that needs compatibility tricks. > > Probably these don't matter with the current tests. I suggest changing > it still because it's still an odd combination to have UTF-8 without > marking the app compatible with recent Windows versions. I picked the smallest XML file that has the desired effect. Also, I don't like enumerating Windows versions in this way because it's not future-proof: Since Windows 11, 12, 13, etc. are not listed, it would only be matter of time until the test breaks in a new Windows version. > (4) > The output from windres goes to a file with the .res suffix but the > format is overridden with --output-format=coff. This looks weird > because windres defaults to --output-format=res for files that use the > .res suffix. For coff, the .o suffix would be logical, and > --output-format option wouldn't be needed. Maybe it looks weird, but IIRC it's the best way that I found that does not run into 32-bit / 64-bit problems. For example, if 'windres' is 64-bit but I'm compiling with a 32-bit targetting gcc. (Some toolchain installations have a <triple>-windres program, but some other toolchain installations have only one windres program for all targets.) > If native setlocale(LC_ALL, "") can indeed result in "en_US" or > "en_US.UTF-8", I wonder if it can result in "az-Cyrl_AZ.UTF-8" too. I > don't see how Gnulib or Gettext would map such a locale name to > az_AZ.UTF-8@cyrillic. (az_AZ@cyrillic was the first one with @ in > localename-unsafe.c, thus I looked at that in MS docs too.) That script part is not something I worry about for the rarely used locales. But for Chinese locales, it can be an issue that users will notice (zh-Hans vs. zh-Hant), therefore if someone has time to dig into these cases, Chinese would be the first thing to test. Bruno 2024-12-24 Bruno Haible <br...@clisp.org> localename-unsafe: Improve the Windows UTF-8 environment support. Reported by Lasse Collin <lasse.col...@tukaani.org> in <https://lists.gnu.org/archive/html/bug-gnulib/2024-12/msg00165.html>. * lib/localename-unsafe.c (gl_locale_name_from_win32_LANGID): In locale names with a modifier, insert the codeset part ".UTF-8" before the modifier. diff --git a/lib/localename-unsafe.c b/lib/localename-unsafe.c index 7088616892..dc0a7844aa 100644 --- a/lib/localename-unsafe.c +++ b/lib/localename-unsafe.c @@ -1529,7 +1529,8 @@ gl_locale_name_from_win32_LANGID (LANGID langid) Windows base (e.g. they have different character conversion facilities that produce different results). */ /* Use our own table. */ - #define N(name) (is_utf8 ? name ".UTF-8" : name) + #define N(name) (is_utf8 ? name ".UTF-8" : name) + #define NM(name,modifier) (is_utf8 ? name ".UTF-8" modifier : name modifier) { int primary, sub; @@ -1604,8 +1605,8 @@ gl_locale_name_from_win32_LANGID (LANGID langid) { case 0x1e: return N("az"); case SUBLANG_AZERI_LATIN: return N("az_AZ"); - case 0x1d: return N("az@cyrillic"); - case SUBLANG_AZERI_CYRILLIC: return N("az_AZ@cyrillic"); + case 0x1d: return NM("az","@cyrillic"); + case SUBLANG_AZERI_CYRILLIC: return NM("az_AZ","@cyrillic"); } return N("az"); case LANG_BASHKIR: @@ -1706,17 +1707,17 @@ gl_locale_name_from_win32_LANGID (LANGID langid) case 0x09: return N("sr_RS"); /* latin */ case 0x0b: return N("sr_ME"); /* latin */ case 0x06: return N("sr_BA"); /* latin */ - case 0x1b: return N("sr@cyrillic"); - case SUBLANG_SERBIAN_CYRILLIC: return N("sr_CS@cyrillic"); - case 0x0a: return N("sr_RS@cyrillic"); - case 0x0c: return N("sr_ME@cyrillic"); - case 0x07: return N("sr_BA@cyrillic"); + case 0x1b: return NM("sr","@cyrillic"); + case SUBLANG_SERBIAN_CYRILLIC: return NM("sr_CS","@cyrillic"); + case 0x0a: return NM("sr_RS","@cyrillic"); + case 0x0c: return NM("sr_ME","@cyrillic"); + case 0x07: return NM("sr_BA","@cyrillic"); /* Bosnian */ case 0x1e: return N("bs"); case 0x1a: return N("bs"); /* latin */ case SUBLANG_BOSNIAN_BOSNIA_HERZEGOVINA_LATIN: return N("bs_BA"); /* latin */ - case 0x19: return N("bs@cyrillic"); - case SUBLANG_BOSNIAN_BOSNIA_HERZEGOVINA_CYRILLIC: return N("bs_BA@cyrillic"); + case 0x19: return NM("bs","@cyrillic"); + case SUBLANG_BOSNIAN_BOSNIA_HERZEGOVINA_CYRILLIC: return NM("bs_BA","@cyrillic"); } return N("hr"); case LANG_CZECH: @@ -1959,8 +1960,8 @@ gl_locale_name_from_win32_LANGID (LANGID langid) { case 0x1e: return N("iu"); /* syllabic */ case SUBLANG_INUKTITUT_CANADA: return N("iu_CA"); /* syllabic */ - case 0x1f: return N("iu@latin"); - case SUBLANG_INUKTITUT_CANADA_LATIN: return N("iu_CA@latin"); + case 0x1f: return NM("iu","@latin"); + case SUBLANG_INUKTITUT_CANADA_LATIN: return NM("iu_CA","@latin"); } return N("iu"); case LANG_ITALIAN: @@ -2317,7 +2318,7 @@ gl_locale_name_from_win32_LANGID (LANGID langid) case SUBLANG_SPANISH: return N("es_ES"); case SUBLANG_SPANISH_MEXICAN: return N("es_MX"); case SUBLANG_SPANISH_MODERN: - return N("es_ES@modern"); /* not seen on Unix */ + return NM("es_ES","@modern"); /* not seen on Unix */ case SUBLANG_SPANISH_GUATEMALA: return N("es_GT"); case SUBLANG_SPANISH_COSTA_RICA: return N("es_CR"); case SUBLANG_SPANISH_PANAMA: return N("es_PA"); @@ -2381,7 +2382,7 @@ gl_locale_name_from_win32_LANGID (LANGID langid) switch (sub) { case SUBLANG_TAMAZIGHT_ARABIC: return N("ber_MA"); - case 0x1f: return N("ber@latin"); + case 0x1f: return NM("ber","@latin"); case SUBLANG_TAMAZIGHT_ALGERIA_LATIN: return N("ber_DZ"); } return N("ber"); @@ -2475,8 +2476,8 @@ gl_locale_name_from_win32_LANGID (LANGID langid) { case 0x1f: return N("uz"); case SUBLANG_UZBEK_LATIN: return N("uz_UZ"); - case 0x1e: return N("uz@cyrillic"); - case SUBLANG_UZBEK_CYRILLIC: return N("uz_UZ@cyrillic"); + case 0x1e: return NM("uz","@cyrillic"); + case SUBLANG_UZBEK_CYRILLIC: return NM("uz_UZ","@cyrillic"); } return N("uz"); case LANG_VENDA: @@ -2542,6 +2543,7 @@ gl_locale_name_from_win32_LANGID (LANGID langid) default: return N("C"); } } + #undef NM #undef N }