https://gcc.gnu.org/g:ad1b71fc2882c14271ebf2bbaf216cceaa88c76a
commit r14-11523-gad1b71fc2882c14271ebf2bbaf216cceaa88c76a Author: Tomasz Kamiński <tkami...@redhat.com> Date: Thu Apr 3 10:23:45 2025 +0200 libstdc++: Fix handling of field width for wide strings and characters [PR119593] This patch corrects handling of UTF-32LE and UTF32-BE in __unicode::__literal_encoding_is_unicode<_CharT>, so they are recognized as unicode and functions produces correct result for wchar_t. Use `__unicode::__field_width` to compute the estimated witdh of the charcter for unicode wide encoding. PR libstdc++/119593 libstdc++-v3/ChangeLog: * include/bits/unicode.h (__unicode::__literal_encoding_is_unicode<_CharT>): Corrected handing for UTF-16 and UTF-32 with "LE" or "BE" suffix. * include/std/format (__formatter_str::_S_character_width): Define. (__formatter_str::_S_character_width): Updated passed char length. * testsuite/std/format/functions/format.cc: Test for wchar_t. Reviewed-by: Jonathan Wakely <jwak...@redhat.com> Signed-off-by: Tomasz Kamiński <tkami...@redhat.com> Diff: --- libstdc++-v3/include/bits/unicode.h | 2 ++ libstdc++-v3/include/std/format | 16 +++++++++++++++- libstdc++-v3/testsuite/std/format/functions/format.cc | 8 ++++++-- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/libstdc++-v3/include/bits/unicode.h b/libstdc++-v3/include/bits/unicode.h index 4b408948d722..eee3b7e37609 100644 --- a/libstdc++-v3/include/bits/unicode.h +++ b/libstdc++-v3/include/bits/unicode.h @@ -1039,6 +1039,8 @@ inline namespace __v15_1_0 string_view __s(__enc); if (__s.ends_with("//")) __s.remove_suffix(2); + if (__s.ends_with("LE") || __s.ends_with("BE")) + __s.remove_suffix(2); return __s == "16" || __s == "32"; } } diff --git a/libstdc++-v3/include/std/format b/libstdc++-v3/include/std/format index f64947a0e293..15bded87c9cd 100644 --- a/libstdc++-v3/include/std/format +++ b/libstdc++-v3/include/std/format @@ -1184,12 +1184,26 @@ namespace __format _M_spec); } + [[__gnu__::__always_inline__]] + static size_t + _S_character_width(_CharT __c) + { + // N.B. single byte cannot encode charcter of width greater than 1 + if constexpr (sizeof(_CharT) > 1u && + __unicode::__literal_encoding_is_unicode<_CharT>()) + return __unicode::__field_width(__c); + else + return 1u; + } + template<typename _Out> typename basic_format_context<_Out, _CharT>::iterator _M_format_character(_CharT __c, basic_format_context<_Out, _CharT>& __fc) const { - return __format::__write_padded_as_spec({&__c, 1u}, 1, __fc, _M_spec); + return __format::__write_padded_as_spec({&__c, 1u}, + _S_character_width(__c), + __fc, _M_spec); } template<typename _Int> diff --git a/libstdc++-v3/testsuite/std/format/functions/format.cc b/libstdc++-v3/testsuite/std/format/functions/format.cc index 78cc1ab482ad..97eb0957e5e1 100644 --- a/libstdc++-v3/testsuite/std/format/functions/format.cc +++ b/libstdc++-v3/testsuite/std/format/functions/format.cc @@ -497,9 +497,14 @@ test_unicode() { // Similar to sC example in test_std_examples, but not from the standard. // Verify that the character "🤡" has estimated field width 2, - // rather than estimated field width equal to strlen("🤡"), which would be 4. + // rather than estimated field width equal to strlen("🤡"), which would be 4, + // or just width 1 for single character. std::string sC = std::format("{:*<3}", "🤡"); VERIFY( sC == "🤡*" ); + std::wstring wsC = std::format(L"{:*<3}", L"🤡"); + VERIFY( wsC == L"🤡*" ); + wsC = std::format(L"{:*<3}", L'🤡'); + VERIFY( wsC == L"🤡*" ); // Verify that "£" has estimated field width 1, not strlen("£") == 2. std::string sL = std::format("{:*<3}", "£"); @@ -513,7 +518,6 @@ test_unicode() std::string sP = std::format("{:1.1} {:*<1.1}", "£", "🤡"); VERIFY( sP == "£ *" ); sP = std::format("{:*<2.1} {:*<2.1}", "£", "🤡"); - VERIFY( sP == "£* **" ); // Verify field width handling for extended grapheme clusters, // and that a cluster gets output as a single item, not truncated.