libstdc++-v3/ChangeLog: * src/c++20/format.cc (__encoding::conv): Convert ISO-8859-1 and ISO-8859-15 directly without using iconv. Check if iconv can be avoided for some extended ASCII encodings. ---
Tested x86_64-linux. libstdc++-v3/src/c++20/format.cc | 119 +++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/libstdc++-v3/src/c++20/format.cc b/libstdc++-v3/src/c++20/format.cc index 1a24fcab7f7..ee91c291d38 100644 --- a/libstdc++-v3/src/c++20/format.cc +++ b/libstdc++-v3/src/c++20/format.cc @@ -74,6 +74,8 @@ struct __encoding : locale::facet { case UTF8: case ASCII: + case ISOLatin1: + case ISO885915: break; default: _M_cd = ::iconv_open("UTF-8", _M_enc.name()); @@ -102,6 +104,123 @@ struct __encoding : locale::facet if (input.empty()) [[unlikely]] return codecvt_base::noconv; + using enum text_encoding::id; + switch (_M_enc.mib()) + { + case UTF8: + case ASCII: + return codecvt_base::noconv; + case ISOLatin1: + case ISO885915: + case windows1252: + { + auto next = input.begin(); + const auto end = input.end(); + do + { + if ((unsigned char)*next & 0x80) + break; + } + while (++next != end); + + if (next == end) // No 8-bit chars that need conversion to UTF-8. + return codecvt_base::noconv; + + out.assign(input.begin(), next); + do + { + if (uint16_t c = (unsigned char)*next; c & 0x80) // 8-bit char + { + if ((c & 0xe0) == 0xa0 && _M_enc.mib() == ISO885915) + { + // For ISO-8859-15 some characters do not map directly + // to the Unicode code point with the same value. + switch (c & 0xbf) + { + case 0xa4: + // Euro symbol requires three UTF-8 code units, + // so deal with it differently: + out += "\u20AC"; + continue; + case 0xbc: + c = 0x0152; + break; + case 0xbd: + c = 0x0153; + break; + case 0xa6: + c = 0x0160; + break; + case 0xa8: + c = 0x0161; + break; + case 0xbe: + c = 0x0178; + break; + case 0xb4: + c = 0x017d; + break; + case 0xb8: + c = 0x017e; + break; + default: + // Everything else is the same as ISO-8859-1 + break; + } + } + else if (c < 0xa0 && _M_enc.mib() == windows1252) + { + // For Windows-1252 some chars in range [0x80,0xa0) + // do not map directly to a single UTF-8 code unit. + // We could handle them here, but just use iconv for now. + goto use_iconv; + } + + // Convert code point to two UTF-8 code units: + char units[2]; + units[0] = 0xc0 | (c >> 6); + units[1] = 0x80 | (c & 0x3f); + out.append(units, 2); + } + else // 7-bit chars map directly to a single UTF-8 code point: + out += c; + } + while (++next < input.end()); + + return codecvt_base::ok; + } + + case ISOLatin2: + case ISOLatin3: + case ISOLatin4: + case ISOLatin5: + case ISOLatinCyrillic: + case ISOLatinGreek: + case windows1250: + case windows1251: + case windows1253: + case windows1254: + case windows1255: + case windows1256: + case windows1257: + case windows1258: + { + bool ascii = true; + for (unsigned char c : input) + if (c & 0x80) + { + ascii = false; + break; + } + if (ascii) + return codecvt_base::noconv; + break; + } + default: + break; + } + +use_iconv: #ifdef _GLIBCXX_HAVE_ICONV if (_M_cd == (::iconv_t)-1) return codecvt_base::error; -- 2.47.0