[PATCH 1/2] libstdc++: Avoid iconv for chrono formatting where possible

Jonathan Wakely Sat, 16 Nov 2024 15:33:06 -0800

libstdc++-v3/ChangeLog:

        * src/c++20/format.cc (__encoding::conv): Convert ISO-8859-1 and
        ISO-8859-15 directly without using iconv. Check if iconv can be
        avoided for some extended ASCII encodings.
---


Tested x86_64-linux.

 libstdc++-v3/src/c++20/format.cc | 119 +++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/libstdc++-v3/src/c++20/format.cc b/libstdc++-v3/src/c++20/format.cc
index 1a24fcab7f7..ee91c291d38 100644
--- a/libstdc++-v3/src/c++20/format.cc
+++ b/libstdc++-v3/src/c++20/format.cc
@@ -74,6 +74,8 @@ struct __encoding : locale::facet
       {
       case UTF8:
       case ASCII:
+      case ISOLatin1:
+      case ISO885915:
        break;
       default:
        _M_cd = ::iconv_open("UTF-8", _M_enc.name());
@@ -102,6 +104,123 @@ struct __encoding : locale::facet
     if (input.empty()) [[unlikely]]
       return codecvt_base::noconv;
 
+    using enum text_encoding::id;
+    switch (_M_enc.mib())
+    {
+      case UTF8:
+      case ASCII:
+       return codecvt_base::noconv;
+      case ISOLatin1:
+      case ISO885915:
+      case windows1252:
+      {
+       auto next = input.begin();
+       const auto end = input.end();
+       do
+         {
+           if ((unsigned char)*next & 0x80)
+             break;
+         }
+       while (++next != end);
+
+       if (next == end) // No 8-bit chars that need conversion to UTF-8.
+         return codecvt_base::noconv;
+
+       out.assign(input.begin(), next);
+       do
+         {
+           if (uint16_t c = (unsigned char)*next; c & 0x80) // 8-bit char
+             {
+               if ((c & 0xe0) == 0xa0 && _M_enc.mib() == ISO885915)
+                 {
+                   // For ISO-8859-15 some characters do not map directly
+                   // to the Unicode code point with the same value.
+                   switch (c & 0xbf)
+                   {
+                     case 0xa4:
+                       // Euro symbol requires three UTF-8 code units,
+                       // so deal with it differently:
+                       out += "\u20AC";
+                       continue;
+                     case 0xbc:
+                       c = 0x0152;
+                       break;
+                     case 0xbd:
+                       c = 0x0153;
+                       break;
+                     case 0xa6:
+                       c = 0x0160;
+                       break;
+                     case 0xa8:
+                       c = 0x0161;
+                       break;
+                     case 0xbe:
+                       c = 0x0178;
+                       break;
+                     case 0xb4:
+                       c = 0x017d;
+                       break;
+                     case 0xb8:
+                       c = 0x017e;
+                       break;
+                     default:
+                       // Everything else is the same as ISO-8859-1
+                       break;
+                     }
+                 }
+               else if (c < 0xa0 && _M_enc.mib() == windows1252)
+                 {
+                   // For Windows-1252 some chars in range [0x80,0xa0)
+                   // do not map directly to a single UTF-8 code unit.
+                   // We could handle them here, but just use iconv for now.
+                   goto use_iconv;
+                 }
+
+               // Convert code point to two UTF-8 code units:
+               char units[2];
+               units[0] = 0xc0 | (c >> 6);
+               units[1] = 0x80 | (c & 0x3f);
+               out.append(units, 2);
+             }
+           else // 7-bit chars map directly to a single UTF-8 code point:
+             out += c;
+         }
+       while (++next < input.end());
+
+       return codecvt_base::ok;
+      }
+
+      case ISOLatin2:
+      case ISOLatin3:
+      case ISOLatin4:
+      case ISOLatin5:
+      case ISOLatinCyrillic:
+      case ISOLatinGreek:
+      case windows1250:
+      case windows1251:
+      case windows1253:
+      case windows1254:
+      case windows1255:
+      case windows1256:
+      case windows1257:
+      case windows1258:
+      {
+       bool ascii = true;
+       for (unsigned char c : input)
+         if (c & 0x80)
+           {
+             ascii = false;
+             break;
+           }
+       if (ascii)
+         return codecvt_base::noconv;
+       break;
+      }
+      default:
+       break;
+    }
+
+use_iconv:
 #ifdef _GLIBCXX_HAVE_ICONV
     if (_M_cd == (::iconv_t)-1)
       return codecvt_base::error;
-- 
2.47.0

[PATCH 1/2] libstdc++: Avoid iconv for chrono formatting where possible

Reply via email to