https://gcc.gnu.org/g:b0f022f93a710c8143badedf8110a47227e17f62

commit r14-10204-gb0f022f93a710c8143badedf8110a47227e17f62
Author: Jonathan Wakely <jwak...@redhat.com>
Date:   Wed May 1 17:09:39 2024 +0100

    libstdc++: Fix handling of incomplete UTF-8 sequences in _Unicode_view
    
    Eddie Nolan reported to me that _Unicode_view was not correctly
    implementing the substitution of ill-formed subsequences with U+FFFD,
    due to failing to increment the counter when the iterator reaches the
    end of the sequence before a multibyte sequence is complete.  As a
    result, the incomplete sequence was not completely consumed, and then
    the remaining character was treated as another ill-formed sequence,
    giving two U+FFFD characters instead of one.
    
    To avoid similar mistakes in future, this change introduces a lambda
    that increments the iterator and the counter together. This ensures the
    counter is always incremented when the iterator is incremented, so that
    we always know how many characters have been consumed.
    
    libstdc++-v3/ChangeLog:
    
            * include/bits/unicode.h (_Unicode_view::_M_read_utf8): Ensure
            count of characters consumed is correct when the end of the
            input is reached unexpectedly.
            * testsuite/ext/unicode/view.cc: Test incomplete UTF-8
            sequences.
    
    (cherry picked from commit 3f04f3939ea0ac8fdd766a60655d29de2ffb44e5)

Diff:
---
 libstdc++-v3/include/bits/unicode.h        | 24 +++++++++++-------------
 libstdc++-v3/testsuite/ext/unicode/view.cc |  7 +++++++
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/libstdc++-v3/include/bits/unicode.h 
b/libstdc++-v3/include/bits/unicode.h
index 29813b743dc1..46238143fb61 100644
--- a/libstdc++-v3/include/bits/unicode.h
+++ b/libstdc++-v3/include/bits/unicode.h
@@ -261,9 +261,13 @@ namespace __unicode
       {
        _Guard<_Iter> __g{this, _M_curr()};
        char32_t __c{};
-       uint8_t __u = *_M_curr()++;
        const uint8_t __lo_bound = 0x80, __hi_bound = 0xBF;
+       uint8_t __u = *_M_curr()++;
        uint8_t __to_incr = 1;
+       auto __incr = [&, this] {
+         ++__to_incr;
+         return ++_M_curr();
+       };
 
        if (__u <= 0x7F) [[likely]]      // 0x00 to 0x7F
          __c = __u;
@@ -281,8 +285,7 @@ namespace __unicode
            else
              {
                __c = (__c << 6) | (__u & 0x3F);
-               ++_M_curr();
-               ++__to_incr;
+               __incr();
              }
          }
        else if (__u <= 0xEF) // 0xE0 to 0xEF
@@ -295,11 +298,10 @@ namespace __unicode
 
            if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
              __c = _S_error();
-           else if (++_M_curr() == _M_last) [[unlikely]]
+           else if (__incr() == _M_last) [[unlikely]]
              __c = _S_error();
            else
              {
-               ++__to_incr;
                __c = (__c << 6) | (__u & 0x3F);
                __u = *_M_curr();
 
@@ -308,8 +310,7 @@ namespace __unicode
                else
                  {
                    __c = (__c << 6) | (__u & 0x3F);
-                   ++_M_curr();
-                   ++__to_incr;
+                   __incr();
                  }
              }
          }
@@ -323,21 +324,19 @@ namespace __unicode
 
            if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
              __c = _S_error();
-           else if (++_M_curr() == _M_last) [[unlikely]]
+           else if (__incr() == _M_last) [[unlikely]]
              __c = _S_error();
            else
              {
-               ++__to_incr;
                __c = (__c << 6) | (__u & 0x3F);
                __u = *_M_curr();
 
                if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
                  __c = _S_error();
-               else if (++_M_curr() == _M_last) [[unlikely]]
+               else if (__incr() == _M_last) [[unlikely]]
                  __c = _S_error();
                else
                  {
-                   ++__to_incr;
                    __c = (__c << 6) | (__u & 0x3F);
                    __u = *_M_curr();
 
@@ -346,8 +345,7 @@ namespace __unicode
                    else
                      {
                        __c = (__c << 6) | (__u & 0x3F);
-                       ++_M_curr();
-                       ++__to_incr;
+                       __incr();
                      }
                  }
              }
diff --git a/libstdc++-v3/testsuite/ext/unicode/view.cc 
b/libstdc++-v3/testsuite/ext/unicode/view.cc
index ee23b0b1d8a3..6f3c099bd84a 100644
--- a/libstdc++-v3/testsuite/ext/unicode/view.cc
+++ b/libstdc++-v3/testsuite/ext/unicode/view.cc
@@ -55,6 +55,13 @@ test_illformed_utf8()
   VERIFY( std::ranges::equal(v5, 
u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41\uFFFD\uFFFD\x42"sv) );
   uc::_Utf8_view v6("\xe1\x80\xe2\xf0\x91\x92\xf1\xbf\x41"sv); // Table 3-11
   VERIFY( std::ranges::equal(v6, u8"\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv) );
+
+  uc::_Utf32_view v7("\xe1\x80"sv);
+  VERIFY( std::ranges::equal(v7, U"\uFFFD"sv) );
+  uc::_Utf32_view v8("\xf1\x80"sv);
+  VERIFY( std::ranges::equal(v8, U"\uFFFD"sv) );
+  uc::_Utf32_view v9("\xf1\x80\x80"sv);
+  VERIFY( std::ranges::equal(v9, U"\uFFFD"sv) );
 }
 
 constexpr void

Reply via email to