This is an automated email from the ASF dual-hosted git repository.

swebb2066 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git


The following commit(s) were added to refs/heads/master by this push:
     new bb2563c9 Fix UTF-8 decoder rejecting valid U+0800 three-byte sequence 
(#664)
bb2563c9 is described below

commit bb2563c9fa18701971091f90c63732d5aa95514c
Author: metsw24-max <[email protected]>
AuthorDate: Thu May 14 08:23:16 2026 +0530

    Fix UTF-8 decoder rejecting valid U+0800 three-byte sequence (#664)
---
 src/main/cpp/transcoder.cpp                 |  2 +-
 src/test/cpp/helpers/transcodertestcase.cpp | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/main/cpp/transcoder.cpp b/src/main/cpp/transcoder.cpp
index 02eb520b..e5002fbf 100644
--- a/src/main/cpp/transcoder.cpp
+++ b/src/main/cpp/transcoder.cpp
@@ -265,7 +265,7 @@ unsigned int Transcoder::decode(const std::string& src,
                                        + ((ch2 & 0x3F) << 6)
                                        + (ch3 & 0x3F);
 
-                               if (rv <= 0x800)
+                               if (rv < 0x800)
                                {
                                        iter = start;
                                        return 0xFFFF;
diff --git a/src/test/cpp/helpers/transcodertestcase.cpp 
b/src/test/cpp/helpers/transcodertestcase.cpp
index 47904456..a94dab5d 100644
--- a/src/test/cpp/helpers/transcodertestcase.cpp
+++ b/src/test/cpp/helpers/transcodertestcase.cpp
@@ -64,6 +64,7 @@ LOGUNIT_CLASS(TranscoderTestCase)
        LOGUNIT_TEST(testDecodeUTF8_2);
        LOGUNIT_TEST(testDecodeUTF8_3);
        LOGUNIT_TEST(testDecodeUTF8_4);
+       LOGUNIT_TEST(testDecodeUTF8_U0800);
        LOGUNIT_TEST(testEncodeUTF16BE_BMP);
        LOGUNIT_TEST(testEncodeUTF16BE_Supplementary);
        LOGUNIT_TEST(testEncodeUTF16LE_Supplementary);
@@ -316,6 +317,27 @@ public:
                LOGUNIT_ASSERT_EQUAL(true, iter == out.end());
        }
 
+       /**
+        * U+0800 (SAMARITAN LETTER ALAF) is the smallest code point that
+        * legitimately requires a three-byte UTF-8 sequence (E0 A0 80).
+        * The overlong check in the three-byte branch of Transcoder::decode
+        * previously used `rv <= 0x800` instead of `rv < 0x800`, so this exact
+        * code point was rejected as if it were an overlong encoding and the
+        * caller substituted Transcoder::LOSSCHAR. Any UTF-8 input containing
+        * the bytes E0 A0 80 was therefore silently corrupted on decode.
+        */
+       void testDecodeUTF8_U0800()
+       {
+               std::string src("\xE0\xA0\x80");
+               LogString out;
+               Transcoder::decodeUTF8(src, out);
+
+               LogString expected;
+               Transcoder::encode(0x0800, expected);
+               LOGUNIT_ASSERT_EQUAL(expected, out);
+               LOGUNIT_ASSERT(out.find(Transcoder::LOSSCHAR) == 
LogString::npos);
+       }
+
        void testEncodeUTF16BE_BMP()
        {
                char raw[4] = { 0, 0, 0, 0 };

Reply via email to