This is an automated email from the ASF dual-hosted git repository.
swebb2066 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git
The following commit(s) were added to refs/heads/master by this push:
new bb2563c9 Fix UTF-8 decoder rejecting valid U+0800 three-byte sequence
(#664)
bb2563c9 is described below
commit bb2563c9fa18701971091f90c63732d5aa95514c
Author: metsw24-max <[email protected]>
AuthorDate: Thu May 14 08:23:16 2026 +0530
Fix UTF-8 decoder rejecting valid U+0800 three-byte sequence (#664)
---
src/main/cpp/transcoder.cpp | 2 +-
src/test/cpp/helpers/transcodertestcase.cpp | 22 ++++++++++++++++++++++
2 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/src/main/cpp/transcoder.cpp b/src/main/cpp/transcoder.cpp
index 02eb520b..e5002fbf 100644
--- a/src/main/cpp/transcoder.cpp
+++ b/src/main/cpp/transcoder.cpp
@@ -265,7 +265,7 @@ unsigned int Transcoder::decode(const std::string& src,
+ ((ch2 & 0x3F) << 6)
+ (ch3 & 0x3F);
- if (rv <= 0x800)
+ if (rv < 0x800)
{
iter = start;
return 0xFFFF;
diff --git a/src/test/cpp/helpers/transcodertestcase.cpp
b/src/test/cpp/helpers/transcodertestcase.cpp
index 47904456..a94dab5d 100644
--- a/src/test/cpp/helpers/transcodertestcase.cpp
+++ b/src/test/cpp/helpers/transcodertestcase.cpp
@@ -64,6 +64,7 @@ LOGUNIT_CLASS(TranscoderTestCase)
LOGUNIT_TEST(testDecodeUTF8_2);
LOGUNIT_TEST(testDecodeUTF8_3);
LOGUNIT_TEST(testDecodeUTF8_4);
+ LOGUNIT_TEST(testDecodeUTF8_U0800);
LOGUNIT_TEST(testEncodeUTF16BE_BMP);
LOGUNIT_TEST(testEncodeUTF16BE_Supplementary);
LOGUNIT_TEST(testEncodeUTF16LE_Supplementary);
@@ -316,6 +317,27 @@ public:
LOGUNIT_ASSERT_EQUAL(true, iter == out.end());
}
+ /**
+ * U+0800 (SAMARITAN LETTER ALAF) is the smallest code point that
+ * legitimately requires a three-byte UTF-8 sequence (E0 A0 80).
+ * The overlong check in the three-byte branch of Transcoder::decode
+ * previously used `rv <= 0x800` instead of `rv < 0x800`, so this exact
+ * code point was rejected as if it were an overlong encoding and the
+ * caller substituted Transcoder::LOSSCHAR. Any UTF-8 input containing
+ * the bytes E0 A0 80 was therefore silently corrupted on decode.
+ */
+ void testDecodeUTF8_U0800()
+ {
+ std::string src("\xE0\xA0\x80");
+ LogString out;
+ Transcoder::decodeUTF8(src, out);
+
+ LogString expected;
+ Transcoder::encode(0x0800, expected);
+ LOGUNIT_ASSERT_EQUAL(expected, out);
+ LOGUNIT_ASSERT(out.find(Transcoder::LOSSCHAR) ==
LogString::npos);
+ }
+
void testEncodeUTF16BE_BMP()
{
char raw[4] = { 0, 0, 0, 0 };