This is an automated email from the ASF dual-hosted git repository.
swebb2066 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git
The following commit(s) were added to refs/heads/master by this push:
new 43360b5a Reject UTF-16 surrogate-half encodings in UTF-8 (#669)
43360b5a is described below
commit 43360b5a199b21bc08a6e8d155145df3d395a74c
Author: metsw24-max <[email protected]>
AuthorDate: Thu May 21 07:50:28 2026 +0530
Reject UTF-16 surrogate-half encodings in UTF-8 (#669)
* Invalid Unicode characters become the Unicode replacement character in XML
---
src/main/cpp/transcoder.cpp | 12 +++-
src/main/cpp/transform.cpp | 5 ++
src/site/markdown/change-report-gh.md | 1 +
src/test/cpp/helpers/transcodertestcase.cpp | 98 +++++++++++++++++++++++++++++
src/test/cpp/xml/xmllayouttest.cpp | 5 +-
5 files changed, 115 insertions(+), 6 deletions(-)
diff --git a/src/main/cpp/transcoder.cpp b/src/main/cpp/transcoder.cpp
index e5002fbf..de0deadf 100644
--- a/src/main/cpp/transcoder.cpp
+++ b/src/main/cpp/transcoder.cpp
@@ -265,7 +265,10 @@ unsigned int Transcoder::decode(const std::string& src,
+ ((ch2 & 0x3F) << 6)
+ (ch3 & 0x3F);
- if (rv < 0x800)
+ // RFC 3629 §3 prohibits UTF-8 encodings of the
UTF-16 surrogate
+ // halves (U+D800..U+DFFF); accepting them lets
malformed Unicode
+ // cross the decode boundary into LogString and
downstream output.
+ if (rv < 0x800 || (0xD800 <= rv && rv <=
0xDFFF))
{
iter = start;
return 0xFFFF;
@@ -289,7 +292,12 @@ unsigned int Transcoder::decode(const std::string& src,
+ ((ch3 & 0x3F) << 6)
+ (ch4 & 0x3F);
- if (rv > 0xFFFF)
+ // RFC 3629 §3 caps UTF-8 at U+10FFFF; lead
bytes F5..F7 (and
+ // F4 with an over-high trailer) produce rv >
0x10FFFF, which
+ // is not a Unicode code point. Without this
bound, encodeUTF16
+ // later silently aliases the bogus value to a
valid in-range
+ // code point — a substitution-collision
filter-bypass primitive.
+ if (rv > 0xFFFF && rv <= 0x10FFFF)
{
return rv;
}
diff --git a/src/main/cpp/transform.cpp b/src/main/cpp/transform.cpp
index ec7c68c4..41206c83 100644
--- a/src/main/cpp/transform.cpp
+++ b/src/main/cpp/transform.cpp
@@ -46,6 +46,11 @@ void appendValidCharacters(LogString& buf, const LogString&
input, CharProcessor
auto ch = Transcoder::decode(input, nextCodePoint);
if (nextCodePoint == lastCodePoint) // failed to decode input?
nextCodePoint = input.end();
+ else if (0xD800 <= ch && ch <= 0xDFFF)
+ {
+ // RFC 3629 §3 explicitly forbids surrogate-half values
in UTF-8
+ ch = 0xFFFF;
+ }
else if (((0x20 <= ch && ch <= 0xD7FF) &&
specials[0] != ch &&
specials[1] != ch &&
diff --git a/src/site/markdown/change-report-gh.md
b/src/site/markdown/change-report-gh.md
index 2f4e1e6e..51b5cef4 100644
--- a/src/site/markdown/change-report-gh.md
+++ b/src/site/markdown/change-report-gh.md
@@ -71,6 +71,7 @@ The following issues have been addressed:
, [#659](https://github.com/apache/logging-log4cxx/pull/659)
, [#660](https://github.com/apache/logging-log4cxx/pull/660)
, [#664](https://github.com/apache/logging-log4cxx/pull/664)
+ , [#669](https://github.com/apache/logging-log4cxx/pull/669)
, [#670](https://github.com/apache/logging-log4cxx/pull/670)
* A lack of robustness dealing with values near numeric limits
diff --git a/src/test/cpp/helpers/transcodertestcase.cpp
b/src/test/cpp/helpers/transcodertestcase.cpp
index a94dab5d..1bc1431e 100644
--- a/src/test/cpp/helpers/transcodertestcase.cpp
+++ b/src/test/cpp/helpers/transcodertestcase.cpp
@@ -64,7 +64,11 @@ LOGUNIT_CLASS(TranscoderTestCase)
LOGUNIT_TEST(testDecodeUTF8_2);
LOGUNIT_TEST(testDecodeUTF8_3);
LOGUNIT_TEST(testDecodeUTF8_4);
+ LOGUNIT_TEST(testDecodeUTF8_RejectSurrogate);
+ LOGUNIT_TEST(testDecodeUTF8_SurrogateBoundaries);
LOGUNIT_TEST(testDecodeUTF8_U0800);
+ LOGUNIT_TEST(testDecodeUTF8_RejectAboveMax);
+ LOGUNIT_TEST(testDecodeUTF8_MaxBoundary);
LOGUNIT_TEST(testEncodeUTF16BE_BMP);
LOGUNIT_TEST(testEncodeUTF16BE_Supplementary);
LOGUNIT_TEST(testEncodeUTF16LE_Supplementary);
@@ -317,6 +321,28 @@ public:
LOGUNIT_ASSERT_EQUAL(true, iter == out.end());
}
+ /**
+ * RFC 3629 §3 prohibits UTF-8 encodings of the UTF-16 surrogate halves
+ * (U+D800..U+DFFF). The three-byte sequences ED A0 80 .. ED BF BF must
+ * not decode to the corresponding surrogate code points: doing so lets
+ * lone surrogates enter LogString and be re-emitted by JSON/XML
layouts,
+ * propagating malformed Unicode past the parsing boundary. Each byte of
+ * the invalid sequence is replaced with Transcoder::LOSSCHAR.
+ */
+ void testDecodeUTF8_RejectSurrogate()
+ {
+ // ED A0 80 would encode U+D800 (the smallest high-surrogate).
+ std::string src("\xED\xA0\x80");
+ LogString out;
+ Transcoder::decodeUTF8(src, out);
+
+ LogString expected;
+ expected.append(1, Transcoder::LOSSCHAR);
+ expected.append(1, Transcoder::LOSSCHAR);
+ expected.append(1, Transcoder::LOSSCHAR);
+ LOGUNIT_ASSERT_EQUAL(expected, out);
+ }
+
/**
* U+0800 (SAMARITAN LETTER ALAF) is the smallest code point that
* legitimately requires a three-byte UTF-8 sequence (E0 A0 80).
@@ -338,6 +364,78 @@ public:
LOGUNIT_ASSERT(out.find(Transcoder::LOSSCHAR) ==
LogString::npos);
}
+ /**
+ * Confirm the surrogate-rejection range is exactly U+D800..U+DFFF:
+ * U+D7FF (ED 9F BF) and U+E000 (EE 80 80) bracket the range and must
+ * still decode cleanly. The four interior values are each rejected.
+ */
+ void testDecodeUTF8_SurrogateBoundaries()
+ {
+ struct { const char* bytes; size_t len; bool reject; } cases[] =
+ {
+ { "\xED\x9F\xBF", 3, false }, // U+D7FF — last valid
before surrogates
+ { "\xED\xA0\x80", 3, true }, // U+D800 —
high-surrogate min (reject)
+ { "\xED\xAF\xBF", 3, true }, // U+DBFF —
high-surrogate max (reject)
+ { "\xED\xB0\x80", 3, true }, // U+DC00 — low-surrogate
min (reject)
+ { "\xED\xBF\xBF", 3, true }, // U+DFFF — low-surrogate
max (reject)
+ { "\xEE\x80\x80", 3, false }, // U+E000 — first valid
after surrogates
+ };
+ for (auto& c : cases)
+ {
+ std::string src(c.bytes, c.len);
+ LogString out;
+ Transcoder::decodeUTF8(src, out);
+ bool hasLoss = out.find(Transcoder::LOSSCHAR) !=
LogString::npos;
+ LOGUNIT_ASSERT_EQUAL(c.reject, hasLoss);
+ }
+ }
+
+ /**
+ * RFC 3629 §3 caps UTF-8 at U+10FFFF. Four-byte sequences with lead F5,
+ * F6, F7 (and F4 with an over-high trailer) decode to values above the
+ * Unicode maximum. Without bounds-rejection here,
Transcoder::encodeUTF16
+ * later silently aliases the bogus value to a valid in-range code point
+ * (e.g. U+110000 collides with U+10000) — a substitution-collision
+ * filter-bypass primitive in wchar builds.
+ */
+ void testDecodeUTF8_RejectAboveMax()
+ {
+ // F4 90 80 80 would encode U+110000 (one past the maximum).
+ std::string src("\xF4\x90\x80\x80");
+ LogString out;
+ Transcoder::decodeUTF8(src, out);
+
+ LogString expected;
+ for (int i = 0; i < 4; ++i)
+ expected.append(1, Transcoder::LOSSCHAR);
+ LOGUNIT_ASSERT_EQUAL(expected, out);
+ }
+
+ /**
+ * Boundary check around U+10FFFF: the canonical encoding of the
+ * maximum legal code point (F4 8F BF BF) must decode cleanly; one past
+ * (F4 90 80 80) and the F5/F6/F7 lead bytes must all be rejected.
+ */
+ void testDecodeUTF8_MaxBoundary()
+ {
+ struct { const char* bytes; size_t len; bool reject; } cases[] =
+ {
+ { "\xF4\x8F\xBF\xBF", 4, false }, // U+10FFFF — maximum
legal code point
+ { "\xF4\x90\x80\x80", 4, true }, // U+110000 — one
past max (reject)
+ { "\xF5\x80\x80\x80", 4, true }, // F5 lead: rv =
0x140000 (reject)
+ { "\xF6\x80\x80\x80", 4, true }, // F6 lead: rv =
0x180000 (reject)
+ { "\xF7\xBF\xBF\xBF", 4, true }, // F7 lead: rv =
0x1FFFFF (reject)
+ };
+ for (auto& c : cases)
+ {
+ std::string src(c.bytes, c.len);
+ LogString out;
+ Transcoder::decodeUTF8(src, out);
+ bool hasLoss = out.find(Transcoder::LOSSCHAR) !=
LogString::npos;
+ LOGUNIT_ASSERT_EQUAL(c.reject, hasLoss);
+ }
+ }
+
void testEncodeUTF16BE_BMP()
{
char raw[4] = { 0, 0, 0, 0 };
diff --git a/src/test/cpp/xml/xmllayouttest.cpp
b/src/test/cpp/xml/xmllayouttest.cpp
index 2ffbb2f0..0399b94b 100644
--- a/src/test/cpp/xml/xmllayouttest.cpp
+++ b/src/test/cpp/xml/xmllayouttest.cpp
@@ -377,11 +377,8 @@ public:
Transcoder::encode(0xD822, problemNameLS); // Add an invalid
character that should be stripped from attribute values
auto keyLS = problemNameLS;
auto expectedKeyValue = problemName;
-#if LOG4CXX_LOGCHAR_IS_WCHAR && !defined(__STDC_ISO_10646__)
- // encodeUTF16 adds 0xD822, but decodeUTF16 cannot convert
0xD822
- // Expat translates the Unicode replacement character to the
following
+ // UTF-8 encodes the Unicode replacement character (0xFFFD) as
the following:
expectedKeyValue += "\xEF\xBF\xBD";
-#endif
std::string problemMessage = "'\001\"<Hello >\"\004'";
std::string expectedCdataValue = "'\"<Hello >\"'";
std::string expectedAttributeValue = "'\"<Hello >\"'"; //
Invalid characters stripped