This is an automated email from the ASF dual-hosted git repository.
freeandnil pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/logging-log4net.git
The following commit(s) were added to refs/heads/master by this push:
new 7fcc7f42 preserve UTF-16 surrogate pairs in MaskXmlInvalidCharacters -
fixes #290 (#291)
7fcc7f42 is described below
commit 7fcc7f425b81b9df7c8a8719ed90953b938cd10e
Author: Jan Friedrich <[email protected]>
AuthorDate: Thu Apr 16 18:14:27 2026 +0200
preserve UTF-16 surrogate pairs in MaskXmlInvalidCharacters - fixes #290
(#291)
The regex [^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD] operated on individual
UTF-16 char units, causing both halves of a valid surrogate pair to be
replaced, silently corrupting supplementary characters (U+10000–U+10FFFF)
such as emoji in XML log output.
Fix by prepending a surrogate-pair alternative to the regex so valid pairs
are matched and preserved as a unit; only lone surrogates and other
XML-illegal code units are replaced with the mask string.
Also optimise CountSubstrings: use a char loop for single-character
substrings (all current callers) and StringComparison.Ordinal for the
multi-character CDATA token path.
Add unit tests covering surrogate pair preservation, lone surrogates,
and CountSubstrings edge cases.
---
...rogate-pairs-in-mask-xml-invalid-characters.xml | 13 ++
src/log4net.Tests/Util/TransformTest.cs | 145 ++++++++++++++++++++-
src/log4net/Util/Transform.cs | 57 +++++---
3 files changed, 195 insertions(+), 20 deletions(-)
diff --git
a/src/changelog/3.3.1/291-fix-surrogate-pairs-in-mask-xml-invalid-characters.xml
b/src/changelog/3.3.1/291-fix-surrogate-pairs-in-mask-xml-invalid-characters.xml
new file mode 100644
index 00000000..9ecba009
--- /dev/null
+++
b/src/changelog/3.3.1/291-fix-surrogate-pairs-in-mask-xml-invalid-characters.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<entry xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xmlns="https://logging.apache.org/xml/ns"
+ xsi:schemaLocation="https://logging.apache.org/xml/ns
https://logging.apache.org/xml/ns/log4j-changelog-0.xsd"
+ type="fixed">
+ <issue id="290" link="https://github.com/apache/logging-log4net/issues/290"/>
+ <issue id="291" link="https://github.com/apache/logging-log4net/pull/291"/>
+ <description format="asciidoc">
+ fix silent corruption of supplementary Unicode characters
(U+10000–U+10FFFF) in `MaskXmlInvalidCharacters`:
+ valid UTF-16 surrogate pairs are now preserved instead of being replaced
with the mask string
+ (reported by @N0tre3l, fixed by @freeandnil in
https://github.com/apache/logging-log4net/pull/291[#291])
+ </description>
+</entry>
\ No newline at end of file
diff --git a/src/log4net.Tests/Util/TransformTest.cs
b/src/log4net.Tests/Util/TransformTest.cs
index b0e932ac..2b9b15f6 100644
--- a/src/log4net.Tests/Util/TransformTest.cs
+++ b/src/log4net.Tests/Util/TransformTest.cs
@@ -17,16 +17,20 @@
//
#endregion
+using System;
+using System.Reflection;
using log4net.Util;
-
using NUnit.Framework;
namespace log4net.Tests.Util;
-
[TestFixture]
public class TransformTest
{
+ /// <summary>
+ /// Verifies that a valid BMP character in the allowed XML 1.0 range (U+203B
※)
+ /// is passed through unchanged.
+ /// </summary>
[Test]
public void MaskXmlInvalidCharactersAllowsJapaneseCharacters()
{
@@ -34,10 +38,147 @@ public void
MaskXmlInvalidCharactersAllowsJapaneseCharacters()
Assert.That(Transform.MaskXmlInvalidCharacters(kome, "?"),
Is.EqualTo(kome));
}
+ /// <summary>
+ /// Verifies that the NUL character (U+0000), which is illegal in XML 1.0,
+ /// is replaced with the supplied mask string.
+ /// </summary>
[Test]
public void MaskXmlInvalidCharactersMasks0Char()
{
const string c = "\0";
Assert.That(Transform.MaskXmlInvalidCharacters(c, "?"), Is.EqualTo("?"));
}
+
+ /// <summary>
+ /// Verifies that a well-formed UTF-16 surrogate pair representing a
supplementary
+ /// character (U+1F511 🔑) is treated as a valid XML character and preserved
intact.
+ /// </summary>
+ [Test]
+ public void MaskXmlInvalidCharactersPreservesValidSurrogatePair()
+ {
+ // U+1F511 KEY (🔑) — encoded as surrogate pair \uD83D\uDD11
+ const string key = "\uD83D\uDD11";
+ Assert.That(Transform.MaskXmlInvalidCharacters(key, "?"), Is.EqualTo(key));
+ }
+
+ /// <summary>
+ /// Verifies that a supplementary character embedded within plain ASCII text
+ /// is preserved without corrupting the surrounding characters.
+ /// </summary>
+ [Test]
+ public void
MaskXmlInvalidCharactersPreservesSupplementaryCharacterInContext()
+ {
+ // "admin🔑" — only the emoji is a surrogate pair; all other chars are BMP
+ const string input = "admin\uD83D\uDD11";
+ Assert.That(Transform.MaskXmlInvalidCharacters(input, "?"),
Is.EqualTo(input));
+ }
+
+ /// <summary>
+ /// Verifies that a lone high surrogate (U+D800–U+DBFF) not followed by a
+ /// low surrogate is malformed UTF-16 and is replaced with the mask string.
+ /// </summary>
+ [Test]
+ public void MaskXmlInvalidCharactersMasksLoneHighSurrogate()
+ {
+ // A lone high surrogate with no following low surrogate is illegal
+ const string loneSurrogate = "\uD83D";
+ Assert.That(Transform.MaskXmlInvalidCharacters(loneSurrogate, "?"),
Is.EqualTo("?"));
+ }
+
+ /// <summary>
+ /// Verifies that a lone low surrogate (U+DC00–U+DFFF) not preceded by a
+ /// high surrogate is malformed UTF-16 and is replaced with the mask string.
+ /// </summary>
+ [Test]
+ public void MaskXmlInvalidCharactersMasksLoneLowSurrogate()
+ {
+ const string loneSurrogate = "\uDD11";
+ Assert.That(Transform.MaskXmlInvalidCharacters(loneSurrogate, "?"),
Is.EqualTo("?"));
+ }
+
+ /// <summary>
+ /// Verifies current behaviour: a null input throws <see
cref="ArgumentNullException"/>
+ /// (delegated to the regex engine). Add a null-guard to <see
cref="Transform.MaskXmlInvalidCharacters"/>
+ /// if a graceful fallback is preferred.
+ /// </summary>
+ [Test]
+ public void MaskXmlInvalidCharactersNullInputThrowsArgumentNullException()
+ => Assert.That(() => Transform.MaskXmlInvalidCharacters(null!, "?"),
+ Throws.ArgumentNullException);
+
+ #region CountSubstrings tests (private method accessed via reflection)
+
+ /// <summary>
+ /// Resolves the private static <see cref="CountSubstrings"/> method once
for all tests
+ /// </summary>
+ private static Func<string, string, int> CountSubstrings { get; } =
(Func<string, string, int>)Delegate.CreateDelegate(
+ typeof(Func<string, string, int>),
typeof(Transform).GetMethod(nameof(CountSubstrings), BindingFlags.NonPublic |
BindingFlags.Static)
+ ?? throw new MissingMethodException(nameof(Transform),
nameof(CountSubstrings)));
+
+ /// <summary>
+ /// Verifies that an empty text returns zero regardless of the substring.
+ /// </summary>
+ [Test]
+ public void CountSubstringsEmptyTextReturnsZero()
+ => Assert.That(CountSubstrings("", "a"), Is.Zero);
+
+ /// <summary>
+ /// Verifies that an empty substring returns zero regardless of the text.
+ /// </summary>
+ [Test]
+ public void CountSubstringsEmptySubstringReturnsZero()
+ => Assert.That(CountSubstrings("abc", ""), Is.Zero);
+
+ /// <summary>
+ /// Verifies that a substring not present in the text returns zero.
+ /// </summary>
+ [Test]
+ public void CountSubstringsNoMatchReturnsZero()
+ => Assert.That(CountSubstrings("hello", "x"), Is.Zero);
+
+ /// <summary>
+ /// Verifies that a single-character substring occurring once is counted
correctly.
+ /// </summary>
+ [Test]
+ public void CountSubstringsSingleCharSingleMatchReturnsOne()
+ => Assert.That(CountSubstrings("hello", "h"), Is.EqualTo(1));
+
+ /// <summary>
+ /// Verifies that a single-character substring occurring multiple times is
counted correctly.
+ /// </summary>
+ [Test]
+ public void CountSubstringsSingleCharMultipleMatchesReturnsCorrectCount()
+ => Assert.That(CountSubstrings("banana", "a"), Is.EqualTo(3));
+
+ /// <summary>
+ /// Verifies counting of XML special characters as used by
<c>WriteEscapedXmlString</c>.
+ /// </summary>
+ [TestCase("<hello><world>", "<", 2)]
+ [TestCase("<hello><world>", ">", 2)]
+ [TestCase("a&&b", "&", 2)]
+ public void CountSubstringsXmlSpecialCharsReturnsCorrectCount(string text,
string substring, int expected)
+ => Assert.That(CountSubstrings(text, substring), Is.EqualTo(expected));
+
+ /// <summary>
+ /// Verifies that the CDATA end token <c>]]></c> is counted correctly as
a multi-character substring.
+ /// </summary>
+ [Test]
+ public void CountSubstringsCdataEndReturnsCorrectCount()
+ => Assert.That(CountSubstrings("foo]]>bar]]>baz", "]]>"), Is.EqualTo(2));
+
+ /// <summary>
+ /// Verifies that non-overlapping occurrences are counted and overlapping
ones are not,
+ /// consistent with the method's documented assumption.
+ /// </summary>
+ [Test]
+ public void CountSubstringsNonOverlappingReturnsCorrectCount()
+ => Assert.That(CountSubstrings("aaaa", "aa"), Is.EqualTo(2));
+
+ /// <summary>
+ /// Verifies that a substring equal to the entire text counts as one match.
+ /// </summary>
+ [Test]
+ public void CountSubstringsSubstringEqualsTextReturnsOne()
+ => Assert.That(CountSubstrings("]]>", "]]>"), Is.EqualTo(1));
+ #endregion
}
\ No newline at end of file
diff --git a/src/log4net/Util/Transform.cs b/src/log4net/Util/Transform.cs
index b646974b..843716e6 100644
--- a/src/log4net/Util/Transform.cs
+++ b/src/log4net/Util/Transform.cs
@@ -17,8 +17,9 @@
//
#endregion
-using System.Xml;
+using System;
using System.Text.RegularExpressions;
+using System.Xml;
namespace log4net.Util;
@@ -110,9 +111,15 @@ public static void WriteEscapedXmlString(this XmlWriter
writer, string textData,
/// This method replaces any illegal characters in the input string
/// with the mask string specified.
/// </para>
+ /// <para>
+ /// Supplementary characters (U+10000�U+10FFFF) encoded as valid UTF-16
surrogate
+ /// pairs are preserved; only lone surrogates and other illegal code units
are masked.
+ /// </para>
/// </remarks>
- public static string MaskXmlInvalidCharacters(string textData, string mask)
- => _invalidchars.Replace(textData, mask);
+ public static string MaskXmlInvalidCharacters(string textData, string mask)
+ => _invalidChars.Replace(textData, m =>
+ // A valid surrogate pair represents a legal supplementary character �
preserve it.
+ m.Value.Length == 2 ? m.Value : mask);
/// <summary>
/// Count the number of times that the substring occurs in the text
@@ -127,29 +134,38 @@ public static string MaskXmlInvalidCharacters(string
textData, string mask)
/// </remarks>
private static int CountSubstrings(string text, string substring)
{
- int count = 0;
- int offset = 0;
- int length = text.Length;
- int substringLength = substring.Length;
-
- if (length == 0)
+ if (text.Length == 0 || substring.Length == 0)
{
return 0;
}
- if (substringLength == 0)
+
+ // Use the char overload for single-character substrings � avoids string
+ // comparison overhead; all current callers pass single ASCII characters.
+ if (substring.Length == 1)
{
- return 0;
+ int charCount = 0;
+ char target = substring[0];
+ foreach (char c in text)
+ {
+ if (c == target)
+ {
+ charCount++;
+ }
+ }
+ return charCount;
}
- while (offset < length)
+ int count = 0;
+ int offset = 0;
+ int substringLength = substring.Length;
+ while (offset < text.Length)
{
- int index = text.IndexOf(substring, offset);
-
+ // Ordinal avoids culture-sensitive comparison for these ASCII-only
tokens.
+ int index = text.IndexOf(substring, offset, StringComparison.Ordinal);
if (index == -1)
{
break;
}
-
count++;
offset = index + substringLength;
}
@@ -160,7 +176,12 @@ private static int CountSubstrings(string text, string
substring)
private const string CdataUnescapableToken = "]]";
/// <summary>
- /// Characters illegal in XML 1.0
+ /// Matches either a valid UTF-16 surrogate pair (preserved) or a single
character
+ /// that is illegal in XML 1.0 (replaced). The surrogate-pair alternative
must come
+ /// first so the engine consumes both code units together before the
single-char
+ /// alternative can match them individually.
/// </summary>
- private static readonly Regex _invalidchars =
new(@"[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]", RegexOptions.Compiled);
-}
+ private static readonly Regex _invalidChars = new(
+ @"[\uD800-\uDBFF][\uDC00-\uDFFF]|[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]",
+ RegexOptions.Compiled);
+}
\ No newline at end of file