This is an automated email from the ASF dual-hosted git repository.

freeandnil pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/logging-log4net.git


The following commit(s) were added to refs/heads/master by this push:
     new 7fcc7f42 preserve UTF-16 surrogate pairs in MaskXmlInvalidCharacters - 
fixes #290 (#291)
7fcc7f42 is described below

commit 7fcc7f425b81b9df7c8a8719ed90953b938cd10e
Author: Jan Friedrich <[email protected]>
AuthorDate: Thu Apr 16 18:14:27 2026 +0200

    preserve UTF-16 surrogate pairs in MaskXmlInvalidCharacters - fixes #290 
(#291)
    
    The regex [^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD] operated on individual
    UTF-16 char units, causing both halves of a valid surrogate pair to be
    replaced, silently corrupting supplementary characters (U+10000–U+10FFFF)
    such as emoji in XML log output.
    
    Fix by prepending a surrogate-pair alternative to the regex so valid pairs
    are matched and preserved as a unit; only lone surrogates and other
    XML-illegal code units are replaced with the mask string.
    
    Also optimise CountSubstrings: use a char loop for single-character
    substrings (all current callers) and StringComparison.Ordinal for the
    multi-character CDATA token path.
    
    Add unit tests covering surrogate pair preservation, lone surrogates,
    and CountSubstrings edge cases.
---
 ...rogate-pairs-in-mask-xml-invalid-characters.xml |  13 ++
 src/log4net.Tests/Util/TransformTest.cs            | 145 ++++++++++++++++++++-
 src/log4net/Util/Transform.cs                      |  57 +++++---
 3 files changed, 195 insertions(+), 20 deletions(-)

diff --git 
a/src/changelog/3.3.1/291-fix-surrogate-pairs-in-mask-xml-invalid-characters.xml
 
b/src/changelog/3.3.1/291-fix-surrogate-pairs-in-mask-xml-invalid-characters.xml
new file mode 100644
index 00000000..9ecba009
--- /dev/null
+++ 
b/src/changelog/3.3.1/291-fix-surrogate-pairs-in-mask-xml-invalid-characters.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<entry xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+       xmlns="https://logging.apache.org/xml/ns";
+       xsi:schemaLocation="https://logging.apache.org/xml/ns 
https://logging.apache.org/xml/ns/log4j-changelog-0.xsd";
+       type="fixed">
+  <issue id="290" link="https://github.com/apache/logging-log4net/issues/290"/>
+  <issue id="291" link="https://github.com/apache/logging-log4net/pull/291"/>
+  <description format="asciidoc">
+    fix silent corruption of supplementary Unicode characters 
(U+10000–U+10FFFF) in `MaskXmlInvalidCharacters`:
+    valid UTF-16 surrogate pairs are now preserved instead of being replaced 
with the mask string
+    (reported by @N0tre3l, fixed by @freeandnil in 
https://github.com/apache/logging-log4net/pull/291[#291])
+  </description>
+</entry>
\ No newline at end of file
diff --git a/src/log4net.Tests/Util/TransformTest.cs 
b/src/log4net.Tests/Util/TransformTest.cs
index b0e932ac..2b9b15f6 100644
--- a/src/log4net.Tests/Util/TransformTest.cs
+++ b/src/log4net.Tests/Util/TransformTest.cs
@@ -17,16 +17,20 @@
 //
 #endregion
 
+using System;
+using System.Reflection;
 using log4net.Util;
-
 using NUnit.Framework;
 
 namespace log4net.Tests.Util;
 
-
 [TestFixture]
 public class TransformTest
 {
+  /// <summary>
+  /// Verifies that a valid BMP character in the allowed XML 1.0 range (U+203B 
※)
+  /// is passed through unchanged.
+  /// </summary>
   [Test]
   public void MaskXmlInvalidCharactersAllowsJapaneseCharacters()
   {
@@ -34,10 +38,147 @@ public void 
MaskXmlInvalidCharactersAllowsJapaneseCharacters()
     Assert.That(Transform.MaskXmlInvalidCharacters(kome, "?"), 
Is.EqualTo(kome));
   }
 
+  /// <summary>
+  /// Verifies that the NUL character (U+0000), which is illegal in XML 1.0,
+  /// is replaced with the supplied mask string.
+  /// </summary>
   [Test]
   public void MaskXmlInvalidCharactersMasks0Char()
   {
     const string c = "\0";
     Assert.That(Transform.MaskXmlInvalidCharacters(c, "?"), Is.EqualTo("?"));
   }
+
+  /// <summary>
+  /// Verifies that a well-formed UTF-16 surrogate pair representing a 
supplementary
+  /// character (U+1F511 🔑) is treated as a valid XML character and preserved 
intact.
+  /// </summary>
+  [Test]
+  public void MaskXmlInvalidCharactersPreservesValidSurrogatePair()
+  {
+    // U+1F511 KEY (🔑) — encoded as surrogate pair \uD83D\uDD11
+    const string key = "\uD83D\uDD11";
+    Assert.That(Transform.MaskXmlInvalidCharacters(key, "?"), Is.EqualTo(key));
+  }
+
+  /// <summary>
+  /// Verifies that a supplementary character embedded within plain ASCII text
+  /// is preserved without corrupting the surrounding characters.
+  /// </summary>
+  [Test]
+  public void 
MaskXmlInvalidCharactersPreservesSupplementaryCharacterInContext()
+  {
+    // "admin🔑" — only the emoji is a surrogate pair; all other chars are BMP
+    const string input = "admin\uD83D\uDD11";
+    Assert.That(Transform.MaskXmlInvalidCharacters(input, "?"), 
Is.EqualTo(input));
+  }
+
+  /// <summary>
+  /// Verifies that a lone high surrogate (U+D800–U+DBFF) not followed by a
+  /// low surrogate is malformed UTF-16 and is replaced with the mask string.
+  /// </summary>
+  [Test]
+  public void MaskXmlInvalidCharactersMasksLoneHighSurrogate()
+  {
+    // A lone high surrogate with no following low surrogate is illegal
+    const string loneSurrogate = "\uD83D";
+    Assert.That(Transform.MaskXmlInvalidCharacters(loneSurrogate, "?"), 
Is.EqualTo("?"));
+  }
+
+  /// <summary>
+  /// Verifies that a lone low surrogate (U+DC00–U+DFFF) not preceded by a
+  /// high surrogate is malformed UTF-16 and is replaced with the mask string.
+  /// </summary>
+  [Test]
+  public void MaskXmlInvalidCharactersMasksLoneLowSurrogate()
+  {
+    const string loneSurrogate = "\uDD11";
+    Assert.That(Transform.MaskXmlInvalidCharacters(loneSurrogate, "?"), 
Is.EqualTo("?"));
+  }
+
+  /// <summary>
+  /// Verifies current behaviour: a null input throws <see 
cref="ArgumentNullException"/>
+  /// (delegated to the regex engine). Add a null-guard to <see 
cref="Transform.MaskXmlInvalidCharacters"/>
+  /// if a graceful fallback is preferred.
+  /// </summary>
+  [Test]
+  public void MaskXmlInvalidCharactersNullInputThrowsArgumentNullException()
+    => Assert.That(() => Transform.MaskXmlInvalidCharacters(null!, "?"),
+        Throws.ArgumentNullException);
+
+  #region CountSubstrings tests (private method accessed via reflection)
+
+  /// <summary>
+  /// Resolves the private static <see cref="CountSubstrings"/> method once 
for all tests
+  /// </summary>
+  private static Func<string, string, int> CountSubstrings { get; } = 
(Func<string, string, int>)Delegate.CreateDelegate(
+    typeof(Func<string, string, int>), 
typeof(Transform).GetMethod(nameof(CountSubstrings), BindingFlags.NonPublic | 
BindingFlags.Static)
+    ?? throw new MissingMethodException(nameof(Transform), 
nameof(CountSubstrings)));
+
+  /// <summary>
+  /// Verifies that an empty text returns zero regardless of the substring.
+  /// </summary>
+  [Test]
+  public void CountSubstringsEmptyTextReturnsZero()
+    => Assert.That(CountSubstrings("", "a"), Is.Zero);
+
+  /// <summary>
+  /// Verifies that an empty substring returns zero regardless of the text.
+  /// </summary>
+  [Test]
+  public void CountSubstringsEmptySubstringReturnsZero()
+    => Assert.That(CountSubstrings("abc", ""), Is.Zero);
+
+  /// <summary>
+  /// Verifies that a substring not present in the text returns zero.
+  /// </summary>
+  [Test]
+  public void CountSubstringsNoMatchReturnsZero()
+    => Assert.That(CountSubstrings("hello", "x"), Is.Zero);
+
+  /// <summary>
+  /// Verifies that a single-character substring occurring once is counted 
correctly.
+  /// </summary>
+  [Test]
+  public void CountSubstringsSingleCharSingleMatchReturnsOne()
+    => Assert.That(CountSubstrings("hello", "h"), Is.EqualTo(1));
+
+  /// <summary>
+  /// Verifies that a single-character substring occurring multiple times is 
counted correctly.
+  /// </summary>
+  [Test]
+  public void CountSubstringsSingleCharMultipleMatchesReturnsCorrectCount()
+    => Assert.That(CountSubstrings("banana", "a"), Is.EqualTo(3));
+
+  /// <summary>
+  /// Verifies counting of XML special characters as used by 
<c>WriteEscapedXmlString</c>.
+  /// </summary>
+  [TestCase("<hello><world>", "<", 2)]
+  [TestCase("<hello><world>", ">", 2)]
+  [TestCase("a&amp;&amp;b", "&", 2)]
+  public void CountSubstringsXmlSpecialCharsReturnsCorrectCount(string text, 
string substring, int expected)
+    => Assert.That(CountSubstrings(text, substring), Is.EqualTo(expected));
+
+  /// <summary>
+  /// Verifies that the CDATA end token <c>]]&gt;</c> is counted correctly as 
a multi-character substring.
+  /// </summary>
+  [Test]
+  public void CountSubstringsCdataEndReturnsCorrectCount()
+    => Assert.That(CountSubstrings("foo]]>bar]]>baz", "]]>"), Is.EqualTo(2));
+
+  /// <summary>
+  /// Verifies that non-overlapping occurrences are counted and overlapping 
ones are not,
+  /// consistent with the method's documented assumption.
+  /// </summary>
+  [Test]
+  public void CountSubstringsNonOverlappingReturnsCorrectCount()
+    => Assert.That(CountSubstrings("aaaa", "aa"), Is.EqualTo(2));
+
+  /// <summary>
+  /// Verifies that a substring equal to the entire text counts as one match.
+  /// </summary>
+  [Test]
+  public void CountSubstringsSubstringEqualsTextReturnsOne()
+    => Assert.That(CountSubstrings("]]>", "]]>"), Is.EqualTo(1));
+  #endregion
 }
\ No newline at end of file
diff --git a/src/log4net/Util/Transform.cs b/src/log4net/Util/Transform.cs
index b646974b..843716e6 100644
--- a/src/log4net/Util/Transform.cs
+++ b/src/log4net/Util/Transform.cs
@@ -17,8 +17,9 @@
 //
 #endregion
 
-using System.Xml;
+using System;
 using System.Text.RegularExpressions;
+using System.Xml;
 
 namespace log4net.Util;
 
@@ -110,9 +111,15 @@ public static void WriteEscapedXmlString(this XmlWriter 
writer, string textData,
   /// This method replaces any illegal characters in the input string
   /// with the mask string specified.
   /// </para>
+  /// <para>
+  /// Supplementary characters (U+10000�U+10FFFF) encoded as valid UTF-16 
surrogate
+  /// pairs are preserved; only lone surrogates and other illegal code units 
are masked.
+  /// </para>
   /// </remarks>
-  public static string MaskXmlInvalidCharacters(string textData, string mask) 
-    => _invalidchars.Replace(textData, mask);
+  public static string MaskXmlInvalidCharacters(string textData, string mask)
+    => _invalidChars.Replace(textData, m =>
+        // A valid surrogate pair represents a legal supplementary character � 
preserve it.
+        m.Value.Length == 2 ? m.Value : mask);
 
   /// <summary>
   /// Count the number of times that the substring occurs in the text
@@ -127,29 +134,38 @@ public static string MaskXmlInvalidCharacters(string 
textData, string mask)
   /// </remarks>
   private static int CountSubstrings(string text, string substring)
   {
-    int count = 0;
-    int offset = 0;
-    int length = text.Length;
-    int substringLength = substring.Length;
-
-    if (length == 0)
+    if (text.Length == 0 || substring.Length == 0)
     {
       return 0;
     }
-    if (substringLength == 0)
+
+    // Use the char overload for single-character substrings � avoids string
+    // comparison overhead; all current callers pass single ASCII characters.
+    if (substring.Length == 1)
     {
-      return 0;
+      int charCount = 0;
+      char target = substring[0];
+      foreach (char c in text)
+      {
+        if (c == target)
+        {
+          charCount++;
+        }
+      }
+      return charCount;
     }
 
-    while (offset < length)
+    int count = 0;
+    int offset = 0;
+    int substringLength = substring.Length;
+    while (offset < text.Length)
     {
-      int index = text.IndexOf(substring, offset);
-
+      // Ordinal avoids culture-sensitive comparison for these ASCII-only 
tokens.
+      int index = text.IndexOf(substring, offset, StringComparison.Ordinal);
       if (index == -1)
       {
         break;
       }
-
       count++;
       offset = index + substringLength;
     }
@@ -160,7 +176,12 @@ private static int CountSubstrings(string text, string 
substring)
   private const string CdataUnescapableToken = "]]";
 
   /// <summary>
-  /// Characters illegal in XML 1.0
+  /// Matches either a valid UTF-16 surrogate pair (preserved) or a single 
character
+  /// that is illegal in XML 1.0 (replaced). The surrogate-pair alternative 
must come
+  /// first so the engine consumes both code units together before the 
single-char
+  /// alternative can match them individually.
   /// </summary>
-  private static readonly Regex _invalidchars = 
new(@"[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]", RegexOptions.Compiled);
-}
+  private static readonly Regex _invalidChars = new(
+    @"[\uD800-\uDBFF][\uDC00-\uDFFF]|[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]",
+    RegexOptions.Compiled);
+}
\ No newline at end of file

Reply via email to