(opennlp) 02/05: OPENNLP-1850 - Add robust character sequence normalization utilities and tests

kristian Thu, 18 Jun 2026 22:14:15 -0700

This is an automated email from the ASF dual-hosted git repository.

krickert pushed a commit to branch OPENNLP-1850_Whitespace-UTF-Normalizae
in repository https://gitbox.apache.org/repos/asf/opennlp.git


commit 0d53e31bb3f6774ad4eb2ad3c8171b64ef5bfdce
Author: Kristian Rickert <[email protected]>
AuthorDate: Thu Jun 18 22:28:13 2026 -0400

    OPENNLP-1850 - Add robust character sequence normalization utilities and 
tests
    
    Co-authored-by: Junie <[email protected]>
    Signed-off-by: Kristian Rickert <[email protected]>
---
 opennlp-api/pom.xml                                |   6 +
 .../opennlp/tools/util/normalizer/CharClass.java   | 383 +++++++++++++++++++++
 .../tools/util/normalizer/CodePointSet.java        | 245 +++++++++++++
 .../tools/util/normalizer/NormalizedText.java      |  51 +++
 .../opennlp/tools/util/normalizer/OffsetMap.java   | 135 ++++++++
 .../opennlp/tools/util/normalizer/UnicodeDash.java | 189 ++++++++++
 .../tools/util/normalizer/UnicodeWhitespace.java   | 242 +++++++++++++
 .../tools/util/normalizer/CharClassTest.java       | 292 ++++++++++++++++
 .../tools/util/normalizer/CodePointSetTest.java    | 241 +++++++++++++
 .../tools/util/normalizer/UnicodeDashTest.java     | 170 +++++++++
 .../util/normalizer/UnicodeWhitespaceTest.java     | 239 +++++++++++++
 .../src/main/java/opennlp/dl/AbstractDL.java       |  33 ++
 .../opennlp/dl/doccat/DocumentCategorizerDL.java   |  13 +-
 .../java/opennlp/dl/namefinder/NameFinderDL.java   |  90 +++--
 .../java/opennlp/dl/AbstractDLChunkingTest.java    |  61 ++++
 .../opennlp/dl/namefinder/NameFinderDLTest.java    |  33 +-
 .../AccentFoldCharSequenceNormalizer.java          | 133 +++++++
 .../normalizer/CaseFoldCharSequenceNormalizer.java |  47 +++
 .../normalizer/DashCharSequenceNormalizer.java     |  45 +++
 .../util/normalizer/NfcCharSequenceNormalizer.java |  45 +++
 .../normalizer/NfkcCharSequenceNormalizer.java     |  46 +++
 .../WhitespaceCharSequenceNormalizer.java          |  46 +++
 .../AccentFoldCharSequenceNormalizerTest.java      | 115 +++++++
 .../UnicodeCharSequenceNormalizerTest.java         |  97 ++++++
 24 files changed, 2960 insertions(+), 37 deletions(-)

diff --git a/opennlp-api/pom.xml b/opennlp-api/pom.xml
index 05404d154..516d9baec 100644
--- a/opennlp-api/pom.xml
+++ b/opennlp-api/pom.xml
@@ -49,6 +49,12 @@
       <artifactId>junit-jupiter-engine</artifactId>
       <scope>test</scope>
     </dependency>
+
+    <dependency>
+      <groupId>org.junit.jupiter</groupId>
+      <artifactId>junit-jupiter-params</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   
 </project>
\ No newline at end of file
diff --git 
a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java 
b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java
new file mode 100644
index 000000000..766f3324e
--- /dev/null
+++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java
@@ -0,0 +1,383 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+
+import opennlp.tools.util.Span;
+
+/**
+ * A configurable class of Unicode code points and the cursor based operations 
over it.
+ *
+ * <p>A {@code CharClass} pairs a {@link CodePointSet} of member code points 
with a single
+ * canonical ASCII {@code replacement} code point. Whitespace and dashes are 
the two built-in
+ * presets ({@link #whitespace()}, {@link #dashes()}); any other class is one 
more configured
+ * instance with no new engine code.</p>
+ *
+ * <p>Every operation is a single forward pass that reads one code point
+ * ({@link Character#codePointAt(CharSequence, int)}), tests membership in 
O(1), acts, and advances
+ * by {@link Character#charCount(int)}. There is no regular expression, no 
{@link java.util.regex}
+ * allocation, and no reliance on {@link Character#isWhitespace(int)} or
+ * {@link Character#isSpaceChar(int)}, all of which disagree with the Unicode 
standard.</p>
+ *
+ * <p>Instances are immutable and thread-safe.</p>
+ */
+public final class CharClass {
+
+  private static final CharClass WHITESPACE =
+      new CharClass(CodePointSet.of(UnicodeWhitespace.codePoints()), 0x0020);
+  private static final CharClass DASHES =
+      new CharClass(CodePointSet.of(UnicodeDash.defaultDashCodePoints()), 
UnicodeDash.HYPHEN_MINUS);
+
+  private final CodePointSet members;
+  private final int replacement;
+
+  private CharClass(CodePointSet members, int replacement) {
+    this.members = members;
+    this.replacement = replacement;
+  }
+
+  /**
+   * Creates a class from a member set and a replacement code point.
+   *
+   * @param members The member code points.
+   * @param replacement The canonical code point used by {@link 
#normalize(CharSequence)} and
+   *     {@link #collapse(CharSequence)}.
+   * @return The class.
+   * @throws IllegalArgumentException Thrown if {@code replacement} is not a 
valid code point.
+   */
+  public static CharClass of(CodePointSet members, int replacement) {
+    Objects.requireNonNull(members, "members");
+    requireValidCodePoint(replacement);
+    return new CharClass(members, replacement);
+  }
+
+  /** {@return the whitespace preset: the Unicode {@code White_Space} set, 
replacement {@code U+0020}} */
+  public static CharClass whitespace() {
+    return WHITESPACE;
+  }
+
+  /**
+   * {@return the dash preset: the Unicode {@code Dash} set excluding the 
mathematical minus signs,
+   * replacement {@code U+002D}}
+   */
+  public static CharClass dashes() {
+    return DASHES;
+  }
+
+  /**
+   * Returns a copy of this class whose member set is extended with {@code 
extra} (for example,
+   * user-defined code points loaded from {@link CodePointSet#fromFile}).
+   *
+   * @param extra The additional member code points.
+   * @return A new {@code CharClass}; this instance is unchanged.
+   */
+  public CharClass withAdditional(CodePointSet extra) {
+    Objects.requireNonNull(extra, "extra");
+    return new CharClass(members.union(extra), replacement);
+  }
+
+  /** {@return the member code points of this class} */
+  public CodePointSet members() {
+    return members;
+  }
+
+  /** {@return the canonical replacement code point} */
+  public int replacement() {
+    return replacement;
+  }
+
+  /**
+   * Tests membership.
+   *
+   * @param codePoint The code point to test.
+   * @return {@code true} if the code point is a member of this class.
+   */
+  public boolean contains(int codePoint) {
+    return members.contains(codePoint);
+  }
+
+  /**
+   * Splits text into the maximal runs of non-member code points, as character 
spans into the
+   * original text. Runs of members are delimiters and produce no empty spans.
+   *
+   * @param text The text to split.
+   * @return The token spans, in order.
+   */
+  public List<Span> splitSpans(CharSequence text) {
+    Objects.requireNonNull(text, "text");
+    final List<Span> spans = new ArrayList<>();
+    final int length = text.length();
+    int tokenStart = -1;
+    int i = 0;
+    while (i < length) {
+      final int codePoint = Character.codePointAt(text, i);
+      if (members.contains(codePoint)) {
+        if (tokenStart >= 0) {
+          spans.add(new Span(tokenStart, i));
+          tokenStart = -1;
+        }
+      } else if (tokenStart < 0) {
+        tokenStart = i;
+      }
+      i += Character.charCount(codePoint);
+    }
+    if (tokenStart >= 0) {
+      spans.add(new Span(tokenStart, length));
+    }
+    return spans;
+  }
+
+  /**
+   * Splits text into the maximal runs of non-member code points.
+   *
+   * @param text The text to split.
+   * @return The tokens, in order, with no empty entries.
+   */
+  public String[] split(CharSequence text) {
+    final List<Span> spans = splitSpans(text);
+    final String[] tokens = new String[spans.size()];
+    for (int i = 0; i < spans.size(); i++) {
+      final Span span = spans.get(i);
+      tokens[i] = text.subSequence(span.getStart(), span.getEnd()).toString();
+    }
+    return tokens;
+  }
+
+  /**
+   * Replaces each member code point with the replacement, one for one.
+   *
+   * @param text The text to normalize.
+   * @return The normalized text.
+   */
+  public String normalize(CharSequence text) {
+    Objects.requireNonNull(text, "text");
+    final StringBuilder out = new StringBuilder(text.length());
+    final int length = text.length();
+    int i = 0;
+    while (i < length) {
+      final int codePoint = Character.codePointAt(text, i);
+      out.appendCodePoint(members.contains(codePoint) ? replacement : 
codePoint);
+      i += Character.charCount(codePoint);
+    }
+    return out.toString();
+  }
+
+  /**
+   * Collapses each maximal run of member code points to a single replacement.
+   *
+   * @param text The text to collapse.
+   * @return The collapsed text.
+   */
+  public String collapse(CharSequence text) {
+    Objects.requireNonNull(text, "text");
+    final StringBuilder out = new StringBuilder(text.length());
+    final int length = text.length();
+    int i = 0;
+    while (i < length) {
+      final int codePoint = Character.codePointAt(text, i);
+      if (members.contains(codePoint)) {
+        out.appendCodePoint(replacement);
+        i = skipRun(text, i);
+      } else {
+        out.appendCodePoint(codePoint);
+        i += Character.charCount(codePoint);
+      }
+    }
+    return out.toString();
+  }
+
+  /**
+   * Collapses runs of members like {@link #collapse(CharSequence)}, but emits
+   * {@code keepReplacement} instead of the usual replacement for any run that 
contains a code
+   * point in {@code keep}. The whitespace "squish" that preserves a line 
break uses this with the
+   * line-break code points as {@code keep} and {@code '\n'} as {@code 
keepReplacement}.
+   *
+   * @param text The text to collapse.
+   * @param keep The member code points whose presence in a run preserves 
structure.
+   * @param keepReplacement The replacement emitted for a run that contains a 
{@code keep} member.
+   * @return The collapsed text.
+   * @throws IllegalArgumentException Thrown if {@code keepReplacement} is not 
a valid code point.
+   */
+  public String collapsePreserving(CharSequence text, CodePointSet keep, int 
keepReplacement) {
+    Objects.requireNonNull(text, "text");
+    Objects.requireNonNull(keep, "keep");
+    requireValidCodePoint(keepReplacement);
+    final StringBuilder out = new StringBuilder(text.length());
+    final int length = text.length();
+    int i = 0;
+    while (i < length) {
+      final int codePoint = Character.codePointAt(text, i);
+      if (members.contains(codePoint)) {
+        boolean preserve = keep.contains(codePoint);
+        int j = i + Character.charCount(codePoint);
+        while (j < length) {
+          final int next = Character.codePointAt(text, j);
+          if (!members.contains(next)) {
+            break;
+          }
+          preserve |= keep.contains(next);
+          j += Character.charCount(next);
+        }
+        out.appendCodePoint(preserve ? keepReplacement : replacement);
+        i = j;
+      } else {
+        out.appendCodePoint(codePoint);
+        i += Character.charCount(codePoint);
+      }
+    }
+    return out.toString();
+  }
+
+  /**
+   * Removes leading and trailing member code points.
+   *
+   * @param text The text to trim.
+   * @return The trimmed text.
+   */
+  public String trim(CharSequence text) {
+    Objects.requireNonNull(text, "text");
+    final int length = text.length();
+    int start = 0;
+    while (start < length) {
+      final int codePoint = Character.codePointAt(text, start);
+      if (!members.contains(codePoint)) {
+        break;
+      }
+      start += Character.charCount(codePoint);
+    }
+    int end = length;
+    while (end > start) {
+      final int codePoint = Character.codePointBefore(text, end);
+      if (!members.contains(codePoint)) {
+        break;
+      }
+      end -= Character.charCount(codePoint);
+    }
+    return text.subSequence(start, end).toString();
+  }
+
+  /**
+   * Removes every member code point.
+   *
+   * @param text The text to filter.
+   * @return The text with all members removed.
+   */
+  public String removeAll(CharSequence text) {
+    Objects.requireNonNull(text, "text");
+    final StringBuilder out = new StringBuilder(text.length());
+    final int length = text.length();
+    int i = 0;
+    while (i < length) {
+      final int codePoint = Character.codePointAt(text, i);
+      if (!members.contains(codePoint)) {
+        out.appendCodePoint(codePoint);
+      }
+      i += Character.charCount(codePoint);
+    }
+    return out.toString();
+  }
+
+  /**
+   * Like {@link #normalize(CharSequence)} but also produces the {@link 
OffsetMap} back to the
+   * original text.
+   *
+   * @param text The text to normalize.
+   * @return The normalized text and its offset map.
+   */
+  public NormalizedText normalizeMapped(CharSequence text) {
+    Objects.requireNonNull(text, "text");
+    final StringBuilder out = new StringBuilder(text.length());
+    final OffsetMap.Builder offsets = new OffsetMap.Builder();
+    final int length = text.length();
+    int i = 0;
+    while (i < length) {
+      final int codePoint = Character.codePointAt(text, i);
+      if (members.contains(codePoint)) {
+        appendMapped(out, replacement, offsets, i, i);
+      } else {
+        appendMapped(out, codePoint, offsets, i, i + 1);
+      }
+      i += Character.charCount(codePoint);
+    }
+    return new NormalizedText(text, out.toString(), offsets.build(length));
+  }
+
+  /**
+   * Like {@link #collapse(CharSequence)} but also produces the {@link 
OffsetMap} back to the
+   * original text. Each collapsed run maps to the run's start offset.
+   *
+   * @param text The text to collapse.
+   * @return The collapsed text and its offset map.
+   */
+  public NormalizedText collapseMapped(CharSequence text) {
+    Objects.requireNonNull(text, "text");
+    final StringBuilder out = new StringBuilder(text.length());
+    final OffsetMap.Builder offsets = new OffsetMap.Builder();
+    final int length = text.length();
+    int i = 0;
+    while (i < length) {
+      final int codePoint = Character.codePointAt(text, i);
+      if (members.contains(codePoint)) {
+        appendMapped(out, replacement, offsets, i, i);
+        i = skipRun(text, i);
+      } else {
+        appendMapped(out, codePoint, offsets, i, i + 1);
+        i += Character.charCount(codePoint);
+      }
+    }
+    return new NormalizedText(text, out.toString(), offsets.build(length));
+  }
+
+  // Appends one code point to the output and records an original offset for 
each output char.
+  // firstOffset maps the first (or only) char; secondOffset maps the low 
surrogate of a
+  // supplementary code point.
+  private static void appendMapped(StringBuilder out, int codePoint, 
OffsetMap.Builder offsets,
+                                   int firstOffset, int secondOffset) {
+    if (Character.isBmpCodePoint(codePoint)) {
+      out.append((char) codePoint);
+      offsets.map(firstOffset);
+    } else {
+      out.append(Character.highSurrogate(codePoint));
+      offsets.map(firstOffset);
+      out.append(Character.lowSurrogate(codePoint));
+      offsets.map(secondOffset);
+    }
+  }
+
+  // Returns the offset just past the maximal run of members starting at 
runStart.
+  private int skipRun(CharSequence text, int runStart) {
+    final int length = text.length();
+    int i = runStart;
+    while (i < length) {
+      final int codePoint = Character.codePointAt(text, i);
+      if (!members.contains(codePoint)) {
+        break;
+      }
+      i += Character.charCount(codePoint);
+    }
+    return i;
+  }
+
+  private static void requireValidCodePoint(int codePoint) {
+    if (codePoint < 0 || codePoint > Character.MAX_CODE_POINT) {
+      throw new IllegalArgumentException("Not a Unicode code point: " + 
codePoint);
+    }
+  }
+}
diff --git 
a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CodePointSet.java 
b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CodePointSet.java
new file mode 100644
index 000000000..a15b005b0
--- /dev/null
+++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CodePointSet.java
@@ -0,0 +1,245 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.BitSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Objects;
+
+/**
+ * An immutable set of Unicode code points with O(1) membership.
+ *
+ * <p>Backed by a {@link BitSet} keyed directly by code point, so {@link 
#contains(int)} is a
+ * single array-word read with no boxing, hashing, or branching beyond a range 
check. Memory is
+ * bounded by the largest member code point (the whole of Unicode would cost 
about {@code 136 KiB},
+ * and the standard whitespace and dash sets are entirely or almost entirely 
in the Basic
+ * Multilingual Plane, so a few kilobytes in practice).</p>
+ *
+ * <p>This type carries no opinion about what the code points mean. It is the 
explicit,
+ * standards-sourced data layer that {@link CharClass} and the reference tables
+ * ({@link UnicodeWhitespace}, {@link UnicodeDash}) are built from, and that 
users extend or
+ * override through {@link #fromFile(Path, String)}.</p>
+ */
+public final class CodePointSet {
+
+  private final BitSet members;
+
+  private CodePointSet(BitSet members) {
+    this.members = members;
+  }
+
+  /**
+   * Creates a set from explicit code points.
+   *
+   * @param codePoints The code points to include.
+   * @return The set.
+   * @throws IllegalArgumentException Thrown if any value is not a valid 
Unicode code point
+   *     (outside {@code [0, U+10FFFF]}).
+   */
+  public static CodePointSet of(int... codePoints) {
+    final BitSet members = new BitSet();
+    for (final int codePoint : codePoints) {
+      requireValid(codePoint);
+      members.set(codePoint);
+    }
+    return new CodePointSet(members);
+  }
+
+  /**
+   * Creates a set covering an inclusive code point range.
+   *
+   * @param firstInclusive The first code point in the range.
+   * @param lastInclusive The last code point in the range.
+   * @return The set.
+   * @throws IllegalArgumentException Thrown if either bound is invalid or 
{@code firstInclusive}
+   *     is greater than {@code lastInclusive}.
+   */
+  public static CodePointSet ofRange(int firstInclusive, int lastInclusive) {
+    requireValid(firstInclusive);
+    requireValid(lastInclusive);
+    if (firstInclusive > lastInclusive) {
+      throw new IllegalArgumentException("Range start " + firstInclusive
+          + " must not exceed range end " + lastInclusive + ".");
+    }
+    final BitSet members = new BitSet();
+    members.set(firstInclusive, lastInclusive + 1);
+    return new CodePointSet(members);
+  }
+
+  /**
+   * Loads the code points declared under one section of a user definitions 
file.
+   *
+   * <p>The format is line oriented and parsed with simple cursor scanning, 
not a regular
+   * expression: a {@code [name]} line opens a section; a {@code #} begins a 
comment that runs to
+   * end of line; each remaining line is a single hex code point ({@code 
U+00A0}, {@code 0x00A0},
+   * or {@code 00A0}) or an inclusive range ({@code U+2000-U+200A}). Section 
names match case
+   * insensitively. Only entries under the requested section are returned, so 
one file can carry,
+   * for example, both {@code [whitespace]} and {@code [dash]} sections.</p>
+   *
+   * @param definitions The file to read (UTF-8).
+   * @param section The section whose entries should be loaded.
+   * @return The code points declared under {@code section}, or an empty set 
if the section is
+   *     absent.
+   * @throws IOException Thrown if the file cannot be read.
+   * @throws IllegalArgumentException Thrown if a line is malformed, naming 
the offending line.
+   */
+  public static CodePointSet fromFile(Path definitions, String section) throws 
IOException {
+    Objects.requireNonNull(definitions, "definitions");
+    return parse(Files.readAllLines(definitions, StandardCharsets.UTF_8), 
section);
+  }
+
+  // Package visible so the parser can be exercised directly, without a 
temporary file.
+  static CodePointSet parse(List<String> lines, String section) {
+    Objects.requireNonNull(section, "section");
+    final String wanted = section.trim().toLowerCase(Locale.ROOT);
+    final BitSet members = new BitSet();
+    String current = null;
+
+    for (int i = 0; i < lines.size(); i++) {
+      final String raw = lines.get(i);
+      final int lineNumber = i + 1;
+      final String line = stripComment(raw).strip();
+      if (line.isEmpty()) {
+        continue;
+      }
+      if (line.charAt(0) == '[') {
+        if (line.length() < 3 || line.charAt(line.length() - 1) != ']') {
+          throw malformed("section header", lineNumber, raw);
+        }
+        current = line.substring(1, line.length() - 
1).strip().toLowerCase(Locale.ROOT);
+        continue;
+      }
+      if (current == null) {
+        throw new IllegalArgumentException("Code point entry before any 
[section] header on line "
+            + lineNumber + ": " + raw);
+      }
+      if (wanted.equals(current)) {
+        addEntry(members, line, lineNumber, raw);
+      }
+    }
+
+    return new CodePointSet(members);
+  }
+
+  private static void addEntry(BitSet members, String line, int lineNumber, 
String raw) {
+    final int separator = line.indexOf('-');
+    if (separator < 0) {
+      members.set(parseCodePoint(line, lineNumber, raw));
+      return;
+    }
+    final int low = parseCodePoint(line.substring(0, separator).strip(), 
lineNumber, raw);
+    final int high = parseCodePoint(line.substring(separator + 1).strip(), 
lineNumber, raw);
+    if (low > high) {
+      throw new IllegalArgumentException("Descending code point range on line "
+          + lineNumber + ": " + raw);
+    }
+    members.set(low, high + 1);
+  }
+
+  private static int parseCodePoint(String token, int lineNumber, String raw) {
+    String hex = token;
+    if (hex.length() >= 2) {
+      final String prefix = hex.substring(0, 2).toLowerCase(Locale.ROOT);
+      if (prefix.equals("u+") || prefix.equals("0x")) {
+        hex = hex.substring(2);
+      }
+    }
+    if (hex.isEmpty()) {
+      throw malformed("code point", lineNumber, raw);
+    }
+    final int codePoint;
+    try {
+      codePoint = Integer.parseInt(hex, 16);
+    } catch (NumberFormatException e) {
+      throw new IllegalArgumentException("Invalid hex code point '" + token + 
"' on line "
+          + lineNumber + ": " + raw, e);
+    }
+    if (codePoint < 0 || codePoint > Character.MAX_CODE_POINT) {
+      throw new IllegalArgumentException("Code point out of range on line "
+          + lineNumber + ": " + raw);
+    }
+    return codePoint;
+  }
+
+  private static String stripComment(String raw) {
+    final int hash = raw.indexOf('#');
+    return hash < 0 ? raw : raw.substring(0, hash);
+  }
+
+  private static IllegalArgumentException malformed(String what, int 
lineNumber, String raw) {
+    return new IllegalArgumentException("Malformed " + what + " on line " + 
lineNumber + ": " + raw);
+  }
+
+  private static void requireValid(int codePoint) {
+    if (codePoint < 0 || codePoint > Character.MAX_CODE_POINT) {
+      throw new IllegalArgumentException("Not a Unicode code point: " + 
codePoint);
+    }
+  }
+
+  /**
+   * Tests membership.
+   *
+   * @param codePoint The code point to test. Out-of-range values return 
{@code false}.
+   * @return {@code true} if the code point is in this set.
+   */
+  public boolean contains(int codePoint) {
+    return codePoint >= 0 && codePoint <= Character.MAX_CODE_POINT && 
members.get(codePoint);
+  }
+
+  /**
+   * Returns a new set containing every code point in this set or {@code 
other}.
+   *
+   * @param other The set to union with.
+   * @return The union, a new set; neither input is modified.
+   */
+  public CodePointSet union(CodePointSet other) {
+    Objects.requireNonNull(other, "other");
+    final BitSet merged = (BitSet) members.clone();
+    merged.or(other.members);
+    return new CodePointSet(merged);
+  }
+
+  /** {@return the number of code points in this set} */
+  public int size() {
+    return members.cardinality();
+  }
+
+  /** {@return whether this set is empty} */
+  public boolean isEmpty() {
+    return members.isEmpty();
+  }
+
+  /** {@return the member code points, in ascending order} */
+  public int[] toArray() {
+    return members.stream().toArray();
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    return o instanceof CodePointSet other && members.equals(other.members);
+  }
+
+  @Override
+  public int hashCode() {
+    return members.hashCode();
+  }
+}
diff --git 
a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/NormalizedText.java 
b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/NormalizedText.java
new file mode 100644
index 000000000..87678d741
--- /dev/null
+++ 
b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/NormalizedText.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+/**
+ * The result of a normalization that keeps the original text alongside the 
normalized form.
+ *
+ * <p>The original is the source of truth (display, offsets, language-specific 
analysis); the
+ * normalized form is a derived view tuned for matching and search. The {@link 
OffsetMap} ties the
+ * two together so a position in the normalized text can be reported against 
the original.</p>
+ *
+ * @param original The untouched source text.
+ * @param normalized The normalized text.
+ * @param offsets The mapping between normalized and original character 
offsets.
+ */
+public record NormalizedText(CharSequence original, String normalized, 
OffsetMap offsets) {
+
+  /**
+   * Maps a normalized character offset back to the original text.
+   *
+   * @param normalizedOffset An offset in {@code [0, normalized().length()]}.
+   * @return The corresponding original character offset.
+   */
+  public int toOriginalOffset(int normalizedOffset) {
+    return offsets.toOriginalOffset(normalizedOffset);
+  }
+
+  /**
+   * Maps an original character offset forward to the normalized text.
+   *
+   * @param originalOffset An offset in {@code [0, original().length()]}.
+   * @return The corresponding normalized character offset.
+   */
+  public int toNormalizedOffset(int originalOffset) {
+    return offsets.toNormalizedOffset(originalOffset);
+  }
+}
diff --git 
a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetMap.java 
b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetMap.java
new file mode 100644
index 000000000..24fa558cf
--- /dev/null
+++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetMap.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.Arrays;
+
+/**
+ * A mapping between character offsets in a normalized string and the original 
text it came from.
+ *
+ * <p>Normalization that collapses runs or substitutes supplementary 
characters changes string
+ * length, so an offset into the normalized form no longer lines up with the 
original. This map
+ * records, for every normalized character, the original character offset it 
was produced from,
+ * which lets a match found in the normalized form be reported in original 
coordinates.</p>
+ *
+ * <p>The internal mapping is non-decreasing, so {@link 
#toOriginalOffset(int)} is a direct array
+ * read (O(1)) and {@link #toNormalizedOffset(int)} is a binary search (O(log 
n)). The map is
+ * built in the same single cursor pass that produces the normalized text, via 
{@link Builder}.</p>
+ */
+public final class OffsetMap {
+
+  // normalizedToOriginal[k] is the original char offset that produced 
normalized char k.
+  // It has one extra trailing slot mapping the end of the normalized text to 
the end of the
+  // original text, so offsets in [0, normalizedLength] are all valid.
+  private final int[] normalizedToOriginal;
+  private final int originalLength;
+
+  private OffsetMap(int[] normalizedToOriginal, int originalLength) {
+    this.normalizedToOriginal = normalizedToOriginal;
+    this.originalLength = originalLength;
+  }
+
+  /**
+   * Maps a normalized character offset back to the original text.
+   *
+   * @param normalizedOffset An offset in {@code [0, normalizedLength]}.
+   * @return The corresponding original character offset.
+   * @throws IndexOutOfBoundsException Thrown if {@code normalizedOffset} is 
out of range.
+   */
+  public int toOriginalOffset(int normalizedOffset) {
+    if (normalizedOffset < 0 || normalizedOffset >= 
normalizedToOriginal.length) {
+      throw new IndexOutOfBoundsException("normalized offset " + 
normalizedOffset
+          + " is outside [0, " + normalizedLength() + "]");
+    }
+    return normalizedToOriginal[normalizedOffset];
+  }
+
+  /**
+   * Maps an original character offset forward to the normalized text.
+   *
+   * <p>Returns the first normalized offset whose source is at or after {@code 
originalOffset}.
+   * When several original characters collapse to one normalized character, 
they all map to that
+   * single normalized offset.</p>
+   *
+   * @param originalOffset An offset in {@code [0, originalLength]}.
+   * @return The corresponding normalized character offset.
+   * @throws IndexOutOfBoundsException Thrown if {@code originalOffset} is out 
of range.
+   */
+  public int toNormalizedOffset(int originalOffset) {
+    if (originalOffset < 0 || originalOffset > originalLength) {
+      throw new IndexOutOfBoundsException("original offset " + originalOffset
+          + " is outside [0, " + originalLength + "]");
+    }
+    int low = 0;
+    int high = normalizedToOriginal.length - 1;
+    int answer = normalizedToOriginal.length - 1;
+    while (low <= high) {
+      final int mid = (low + high) >>> 1;
+      if (normalizedToOriginal[mid] >= originalOffset) {
+        answer = mid;
+        high = mid - 1;
+      } else {
+        low = mid + 1;
+      }
+    }
+    return answer;
+  }
+
+  /** {@return the length of the normalized text this map was built for} */
+  public int normalizedLength() {
+    return normalizedToOriginal.length - 1;
+  }
+
+  /** {@return the length of the original text this map was built for} */
+  public int originalLength() {
+    return originalLength;
+  }
+
+  /**
+   * Builds an {@link OffsetMap} incrementally during a normalization pass. 
Call {@link #map(int)}
+   * once for each character appended to the normalized output, then {@link 
#build(int)} once.
+   */
+  public static final class Builder {
+
+    private int[] buffer = new int[16];
+    private int length;
+
+    /**
+     * Records the original character offset that produced the next normalized 
character.
+     *
+     * @param originalOffset The source offset in the original text.
+     */
+    public void map(int originalOffset) {
+      if (length == buffer.length) {
+        buffer = Arrays.copyOf(buffer, buffer.length * 2);
+      }
+      buffer[length++] = originalOffset;
+    }
+
+    /**
+     * Finalizes the map.
+     *
+     * @param originalLength The length of the original text (used as the 
trailing sentinel).
+     * @return The immutable {@link OffsetMap}.
+     */
+    public OffsetMap build(int originalLength) {
+      final int[] mapping = Arrays.copyOf(buffer, length + 1);
+      mapping[length] = originalLength;
+      return new OffsetMap(mapping, originalLength);
+    }
+  }
+}
diff --git 
a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/UnicodeDash.java 
b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/UnicodeDash.java
new file mode 100644
index 000000000..7ac3ea829
--- /dev/null
+++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/UnicodeDash.java
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+
+/**
+ * Reference data for Unicode dashes, plus O(1) membership lookups.
+ *
+ * <p>This is a static, immutable table of every code point that carries the 
Unicode {@code Dash}
+ * property (Unicode Character Database, {@code PropList.txt}). The set is 
broader than the
+ * {@code Pd} (dash punctuation) general category: it also includes the swung 
dash ({@code Po})
+ * and the mathematical minus signs ({@code Sm}). Java offers no {@code Dash} 
predicate and
+ * {@code \p{Pd}} would miss the {@code Sm} and {@code Po} members, which is 
why the set is kept
+ * here explicitly.</p>
+ *
+ * <p>Two distinctions matter for normalization:</p>
+ * <ul>
+ *   <li>The three mathematical minus signs ({@code U+207B}, {@code U+208B}, 
{@code U+2212}, all
+ *   category {@code Sm}) are excluded from {@link #defaultDashCodePoints()} 
because flattening
+ *   them to {@code U+002D} can change mathematical meaning. They remain 
available through
+ *   {@link #codePoints()} for callers that opt in.</li>
+ *   <li>{@code U+00AD} SOFT HYPHEN is deliberately absent: it is a format 
character
+ *   ({@code White_Space=no}, {@code Dash=no}), an invisible line-break hint, 
and must not be
+ *   turned into a visible hyphen.</li>
+ * </ul>
+ */
+public final class UnicodeDash {
+
+  /** The canonical ASCII dash that dashes are normalized to: {@code U+002D} 
HYPHEN-MINUS. */
+  public static final int HYPHEN_MINUS = 0x002D;
+
+  /** The Unicode general category of a dash code point. */
+  public enum Category {
+    /** {@code Pd} - dash punctuation. */
+    Pd,
+    /** {@code Po} - other punctuation (the swung dash). */
+    Po,
+    /** {@code Sm} - math symbol (the minus signs). */
+    Sm
+  }
+
+  /**
+   * One Unicode dash code point and its reference attributes.
+   *
+   * @param codePoint The Unicode code point.
+   * @param name The Unicode character name, lower cased.
+   * @param category The Unicode general {@link Category category}.
+   */
+  public record DashCharacter(int codePoint, String name, Category category) {
+
+    /** {@return whether this is a mathematical minus sign (category {@code 
Sm})} */
+    public boolean isMathematical() {
+      return category == Category.Sm;
+    }
+
+    /** {@return whether this code point is outside the Basic Multilingual 
Plane} */
+    public boolean isSupplementary() {
+      return codePoint > 0xFFFF;
+    }
+
+    /** {@return the {@code U+XXXX} notation for this code point} */
+    public String toUnicodeNotation() {
+      return String.format("U+%04X", codePoint);
+    }
+  }
+
+  private static final List<DashCharacter> DASHES = List.of(
+      new DashCharacter(0x002D, "hyphen-minus", Category.Pd),
+      new DashCharacter(0x058A, "armenian hyphen", Category.Pd),
+      new DashCharacter(0x05BE, "hebrew punctuation maqaf", Category.Pd),
+      new DashCharacter(0x1400, "canadian syllabics hyphen", Category.Pd),
+      new DashCharacter(0x1806, "mongolian todo soft hyphen", Category.Pd),
+      new DashCharacter(0x2010, "hyphen", Category.Pd),
+      new DashCharacter(0x2011, "non-breaking hyphen", Category.Pd),
+      new DashCharacter(0x2012, "figure dash", Category.Pd),
+      new DashCharacter(0x2013, "en dash", Category.Pd),
+      new DashCharacter(0x2014, "em dash", Category.Pd),
+      new DashCharacter(0x2015, "horizontal bar", Category.Pd),
+      new DashCharacter(0x2053, "swung dash", Category.Po),
+      new DashCharacter(0x207B, "superscript minus", Category.Sm),
+      new DashCharacter(0x208B, "subscript minus", Category.Sm),
+      new DashCharacter(0x2212, "minus sign", Category.Sm),
+      new DashCharacter(0x2E17, "double oblique hyphen", Category.Pd),
+      new DashCharacter(0x2E1A, "hyphen with diaeresis", Category.Pd),
+      new DashCharacter(0x2E3A, "two-em dash", Category.Pd),
+      new DashCharacter(0x2E3B, "three-em dash", Category.Pd),
+      new DashCharacter(0x2E40, "double hyphen", Category.Pd),
+      new DashCharacter(0x2E5D, "oblique hyphen", Category.Pd),
+      new DashCharacter(0x301C, "wave dash", Category.Pd),
+      new DashCharacter(0x3030, "wavy dash", Category.Pd),
+      new DashCharacter(0x30A0, "katakana-hiragana double hyphen", 
Category.Pd),
+      new DashCharacter(0xFE31, "presentation form for vertical em dash", 
Category.Pd),
+      new DashCharacter(0xFE32, "presentation form for vertical en dash", 
Category.Pd),
+      new DashCharacter(0xFE58, "small em dash", Category.Pd),
+      new DashCharacter(0xFE63, "small hyphen-minus", Category.Pd),
+      new DashCharacter(0xFF0D, "fullwidth hyphen-minus", Category.Pd),
+      new DashCharacter(0x10D6E, "garay hyphen", Category.Pd),
+      new DashCharacter(0x10EAD, "yezidi hyphenation mark", Category.Pd));
+
+  private static final Map<Integer, DashCharacter> BY_CODE_POINT = new 
HashMap<>();
+  private static final BitSet MEMBERSHIP = new BitSet();
+  private static final int[] CODE_POINTS = new int[DASHES.size()];
+  private static final List<DashCharacter> MATHEMATICAL = new ArrayList<>();
+  private static final int[] DEFAULT_CODE_POINTS;
+
+  static {
+    final List<Integer> defaults = new ArrayList<>();
+    for (int i = 0; i < DASHES.size(); i++) {
+      final DashCharacter dash = DASHES.get(i);
+      BY_CODE_POINT.put(dash.codePoint(), dash);
+      MEMBERSHIP.set(dash.codePoint());
+      CODE_POINTS[i] = dash.codePoint();
+      if (dash.isMathematical()) {
+        MATHEMATICAL.add(dash);
+      } else {
+        defaults.add(dash.codePoint());
+      }
+    }
+    DEFAULT_CODE_POINTS = 
defaults.stream().mapToInt(Integer::intValue).toArray();
+  }
+
+  private UnicodeDash() {
+  }
+
+  /**
+   * Tests whether a code point carries the Unicode {@code Dash} property.
+   *
+   * @param codePoint The code point to test. Out-of-range values return 
{@code false}.
+   * @return {@code true} if the code point is one of the Unicode dash 
characters.
+   */
+  public static boolean isDash(int codePoint) {
+    return codePoint >= 0 && codePoint <= Character.MAX_CODE_POINT && 
MEMBERSHIP.get(codePoint);
+  }
+
+  /**
+   * Looks up the reference entry for a dash code point.
+   *
+   * @param codePoint The code point.
+   * @return The {@link DashCharacter}, or {@link Optional#empty()} if it is 
not a dash.
+   */
+  public static Optional<DashCharacter> byCodePoint(int codePoint) {
+    return Optional.ofNullable(BY_CODE_POINT.get(codePoint));
+  }
+
+  /** {@return all Unicode dash characters, in ascending code point order} */
+  public static List<DashCharacter> all() {
+    return DASHES;
+  }
+
+  /** {@return the mathematical minus signs, excluded from the default 
normalization set} */
+  public static List<DashCharacter> mathematical() {
+    return List.copyOf(MATHEMATICAL);
+  }
+
+  /** {@return all dash code points, in ascending order, including the 
mathematical minus signs} */
+  public static int[] codePoints() {
+    return CODE_POINTS.clone();
+  }
+
+  /**
+   * {@return the dash code points used for normalization by default, in 
ascending order}
+   *
+   * <p>This is every dash except the mathematical minus signs, so flattening 
to
+   * {@link #HYPHEN_MINUS} does not silently rewrite mathematics.</p>
+   */
+  public static int[] defaultDashCodePoints() {
+    return DEFAULT_CODE_POINTS.clone();
+  }
+}
diff --git 
a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/UnicodeWhitespace.java
 
b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/UnicodeWhitespace.java
new file mode 100644
index 000000000..3712f0906
--- /dev/null
+++ 
b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/UnicodeWhitespace.java
@@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+
+/**
+ * Reference data for Unicode whitespace, plus O(1) membership lookups.
+ *
+ * <p>This is a static, immutable table of the {@code 25} code points that 
carry the Unicode
+ * {@code White_Space} property, and the related {@code 6} code points that 
are commonly mistaken
+ * for whitespace but carry {@code White_Space=no} (zero-width and other 
format characters).
+ * The data mirrors the tables in
+ * <a href="https://en.wikipedia.org/wiki/Whitespace_character";>Whitespace 
character</a>
+ * and the Unicode Character Database ({@code PropList.txt}).</p>
+ *
+ * <p>The membership test is deliberately built from this explicit table 
rather than from
+ * {@link Character#isWhitespace(int)} or {@link Character#isSpaceChar(int)}, 
both of which
+ * disagree with the Unicode {@code White_Space} property. {@code 
Character.isWhitespace}
+ * excludes the non-breaking spaces and {@code NEL} but includes the 
information-separator
+ * controls {@code U+001C}-{@code U+001F}; {@code Character.isSpaceChar} 
excludes tab, newline,
+ * and the other line breaks. {@link #isWhitespace(int)} matches the standard 
exactly.</p>
+ */
+public final class UnicodeWhitespace {
+
+  /** Unicode general category for a whitespace or related code point. */
+  public enum Category {
+    /** {@code Cc} - control. */
+    Cc,
+    /** {@code Zs} - space separator. */
+    Zs,
+    /** {@code Zl} - line separator. */
+    Zl,
+    /** {@code Zp} - paragraph separator. */
+    Zp,
+    /** {@code Cf} - format (the related, non-whitespace code points). */
+    Cf
+  }
+
+  /** Line-breaking behavior, mirroring the "Notes" column of the reference 
table. */
+  public enum Breaking {
+    /** A break opportunity, but not a forced line break (e.g. {@code SPACE}). 
*/
+    MAY_BREAK,
+    /** A forced line or paragraph break (e.g. {@code LF}, {@code LINE 
SEPARATOR}). */
+    LINE_BREAK,
+    /** A space that suppresses line breaking (e.g. {@code NO-BREAK SPACE}). */
+    NON_BREAKING
+  }
+
+  /**
+   * One Unicode whitespace code point and its reference attributes.
+   *
+   * @param codePoint The Unicode code point.
+   * @param name The Unicode character name, lower cased as in the reference 
table.
+   * @param abbreviation The common abbreviation (for example {@code NBSP}), 
or {@code ""} if none.
+   * @param category The Unicode general {@link Category category}.
+   * @param breaking The line-{@link Breaking breaking} behavior.
+   */
+  public record WhitespaceCharacter(int codePoint, String name, String 
abbreviation,
+                                    Category category, Breaking breaking) {
+
+    /** {@return whether this code point forces a line or paragraph break} */
+    public boolean isLineBreak() {
+      return breaking == Breaking.LINE_BREAK;
+    }
+
+    /** {@return whether this is a non-breaking space} */
+    public boolean isNonBreaking() {
+      return breaking == Breaking.NON_BREAKING;
+    }
+
+    /** {@return the {@code U+XXXX} notation for this code point} */
+    public String toUnicodeNotation() {
+      return String.format("U+%04X", codePoint);
+    }
+  }
+
+  /**
+   * One related code point that is commonly confused with whitespace but is 
not
+   * ({@code White_Space=no}). These are format characters and must not be 
treated as, or
+   * normalized like, whitespace.
+   *
+   * @param codePoint The Unicode code point.
+   * @param name The Unicode character name, lower cased as in the reference 
table.
+   * @param abbreviation The common abbreviation (for example {@code BOM}), or 
{@code ""} if none.
+   * @param note A short description of what the character actually does.
+   */
+  public record RelatedCharacter(int codePoint, String name, String 
abbreviation, String note) {
+
+    /** {@return the {@code U+XXXX} notation for this code point} */
+    public String toUnicodeNotation() {
+      return String.format("U+%04X", codePoint);
+    }
+  }
+
+  private static final List<WhitespaceCharacter> WHITESPACE = List.of(
+      new WhitespaceCharacter(0x0009, "character tabulation", "HT", 
Category.Cc, Breaking.MAY_BREAK),
+      new WhitespaceCharacter(0x000A, "line feed", "LF", Category.Cc, 
Breaking.LINE_BREAK),
+      new WhitespaceCharacter(0x000B, "line tabulation", "VT", Category.Cc, 
Breaking.LINE_BREAK),
+      new WhitespaceCharacter(0x000C, "form feed", "FF", Category.Cc, 
Breaking.LINE_BREAK),
+      new WhitespaceCharacter(0x000D, "carriage return", "CR", Category.Cc, 
Breaking.LINE_BREAK),
+      new WhitespaceCharacter(0x0020, "space", "", Category.Zs, 
Breaking.MAY_BREAK),
+      new WhitespaceCharacter(0x0085, "next line", "NEL", Category.Cc, 
Breaking.LINE_BREAK),
+      new WhitespaceCharacter(0x00A0, "no-break space", "NBSP", Category.Zs, 
Breaking.NON_BREAKING),
+      new WhitespaceCharacter(0x1680, "ogham space mark", "", Category.Zs, 
Breaking.MAY_BREAK),
+      new WhitespaceCharacter(0x2000, "en quad", "", Category.Zs, 
Breaking.MAY_BREAK),
+      new WhitespaceCharacter(0x2001, "em quad", "", Category.Zs, 
Breaking.MAY_BREAK),
+      new WhitespaceCharacter(0x2002, "en space", "", Category.Zs, 
Breaking.MAY_BREAK),
+      new WhitespaceCharacter(0x2003, "em space", "", Category.Zs, 
Breaking.MAY_BREAK),
+      new WhitespaceCharacter(0x2004, "three-per-em space", "", Category.Zs, 
Breaking.MAY_BREAK),
+      new WhitespaceCharacter(0x2005, "four-per-em space", "", Category.Zs, 
Breaking.MAY_BREAK),
+      new WhitespaceCharacter(0x2006, "six-per-em space", "", Category.Zs, 
Breaking.MAY_BREAK),
+      new WhitespaceCharacter(0x2007, "figure space", "", Category.Zs, 
Breaking.NON_BREAKING),
+      new WhitespaceCharacter(0x2008, "punctuation space", "", Category.Zs, 
Breaking.MAY_BREAK),
+      new WhitespaceCharacter(0x2009, "thin space", "", Category.Zs, 
Breaking.MAY_BREAK),
+      new WhitespaceCharacter(0x200A, "hair space", "", Category.Zs, 
Breaking.MAY_BREAK),
+      new WhitespaceCharacter(0x2028, "line separator", "", Category.Zl, 
Breaking.LINE_BREAK),
+      new WhitespaceCharacter(0x2029, "paragraph separator", "", Category.Zp, 
Breaking.LINE_BREAK),
+      new WhitespaceCharacter(0x202F, "narrow no-break space", "NNBSP", 
Category.Zs,
+          Breaking.NON_BREAKING),
+      new WhitespaceCharacter(0x205F, "medium mathematical space", "MMSP", 
Category.Zs,
+          Breaking.MAY_BREAK),
+      new WhitespaceCharacter(0x3000, "ideographic space", "", Category.Zs, 
Breaking.MAY_BREAK));
+
+  private static final List<RelatedCharacter> LOOKALIKES = List.of(
+      new RelatedCharacter(0x180E, "mongolian vowel separator", "MVS",
+          "format character; narrow space for Mongolian"),
+      new RelatedCharacter(0x200B, "zero width space", "ZWSP",
+          "format; word boundary indicator, no visible width"),
+      new RelatedCharacter(0x200C, "zero width non-joiner", "ZWNJ",
+          "format; prevents character connection"),
+      new RelatedCharacter(0x200D, "zero width joiner", "ZWJ",
+          "format; enables character connection"),
+      new RelatedCharacter(0x2060, "word joiner", "WJ",
+          "format; non-breaking, no line break point"),
+      new RelatedCharacter(0xFEFF, "zero width no-break space", "BOM",
+          "format; byte order mark"));
+
+  private static final Map<Integer, WhitespaceCharacter> BY_CODE_POINT = new 
HashMap<>();
+  private static final BitSet MEMBERSHIP = new BitSet();
+  private static final BitSet LOOKALIKE_MEMBERSHIP = new BitSet();
+  private static final int[] CODE_POINTS = new int[WHITESPACE.size()];
+  private static final List<WhitespaceCharacter> LINE_BREAKS = new 
ArrayList<>();
+  private static final List<WhitespaceCharacter> NON_BREAKING = new 
ArrayList<>();
+
+  static {
+    for (int i = 0; i < WHITESPACE.size(); i++) {
+      final WhitespaceCharacter ws = WHITESPACE.get(i);
+      BY_CODE_POINT.put(ws.codePoint(), ws);
+      MEMBERSHIP.set(ws.codePoint());
+      CODE_POINTS[i] = ws.codePoint();
+      if (ws.isLineBreak()) {
+        LINE_BREAKS.add(ws);
+      }
+      if (ws.isNonBreaking()) {
+        NON_BREAKING.add(ws);
+      }
+    }
+    for (final RelatedCharacter related : LOOKALIKES) {
+      LOOKALIKE_MEMBERSHIP.set(related.codePoint());
+    }
+  }
+
+  private UnicodeWhitespace() {
+  }
+
+  /**
+   * Tests whether a code point carries the Unicode {@code White_Space} 
property.
+   *
+   * @param codePoint The code point to test. Out-of-range values (negative or 
beyond
+   *     {@link Character#MAX_CODE_POINT}) simply return {@code false}.
+   * @return {@code true} if the code point is one of the {@code 25} Unicode 
whitespace characters.
+   */
+  public static boolean isWhitespace(int codePoint) {
+    return codePoint >= 0 && codePoint <= Character.MAX_CODE_POINT && 
MEMBERSHIP.get(codePoint);
+  }
+
+  /**
+   * Tests whether a code point is one of the related, non-whitespace 
look-alike format characters.
+   *
+   * @param codePoint The code point to test.
+   * @return {@code true} if the code point is in the {@link #lookalikes() 
look-alike} set.
+   */
+  public static boolean isLookalike(int codePoint) {
+    return codePoint >= 0 && codePoint <= Character.MAX_CODE_POINT
+        && LOOKALIKE_MEMBERSHIP.get(codePoint);
+  }
+
+  /**
+   * Looks up the reference entry for a whitespace code point.
+   *
+   * @param codePoint The code point.
+   * @return The {@link WhitespaceCharacter}, or {@link Optional#empty()} if 
it is not whitespace.
+   */
+  public static Optional<WhitespaceCharacter> byCodePoint(int codePoint) {
+    return Optional.ofNullable(BY_CODE_POINT.get(codePoint));
+  }
+
+  /** {@return the {@code 25} Unicode whitespace characters, in ascending code 
point order} */
+  public static List<WhitespaceCharacter> all() {
+    return WHITESPACE;
+  }
+
+  /** {@return the related, non-whitespace look-alike format characters} */
+  public static List<RelatedCharacter> lookalikes() {
+    return LOOKALIKES;
+  }
+
+  /** {@return the whitespace characters that force a line or paragraph break} 
*/
+  public static List<WhitespaceCharacter> lineBreaks() {
+    return List.copyOf(LINE_BREAKS);
+  }
+
+  /** {@return the non-breaking whitespace characters} */
+  public static List<WhitespaceCharacter> nonBreaking() {
+    return List.copyOf(NON_BREAKING);
+  }
+
+  /** {@return the whitespace code points, in ascending order} */
+  public static int[] codePoints() {
+    return CODE_POINTS.clone();
+  }
+}
diff --git 
a/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java 
b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java
new file mode 100644
index 000000000..5e2a42ba6
--- /dev/null
+++ b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java
@@ -0,0 +1,292 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.util.Span;
+import opennlp.tools.util.normalizer.UnicodeWhitespace.WhitespaceCharacter;
+
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class CharClassTest {
+
+  private static final CharClass WS = CharClass.whitespace();
+  private static final CharClass DASH = CharClass.dashes();
+
+  // Non-ASCII test characters are built from code points (no literal glyphs, 
no Unicode escapes)
+  // so the source stays pure ASCII and the intent is explicit. Tab and 
newline use \t and \n.
+  private static final String NBSP = cp(0x00A0);
+  private static final String IDEOGRAPHIC = cp(0x3000);
+  private static final String EM_DASH = cp(0x2014);
+  private static final String EN_DASH = cp(0x2013);
+  private static final String FIGURE_DASH = cp(0x2012);
+  private static final String MINUS_SIGN = cp(0x2212);
+  private static final String YEZIDI_HYPHEN = cp(0x10EAD);
+  private static final String GRINNING_FACE = cp(0x1F600);
+
+  private static String cp(int codePoint) {
+    return new String(Character.toChars(codePoint));
+  }
+
+  private static CodePointSet lineBreaks() {
+    return CodePointSet.of(UnicodeWhitespace.lineBreaks().stream()
+        .mapToInt(WhitespaceCharacter::codePoint).toArray());
+  }
+
+  // --- membership 
--------------------------------------------------------------------------
+
+  @Test
+  void testWhitespacePresetMembership() {
+    assertTrue(WS.contains(0x0020));
+    assertTrue(WS.contains(0x0009));
+    assertTrue(WS.contains(0x00A0));
+    assertTrue(WS.contains(0x3000));
+    assertTrue(WS.contains(0x2028));
+    assertFalse(WS.contains('a'));
+    assertFalse(WS.contains(0x200B), "zero width space is not whitespace");
+  }
+
+  @Test
+  void testDashPresetMembershipExcludesMathMinus() {
+    assertTrue(DASH.contains(0x2014));
+    assertTrue(DASH.contains(0x2013));
+    assertTrue(DASH.contains(0xFF0D));
+    assertFalse(DASH.contains(0x2212), "math minus is excluded by default");
+    assertFalse(DASH.contains('a'));
+  }
+
+  // --- normalize / collapse 
----------------------------------------------------------------
+
+  @Test
+  void testNormalizeReplacesEachMemberOneForOne() {
+    assertEquals("a  b", WS.normalize("a" + NBSP + IDEOGRAPHIC + "b"));
+    assertEquals("well-known", DASH.normalize("well" + EM_DASH + "known"));
+    assertEquals("a-b-c", DASH.normalize("a" + EN_DASH + "b" + FIGURE_DASH + 
"c"));
+  }
+
+  @Test
+  void testNormalizeLeavesMathMinusUntouched() {
+    assertEquals("5" + MINUS_SIGN + "3", DASH.normalize("5" + MINUS_SIGN + 
"3"));
+  }
+
+  @Test
+  void testCollapseMergesRuns() {
+    assertEquals("a b", WS.collapse("a" + NBSP + IDEOGRAPHIC + "b"));
+    assertEquals(" a b ", WS.collapse("  a\t\tb  "));
+    assertEquals("a-b", DASH.collapse("a" + EM_DASH + EN_DASH + EM_DASH + 
"b"));
+  }
+
+  @Test
+  void testNormalizeAndCollapseHandleSupplementaryMembers() {
+    assertEquals("x-y", DASH.normalize("x" + YEZIDI_HYPHEN + "y"));
+    assertEquals("x-y", DASH.collapse("x" + YEZIDI_HYPHEN + YEZIDI_HYPHEN + 
"y"));
+  }
+
+  @Test
+  void testEmptyAndAllMemberInputs() {
+    assertEquals("", WS.normalize(""));
+    assertEquals("", WS.collapse(""));
+    assertEquals("", WS.trim(""));
+    assertEquals("", WS.removeAll(""));
+    assertArrayEquals(new String[0], WS.split(""));
+    assertArrayEquals(new String[0], WS.split(" " + IDEOGRAPHIC));
+  }
+
+  // --- squish (collapsePreserving) 
---------------------------------------------------------
+
+  @Test
+  void testCollapsePreservingKeepsLineBreaks() {
+    final CodePointSet keep = lineBreaks();
+    assertEquals("a\nb", WS.collapsePreserving("a\n\n\t\tb", keep, '\n'));
+    assertEquals("a b", WS.collapsePreserving("a \t b", keep, '\n'));
+    assertEquals("a\nb\nc", WS.collapsePreserving("a\n \tb \nc", keep, '\n'));
+  }
+
+  // --- trim / removeAll 
--------------------------------------------------------------------
+
+  @Test
+  void testTrim() {
+    assertEquals("hello", WS.trim("\t hello" + IDEOGRAPHIC + IDEOGRAPHIC));
+    assertEquals("noedge", WS.trim("noedge"));
+    assertEquals("", WS.trim("  "));
+    assertEquals("a b", WS.trim("  a b  "), "interior whitespace is 
preserved");
+  }
+
+  @Test
+  void testRemoveAll() {
+    assertEquals("abcd", WS.removeAll("a b\tc d"));
+  }
+
+  // --- split / splitSpans 
------------------------------------------------------------------
+
+  @Test
+  void testSplitOnUnicodeWhitespace() {
+    assertArrayEquals(new String[] {"one", "two", "three"},
+        WS.split("one two" + IDEOGRAPHIC + IDEOGRAPHIC + "three"));
+    assertArrayEquals(new String[] {"a", "b"}, WS.split("  a b  "));
+  }
+
+  @Test
+  void testSplitSpansCarryOriginalOffsets() {
+    final String text = "one two";
+    final List<Span> spans = WS.splitSpans(text);
+    assertEquals(2, spans.size());
+    assertEquals(0, spans.get(0).getStart());
+    assertEquals(3, spans.get(0).getEnd());
+    assertEquals("one", spans.get(0).getCoveredText(text).toString());
+    assertEquals(4, spans.get(1).getStart());
+    assertEquals(7, spans.get(1).getEnd());
+    assertEquals("two", spans.get(1).getCoveredText(text).toString());
+  }
+
+  @Test
+  void testSplitSpansWithSupplementaryToken() {
+    final String text = "a " + GRINNING_FACE + " b";
+    final List<Span> spans = WS.splitSpans(text);
+    assertEquals(3, spans.size());
+    assertEquals("a", spans.get(0).getCoveredText(text).toString());
+    assertEquals(GRINNING_FACE, spans.get(1).getCoveredText(text).toString());
+    assertEquals("b", spans.get(2).getCoveredText(text).toString());
+  }
+
+  // --- custom classes 
----------------------------------------------------------------------
+
+  @Test
+  void testCustomClass() {
+    final CharClass vowelO = CharClass.of(CodePointSet.of('o'), '0');
+    assertEquals("f00 bar", vowelO.normalize("foo bar"));
+    assertEquals("f0", vowelO.collapse("foo"));
+  }
+
+  @Test
+  void testWithAdditionalExtendsWithoutMutatingOriginal() {
+    final CharClass extended = WS.withAdditional(CodePointSet.of('_'));
+    assertTrue(extended.contains('_'));
+    assertTrue(extended.contains(0x0020));
+    assertEquals("a b c", extended.normalize("a_b c"));
+    assertFalse(WS.contains('_'), "the preset must be unchanged");
+  }
+
+  @Test
+  void testOfRejectsInvalidReplacement() {
+    assertThrows(IllegalArgumentException.class,
+        () -> CharClass.of(CodePointSet.of(0x20), -1));
+    assertThrows(IllegalArgumentException.class,
+        () -> CharClass.of(CodePointSet.of(0x20), Character.MAX_CODE_POINT + 
1));
+  }
+
+  // --- offset-mapped variants 
--------------------------------------------------------------
+
+  @Test
+  void testCollapseMappedOffsets() {
+    final NormalizedText nt = WS.collapseMapped("a  b");
+    assertEquals("a b", nt.normalized());
+    assertEquals(3, nt.offsets().normalizedLength());
+    assertEquals(4, nt.offsets().originalLength());
+
+    assertEquals(0, nt.toOriginalOffset(0));
+    assertEquals(1, nt.toOriginalOffset(1));
+    assertEquals(3, nt.toOriginalOffset(2));
+    assertEquals(4, nt.toOriginalOffset(3));
+
+    assertEquals(0, nt.toNormalizedOffset(0));
+    assertEquals(1, nt.toNormalizedOffset(1));
+    assertEquals(2, nt.toNormalizedOffset(3));
+    assertEquals(3, nt.toNormalizedOffset(4));
+  }
+
+  @Test
+  void testNormalizeMappedIsIdentityWhenNothingMatches() {
+    final NormalizedText nt = WS.normalizeMapped("abc");
+    assertEquals("abc", nt.normalized());
+    for (int i = 0; i <= 3; i++) {
+      assertEquals(i, nt.toOriginalOffset(i));
+    }
+  }
+
+  @Test
+  void testNormalizeMappedPreservesSupplementaryCopyOffsets() {
+    final String text = "a" + GRINNING_FACE + "b";
+    final NormalizedText nt = WS.normalizeMapped(text);
+    assertEquals(text, nt.normalized());
+    for (int i = 0; i <= text.length(); i++) {
+      assertEquals(i, nt.toOriginalOffset(i));
+    }
+  }
+
+  @Test
+  void testNormalizeMappedCollapsesSupplementaryMemberToOneChar() {
+    final String text = "x" + YEZIDI_HYPHEN + "y";
+    final NormalizedText nt = DASH.normalizeMapped(text);
+    assertEquals("x-y", nt.normalized());
+    assertEquals(0, nt.toOriginalOffset(0));
+    assertEquals(1, nt.toOriginalOffset(1));
+    assertEquals(3, nt.toOriginalOffset(2));
+    assertEquals(4, nt.toOriginalOffset(3));
+  }
+
+  @Test
+  void testOffsetMapRejectsOutOfRange() {
+    final OffsetMap map = WS.collapseMapped("ab").offsets();
+    assertThrows(IndexOutOfBoundsException.class, () -> 
map.toOriginalOffset(-1));
+    assertThrows(IndexOutOfBoundsException.class,
+        () -> map.toOriginalOffset(map.normalizedLength() + 1));
+    assertThrows(IndexOutOfBoundsException.class, () -> 
map.toNormalizedOffset(-1));
+    assertThrows(IndexOutOfBoundsException.class,
+        () -> map.toNormalizedOffset(map.originalLength() + 1));
+  }
+
+  @Test
+  void testAccessorsExposeMembersAndReplacement() {
+    assertEquals(0x0020, WS.replacement());
+    assertEquals('-', DASH.replacement());
+    assertTrue(WS.members().contains(0x00A0));
+    assertFalse(WS.members().contains('a'));
+  }
+
+  @Test
+  void testOffsetMapBuilderGrowsBeyondInitialCapacity() {
+    // 26 output characters force the OffsetMap builder past its initial 
16-entry buffer.
+    final String text = "abcdefghijklmnopqrstuvwxyz";
+    final NormalizedText nt = WS.normalizeMapped(text);
+    assertEquals(text, nt.normalized());
+    assertEquals(26, nt.offsets().normalizedLength());
+    for (int i = 0; i <= text.length(); i++) {
+      assertEquals(i, nt.toOriginalOffset(i));
+    }
+  }
+
+  @Test
+  void testNormalizeMappedWithSupplementaryReplacement() {
+    // A supplementary replacement exercises the two-char substitution path of 
the offset map.
+    final int penguin = 0x1F427;
+    final CharClass toPenguin = CharClass.of(CodePointSet.of(' '), penguin);
+    final NormalizedText nt = toPenguin.normalizeMapped("a b");
+    assertEquals("a" + new String(Character.toChars(penguin)) + "b", 
nt.normalized());
+    assertEquals(0, nt.toOriginalOffset(0));
+    assertEquals(1, nt.toOriginalOffset(1));
+    assertEquals(1, nt.toOriginalOffset(2));
+    assertEquals(2, nt.toOriginalOffset(3));
+  }
+}
diff --git 
a/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CodePointSetTest.java 
b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CodePointSetTest.java
new file mode 100644
index 000000000..769cea71f
--- /dev/null
+++ 
b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CodePointSetTest.java
@@ -0,0 +1,241 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class CodePointSetTest {
+
+  @Test
+  void testOfContainsExactlyTheGivenCodePoints() {
+    final CodePointSet set = CodePointSet.of(0x0041, 0x00A0, 0x1F600);
+    assertTrue(set.contains(0x0041));
+    assertTrue(set.contains(0x00A0));
+    assertTrue(set.contains(0x1F600));
+    assertFalse(set.contains(0x0042));
+    assertEquals(3, set.size());
+    assertFalse(set.isEmpty());
+  }
+
+  @Test
+  void testToArrayIsAscending() {
+    final CodePointSet set = CodePointSet.of(0x3000, 0x0009, 0x00A0);
+    assertArrayEquals(new int[] {0x0009, 0x00A0, 0x3000}, set.toArray());
+  }
+
+  @Test
+  void testOfRangeIsInclusive() {
+    final CodePointSet set = CodePointSet.ofRange(0x2000, 0x200A);
+    assertTrue(set.contains(0x2000));
+    assertTrue(set.contains(0x2005));
+    assertTrue(set.contains(0x200A));
+    assertFalse(set.contains(0x1FFF));
+    assertFalse(set.contains(0x200B));
+    assertEquals(11, set.size());
+  }
+
+  @Test
+  void testOfRangeRejectsDescending() {
+    assertThrows(IllegalArgumentException.class, () -> 
CodePointSet.ofRange(0x200A, 0x2000));
+  }
+
+  @ParameterizedTest
+  @ValueSource(ints = {-1, Integer.MIN_VALUE, Character.MAX_CODE_POINT + 1, 
Integer.MAX_VALUE})
+  void testOfRejectsInvalidCodePoints(int codePoint) {
+    assertThrows(IllegalArgumentException.class, () -> 
CodePointSet.of(codePoint));
+  }
+
+  @ParameterizedTest
+  @ValueSource(ints = {-1, Integer.MIN_VALUE, Character.MAX_CODE_POINT + 1, 
Integer.MAX_VALUE})
+  void testContainsIsRangeSafe(int codePoint) {
+    assertFalse(CodePointSet.of(0x0020).contains(codePoint));
+  }
+
+  @Test
+  void testUnionIsNonDestructive() {
+    final CodePointSet a = CodePointSet.of(0x0041);
+    final CodePointSet b = CodePointSet.of(0x0042);
+    final CodePointSet union = a.union(b);
+
+    assertTrue(union.contains(0x0041));
+    assertTrue(union.contains(0x0042));
+    assertEquals(2, union.size());
+    assertFalse(a.contains(0x0042), "left operand must be unchanged");
+    assertFalse(b.contains(0x0041), "right operand must be unchanged");
+  }
+
+  @Test
+  void testEqualsAndHashCode() {
+    assertEquals(CodePointSet.of(0x01, 0x02), CodePointSet.of(0x02, 0x01));
+    assertEquals(CodePointSet.of(0x01, 0x02).hashCode(), CodePointSet.of(0x02, 
0x01).hashCode());
+    assertFalse(CodePointSet.of(0x01).equals(CodePointSet.of(0x02)));
+  }
+
+  @Test
+  void testEqualsAgainstOtherTypesAndNull() {
+    final CodePointSet set = CodePointSet.of(0x20);
+    assertFalse(set.equals(null));
+    assertFalse(set.equals("not a code point set"));
+  }
+
+  @Test
+  void testParseAcceptsSingleHexDigit() {
+    assertTrue(CodePointSet.parse(List.of("[s]", "9"), "s").contains(0x9));
+  }
+
+  @Test
+  void testParseRejectsEmptyCodePointAfterPrefix() {
+    assertThrows(IllegalArgumentException.class,
+        () -> CodePointSet.parse(List.of("[s]", "U+"), "s"));
+  }
+
+  @Test
+  void testParseRejectsTooShortSectionHeader() {
+    assertThrows(IllegalArgumentException.class,
+        () -> CodePointSet.parse(List.of("[]", "U+0041"), "s"));
+  }
+
+  @Test
+  void testParseSingleCodePointsRangesCommentsAndBlankLines() {
+    final List<String> lines = List.of(
+        "# a whitespace overlay",
+        "[whitespace]",
+        "U+00A0          # no-break space",
+        "0x2028",
+        "2029",
+        "",
+        "U+2000-U+200A   # typographic spaces");
+
+    final CodePointSet set = CodePointSet.parse(lines, "whitespace");
+
+    assertTrue(set.contains(0x00A0));
+    assertTrue(set.contains(0x2028));
+    assertTrue(set.contains(0x2029));
+    assertTrue(set.contains(0x2000));
+    assertTrue(set.contains(0x2007));
+    assertTrue(set.contains(0x200A));
+    assertFalse(set.contains(0x200B));
+    assertEquals(3 + 11, set.size());
+  }
+
+  @Test
+  void testParseReturnsOnlyRequestedSection() {
+    final List<String> lines = List.of(
+        "[whitespace]",
+        "U+00A0",
+        "[dash]",
+        "U+2212",
+        "U+2014");
+
+    final CodePointSet whitespace = CodePointSet.parse(lines, "whitespace");
+    assertTrue(whitespace.contains(0x00A0));
+    assertFalse(whitespace.contains(0x2212));
+    assertFalse(whitespace.contains(0x2014));
+
+    final CodePointSet dash = CodePointSet.parse(lines, "dash");
+    assertTrue(dash.contains(0x2212));
+    assertTrue(dash.contains(0x2014));
+    assertFalse(dash.contains(0x00A0));
+  }
+
+  @Test
+  void testParseSectionNameIsCaseInsensitive() {
+    final List<String> lines = List.of("[WhiteSpace]", "U+00A0");
+    assertTrue(CodePointSet.parse(lines, "whitespace").contains(0x00A0));
+    assertTrue(CodePointSet.parse(lines, "WHITESPACE").contains(0x00A0));
+  }
+
+  @Test
+  void testParseMissingSectionIsEmpty() {
+    final List<String> lines = List.of("[whitespace]", "U+00A0");
+    assertTrue(CodePointSet.parse(lines, "dash").isEmpty());
+  }
+
+  @Test
+  void testParseRejectsMalformedSectionHeader() {
+    final List<String> lines = List.of("[whitespace", "U+00A0");
+    final IllegalArgumentException e = 
assertThrows(IllegalArgumentException.class,
+        () -> CodePointSet.parse(lines, "whitespace"));
+    assertTrue(e.getMessage().contains("line 1"), e.getMessage());
+  }
+
+  @Test
+  void testParseRejectsInvalidHex() {
+    final List<String> lines = List.of("[whitespace]", "U+ZZZZ");
+    final IllegalArgumentException e = 
assertThrows(IllegalArgumentException.class,
+        () -> CodePointSet.parse(lines, "whitespace"));
+    assertTrue(e.getMessage().contains("line 2"), e.getMessage());
+  }
+
+  @Test
+  void testParseRejectsDescendingRange() {
+    final List<String> lines = List.of("[whitespace]", "U+200A-U+2000");
+    assertThrows(IllegalArgumentException.class, () -> 
CodePointSet.parse(lines, "whitespace"));
+  }
+
+  @Test
+  void testParseRejectsOutOfRangeCodePoint() {
+    final List<String> lines = List.of("[whitespace]", "U+110000");
+    assertThrows(IllegalArgumentException.class, () -> 
CodePointSet.parse(lines, "whitespace"));
+  }
+
+  @Test
+  void testParseRejectsEntryBeforeAnySection() {
+    final List<String> lines = List.of("U+00A0");
+    final IllegalArgumentException e = 
assertThrows(IllegalArgumentException.class,
+        () -> CodePointSet.parse(lines, "whitespace"));
+    assertTrue(e.getMessage().contains("before any [section]"), 
e.getMessage());
+  }
+
+  @Test
+  void testParseAcceptsAllThreeHexPrefixes() {
+    final List<String> lines = List.of("[s]", "U+0041", "0x0042", "0043");
+    final CodePointSet set = CodePointSet.parse(lines, "s");
+    assertTrue(set.contains(0x41));
+    assertTrue(set.contains(0x42));
+    assertTrue(set.contains(0x43));
+  }
+
+  @Test
+  void testFromFileReadsTheNamedSection(@TempDir Path dir) throws IOException {
+    final Path file = dir.resolve("delimiters.txt");
+    Files.writeString(file, String.join("\n",
+        "[whitespace]",
+        "U+00A0",
+        "[dash]",
+        "U+2E5D"), StandardCharsets.UTF_8);
+
+    assertTrue(CodePointSet.fromFile(file, "whitespace").contains(0x00A0));
+    assertTrue(CodePointSet.fromFile(file, "dash").contains(0x2E5D));
+    assertFalse(CodePointSet.fromFile(file, "dash").contains(0x00A0));
+  }
+}
diff --git 
a/opennlp-api/src/test/java/opennlp/tools/util/normalizer/UnicodeDashTest.java 
b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/UnicodeDashTest.java
new file mode 100644
index 000000000..9d547a980
--- /dev/null
+++ 
b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/UnicodeDashTest.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.MethodSource;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import opennlp.tools.util.normalizer.UnicodeDash.Category;
+import opennlp.tools.util.normalizer.UnicodeDash.DashCharacter;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class UnicodeDashTest {
+
+  private static List<DashCharacter> dashes() {
+    return UnicodeDash.all();
+  }
+
+  // Maps the running JDK's general category to our enum, or null if it cannot 
be expressed (which
+  // includes code points the JDK's Unicode version does not yet assign).
+  private static Category jdkCategory(int codePoint) {
+    return switch (Character.getType(codePoint)) {
+      case Character.DASH_PUNCTUATION -> Category.Pd;
+      case Character.MATH_SYMBOL -> Category.Sm;
+      case Character.OTHER_PUNCTUATION -> Category.Po;
+      default -> null;
+    };
+  }
+
+  @Test
+  void testDashSetHasExactly31() {
+    assertEquals(31, UnicodeDash.all().size());
+  }
+
+  @ParameterizedTest
+  @MethodSource("dashes")
+  void testEachDashIsSelfConsistent(DashCharacter dash) {
+    assertTrue(UnicodeDash.isDash(dash.codePoint()), dash::toUnicodeNotation);
+    assertEquals(dash, 
UnicodeDash.byCodePoint(dash.codePoint()).orElseThrow());
+    assertNotNull(dash.category());
+    assertFalse(dash.name().isBlank());
+  }
+
+  @ParameterizedTest
+  @MethodSource("dashes")
+  void testCategoryMatchesJdkUnicodeDataWhenAssigned(DashCharacter dash) {
+    final Category jdk = jdkCategory(dash.codePoint());
+    // Skip code points the running JVM's Unicode version does not assign yet 
(e.g. newer dashes).
+    if (Character.getType(dash.codePoint()) != Character.UNASSIGNED) {
+      assertEquals(jdk, dash.category(), dash::toUnicodeNotation);
+    }
+  }
+
+  @Test
+  void testCodePointsAreUniqueAndStrictlyAscending() {
+    final int[] cps = UnicodeDash.codePoints();
+    for (int i = 1; i < cps.length; i++) {
+      assertTrue(cps[i] > cps[i - 1], "dash code points must be unique and 
ascending");
+    }
+  }
+
+  @Test
+  void testMathematicalAreExactlyTheThreeMinusSigns() {
+    final Set<Integer> math = UnicodeDash.mathematical().stream()
+        .map(DashCharacter::codePoint).collect(Collectors.toSet());
+    assertEquals(Set.of(0x207B, 0x208B, 0x2212), math);
+    UnicodeDash.mathematical().forEach(d -> {
+      assertTrue(d.isMathematical());
+      assertEquals(Category.Sm, d.category());
+    });
+  }
+
+  @Test
+  void testDefaultDashSetExcludesMathematicalMinusSigns() {
+    final int[] defaults = UnicodeDash.defaultDashCodePoints();
+    assertEquals(UnicodeDash.all().size() - 3, defaults.length);
+    for (final int codePoint : defaults) {
+      
assertFalse(UnicodeDash.byCodePoint(codePoint).orElseThrow().isMathematical(),
+          () -> String.format("U+%04X must not be a math minus", codePoint));
+    }
+    assertFalse(Arrays.stream(defaults).anyMatch(cp -> cp == 0x2212));
+  }
+
+  @Test
+  void testHyphenMinusIsTheCanonicalTarget() {
+    assertEquals(0x002D, UnicodeDash.HYPHEN_MINUS);
+    assertTrue(UnicodeDash.isDash(0x002D));
+    assertEquals(Category.Pd, 
UnicodeDash.byCodePoint(0x002D).orElseThrow().category());
+  }
+
+  @Test
+  void testSupplementaryDashesArePresent() {
+    for (final int codePoint : new int[] {0x10D6E, 0x10EAD}) {
+      assertTrue(UnicodeDash.isDash(codePoint));
+      
assertTrue(UnicodeDash.byCodePoint(codePoint).orElseThrow().isSupplementary());
+    }
+  }
+
+  @Test
+  void testBmpDashIsNotSupplementary() {
+    
assertFalse(UnicodeDash.byCodePoint(0x2014).orElseThrow().isSupplementary());
+  }
+
+  @Test
+  void testDashToUnicodeNotation() {
+    assertEquals("U+2014", 
UnicodeDash.byCodePoint(0x2014).orElseThrow().toUnicodeNotation());
+    assertEquals("U+10EAD", 
UnicodeDash.byCodePoint(0x10EAD).orElseThrow().toUnicodeNotation());
+  }
+
+  @ParameterizedTest
+  @ValueSource(ints = {0x00AD, 0x002E, 0x0041, 0x0020, 0x007E, 0x1F600})
+  void testNonDashesAreNotDashes(int codePoint) {
+    // Notably U+00AD SOFT HYPHEN is a format character, not a dash, and must 
not be treated as one.
+    assertFalse(UnicodeDash.isDash(codePoint));
+  }
+
+  @ParameterizedTest
+  @ValueSource(ints = {-1, Integer.MIN_VALUE, Character.MAX_CODE_POINT + 1, 
Integer.MAX_VALUE})
+  void testIsDashIsRangeSafe(int codePoint) {
+    assertFalse(UnicodeDash.isDash(codePoint));
+  }
+
+  @Test
+  void testByCodePointUnknownIsEmpty() {
+    assertTrue(UnicodeDash.byCodePoint('A').isEmpty());
+    assertTrue(UnicodeDash.byCodePoint(0x00AD).isEmpty());
+  }
+
+  @Test
+  void testReferenceListIsImmutable() {
+    assertThrows(UnsupportedOperationException.class, () -> 
UnicodeDash.all().add(null));
+    assertThrows(UnsupportedOperationException.class, () -> 
UnicodeDash.mathematical().add(null));
+  }
+
+  @Test
+  void testArrayAccessorsReturnDefensiveCopies() {
+    final int[] all = UnicodeDash.codePoints();
+    all[0] = -1;
+    assertEquals(0x002D, UnicodeDash.codePoints()[0]);
+
+    final int[] defaults = UnicodeDash.defaultDashCodePoints();
+    defaults[0] = -1;
+    assertEquals(0x002D, UnicodeDash.defaultDashCodePoints()[0]);
+  }
+}
diff --git 
a/opennlp-api/src/test/java/opennlp/tools/util/normalizer/UnicodeWhitespaceTest.java
 
b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/UnicodeWhitespaceTest.java
new file mode 100644
index 000000000..bd040efc0
--- /dev/null
+++ 
b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/UnicodeWhitespaceTest.java
@@ -0,0 +1,239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.MethodSource;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import opennlp.tools.util.normalizer.UnicodeWhitespace.Category;
+import opennlp.tools.util.normalizer.UnicodeWhitespace.RelatedCharacter;
+import opennlp.tools.util.normalizer.UnicodeWhitespace.WhitespaceCharacter;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class UnicodeWhitespaceTest {
+
+  private static List<WhitespaceCharacter> whitespace() {
+    return UnicodeWhitespace.all();
+  }
+
+  private static List<RelatedCharacter> lookalikes() {
+    return UnicodeWhitespace.lookalikes();
+  }
+
+  // Maps the JDK's Unicode general category to our enum, used as an 
independent oracle.
+  private static Category jdkCategory(int codePoint) {
+    return switch (Character.getType(codePoint)) {
+      case Character.CONTROL -> Category.Cc;
+      case Character.SPACE_SEPARATOR -> Category.Zs;
+      case Character.LINE_SEPARATOR -> Category.Zl;
+      case Character.PARAGRAPH_SEPARATOR -> Category.Zp;
+      case Character.FORMAT -> Category.Cf;
+      default -> null;
+    };
+  }
+
+  @Test
+  void testWhitespaceSetHasExactly25() {
+    assertEquals(25, UnicodeWhitespace.all().size());
+  }
+
+  @Test
+  void testLookalikeSetHasExactly6() {
+    assertEquals(6, UnicodeWhitespace.lookalikes().size());
+  }
+
+  @Test
+  void testRelatedCharacterExposesAttributes() {
+    final var bom = UnicodeWhitespace.lookalikes().stream()
+        .filter(r -> r.codePoint() == 0xFEFF).findFirst().orElseThrow();
+    assertEquals("zero width no-break space", bom.name());
+    assertEquals("BOM", bom.abbreviation());
+    assertFalse(bom.note().isBlank());
+    assertEquals("U+FEFF", bom.toUnicodeNotation());
+  }
+
+  @ParameterizedTest
+  @MethodSource("whitespace")
+  void testEachWhitespaceCharIsSelfConsistent(WhitespaceCharacter ws) {
+    assertTrue(UnicodeWhitespace.isWhitespace(ws.codePoint()),
+        () -> ws.toUnicodeNotation() + " should be whitespace");
+    assertEquals(ws, 
UnicodeWhitespace.byCodePoint(ws.codePoint()).orElseThrow());
+    assertFalse(UnicodeWhitespace.isLookalike(ws.codePoint()),
+        () -> ws.toUnicodeNotation() + " must not also be a look-alike");
+    assertNotNull(ws.category());
+    assertNotNull(ws.breaking());
+    assertNotNull(ws.abbreviation());
+    assertFalse(ws.name().isBlank());
+  }
+
+  @ParameterizedTest
+  @MethodSource("whitespace")
+  void testAllWhitespaceIsInTheBmp(WhitespaceCharacter ws) {
+    // Every Unicode White_Space code point is in the Basic Multilingual Plane 
(one char).
+    assertTrue(ws.codePoint() <= 0xFFFF, ws::toUnicodeNotation);
+    assertEquals(1, Character.charCount(ws.codePoint()));
+  }
+
+  @ParameterizedTest
+  @MethodSource("whitespace")
+  void testCategoryMatchesJdkUnicodeData(WhitespaceCharacter ws) {
+    // Independent cross-check: our hand-entered category must agree with the 
JDK's UCD.
+    assertEquals(jdkCategory(ws.codePoint()), ws.category(), 
ws::toUnicodeNotation);
+  }
+
+  @Test
+  void testCodePointsAreUniqueAndStrictlyAscending() {
+    final int[] cps = UnicodeWhitespace.codePoints();
+    for (int i = 1; i < cps.length; i++) {
+      assertTrue(cps[i] > cps[i - 1],
+          "code points must be unique and ascending at index " + i);
+    }
+  }
+
+  @Test
+  void testCodePointsMatchAllOrder() {
+    final int[] fromRecords = 
whitespace().stream().mapToInt(WhitespaceCharacter::codePoint).toArray();
+    assertArrayEqualsInt(fromRecords, UnicodeWhitespace.codePoints());
+  }
+
+  @Test
+  void testCodePointsReturnsDefensiveCopy() {
+    final int[] first = UnicodeWhitespace.codePoints();
+    first[0] = -999;
+    assertEquals(0x0009, UnicodeWhitespace.codePoints()[0]);
+  }
+
+  @ParameterizedTest
+  @MethodSource("lookalikes")
+  void testLookalikesAreNotWhitespace(RelatedCharacter related) {
+    assertFalse(UnicodeWhitespace.isWhitespace(related.codePoint()),
+        () -> related.toUnicodeNotation() + " is White_Space=no");
+    assertTrue(UnicodeWhitespace.byCodePoint(related.codePoint()).isEmpty());
+    assertTrue(UnicodeWhitespace.isLookalike(related.codePoint()));
+    // Every look-alike is a format character in the UCD.
+    assertEquals(Category.Cf, jdkCategory(related.codePoint()), 
related::toUnicodeNotation);
+  }
+
+  @Test
+  void testLineBreaksAreExactlyTheSeven() {
+    final Set<Integer> expected = Set.of(0x000A, 0x000B, 0x000C, 0x000D, 
0x0085, 0x2028, 0x2029);
+    assertEquals(expected, UnicodeWhitespace.lineBreaks().stream()
+        .map(WhitespaceCharacter::codePoint).collect(Collectors.toSet()));
+  }
+
+  @Test
+  void testNonBreakingAreExactlyTheThree() {
+    final Set<Integer> expected = Set.of(0x00A0, 0x2007, 0x202F);
+    assertEquals(expected, UnicodeWhitespace.nonBreaking().stream()
+        .map(WhitespaceCharacter::codePoint).collect(Collectors.toSet()));
+  }
+
+  @ParameterizedTest
+  @ValueSource(ints = {0x0008, 0x000E, 0x001F, 0x0021, 0x1FFF, 0x200B, 0x202A, 
0x2FFF, 0x3001})
+  void testNeighboringCodePointsAreNotWhitespace(int codePoint) {
+    assertFalse(UnicodeWhitespace.isWhitespace(codePoint),
+        () -> String.format("U+%04X must not be whitespace", codePoint));
+  }
+
+  @Test
+  void testIncludesNbspAndNelThatJavaIsWhitespaceOmits() {
+    // Documents the deliberate divergence from Character.isWhitespace.
+    assertTrue(UnicodeWhitespace.isWhitespace(0x00A0));
+    assertFalse(Character.isWhitespace(0x00A0));
+    assertTrue(UnicodeWhitespace.isWhitespace(0x0085));
+    assertFalse(Character.isWhitespace(0x0085));
+  }
+
+  @ParameterizedTest
+  @ValueSource(ints = {0x001C, 0x001D, 0x001E, 0x001F})
+  void testExcludesInfoSeparatorsThatJavaIsWhitespaceIncludes(int codePoint) {
+    assertFalse(UnicodeWhitespace.isWhitespace(codePoint));
+    assertTrue(Character.isWhitespace(codePoint));
+  }
+
+  @Test
+  void testIncludesTabThatIsSpaceCharOmits() {
+    // Character.isSpaceChar excludes the control whitespace; ours includes it.
+    assertTrue(UnicodeWhitespace.isWhitespace(0x0009));
+    assertFalse(Character.isSpaceChar(0x0009));
+  }
+
+  @Test
+  void testByCodePointUnknownIsEmpty() {
+    assertTrue(UnicodeWhitespace.byCodePoint('A').isEmpty());
+    assertTrue(UnicodeWhitespace.byCodePoint(0x200B).isEmpty(), "a look-alike 
is not whitespace");
+  }
+
+  @ParameterizedTest
+  @ValueSource(ints = {Integer.MIN_VALUE, -1, Character.MAX_CODE_POINT + 1, 
Integer.MAX_VALUE})
+  void testIsWhitespaceHandlesOutOfRangeSafely(int codePoint) {
+    assertFalse(UnicodeWhitespace.isWhitespace(codePoint));
+    assertFalse(UnicodeWhitespace.isLookalike(codePoint));
+  }
+
+  @Test
+  void testReferenceListsAreImmutable() {
+    assertThrows(UnsupportedOperationException.class,
+        () -> UnicodeWhitespace.all().add(null));
+    assertThrows(UnsupportedOperationException.class,
+        () -> UnicodeWhitespace.lookalikes().add(null));
+    assertThrows(UnsupportedOperationException.class,
+        () -> UnicodeWhitespace.lineBreaks().add(null));
+    assertThrows(UnsupportedOperationException.class,
+        () -> UnicodeWhitespace.nonBreaking().add(null));
+  }
+
+  @Test
+  void testToUnicodeNotationIsZeroPadded() {
+    assertEquals("U+0009", 
UnicodeWhitespace.byCodePoint(0x0009).orElseThrow().toUnicodeNotation());
+    assertEquals("U+00A0", 
UnicodeWhitespace.byCodePoint(0x00A0).orElseThrow().toUnicodeNotation());
+    assertEquals("U+3000", 
UnicodeWhitespace.byCodePoint(0x3000).orElseThrow().toUnicodeNotation());
+  }
+
+  @Test
+  void testLineBreakAndNonBreakingFlagsAgreeWithBreaking() {
+    final WhitespaceCharacter lf = 
UnicodeWhitespace.byCodePoint(0x000A).orElseThrow();
+    assertTrue(lf.isLineBreak());
+    assertFalse(lf.isNonBreaking());
+
+    final WhitespaceCharacter nbsp = 
UnicodeWhitespace.byCodePoint(0x00A0).orElseThrow();
+    assertTrue(nbsp.isNonBreaking());
+    assertFalse(nbsp.isLineBreak());
+
+    final WhitespaceCharacter space = 
UnicodeWhitespace.byCodePoint(0x0020).orElseThrow();
+    assertFalse(space.isLineBreak());
+    assertFalse(space.isNonBreaking());
+  }
+
+  private static void assertArrayEqualsInt(int[] expected, int[] actual) {
+    assertEquals(Arrays.toString(expected), Arrays.toString(actual));
+    assertTrue(IntStream.range(0, expected.length).allMatch(i -> expected[i] 
== actual[i]));
+  }
+}
diff --git 
a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/AbstractDL.java 
b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/AbstractDL.java
index 6e6e54767..5b0a14f88 100644
--- 
a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/AbstractDL.java
+++ 
b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/AbstractDL.java
@@ -23,6 +23,7 @@ import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -40,6 +41,7 @@ import ai.onnxruntime.OrtSession;
 import opennlp.tools.tokenize.BertTokenizer;
 import opennlp.tools.tokenize.Tokenizer;
 import opennlp.tools.tokenize.WordpieceTokenizer;
+import opennlp.tools.util.normalizer.CharClass;
 
 /**
  * Base class for OpenNLP deep-learning classes using ONNX Runtime.
@@ -327,6 +329,37 @@ public abstract class AbstractDL implements AutoCloseable {
     }
   }
 
+  /**
+   * Unicode-aware whitespace. Input is tokenized on the full Unicode {@code 
White_Space} set
+   * rather than the six ASCII characters Java's {@code \s} recognizes, and 
the same class is
+   * reused by subclasses that need to match against whitespace in the source 
text.
+   */
+  protected static final CharClass WHITESPACE = CharClass.whitespace();
+
+  /**
+   * Splits {@code text} on Unicode whitespace and groups the resulting tokens 
into overlapping
+   * chunks, each rejoined with single ASCII spaces, ready for WordPiece 
tokenization. The split
+   * uses the Unicode {@code White_Space} set, so spacing such as a no-break 
space or the
+   * ideographic space is recognized, and it yields no empty tokens from 
leading, trailing, or
+   * repeated whitespace.
+   *
+   * @param text The input text.
+   * @param documentSplitSize The maximum number of whitespace tokens per 
chunk.
+   * @param splitOverlapSize The number of tokens shared between consecutive 
chunks.
+   * @return The chunk strings, in order.
+   */
+  protected static List<String> whitespaceChunks(final String text, final int 
documentSplitSize,
+                                                 final int splitOverlapSize) {
+    final String[] whitespaceTokenized = WHITESPACE.split(text);
+    final List<String> groups = new ArrayList<>();
+    for (final ChunkRange chunkRange : chunkRanges(
+        whitespaceTokenized.length, documentSplitSize, splitOverlapSize)) {
+      groups.add(String.join(" ",
+          Arrays.copyOfRange(whitespaceTokenized, chunkRange.start(), 
chunkRange.end())));
+    }
+    return groups;
+  }
+
   /**
    * Splits a token sequence into overlapping chunk ranges.
    *
diff --git 
a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java
 
b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java
index 7aa36e494..c7293fc8b 100644
--- 
a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java
+++ 
b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java
@@ -331,17 +331,10 @@ public class DocumentCategorizerDL extends AbstractDL 
implements DocumentCategor
 
     final List<Tokens> t = new LinkedList<>();
 
-    // Segment long input text into overlapping chunks configured by 
InferenceOptions before
-    // feeding each chunk into BERT.
+    // Segment long input text into overlapping chunks (split on Unicode 
whitespace) configured by
+    // InferenceOptions before feeding each chunk into BERT.
     // 
https://medium.com/analytics-vidhya/text-classification-with-bert-using-transformers-for-long-text-inputs-f54833994dfd
-    final String[] whitespaceTokenized = text.split("\\s+");
-
-    for (ChunkRange chunkRange : chunkRanges(
-        whitespaceTokenized.length, documentSplitSize, splitOverlapSize)) {
-
-      // The group is that subsection of string.
-      final String group = String.join(" ",
-          Arrays.copyOfRange(whitespaceTokenized, chunkRange.start(), 
chunkRange.end()));
+    for (final String group : whitespaceChunks(text, documentSplitSize, 
splitOverlapSize)) {
 
       // Now we can tokenize the group and continue.
       final String[] tokens = tokenizer.tokenize(group);
diff --git 
a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java
 
b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java
index e5b5c89b5..eff6b87d5 100644
--- 
a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java
+++ 
b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java
@@ -28,8 +28,6 @@ import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 import ai.onnxruntime.OnnxTensor;
 import ai.onnxruntime.OrtException;
@@ -356,7 +354,7 @@ public class NameFinderDL extends AbstractDL implements 
TokenNameFinder {
         continue;
       }
 
-      final SpanMatch match = findByRegex(text, spanText, characterStart, 
searchEnd);
+      final SpanMatch match = findInSource(text, spanText, characterStart, 
searchEnd);
       if (match.start() != -1) {
         spans.add(new Span(match.start(), match.end(), entityType, 
entity.probability()));
         characterStart = match.end();
@@ -567,35 +565,82 @@ public class NameFinderDL extends AbstractDL implements 
TokenNameFinder {
   /**
    * Locates reconstructed span text in a bounded region of the original input 
text.
    *
+   * <p>Matching is a single forward cursor scan, not a regular expression. 
Each space in the
+   * reconstructed span matches a run of zero or more Unicode whitespace 
characters in the source
+   * (so an entity whose WordPiece pieces were rejoined with spaces, such as 
{@code "AT & T"} for
+   * {@code "AT&T"}, is still located), and every other code point matches 
case-insensitively.
+   * Using a cursor avoids {@link java.util.regex.Pattern}/{@link 
java.util.regex.Matcher}
+   * allocation and the ReDoS surface of regular expressions, and recognizes 
Unicode whitespace
+   * that Java's {@code \s} does not.</p>
+   *
    * @param text The original text.
-   * @param span The reconstructed span text.
+   * @param span The reconstructed span text, with sub-tokens separated by 
single ASCII spaces.
    * @param searchStart The first character offset to search from.
    * @param searchEnd The exclusive upper bound of the region to search.
    * @return The matched character offsets, or {@code (-1, -1)} when the 
reconstructed text
    *     cannot be found in the requested region.
    */
-  private static SpanMatch findByRegex(String text, String span, int 
searchStart, int searchEnd) {
-
-    // Reconstructed span text normalizes whitespace, so match flexibly: a 
space in the span may
-    // map to any run of whitespace OR none in the source (e.g. 
punctuation/'&' inside "U.S.A",
-    // "AT&T" that wordpiece tokenization split apart). Use \s* rather than 
\s+ so such entities
-    // are still located instead of being silently dropped.
-    final String regex = Pattern.quote(span).replace(" ", "\\E\\s*\\Q");
+  private static SpanMatch findInSource(String text, String span, int 
searchStart, int searchEnd) {
 
-    final Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
-    final Matcher matcher = pattern.matcher(text);
     final int regionStart = Math.min(Math.max(searchStart, 0), text.length());
     final int regionEnd = Math.min(Math.max(searchEnd, regionStart), 
text.length());
-    matcher.region(regionStart, regionEnd);
 
-    if (matcher.find()) {
-      return new SpanMatch(matcher.start(), matcher.end());
+    int start = regionStart;
+    while (start < regionEnd) {
+      final int end = matchAt(text, span, start, regionEnd);
+      if (end != -1) {
+        return new SpanMatch(start, end);
+      }
+      start += Character.charCount(text.codePointAt(start));
     }
 
     return new SpanMatch(-1, -1);
 
   }
 
+  /**
+   * Attempts to match {@code span} against {@code text} beginning at {@code 
start} and bounded by
+   * {@code regionEnd}. A space in {@code span} consumes a run of zero or more 
Unicode whitespace
+   * code points in the source; every other code point must match 
case-insensitively.
+   *
+   * @return The exclusive end offset of the match in {@code text}, or {@code 
-1} if no match
+   *     begins at {@code start}.
+   */
+  private static int matchAt(String text, String span, int start, int 
regionEnd) {
+
+    int t = start;
+    int s = 0;
+
+    while (s < span.length()) {
+      final int spanCp = span.codePointAt(s);
+      if (spanCp == ' ') {
+        while (t < regionEnd && WHITESPACE.contains(text.codePointAt(t))) {
+          t += Character.charCount(text.codePointAt(t));
+        }
+        s += 1;
+      } else {
+        if (t >= regionEnd) {
+          return -1;
+        }
+        final int textCp = text.codePointAt(t);
+        if (!equalsIgnoreCase(spanCp, textCp)) {
+          return -1;
+        }
+        t += Character.charCount(textCp);
+        s += Character.charCount(spanCp);
+      }
+    }
+
+    return t;
+
+  }
+
+  private static boolean equalsIgnoreCase(int a, int b) {
+    return a == b
+        || Character.toLowerCase(a) == Character.toLowerCase(b)
+        || Character.toUpperCase(a) == Character.toUpperCase(b);
+  }
+
   private record LabelPrediction(String label, double probability) {
   }
 
@@ -613,17 +658,10 @@ public class NameFinderDL extends AbstractDL implements 
TokenNameFinder {
 
     final List<Tokens> t = new LinkedList<>();
 
-    // Segment long input text into overlapping chunks configured by 
InferenceOptions before
-    // feeding each chunk into BERT.
+    // Segment long input text into overlapping chunks (split on Unicode 
whitespace) configured by
+    // InferenceOptions before feeding each chunk into BERT.
     // 
https://medium.com/analytics-vidhya/text-classification-with-bert-using-transformers-for-long-text-inputs-f54833994dfd
-    final String[] whitespaceTokenized = text.split("\\s+");
-
-    for (ChunkRange chunkRange : chunkRanges(
-        whitespaceTokenized.length, documentSplitSize, splitOverlapSize)) {
-
-      // The group is that subsection of string.
-      final String group = String.join(" ",
-          Arrays.copyOfRange(whitespaceTokenized, chunkRange.start(), 
chunkRange.end()));
+    for (final String group : whitespaceChunks(text, documentSplitSize, 
splitOverlapSize)) {
 
       // Now we can tokenize the group and continue.
       final String[] tokens = tokenizer.tokenize(group);
diff --git 
a/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/AbstractDLChunkingTest.java
 
b/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/AbstractDLChunkingTest.java
new file mode 100644
index 000000000..38ab38450
--- /dev/null
+++ 
b/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/AbstractDLChunkingTest.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.dl;
+
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+/**
+ * Model-free tests for {@link AbstractDL#whitespaceChunks(String, int, int)}, 
the shared
+ * tokenize-and-chunk seam used by both {@code NameFinderDL} and {@code 
DocumentCategorizerDL}.
+ */
+public class AbstractDLChunkingTest {
+
+  @Test
+  void testSplitsOnUnicodeWhitespaceNotJustAscii() {
+    // A no-break space (U+00A0) and an ideographic space (U+3000) are not 
matched by Java's \s
+    // but must still separate tokens; the chunk is rejoined with single ASCII 
spaces.
+    final String nbsp = new String(Character.toChars(0x00A0));
+    final String ideographic = new String(Character.toChars(0x3000));
+    assertEquals(List.of("alpha beta gamma"),
+        AbstractDL.whitespaceChunks("alpha" + nbsp + "beta" + ideographic + 
"gamma", 100, 0));
+  }
+
+  @Test
+  void testDropsEmptyTokensFromLeadingTrailingAndRepeatedWhitespace() {
+    // Unlike split("\\s+"), the Unicode-aware split yields no empty leading 
or trailing tokens.
+    assertEquals(List.of("a b c"), AbstractDL.whitespaceChunks("  a   b\tc  ", 
100, 0));
+  }
+
+  @Test
+  void testAppliesChunkSizeWithoutOverlap() {
+    assertEquals(List.of("a b", "c d"), AbstractDL.whitespaceChunks("a b c d", 
2, 0));
+  }
+
+  @Test
+  void testAppliesChunkOverlap() {
+    assertEquals(List.of("a b", "b c", "c d"), AbstractDL.whitespaceChunks("a 
b c d", 2, 1));
+  }
+
+  @Test
+  void testEmptyTextYieldsNoChunks() {
+    assertEquals(List.of(), AbstractDL.whitespaceChunks("", 100, 0));
+  }
+}
diff --git 
a/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/namefinder/NameFinderDLTest.java
 
b/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/namefinder/NameFinderDLTest.java
index c0a8aede2..1c97e0ad1 100644
--- 
a/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/namefinder/NameFinderDLTest.java
+++ 
b/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/namefinder/NameFinderDLTest.java
@@ -169,7 +169,7 @@ public class NameFinderDLTest {
   void testDecodeSpansLocatesEntityWithInternalPunctuation() {
     // WordPiece splits "AT&T" into separate AT / & / T tokens, so the 
reconstructed span text
     // ("AT & T") must still be located in the contiguous source. Regression 
guard for the
-    // flexible-whitespace (\s*) matching in findByRegex.
+    // flexible-whitespace matching in findInSource (a span space matches zero 
source whitespace).
     final String text = "Buy AT&T stock";
     final String[] tokens = {"[CLS]", "Buy", "AT", "&", "T", "stock", "[SEP]"};
     final float[][] scores = {
@@ -184,6 +184,37 @@ public class NameFinderDLTest {
     assertEquals("AT&T", spans.get(0).getCoveredText(text));
   }
 
+  @Test
+  void testDecodeSpansMatchesEntitySeparatedByNoBreakSpace() {
+    // The source separates "New" and "York" with a no-break space (U+00A0). 
Java's \s does not
+    // match it, so the previous regex matcher would have dropped this LOC 
span; the Unicode-aware
+    // cursor matcher locates it and the covered text includes the no-break 
space.
+    final String nbsp = new String(Character.toChars(0x00A0));
+    final String text = "Visit New" + nbsp + "York today";
+    final String[] tokens = {"[CLS]", "New", "York", "[SEP]"};
+    final float[][] scores = {scoresFor(0), scoresFor(3), scoresFor(4), 
scoresFor(0)};
+
+    final List<Span> spans = NameFinderDL.decodeSpans(text, tokens, scores, 
ID_TO_LABELS);
+
+    assertEquals(1, spans.size());
+    assertEquals("LOC", spans.get(0).getType());
+    assertEquals("New" + nbsp + "York", spans.get(0).getCoveredText(text));
+  }
+
+  @Test
+  void testDecodeSpansMatchesEntitySeparatedByIdeographicSpace() {
+    // Same idea with the CJK ideographic space (U+3000), another character 
outside Java's \s.
+    final String ideographic = new String(Character.toChars(0x3000));
+    final String text = "from New" + ideographic + "York city";
+    final String[] tokens = {"[CLS]", "New", "York", "[SEP]"};
+    final float[][] scores = {scoresFor(0), scoresFor(3), scoresFor(4), 
scoresFor(0)};
+
+    final List<Span> spans = NameFinderDL.decodeSpans(text, tokens, scores, 
ID_TO_LABELS);
+
+    assertEquals(1, spans.size());
+    assertEquals("New" + ideographic + "York", 
spans.get(0).getCoveredText(text));
+  }
+
   @Test
   void testDecodeSpansDoesNotMatchBeyondSearchEnd() {
     final String text = "London was quiet. Later Paris was loud.";
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizer.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizer.java
new file mode 100644
index 000000000..3a940b1b8
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizer.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.text.Normalizer;
+import java.util.Set;
+
+/**
+ * A {@link CharSequenceNormalizer} that folds diacritics for search and 
matching, the
+ * multilingual-safe counterpart to a Latin-only ASCII folding filter.
+ *
+ * <p>Folding decomposes the text (NFD) and drops nonspacing combining marks, 
but only for base
+ * characters whose script is in {@code foldScripts} (Latin, Greek, and 
Cyrillic by default). Marks
+ * on other scripts are left untouched, because there they are essential 
orthography rather than
+ * decoration: stripping an Indic vowel sign or a virama, an Arabic harakat, a 
Hebrew point, or a
+ * Thai vowel changes the word. This script gating is the key correctness 
rule; never strip all
+ * nonspacing marks globally.</p>
+ *
+ * <p>Many "accented" Latin letters are atomic and do not decompose ({@code o} 
with stroke, the
+ * {@code ae}/{@code oe} ligatures, eszett, thorn, and so on). When {@code 
foldStrokeLetters} is
+ * enabled (the default) these are mapped to an ASCII approximation. Folding 
is a recall
+ * optimization, not a linguistically correct transform, so it is intended for 
a search/matching
+ * token rather than for display or language-specific analysis.</p>
+ *
+ * <p>Scanning is a single cursor pass over the decomposed text; no regular 
expression is used, and
+ * no global {@code \p{Mn}} strip is performed.</p>
+ */
+public class AccentFoldCharSequenceNormalizer implements 
CharSequenceNormalizer {
+
+  private static final long serialVersionUID = 1L;
+
+  private static final Set<Character.UnicodeScript> DEFAULT_SCRIPTS = Set.of(
+      Character.UnicodeScript.LATIN,
+      Character.UnicodeScript.GREEK,
+      Character.UnicodeScript.CYRILLIC);
+
+  private static final AccentFoldCharSequenceNormalizer INSTANCE =
+      new AccentFoldCharSequenceNormalizer(DEFAULT_SCRIPTS, true);
+
+  private final Set<Character.UnicodeScript> foldScripts;
+  private final boolean foldStrokeLetters;
+
+  /**
+   * Creates a folder.
+   *
+   * @param foldScripts The scripts whose base characters' diacritics are 
folded; marks on every
+   *     other script are preserved.
+   * @param foldStrokeLetters Whether atomic Latin letters such as the stroke 
letters and ligatures
+   *     are mapped to an ASCII approximation.
+   */
+  public AccentFoldCharSequenceNormalizer(Set<Character.UnicodeScript> 
foldScripts,
+                                          boolean foldStrokeLetters) {
+    this.foldScripts = Set.copyOf(foldScripts);
+    this.foldStrokeLetters = foldStrokeLetters;
+  }
+
+  /** {@return the shared instance with the safe defaults: Latin, Greek, and 
Cyrillic plus the
+   *     stroke-letter map} */
+  public static AccentFoldCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  @Override
+  public CharSequence normalize(CharSequence text) {
+    final String decomposed = Normalizer.normalize(text, Normalizer.Form.NFD);
+    final StringBuilder out = new StringBuilder(decomposed.length());
+
+    Character.UnicodeScript baseScript = null;
+    int i = 0;
+    final int length = decomposed.length();
+    while (i < length) {
+      final int codePoint = decomposed.codePointAt(i);
+      if (Character.getType(codePoint) == Character.NON_SPACING_MARK) {
+        // Drop the mark only when its base character belongs to a folded 
script.
+        if (baseScript == null || !foldScripts.contains(baseScript)) {
+          out.appendCodePoint(codePoint);
+        }
+      } else {
+        final String mapped = foldStrokeLetters ? strokeLetter(codePoint) : 
null;
+        if (mapped != null) {
+          out.append(mapped);
+          baseScript = Character.UnicodeScript.LATIN;
+        } else {
+          out.appendCodePoint(codePoint);
+          baseScript = Character.UnicodeScript.of(codePoint);
+        }
+      }
+      i += Character.charCount(codePoint);
+    }
+
+    return Normalizer.normalize(out, Normalizer.Form.NFC);
+  }
+
+  // Atomic Latin letters that NFD does not decompose, mapped to an ASCII 
approximation.
+  private static String strokeLetter(int codePoint) {
+    return switch (codePoint) {
+      case 0x00F8 -> "o";   // o with stroke
+      case 0x00D8 -> "O";   // O with stroke
+      case 0x00E6 -> "ae";  // ae ligature
+      case 0x00C6 -> "AE";  // AE ligature
+      case 0x0153 -> "oe";  // oe ligature
+      case 0x0152 -> "OE";  // OE ligature
+      case 0x00DF -> "ss";  // eszett
+      case 0x1E9E -> "SS";  // capital eszett
+      case 0x00FE -> "th";  // thorn
+      case 0x00DE -> "TH";  // capital thorn
+      case 0x00F0 -> "d";   // eth
+      case 0x00D0 -> "D";   // capital eth
+      case 0x0111 -> "d";   // d with stroke
+      case 0x0110 -> "D";   // D with stroke
+      case 0x0142 -> "l";   // l with stroke
+      case 0x0141 -> "L";   // L with stroke
+      case 0x0127 -> "h";   // h with stroke
+      case 0x0126 -> "H";   // H with stroke
+      case 0x0131 -> "i";   // dotless i
+      default -> null;
+    };
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/CaseFoldCharSequenceNormalizer.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/CaseFoldCharSequenceNormalizer.java
new file mode 100644
index 000000000..176dd108b
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/CaseFoldCharSequenceNormalizer.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.Locale;
+
+/**
+ * A {@link CharSequenceNormalizer} that lower cases text for case-insensitive 
matching, using
+ * {@link Locale#ROOT} so the result does not depend on the JVM's default 
locale.
+ *
+ * <p>This is the case-folding step of a search / BM25 analysis chain (the 
counterpart to Lucene's
+ * lower-case filter). {@code Locale.ROOT} avoids locale surprises such as the 
Turkish dotless-i
+ * mapping; callers that need language-specific case rules should fold with an 
explicit locale
+ * upstream. Full Unicode case folding (for example German eszett, {@code 
U+00DF}, to {@code ss})
+ * is a distinct, heavier transform and is intentionally out of scope here.</p>
+ */
+public class CaseFoldCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final long serialVersionUID = 1L;
+
+  private static final CaseFoldCharSequenceNormalizer INSTANCE =
+      new CaseFoldCharSequenceNormalizer();
+
+  /** {@return the shared, stateless instance} */
+  public static CaseFoldCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  @Override
+  public CharSequence normalize(CharSequence text) {
+    return text.toString().toLowerCase(Locale.ROOT);
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java
new file mode 100644
index 000000000..31237e73f
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+/**
+ * A {@link CharSequenceNormalizer} that maps every Unicode dash to an ASCII 
hyphen-minus
+ * ({@code U+002D}), reusing the cursor based {@link CharClass#dashes()} 
engine.
+ *
+ * <p>This folds the many dash code points (en dash, em dash, figure dash, 
non-breaking hyphen,
+ * fullwidth hyphen, and so on) to a single form so that {@code 
"state-of-the-art"} matches
+ * regardless of which dash the source used. The mathematical minus signs are 
left untouched by
+ * default, and {@code U+00AD} SOFT HYPHEN (a format character) is not treated 
as a dash.</p>
+ */
+public class DashCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final long serialVersionUID = 1L;
+
+  private static final CharClass DASHES = CharClass.dashes();
+
+  private static final DashCharSequenceNormalizer INSTANCE = new 
DashCharSequenceNormalizer();
+
+  /** {@return the shared, stateless instance} */
+  public static DashCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  @Override
+  public CharSequence normalize(CharSequence text) {
+    return DASHES.normalize(text);
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NfcCharSequenceNormalizer.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NfcCharSequenceNormalizer.java
new file mode 100644
index 000000000..72d25d93b
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NfcCharSequenceNormalizer.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.text.Normalizer;
+
+/**
+ * A {@link CharSequenceNormalizer} that applies Unicode Normalization Form C 
(canonical
+ * composition, UAX #15).
+ *
+ * <p>NFC is the safe, lossless (under canonical equivalence) baseline for 
matching: precomposed
+ * and decomposed spellings of the same text (for example {@code U+00E9} 
versus {@code e} plus a
+ * combining acute accent) become identical, so equal text compares equal 
regardless of how it was
+ * encoded. It changes no characters' meaning and is the W3C-recommended 
interchange form.</p>
+ */
+public class NfcCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final long serialVersionUID = 1L;
+
+  private static final NfcCharSequenceNormalizer INSTANCE = new 
NfcCharSequenceNormalizer();
+
+  /** {@return the shared, stateless instance} */
+  public static NfcCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  @Override
+  public CharSequence normalize(CharSequence text) {
+    return Normalizer.normalize(text, Normalizer.Form.NFC);
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NfkcCharSequenceNormalizer.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NfkcCharSequenceNormalizer.java
new file mode 100644
index 000000000..c95568fab
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NfkcCharSequenceNormalizer.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.text.Normalizer;
+
+/**
+ * A {@link CharSequenceNormalizer} that applies Unicode Normalization Form KC 
(compatibility
+ * composition, UAX #15).
+ *
+ * <p>NFKC folds compatibility variants to their canonical form: fullwidth and 
halfwidth letters,
+ * the {@code U+FB01} ligature to {@code fi}, and super/subscript digits to 
plain digits. It is
+ * more aggressive than {@link NfcCharSequenceNormalizer NFC} and is lossy (it 
can change a
+ * character's appearance or meaning, e.g. a squared numeral to a plain one), 
so it is a deliberate
+ * choice for search/recall rather than a safe default.</p>
+ */
+public class NfkcCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final long serialVersionUID = 1L;
+
+  private static final NfkcCharSequenceNormalizer INSTANCE = new 
NfkcCharSequenceNormalizer();
+
+  /** {@return the shared, stateless instance} */
+  public static NfkcCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  @Override
+  public CharSequence normalize(CharSequence text) {
+    return Normalizer.normalize(text, Normalizer.Form.NFKC);
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java
new file mode 100644
index 000000000..affa82745
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+/**
+ * A {@link CharSequenceNormalizer} that collapses each run of Unicode 
whitespace to a single ASCII
+ * space and trims the edges, reusing the cursor based {@link 
CharClass#whitespace()} engine.
+ *
+ * <p>Unlike a {@code \s} regular expression, this recognizes the full Unicode 
{@code White_Space}
+ * set (no-break space, ideographic space, the typographic spaces, line and 
paragraph separators,
+ * and so on), so spacing copied from the web, PDFs, or non-Latin sources 
normalizes consistently.
+ * It is the Unicode-aware, regex-free counterpart to {@link 
ShrinkCharSequenceNormalizer}.</p>
+ */
+public class WhitespaceCharSequenceNormalizer implements 
CharSequenceNormalizer {
+
+  private static final long serialVersionUID = 1L;
+
+  private static final CharClass WHITESPACE = CharClass.whitespace();
+
+  private static final WhitespaceCharSequenceNormalizer INSTANCE =
+      new WhitespaceCharSequenceNormalizer();
+
+  /** {@return the shared, stateless instance} */
+  public static WhitespaceCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  @Override
+  public CharSequence normalize(CharSequence text) {
+    return WHITESPACE.trim(WHITESPACE.collapse(text));
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizerTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizerTest.java
new file mode 100644
index 000000000..ba4a6ea4b
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizerTest.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.Set;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertSame;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class AccentFoldCharSequenceNormalizerTest {
+
+  private static String cp(int codePoint) {
+    return new String(Character.toChars(codePoint));
+  }
+
+  private static String fold(String text) {
+    return 
AccentFoldCharSequenceNormalizer.getInstance().normalize(text).toString();
+  }
+
+  @Test
+  void testFoldsLatinAccents() {
+    assertEquals("cafe", fold("caf" + cp(0x00E9)));        // cafe with acute e
+    assertEquals("naive", fold("na" + cp(0x00EF) + "ve")); // naive with 
diaeresis i
+    assertEquals("Muller", fold("M" + cp(0x00FC) + "ller")); // Muller with 
umlaut u
+    assertEquals("anos", fold("a" + cp(0x00F1) + "os"));   // anos with tilde n
+  }
+
+  @Test
+  void testMapsStrokeAndLigatureLetters() {
+    assertEquals("o", fold(cp(0x00F8)));   // o with stroke
+    assertEquals("ae", fold(cp(0x00E6)));  // ae ligature
+    assertEquals("oe", fold(cp(0x0153)));  // oe ligature
+    assertEquals("Strasse", fold("Stra" + cp(0x00DF) + "e")); // eszett
+    assertEquals("th", fold(cp(0x00FE)));  // thorn
+    assertEquals("l", fold(cp(0x0142)));   // l with stroke
+    assertEquals("i", fold(cp(0x0131)));   // dotless i
+  }
+
+  @Test
+  void testFoldsGreekAndCyrillicAccents() {
+    assertEquals(cp(0x03B1), fold(cp(0x03AC))); // Greek alpha with tonos -> 
alpha
+    assertEquals(cp(0x0438), fold(cp(0x0439))); // Cyrillic short i -> i
+  }
+
+  @Test
+  void testLeavesAsciiUnchanged() {
+    assertEquals("hello world", fold("hello world"));
+  }
+
+  @Test
+  void testDoesNotTouchDevanagariArabicOrHebrewMarks() {
+    // The critical guard: marks on non-folded scripts are essential 
orthography and must survive.
+    final String devanagari = cp(0x0915) + cp(0x093E); // ka + aa vowel sign
+    assertEquals(devanagari, fold(devanagari));
+
+    final String arabic = cp(0x0628) + cp(0x064E);     // beh + fatha (a 
nonspacing mark)
+    assertEquals(arabic, fold(arabic));
+    assertTrue(fold(arabic).indexOf(0x064E) >= 0, "the Arabic fatha must not 
be stripped");
+
+    final String hebrew = cp(0x05D0) + cp(0x05B8);     // alef + qamats (a 
nonspacing mark)
+    assertEquals(hebrew, fold(hebrew));
+    assertTrue(fold(hebrew).indexOf(0x05B8) >= 0, "the Hebrew point must not 
be stripped");
+  }
+
+  @Test
+  void testScriptScopeIsConfigurable() {
+    // With no folded scripts, Latin accents are preserved.
+    final AccentFoldCharSequenceNormalizer none =
+        new AccentFoldCharSequenceNormalizer(Set.of(), false);
+    assertEquals("caf" + cp(0x00E9), none.normalize("caf" + 
cp(0x00E9)).toString());
+
+    // Widening the scope to Arabic folds an Arabic mark that the default 
leaves untouched.
+    final AccentFoldCharSequenceNormalizer arabicToo =
+        new 
AccentFoldCharSequenceNormalizer(Set.of(Character.UnicodeScript.ARABIC), false);
+    assertEquals(cp(0x0628), arabicToo.normalize(cp(0x0628) + 
cp(0x064E)).toString());
+  }
+
+  @Test
+  void testStrokeLetterMappingIsConfigurable() {
+    final AccentFoldCharSequenceNormalizer noStroke =
+        new 
AccentFoldCharSequenceNormalizer(Set.of(Character.UnicodeScript.LATIN), false);
+    assertEquals(cp(0x00DF), noStroke.normalize(cp(0x00DF)).toString()); // 
eszett kept as-is
+  }
+
+  @Test
+  void testComposesAfterCaseFold() {
+    final CharSequenceNormalizer pipeline = new 
AggregateCharSequenceNormalizer(
+        CaseFoldCharSequenceNormalizer.getInstance(),
+        AccentFoldCharSequenceNormalizer.getInstance());
+    assertEquals("cafe", pipeline.normalize("CAF" + cp(0x00C9)).toString()); 
// CAFE with acute E
+  }
+
+  @Test
+  void testInstanceIsSharedSingleton() {
+    assertSame(AccentFoldCharSequenceNormalizer.getInstance(),
+        AccentFoldCharSequenceNormalizer.getInstance());
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizerTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizerTest.java
new file mode 100644
index 000000000..7a700739f
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizerTest.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertSame;
+
+/**
+ * Tests for the {@link CharClass}-backed and Unicode-normalization {@link 
CharSequenceNormalizer}
+ * implementations, and their composition through {@link 
AggregateCharSequenceNormalizer}.
+ */
+public class UnicodeCharSequenceNormalizerTest {
+
+  private static String cp(int codePoint) {
+    return new String(Character.toChars(codePoint));
+  }
+
+  @Test
+  void testWhitespaceCollapsesUnicodeRunsAndTrims() {
+    final String input = "  a" + cp(0x00A0) + cp(0x00A0) + "b" + cp(0x3000) + 
"  ";
+    assertEquals("a b",
+        
WhitespaceCharSequenceNormalizer.getInstance().normalize(input).toString());
+  }
+
+  @Test
+  void testDashFoldsUnicodeDashesButNotMathMinus() {
+    assertEquals("a-b",
+        DashCharSequenceNormalizer.getInstance().normalize("a" + cp(0x2014) + 
"b").toString());
+    final String math = "5" + cp(0x2212) + "3";
+    assertEquals(math, 
DashCharSequenceNormalizer.getInstance().normalize(math).toString());
+  }
+
+  @Test
+  void testNfcComposesDecomposedSequences() {
+    // "e" + combining acute accent -> the precomposed letter U+00E9.
+    assertEquals(cp(0x00E9),
+        NfcCharSequenceNormalizer.getInstance().normalize("e" + 
cp(0x0301)).toString());
+  }
+
+  @Test
+  void testNfkcFoldsCompatibilityForms() {
+    assertEquals("A",
+        
NfkcCharSequenceNormalizer.getInstance().normalize(cp(0xFF21)).toString());
+    assertEquals("fi",
+        
NfkcCharSequenceNormalizer.getInstance().normalize(cp(0xFB01)).toString());
+  }
+
+  @Test
+  void testCaseFoldLowercasesIndependentOfLocale() {
+    assertEquals("abc", 
CaseFoldCharSequenceNormalizer.getInstance().normalize("ABC").toString());
+    // Accents are preserved; only case changes (CAFE-acute -> cafe-acute).
+    assertEquals("caf" + cp(0x00E9),
+        CaseFoldCharSequenceNormalizer.getInstance().normalize("CAF" + 
cp(0x00C9)).toString());
+  }
+
+  @Test
+  void testInstancesAreSharedSingletons() {
+    assertSame(WhitespaceCharSequenceNormalizer.getInstance(),
+        WhitespaceCharSequenceNormalizer.getInstance());
+    assertSame(DashCharSequenceNormalizer.getInstance(),
+        DashCharSequenceNormalizer.getInstance());
+    assertSame(NfcCharSequenceNormalizer.getInstance(),
+        NfcCharSequenceNormalizer.getInstance());
+    assertSame(NfkcCharSequenceNormalizer.getInstance(),
+        NfkcCharSequenceNormalizer.getInstance());
+    assertSame(CaseFoldCharSequenceNormalizer.getInstance(),
+        CaseFoldCharSequenceNormalizer.getInstance());
+  }
+
+  @Test
+  void testComposeIntoAUnifiedPipeline() {
+    // NFC, then Unicode whitespace, then dash folding, applied in order 
through the aggregate.
+    final CharSequenceNormalizer pipeline = new 
AggregateCharSequenceNormalizer(
+        NfcCharSequenceNormalizer.getInstance(),
+        WhitespaceCharSequenceNormalizer.getInstance(),
+        DashCharSequenceNormalizer.getInstance());
+
+    final String input = cp(0x00A0) + "a" + cp(0x2014) + "b" + cp(0x00A0);
+    assertEquals("a-b", pipeline.normalize(input).toString());
+  }
+}

(opennlp) 02/05: OPENNLP-1850 - Add robust character sequence normalization utilities and tests

Reply via email to