bullet normalizers, the TextNormalizer pipeline, and offset-preserving TextAnalyzer

kristian Thu, 18 Jun 2026 22:14:12 -0700

This is an automated email from the ASF dual-hosted git repository.

krickert pushed a commit to branch OPENNLP-1850_Whitespace-UTF-Normalizae
in repository https://gitbox.apache.org/repos/asf/opennlp.git


commit ab15d7523bc768a39d46e3b291417cb8220501e3
Author: Kristian Rickert <[email protected]>
AuthorDate: Thu Jun 18 23:12:04 2026 -0400

    OPENNLP-1850 - Add quote/digit/invisible/ellipsis/bullet normalizers, the 
TextNormalizer pipeline, and offset-preserving TextAnalyzer
    
    Quote, digit, decimal, invisible-control, ellipsis, and bullet normalizers,
    all reusing the cursor-based CharClass engine (O(1) membership, no regex).
    TextNormalizer is a fluent builder that composes the rungs into an
    AggregateCharSequenceNormalizer, with a conservative searchDefault() chain.
    TextAnalyzer/AnalyzedToken tokenize and normalize per token while keeping 
each
    token's source span, the offset-preserving building block for BM25 matching.
---
 .../tools/util/normalizer/AnalyzedToken.java       |  34 +++++
 .../tools/util/normalizer/TextAnalyzer.java        |  93 ++++++++++++
 .../tools/util/normalizer/TextAnalyzerTest.java    | 102 +++++++++++++
 .../normalizer/BulletCharSequenceNormalizer.java   |  51 +++++++
 .../normalizer/DigitCharSequenceNormalizer.java    |  57 +++++++
 .../normalizer/EllipsisCharSequenceNormalizer.java |  60 ++++++++
 .../InvisibleCharSequenceNormalizer.java           |  71 +++++++++
 .../normalizer/QuoteCharSequenceNormalizer.java    |  69 +++++++++
 .../tools/util/normalizer/TextNormalizer.java      | 142 ++++++++++++++++++
 .../AccentFoldCharSequenceNormalizerTest.java      |  30 ++++
 .../util/normalizer/SetBasedNormalizerTest.java    | 163 +++++++++++++++++++++
 .../tools/util/normalizer/TextNormalizerTest.java  |  77 ++++++++++
 12 files changed, 949 insertions(+)

diff --git 
a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/AnalyzedToken.java 
b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/AnalyzedToken.java
new file mode 100644
index 000000000..389146596
--- /dev/null
+++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/AnalyzedToken.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import opennlp.tools.util.Span;
+
+/**
+ * One analyzed token: its character span in the source text, the original 
token text, and the
+ * normalized form used for matching or indexing.
+ *
+ * <p>The span ties the normalized term back to the original text, so a search 
hit on
+ * {@link #normalized()} can be highlighted against the source using {@link 
#span()} even though
+ * the normalized form may differ in length (for example after diacritic 
folding).</p>
+ *
+ * @param span The character span of the token in the source text.
+ * @param original The original token text.
+ * @param normalized The normalized token text (the match/index form).
+ */
+public record AnalyzedToken(Span span, String original, String normalized) {
+}
diff --git 
a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/TextAnalyzer.java 
b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/TextAnalyzer.java
new file mode 100644
index 000000000..7e8ce8d77
--- /dev/null
+++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/TextAnalyzer.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+
+import opennlp.tools.util.Span;
+
+/**
+ * Splits text into tokens and normalizes each one, keeping every token's 
original character span.
+ *
+ * <p>This is the offset-preserving building block for search and BM25-style 
matching: tokens are
+ * found with a {@link CharClass} splitter (O(1) membership, a single cursor 
pass, no regular
+ * expression) and each token's text is run through a {@link 
CharSequenceNormalizer}. The result is
+ * a list of {@link AnalyzedToken}, each carrying the source {@link Span} 
alongside its normalized
+ * form, so a match on the normalized term can always be reported and 
highlighted against the
+ * original text even when normalization changes a token's length.</p>
+ */
+public final class TextAnalyzer {
+
+  private final CharClass splitter;
+  private final CharSequenceNormalizer normalizer;
+
+  /**
+   * Creates an analyzer.
+   *
+   * @param splitter The character class whose members delimit tokens 
(typically
+   *     {@link CharClass#whitespace()}).
+   * @param normalizer The per-token normalizer.
+   */
+  public TextAnalyzer(CharClass splitter, CharSequenceNormalizer normalizer) {
+    this.splitter = Objects.requireNonNull(splitter, "splitter");
+    this.normalizer = Objects.requireNonNull(normalizer, "normalizer");
+  }
+
+  /**
+   * Creates an analyzer that splits on Unicode whitespace.
+   *
+   * @param normalizer The per-token normalizer.
+   * @return The analyzer.
+   */
+  public static TextAnalyzer whitespace(CharSequenceNormalizer normalizer) {
+    return new TextAnalyzer(CharClass.whitespace(), normalizer);
+  }
+
+  /**
+   * Tokenizes {@code text} and normalizes each token.
+   *
+   * @param text The text to analyze.
+   * @return The analyzed tokens, in order, each with its source span and 
normalized form.
+   */
+  public List<AnalyzedToken> analyze(CharSequence text) {
+    Objects.requireNonNull(text, "text");
+    final List<AnalyzedToken> tokens = new ArrayList<>();
+    for (final Span span : splitter.splitSpans(text)) {
+      final String original = text.subSequence(span.getStart(), 
span.getEnd()).toString();
+      final String normalized = normalizer.normalize(original).toString();
+      tokens.add(new AnalyzedToken(span, original, normalized));
+    }
+    return tokens;
+  }
+
+  /**
+   * Tokenizes {@code text} and returns only the normalized terms.
+   *
+   * @param text The text to analyze.
+   * @return The normalized token terms, in order.
+   */
+  public List<String> terms(CharSequence text) {
+    final List<AnalyzedToken> analyzed = analyze(text);
+    final List<String> terms = new ArrayList<>(analyzed.size());
+    for (final AnalyzedToken token : analyzed) {
+      terms.add(token.normalized());
+    }
+    return terms;
+  }
+}
diff --git 
a/opennlp-api/src/test/java/opennlp/tools/util/normalizer/TextAnalyzerTest.java 
b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/TextAnalyzerTest.java
new file mode 100644
index 000000000..77decf860
--- /dev/null
+++ 
b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/TextAnalyzerTest.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.List;
+import java.util.Locale;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class TextAnalyzerTest {
+
+  private static final CharSequenceNormalizer LOWER = s -> 
s.toString().toLowerCase(Locale.ROOT);
+
+  private static String cp(int codePoint) {
+    return new String(Character.toChars(codePoint));
+  }
+
+  @Test
+  void testAnalyzePreservesSpansAndNormalizesTokens() {
+    final String text = "Hello WORLD";
+    final List<AnalyzedToken> tokens = 
TextAnalyzer.whitespace(LOWER).analyze(text);
+
+    assertEquals(2, tokens.size());
+    assertEquals(0, tokens.get(0).span().getStart());
+    assertEquals(5, tokens.get(0).span().getEnd());
+    assertEquals("Hello", tokens.get(0).original());
+    assertEquals("hello", tokens.get(0).normalized());
+    assertEquals("WORLD", tokens.get(1).original());
+    assertEquals("world", tokens.get(1).normalized());
+    assertEquals("Hello", 
tokens.get(0).span().getCoveredText(text).toString());
+  }
+
+  @Test
+  void testSpanStaysCorrectWhenNormalizedLengthChanges() {
+    final CharSequenceNormalizer bracket = s -> "[" + s + "]";
+    final String text = "ab cd";
+    final List<AnalyzedToken> tokens = 
TextAnalyzer.whitespace(bracket).analyze(text);
+
+    assertEquals("[ab]", tokens.get(0).normalized());
+    assertEquals(0, tokens.get(0).span().getStart());
+    assertEquals(2, tokens.get(0).span().getEnd());
+    assertEquals(3, tokens.get(1).span().getStart());
+    assertEquals(5, tokens.get(1).span().getEnd());
+  }
+
+  @Test
+  void testSplitsOnUnicodeWhitespace() {
+    final String text = "alpha" + cp(0x00A0) + "beta";
+    final List<AnalyzedToken> tokens = 
TextAnalyzer.whitespace(LOWER).analyze(text);
+
+    assertEquals(2, tokens.size());
+    assertEquals("alpha", tokens.get(0).normalized());
+    assertEquals("beta", tokens.get(1).normalized());
+  }
+
+  @Test
+  void testSupplementaryTokenIsKeptIntact() {
+    final String emoji = cp(0x1F600);
+    final String text = "a " + emoji + " b";
+    final List<AnalyzedToken> tokens = 
TextAnalyzer.whitespace(LOWER).analyze(text);
+
+    assertEquals(3, tokens.size());
+    assertEquals(emoji, tokens.get(1).original());
+    assertTrue(tokens.get(1).span().getEnd() - tokens.get(1).span().getStart() 
== emoji.length());
+  }
+
+  @Test
+  void testTermsReturnsNormalizedFormsOnly() {
+    assertEquals(List.of("a", "b", "c"), 
TextAnalyzer.whitespace(LOWER).terms("A B C"));
+  }
+
+  @Test
+  void testEmptyAndWhitespaceOnlyYieldNoTokens() {
+    assertEquals(List.of(), TextAnalyzer.whitespace(LOWER).analyze(""));
+    assertEquals(List.of(), TextAnalyzer.whitespace(LOWER).analyze("   "));
+  }
+
+  @Test
+  void testRejectsNullArguments() {
+    assertThrows(NullPointerException.class, () -> new TextAnalyzer(null, 
LOWER));
+    assertThrows(NullPointerException.class, () -> new 
TextAnalyzer(CharClass.whitespace(), null));
+    assertThrows(NullPointerException.class, () -> 
TextAnalyzer.whitespace(LOWER).analyze(null));
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java
new file mode 100644
index 000000000..a58a45c53
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+/**
+ * A {@link CharSequenceNormalizer} that replaces unambiguous list-bullet 
characters with a space,
+ * so a bullet acts as a token separator rather than sticking to the following 
word.
+ *
+ * <p>Membership is an O(1) {@link CharClass} lookup and scanning is a single 
cursor pass with no
+ * regular expression. The middle dot ({@code U+00B7}) is deliberately 
<em>not</em> included,
+ * because it is a letter in Catalan ({@code l..l}) and other orthographies; 
only characters that
+ * are unambiguously list bullets are replaced.</p>
+ */
+public class BulletCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final long serialVersionUID = 1L;
+
+  private static final CharClass BULLETS = CharClass.of(CodePointSet.of(
+      0x2022,   // bullet
+      0x2023,   // triangular bullet
+      0x2043,   // hyphen bullet
+      0x2219,   // bullet operator
+      0x25E6),  // white bullet
+      0x0020);
+
+  private static final BulletCharSequenceNormalizer INSTANCE = new 
BulletCharSequenceNormalizer();
+
+  /** {@return the shared, stateless instance} */
+  public static BulletCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  @Override
+  public CharSequence normalize(CharSequence text) {
+    return BULLETS.normalize(text);
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java
new file mode 100644
index 000000000..90ee0d3d1
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+/**
+ * A {@link CharSequenceNormalizer} that maps Unicode decimal digits to their 
ASCII equivalents,
+ * so for example Arabic-Indic, Devanagari, or fullwidth digits all become 
{@code 0}-{@code 9}.
+ *
+ * <p>It maps a code point when {@link Character#digit(int, int)} reports a 
value of {@code 0}-
+ * {@code 9} in radix ten, that is, when the code point is a Unicode decimal 
digit. Other numeric
+ * forms (Roman numerals, superscripts, circled numbers, fractions) are not 
decimal digits and are
+ * left unchanged. Scanning is a single O(1)-per-code-point cursor pass with 
no regular
+ * expression.</p>
+ */
+public class DigitCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final long serialVersionUID = 1L;
+
+  private static final DigitCharSequenceNormalizer INSTANCE = new 
DigitCharSequenceNormalizer();
+
+  /** {@return the shared, stateless instance} */
+  public static DigitCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  @Override
+  public CharSequence normalize(CharSequence text) {
+    final StringBuilder out = new StringBuilder(text.length());
+    final int length = text.length();
+    int i = 0;
+    while (i < length) {
+      final int codePoint = Character.codePointAt(text, i);
+      final int value = Character.digit(codePoint, 10);
+      if (value >= 0) {
+        out.append((char) ('0' + value));
+      } else {
+        out.appendCodePoint(codePoint);
+      }
+      i += Character.charCount(codePoint);
+    }
+    return out.toString();
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java
new file mode 100644
index 000000000..8eccf2e5c
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+/**
+ * A {@link CharSequenceNormalizer} that expands the ellipsis and leader 
characters to ASCII dots:
+ * the horizontal ellipsis ({@code U+2026}) to {@code "..."} and the two-dot 
leader
+ * ({@code U+2025}) to {@code ".."}.
+ *
+ * <p>Scanning is a single O(1)-per-code-point cursor pass with no regular 
expression. ASCII dot
+ * runs are left unchanged.</p>
+ */
+public class EllipsisCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final long serialVersionUID = 1L;
+
+  private static final EllipsisCharSequenceNormalizer INSTANCE =
+      new EllipsisCharSequenceNormalizer();
+
+  /** {@return the shared, stateless instance} */
+  public static EllipsisCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  @Override
+  public CharSequence normalize(CharSequence text) {
+    final StringBuilder out = new StringBuilder(text.length());
+    final int length = text.length();
+    int i = 0;
+    while (i < length) {
+      final int codePoint = Character.codePointAt(text, i);
+      final String mapped = switch (codePoint) {
+        case 0x2026 -> "...";  // horizontal ellipsis
+        case 0x2025 -> "..";   // two dot leader
+        default -> null;
+      };
+      if (mapped != null) {
+        out.append(mapped);
+      } else {
+        out.appendCodePoint(codePoint);
+      }
+      i += Character.charCount(codePoint);
+    }
+    return out.toString();
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java
new file mode 100644
index 000000000..3828f6f22
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+/**
+ * A {@link CharSequenceNormalizer} that removes invisible format and 
bidirectional control
+ * characters that add no textual content and are a common source of noise and 
spoofing (the
+ * byte-order mark, zero width space, word joiner, bidi 
marks/embeddings/overrides/isolates, the
+ * invisible math operators, soft hyphen, and the Arabic letter mark).
+ *
+ * <p>Membership is an O(1) {@link CharClass} lookup and removal is a single 
cursor pass with no
+ * regular expression. The zero width joiner ({@code U+200D}) and non-joiner 
({@code U+200C}) are
+ * deliberately <em>kept</em>, because they carry meaning in Persian, Indic 
scripts, and emoji
+ * sequences; so are variation selectors. Use this only for a matching/search 
form, not for
+ * display.</p>
+ */
+public class InvisibleCharSequenceNormalizer implements CharSequenceNormalizer 
{
+
+  private static final long serialVersionUID = 1L;
+
+  // The replacement is unused: removeAll deletes members rather than 
substituting them.
+  private static final CharClass INVISIBLE = CharClass.of(CodePointSet.of(
+      0x00AD,   // soft hyphen
+      0x061C,   // arabic letter mark
+      0x200B,   // zero width space
+      0x200E,   // left-to-right mark
+      0x200F,   // right-to-left mark
+      0x202A,   // left-to-right embedding
+      0x202B,   // right-to-left embedding
+      0x202C,   // pop directional formatting
+      0x202D,   // left-to-right override
+      0x202E,   // right-to-left override
+      0x2060,   // word joiner
+      0x2061,   // function application
+      0x2062,   // invisible times
+      0x2063,   // invisible separator
+      0x2064,   // invisible plus
+      0x2066,   // left-to-right isolate
+      0x2067,   // right-to-left isolate
+      0x2068,   // first strong isolate
+      0x2069,   // pop directional isolate
+      0xFEFF),  // zero width no-break space (byte order mark)
+      0x0020);
+
+  private static final InvisibleCharSequenceNormalizer INSTANCE =
+      new InvisibleCharSequenceNormalizer();
+
+  /** {@return the shared, stateless instance} */
+  public static InvisibleCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  @Override
+  public CharSequence normalize(CharSequence text) {
+    return INVISIBLE.removeAll(text);
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java
new file mode 100644
index 000000000..acef8dcd0
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+/**
+ * A {@link CharSequenceNormalizer} that folds typographic quotation marks to 
their ASCII forms:
+ * the single quotes and apostrophes to {@code '} and the double quotes to 
{@code "}.
+ *
+ * <p>This is high value for matching, since curly quotes, guillemets, and 
fullwidth quotes
+ * otherwise prevent {@code "don't"} from matching {@code "don" + U+2019 + 
"t"}. It is built from
+ * two {@link CharClass} sets, so membership is O(1) and scanning is a single 
cursor pass with no
+ * regular expression. ASCII quotes are left unchanged.</p>
+ */
+public class QuoteCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final long serialVersionUID = 1L;
+
+  // Single quotes / apostrophes -> U+0027 APOSTROPHE.
+  private static final CharClass SINGLE = CharClass.of(CodePointSet.of(
+      0x2018,   // left single quotation mark
+      0x2019,   // right single quotation mark
+      0x201A,   // single low-9 quotation mark
+      0x201B,   // single high-reversed-9 quotation mark
+      0x2039,   // single left-pointing angle quotation mark
+      0x203A,   // single right-pointing angle quotation mark
+      0x02BC,   // modifier letter apostrophe
+      0xFF07),  // fullwidth apostrophe
+      '\'');
+
+  // Double quotes -> U+0022 QUOTATION MARK.
+  private static final CharClass DOUBLE = CharClass.of(CodePointSet.of(
+      0x201C,   // left double quotation mark
+      0x201D,   // right double quotation mark
+      0x201E,   // double low-9 quotation mark
+      0x201F,   // double high-reversed-9 quotation mark
+      0x00AB,   // left-pointing double angle quotation mark
+      0x00BB,   // right-pointing double angle quotation mark
+      0x301D,   // reversed double prime quotation mark
+      0x301E,   // double prime quotation mark
+      0x301F,   // low double prime quotation mark
+      0xFF02),  // fullwidth quotation mark
+      '"');
+
+  private static final QuoteCharSequenceNormalizer INSTANCE = new 
QuoteCharSequenceNormalizer();
+
+  /** {@return the shared, stateless instance} */
+  public static QuoteCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  @Override
+  public CharSequence normalize(CharSequence text) {
+    return DOUBLE.normalize(SINGLE.normalize(text));
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java
new file mode 100644
index 000000000..c1bac6409
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+
+/**
+ * A fluent builder that composes the normalization rungs into a single
+ * {@link CharSequenceNormalizer}.
+ *
+ * <p>The rungs are applied in the order they are added, so the caller 
controls the chain. A
+ * conservative, search-oriented chain is available through {@link 
#searchDefault()}. Each rung is
+ * a shared, stateless normalizer; the built normalizer is an {@link 
AggregateCharSequenceNormalizer}
+ * that applies them in sequence.</p>
+ *
+ * <pre>{@code
+ * CharSequenceNormalizer n = TextNormalizer.builder()
+ *     .nfc().caseFold().accentFold()
+ *     .build();
+ * }</pre>
+ */
+public final class TextNormalizer {
+
+  private final List<CharSequenceNormalizer> steps = new ArrayList<>();
+
+  private TextNormalizer() {
+  }
+
+  /** {@return a new, empty builder} */
+  public static TextNormalizer builder() {
+    return new TextNormalizer();
+  }
+
+  /** {@return this builder with NFC canonical composition appended} */
+  public TextNormalizer nfc() {
+    return add(NfcCharSequenceNormalizer.getInstance());
+  }
+
+  /** {@return this builder with NFKC compatibility composition appended} */
+  public TextNormalizer nfkc() {
+    return add(NfkcCharSequenceNormalizer.getInstance());
+  }
+
+  /** {@return this builder with invisible/bidi control stripping appended} */
+  public TextNormalizer stripInvisible() {
+    return add(InvisibleCharSequenceNormalizer.getInstance());
+  }
+
+  /** {@return this builder with Unicode whitespace collapsing appended} */
+  public TextNormalizer whitespace() {
+    return add(WhitespaceCharSequenceNormalizer.getInstance());
+  }
+
+  /** {@return this builder with quotation-mark folding appended} */
+  public TextNormalizer quotes() {
+    return add(QuoteCharSequenceNormalizer.getInstance());
+  }
+
+  /** {@return this builder with dash folding appended} */
+  public TextNormalizer dashes() {
+    return add(DashCharSequenceNormalizer.getInstance());
+  }
+
+  /** {@return this builder with decimal-digit folding appended} */
+  public TextNormalizer digits() {
+    return add(DigitCharSequenceNormalizer.getInstance());
+  }
+
+  /** {@return this builder with ellipsis expansion appended} */
+  public TextNormalizer ellipsis() {
+    return add(EllipsisCharSequenceNormalizer.getInstance());
+  }
+
+  /** {@return this builder with list-bullet replacement appended} */
+  public TextNormalizer bullets() {
+    return add(BulletCharSequenceNormalizer.getInstance());
+  }
+
+  /** {@return this builder with case folding appended} */
+  public TextNormalizer caseFold() {
+    return add(CaseFoldCharSequenceNormalizer.getInstance());
+  }
+
+  /** {@return this builder with script-gated diacritic folding appended} */
+  public TextNormalizer accentFold() {
+    return add(AccentFoldCharSequenceNormalizer.getInstance());
+  }
+
+  /**
+   * Appends a custom normalizer.
+   *
+   * @param custom The normalizer to append.
+   * @return This builder.
+   */
+  public TextNormalizer with(CharSequenceNormalizer custom) {
+    return add(Objects.requireNonNull(custom, "custom"));
+  }
+
+  /** {@return the composed normalizer for the rungs added so far} */
+  public CharSequenceNormalizer build() {
+    return new AggregateCharSequenceNormalizer(steps.toArray(new 
CharSequenceNormalizer[0]));
+  }
+
+  /**
+   * {@return a conservative search/matching chain}
+   *
+   * <p>The chain strips invisible controls, applies NFC, collapses 
whitespace, folds quotes and
+   * dashes, case folds, and finally applies script-gated diacritic 
folding.</p>
+   */
+  public static CharSequenceNormalizer searchDefault() {
+    return builder()
+        .stripInvisible()
+        .nfc()
+        .whitespace()
+        .quotes()
+        .dashes()
+        .caseFold()
+        .accentFold()
+        .build();
+  }
+
+  private TextNormalizer add(CharSequenceNormalizer normalizer) {
+    steps.add(normalizer);
+    return this;
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizerTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizerTest.java
index ba4a6ea4b..5db1a4683 100644
--- 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizerTest.java
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizerTest.java
@@ -53,6 +53,36 @@ public class AccentFoldCharSequenceNormalizerTest {
     assertEquals("i", fold(cp(0x0131)));   // dotless i
   }
 
+  @Test
+  void testEveryStrokeAndLigatureLetterMaps() {
+    assertEquals("o", fold(cp(0x00F8)));   // o with stroke
+    assertEquals("O", fold(cp(0x00D8)));   // O with stroke
+    assertEquals("ae", fold(cp(0x00E6)));  // ae
+    assertEquals("AE", fold(cp(0x00C6)));  // AE
+    assertEquals("oe", fold(cp(0x0153)));  // oe
+    assertEquals("OE", fold(cp(0x0152)));  // OE
+    assertEquals("ss", fold(cp(0x00DF)));  // eszett
+    assertEquals("SS", fold(cp(0x1E9E)));  // capital eszett
+    assertEquals("th", fold(cp(0x00FE)));  // thorn
+    assertEquals("TH", fold(cp(0x00DE)));  // capital thorn
+    assertEquals("d", fold(cp(0x00F0)));   // eth
+    assertEquals("D", fold(cp(0x00D0)));   // capital eth
+    assertEquals("d", fold(cp(0x0111)));   // d with stroke
+    assertEquals("D", fold(cp(0x0110)));   // D with stroke
+    assertEquals("l", fold(cp(0x0142)));   // l with stroke
+    assertEquals("L", fold(cp(0x0141)));   // L with stroke
+    assertEquals("h", fold(cp(0x0127)));   // h with stroke
+    assertEquals("H", fold(cp(0x0126)));   // H with stroke
+    assertEquals("i", fold(cp(0x0131)));   // dotless i
+  }
+
+  @Test
+  void testLeadingCombiningMarkWithNoBaseIsKept() {
+    // A combining mark with no preceding base (baseScript == null) must be 
kept, not dropped.
+    final String input = cp(0x0301) + "x"; // combining acute, then x
+    assertEquals(input, fold(input));
+  }
+
   @Test
   void testFoldsGreekAndCyrillicAccents() {
     assertEquals(cp(0x03B1), fold(cp(0x03AC))); // Greek alpha with tonos -> 
alpha
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/SetBasedNormalizerTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/SetBasedNormalizerTest.java
new file mode 100644
index 000000000..ea333f06b
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/SetBasedNormalizerTest.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertSame;
+
+public class SetBasedNormalizerTest {
+
+  private static String cp(int codePoint) {
+    return new String(Character.toChars(codePoint));
+  }
+
+  private static String quotes(String text) {
+    return 
QuoteCharSequenceNormalizer.getInstance().normalize(text).toString();
+  }
+
+  private static String digits(String text) {
+    return 
DigitCharSequenceNormalizer.getInstance().normalize(text).toString();
+  }
+
+  private static String invisible(String text) {
+    return 
InvisibleCharSequenceNormalizer.getInstance().normalize(text).toString();
+  }
+
+  private static String ellipsis(String text) {
+    return 
EllipsisCharSequenceNormalizer.getInstance().normalize(text).toString();
+  }
+
+  private static String bullet(String text) {
+    return 
BulletCharSequenceNormalizer.getInstance().normalize(text).toString();
+  }
+
+  // --- quotes 
------------------------------------------------------------------------------
+
+  @Test
+  void testQuotesFoldSingleAndDouble() {
+    assertEquals("don't", quotes("don" + cp(0x2019) + "t"));   // right single 
quote
+    assertEquals("\"hi\"", quotes(cp(0x201C) + "hi" + cp(0x201D))); // curly 
double quotes
+    assertEquals("\"x\"", quotes(cp(0x00AB) + "x" + cp(0x00BB)));   // 
guillemets
+    assertEquals("'y'", quotes(cp(0x2039) + "y" + cp(0x203A)));     // single 
angle quotes
+    assertEquals("'", quotes(cp(0xFF07)));   // fullwidth apostrophe
+    assertEquals("\"", quotes(cp(0xFF02)));  // fullwidth quotation mark
+    assertEquals("'", quotes(cp(0x02BC)));   // modifier letter apostrophe
+  }
+
+  @Test
+  void testQuotesLeaveAsciiAndNonQuotesAlone() {
+    assertEquals("'a' \"b\"", quotes("'a' \"b\""));
+    assertEquals("abc", quotes("abc"));
+    assertEquals(cp(0x2014), quotes(cp(0x2014))); // em dash is not a quote
+  }
+
+  @Test
+  void testQuotesSingleton() {
+    assertSame(QuoteCharSequenceNormalizer.getInstance(), 
QuoteCharSequenceNormalizer.getInstance());
+  }
+
+  // --- digits 
------------------------------------------------------------------------------
+
+  @Test
+  void testDigitsMapDecimalDigitsToAscii() {
+    assertEquals("123", digits(cp(0x0661) + cp(0x0662) + cp(0x0663))); // 
arabic-indic 1 2 3
+    assertEquals("12", digits(cp(0x0967) + cp(0x0968)));               // 
devanagari 1 2
+    assertEquals("15", digits(cp(0xFF11) + cp(0xFF15)));               // 
fullwidth 1 5
+    assertEquals("a5b", digits("a" + cp(0x0665) + "b"));               // 
arabic-indic 5
+  }
+
+  @Test
+  void testDigitsLeaveAsciiAndNonDecimalNumeralsAlone() {
+    assertEquals("0123456789", digits("0123456789"));
+    assertEquals(cp(0x00B2), digits(cp(0x00B2)));   // superscript two 
(category No)
+    assertEquals(cp(0x2160), digits(cp(0x2160)));   // roman numeral one 
(category Nl)
+    assertEquals(cp(0x00BD), digits(cp(0x00BD)));   // vulgar fraction one 
half (category No)
+    assertEquals("abc", digits("abc"));
+  }
+
+  @Test
+  void testDigitsSingleton() {
+    assertSame(DigitCharSequenceNormalizer.getInstance(), 
DigitCharSequenceNormalizer.getInstance());
+  }
+
+  // --- invisible / bidi controls 
-----------------------------------------------------------
+
+  @Test
+  void testInvisibleRemovesFormatAndBidiControls() {
+    assertEquals("ab", invisible("a" + cp(0xFEFF) + "b"));    // byte order 
mark
+    assertEquals("ab", invisible("a" + cp(0x200B) + "b"));    // zero width 
space
+    assertEquals("ab", invisible("a" + cp(0x2060) + "b"));    // word joiner
+    assertEquals("softhyphen", invisible("soft" + cp(0x00AD) + "hyphen"));
+    assertEquals("evil", invisible(cp(0x202E) + "evil" + cp(0x202C))); // bidi 
override + pop
+  }
+
+  @Test
+  void testInvisibleKeepsJoinersVariationSelectorsAndText() {
+    final String zwj = "a" + cp(0x200D) + "b";   // zero width joiner is 
meaningful
+    assertEquals(zwj, invisible(zwj));
+    final String zwnj = "a" + cp(0x200C) + "b";  // zero width non-joiner is 
meaningful
+    assertEquals(zwnj, invisible(zwnj));
+    final String family = cp(0x1F468) + cp(0x200D) + cp(0x1F469); // ZWJ emoji 
sequence preserved
+    assertEquals(family, invisible(family));
+    assertEquals("hello", invisible("hello"));
+  }
+
+  @Test
+  void testInvisibleSingleton() {
+    assertSame(InvisibleCharSequenceNormalizer.getInstance(),
+        InvisibleCharSequenceNormalizer.getInstance());
+  }
+
+  // --- ellipsis 
----------------------------------------------------------------------------
+
+  @Test
+  void testEllipsisExpandsToAsciiDots() {
+    assertEquals("...", ellipsis(cp(0x2026)));               // horizontal 
ellipsis
+    assertEquals("wait...", ellipsis("wait" + cp(0x2026)));
+    assertEquals("..", ellipsis(cp(0x2025)));                // two dot leader
+    assertEquals("...", ellipsis("..."));                    // ascii dots 
unchanged
+  }
+
+  @Test
+  void testEllipsisSingleton() {
+    assertSame(EllipsisCharSequenceNormalizer.getInstance(),
+        EllipsisCharSequenceNormalizer.getInstance());
+  }
+
+  // --- bullets 
-----------------------------------------------------------------------------
+
+  @Test
+  void testBulletsBecomeSeparatorSpaces() {
+    assertEquals(" item", bullet(cp(0x2022) + "item"));      // bullet
+    assertEquals(" item", bullet(cp(0x25E6) + "item"));      // white bullet
+    assertEquals("a b", bullet("a" + cp(0x2043) + "b"));     // hyphen bullet
+  }
+
+  @Test
+  void testBulletsLeaveMiddleDotAndTextAlone() {
+    assertEquals(cp(0x00B7), bullet(cp(0x00B7)));            // middle dot 
kept (Catalan)
+    assertEquals("plain", bullet("plain"));
+  }
+
+  @Test
+  void testBulletSingleton() {
+    assertSame(BulletCharSequenceNormalizer.getInstance(),
+        BulletCharSequenceNormalizer.getInstance());
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TextNormalizerTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TextNormalizerTest.java
new file mode 100644
index 000000000..64aa6df3a
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TextNormalizerTest.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.Locale;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+public class TextNormalizerTest {
+
+  private static String cp(int codePoint) {
+    return new String(Character.toChars(codePoint));
+  }
+
+  @Test
+  void testRungsApplyInOrder() {
+    final CharSequenceNormalizer n = 
TextNormalizer.builder().caseFold().accentFold().build();
+    assertEquals("cafe", n.normalize("CAF" + cp(0x00C9)).toString()); // 
CAFE-acute -> cafe
+  }
+
+  @Test
+  void testEmptyBuilderIsIdentity() {
+    assertEquals("UnChanged", 
TextNormalizer.builder().build().normalize("UnChanged").toString());
+  }
+
+  @Test
+  void testWhitespaceAndFoldChain() {
+    final CharSequenceNormalizer n = TextNormalizer.builder()
+        .nfc().whitespace().caseFold().accentFold().build();
+    assertEquals("cafe", n.normalize("  CAF" + cp(0x00C9) + "  ").toString());
+  }
+
+  @Test
+  void testWithCustomNormalizer() {
+    final CharSequenceNormalizer up = s -> 
s.toString().toUpperCase(Locale.ROOT);
+    assertEquals("AB", 
TextNormalizer.builder().with(up).build().normalize("ab").toString());
+  }
+
+  @Test
+  void testWithRejectsNull() {
+    assertThrows(NullPointerException.class, () -> 
TextNormalizer.builder().with(null));
+  }
+
+  @Test
+  void testSearchDefaultCleansMessyInput() {
+    // BOM + curly-quoted, mixed-case, accented text -> stripped, 
ASCII-quoted, folded.
+    final String input = cp(0xFEFF) + cp(0x201C) + "Caf" + cp(0x00C9) + 
cp(0x201D);
+    assertEquals("\"cafe\"", 
TextNormalizer.searchDefault().normalize(input).toString());
+  }
+
+  @Test
+  void testEveryRungIsInvokable() {
+    final CharSequenceNormalizer n = TextNormalizer.builder()
+        
.stripInvisible().nfc().nfkc().whitespace().quotes().dashes().digits().ellipsis().bullets()
+        .caseFold().accentFold().build();
+    // BOM stripped, Arabic-Indic 1 -> 1, case + accent folded.
+    final String input = cp(0xFEFF) + "CAF" + cp(0x00C9) + " " + cp(0x0661);
+    assertEquals("cafe 1", n.normalize(input).toString());
+  }
+}

(opennlp) 03/05: OPENNLP-1850 - Add quote/digit/invisible/ellipsis/bullet normalizers, the TextNormalizer pipeline, and offset-preserving TextAnalyzer

Reply via email to