This is an automated email from the ASF dual-hosted git repository.

krickert pushed a commit to branch OPENNLP-1850_Whitespace-UTF-Normalizae
in repository https://gitbox.apache.org/repos/asf/opennlp.git

commit 19fb1b6300f7d671f31bf58d8af10067d46a12b4
Author: Kristian Rickert <[email protected]>
AuthorDate: Thu Jun 18 23:12:04 2026 -0400

    OPENNLP-1850 - Add offset-safe input normalization opt-ins to the DL 
components
    
    InferenceOptions gains setNormalizeWhitespace and setNormalizeDashes (both 
off
    by default). When enabled, NameFinderDL and DocumentCategorizerDL fold input
    whitespace and/or dashes to their ASCII forms before inference via a shared
    AbstractDL.normalizeInput helper. The mapping is one code point to one ASCII
    character, so it is offset preserving for the Basic Multilingual Plane and 
any
    spans the model produces still align with the input.
---
 .../src/main/java/opennlp/dl/AbstractDL.java       | 26 +++++++++++++++++
 .../src/main/java/opennlp/dl/InferenceOptions.java | 34 ++++++++++++++++++++++
 .../opennlp/dl/doccat/DocumentCategorizerDL.java   | 11 ++++++-
 .../java/opennlp/dl/namefinder/NameFinderDL.java   |  7 ++++-
 .../java/opennlp/dl/AbstractDLChunkingTest.java    | 21 +++++++++++++
 5 files changed, 97 insertions(+), 2 deletions(-)

diff --git 
a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/AbstractDL.java 
b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/AbstractDL.java
index 5b0a14f88..483788366 100644
--- 
a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/AbstractDL.java
+++ 
b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/AbstractDL.java
@@ -336,6 +336,32 @@ public abstract class AbstractDL implements AutoCloseable {
    */
   protected static final CharClass WHITESPACE = CharClass.whitespace();
 
+  /** Unicode dashes (excluding the mathematical minus signs), used for 
optional input folding. */
+  protected static final CharClass DASHES = CharClass.dashes();
+
+  /**
+   * Optionally folds Unicode whitespace and/or dashes in the input to their 
ASCII forms before
+   * inference. Each member code point maps to exactly one ASCII character, so 
the transform is
+   * offset preserving for Basic Multilingual Plane characters and any spans a 
model produces still
+   * align with the input.
+   *
+   * @param text The input text.
+   * @param normalizeWhitespace Whether to fold whitespace to ASCII spaces.
+   * @param normalizeDashes Whether to fold dashes to the ASCII hyphen.
+   * @return The optionally normalized text.
+   */
+  protected static String normalizeInput(final String text, final boolean 
normalizeWhitespace,
+                                         final boolean normalizeDashes) {
+    String result = text;
+    if (normalizeWhitespace) {
+      result = WHITESPACE.normalize(result).toString();
+    }
+    if (normalizeDashes) {
+      result = DASHES.normalize(result).toString();
+    }
+    return result;
+  }
+
   /**
    * Splits {@code text} on Unicode whitespace and groups the resulting tokens 
into overlapping
    * chunks, each rejoined with single ASCII spaces, ready for WordPiece 
tokenization. The split
diff --git 
a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/InferenceOptions.java
 
b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/InferenceOptions.java
index 344c5846d..f74effb29 100644
--- 
a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/InferenceOptions.java
+++ 
b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/InferenceOptions.java
@@ -26,6 +26,8 @@ public class InferenceOptions {
   private int documentSplitSize = 250;
   private int splitOverlapSize = 50;
   private Boolean lowerCase;
+  private boolean normalizeWhitespace;
+  private boolean normalizeDashes;
 
   public boolean isIncludeAttentionMask() {
     return includeAttentionMask;
@@ -75,6 +77,38 @@ public class InferenceOptions {
     this.splitOverlapSize = splitOverlapSize;
   }
 
+  /** {@return whether input whitespace is normalized to ASCII spaces before 
inference} */
+  public boolean isNormalizeWhitespace() {
+    return normalizeWhitespace;
+  }
+
+  /**
+   * Replaces every Unicode whitespace character in the input with an ASCII 
space before inference.
+   * This is offset preserving (each whitespace code point maps to one space), 
so any spans a model
+   * produces still align with the input. Off by default.
+   *
+   * @param normalizeWhitespace Whether to normalize whitespace.
+   */
+  public void setNormalizeWhitespace(boolean normalizeWhitespace) {
+    this.normalizeWhitespace = normalizeWhitespace;
+  }
+
+  /** {@return whether input dashes are normalized to the ASCII hyphen before 
inference} */
+  public boolean isNormalizeDashes() {
+    return normalizeDashes;
+  }
+
+  /**
+   * Replaces Unicode dashes in the input with the ASCII hyphen-minus before 
inference. This is
+   * offset preserving for the dash characters in the Basic Multilingual Plane 
(the common case).
+   * The mathematical minus signs are not affected. Off by default.
+   *
+   * @param normalizeDashes Whether to normalize dashes.
+   */
+  public void setNormalizeDashes(boolean normalizeDashes) {
+    this.normalizeDashes = normalizeDashes;
+  }
+
   /**
    * Returns whether tokenization should lower case the input text and strip
    * accents, as required by uncased models.
diff --git 
a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java
 
b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java
index c7293fc8b..c73ef6de0 100644
--- 
a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java
+++ 
b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java
@@ -85,6 +85,8 @@ public class DocumentCategorizerDL extends AbstractDL 
implements DocumentCategor
   private final boolean includeTokenTypeIds;
   private final int documentSplitSize;
   private final int splitOverlapSize;
+  private final boolean normalizeWhitespace;
+  private final boolean normalizeDashes;
 
   /**
    * Test-only constructor that injects an already-built {@link OrtSession} 
(or {@code null}),
@@ -101,6 +103,8 @@ public class DocumentCategorizerDL extends AbstractDL 
implements DocumentCategor
     this.includeTokenTypeIds = inferenceOptions.isIncludeTokenTypeIds();
     this.documentSplitSize = inferenceOptions.getDocumentSplitSize();
     this.splitOverlapSize = inferenceOptions.getSplitOverlapSize();
+    this.normalizeWhitespace = inferenceOptions.isNormalizeWhitespace();
+    this.normalizeDashes = inferenceOptions.isNormalizeDashes();
   }
 
   /**
@@ -132,6 +136,8 @@ public class DocumentCategorizerDL extends AbstractDL 
implements DocumentCategor
     this.includeTokenTypeIds = inferenceOptions.isIncludeTokenTypeIds();
     this.documentSplitSize = inferenceOptions.getDocumentSplitSize();
     this.splitOverlapSize = inferenceOptions.getSplitOverlapSize();
+    this.normalizeWhitespace = inferenceOptions.isNormalizeWhitespace();
+    this.normalizeDashes = inferenceOptions.isNormalizeDashes();
 
   }
 
@@ -165,6 +171,8 @@ public class DocumentCategorizerDL extends AbstractDL 
implements DocumentCategor
     this.includeTokenTypeIds = inferenceOptions.isIncludeTokenTypeIds();
     this.documentSplitSize = inferenceOptions.getDocumentSplitSize();
     this.splitOverlapSize = inferenceOptions.getSplitOverlapSize();
+    this.normalizeWhitespace = inferenceOptions.isNormalizeWhitespace();
+    this.normalizeDashes = inferenceOptions.isNormalizeDashes();
 
   }
 
@@ -327,8 +335,9 @@ public class DocumentCategorizerDL extends AbstractDL 
implements DocumentCategor
 
   }
 
-  private List<Tokens> tokenize(final String text) {
+  private List<Tokens> tokenize(final String input) {
 
+    final String text = normalizeInput(input, normalizeWhitespace, 
normalizeDashes);
     final List<Tokens> t = new LinkedList<>();
 
     // Segment long input text into overlapping chunks (split on Unicode 
whitespace) configured by
diff --git 
a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java
 
b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java
index eff6b87d5..555f71323 100644
--- 
a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java
+++ 
b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java
@@ -102,6 +102,8 @@ public class NameFinderDL extends AbstractDL implements 
TokenNameFinder {
   private final boolean includeTokenTypeIds;
   private final int documentSplitSize;
   private final int splitOverlapSize;
+  private final boolean normalizeWhitespace;
+  private final boolean normalizeDashes;
 
   /**
    * Instantiates a {@link TokenNameFinder name finder} using ONNX models.
@@ -151,6 +153,8 @@ public class NameFinderDL extends AbstractDL implements 
TokenNameFinder {
     this.includeTokenTypeIds = inferenceOptions.isIncludeTokenTypeIds();
     this.documentSplitSize = inferenceOptions.getDocumentSplitSize();
     this.splitOverlapSize = inferenceOptions.getSplitOverlapSize();
+    this.normalizeWhitespace = inferenceOptions.isNormalizeWhitespace();
+    this.normalizeDashes = inferenceOptions.isNormalizeDashes();
     this.sentenceDetector = sentenceDetector;
 
   }
@@ -183,7 +187,8 @@ public class NameFinderDL extends AbstractDL implements 
TokenNameFinder {
     final List<Span> spans = new ArrayList<>();
 
     // Join the tokens here because they will be tokenized using Wordpiece 
during inference.
-    final String text = String.join(" ", input);
+    final String text =
+        normalizeInput(String.join(" ", input), normalizeWhitespace, 
normalizeDashes);
 
     // sentPosDetect (not sentDetect) so each sentence's offset in the full 
text is known.
     final Span[] sentenceSpans = sentenceDetector.sentPosDetect(text);
diff --git 
a/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/AbstractDLChunkingTest.java
 
b/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/AbstractDLChunkingTest.java
index 38ab38450..386f47ee7 100644
--- 
a/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/AbstractDLChunkingTest.java
+++ 
b/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/AbstractDLChunkingTest.java
@@ -58,4 +58,25 @@ public class AbstractDLChunkingTest {
   void testEmptyTextYieldsNoChunks() {
     assertEquals(List.of(), AbstractDL.whitespaceChunks("", 100, 0));
   }
+
+  @Test
+  void testNormalizeInputIsOptInAndOffsetPreserving() {
+    final String nbsp = new String(Character.toChars(0x00A0));
+    final String emDash = new String(Character.toChars(0x2014));
+    final String input = "a" + nbsp + "b" + emDash + "c";
+
+    // Off by default: unchanged.
+    assertEquals(input, AbstractDL.normalizeInput(input, false, false));
+
+    // Whitespace only: the no-break space becomes a space, and the length is 
preserved.
+    final String ws = AbstractDL.normalizeInput(input, true, false);
+    assertEquals("a b" + emDash + "c", ws);
+    assertEquals(input.length(), ws.length());
+
+    // Dashes only: the em dash becomes an ASCII hyphen.
+    assertEquals("a" + nbsp + "b-c", AbstractDL.normalizeInput(input, false, 
true));
+
+    // Both.
+    assertEquals("a b-c", AbstractDL.normalizeInput(input, true, true));
+  }
 }

Reply via email to