This is an automated email from the ASF dual-hosted git repository. krickert pushed a commit to branch OPENNLP-1850_Whitespace-UTF-Normalizae in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit 19fb1b6300f7d671f31bf58d8af10067d46a12b4 Author: Kristian Rickert <[email protected]> AuthorDate: Thu Jun 18 23:12:04 2026 -0400 OPENNLP-1850 - Add offset-safe input normalization opt-ins to the DL components InferenceOptions gains setNormalizeWhitespace and setNormalizeDashes (both off by default). When enabled, NameFinderDL and DocumentCategorizerDL fold input whitespace and/or dashes to their ASCII forms before inference via a shared AbstractDL.normalizeInput helper. The mapping is one code point to one ASCII character, so it is offset preserving for the Basic Multilingual Plane and any spans the model produces still align with the input. --- .../src/main/java/opennlp/dl/AbstractDL.java | 26 +++++++++++++++++ .../src/main/java/opennlp/dl/InferenceOptions.java | 34 ++++++++++++++++++++++ .../opennlp/dl/doccat/DocumentCategorizerDL.java | 11 ++++++- .../java/opennlp/dl/namefinder/NameFinderDL.java | 7 ++++- .../java/opennlp/dl/AbstractDLChunkingTest.java | 21 +++++++++++++ 5 files changed, 97 insertions(+), 2 deletions(-) diff --git a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/AbstractDL.java b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/AbstractDL.java index 5b0a14f88..483788366 100644 --- a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/AbstractDL.java +++ b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/AbstractDL.java @@ -336,6 +336,32 @@ public abstract class AbstractDL implements AutoCloseable { */ protected static final CharClass WHITESPACE = CharClass.whitespace(); + /** Unicode dashes (excluding the mathematical minus signs), used for optional input folding. */ + protected static final CharClass DASHES = CharClass.dashes(); + + /** + * Optionally folds Unicode whitespace and/or dashes in the input to their ASCII forms before + * inference. Each member code point maps to exactly one ASCII character, so the transform is + * offset preserving for Basic Multilingual Plane characters and any spans a model produces still + * align with the input. + * + * @param text The input text. + * @param normalizeWhitespace Whether to fold whitespace to ASCII spaces. + * @param normalizeDashes Whether to fold dashes to the ASCII hyphen. + * @return The optionally normalized text. + */ + protected static String normalizeInput(final String text, final boolean normalizeWhitespace, + final boolean normalizeDashes) { + String result = text; + if (normalizeWhitespace) { + result = WHITESPACE.normalize(result).toString(); + } + if (normalizeDashes) { + result = DASHES.normalize(result).toString(); + } + return result; + } + /** * Splits {@code text} on Unicode whitespace and groups the resulting tokens into overlapping * chunks, each rejoined with single ASCII spaces, ready for WordPiece tokenization. The split diff --git a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/InferenceOptions.java b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/InferenceOptions.java index 344c5846d..f74effb29 100644 --- a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/InferenceOptions.java +++ b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/InferenceOptions.java @@ -26,6 +26,8 @@ public class InferenceOptions { private int documentSplitSize = 250; private int splitOverlapSize = 50; private Boolean lowerCase; + private boolean normalizeWhitespace; + private boolean normalizeDashes; public boolean isIncludeAttentionMask() { return includeAttentionMask; @@ -75,6 +77,38 @@ public class InferenceOptions { this.splitOverlapSize = splitOverlapSize; } + /** {@return whether input whitespace is normalized to ASCII spaces before inference} */ + public boolean isNormalizeWhitespace() { + return normalizeWhitespace; + } + + /** + * Replaces every Unicode whitespace character in the input with an ASCII space before inference. + * This is offset preserving (each whitespace code point maps to one space), so any spans a model + * produces still align with the input. Off by default. + * + * @param normalizeWhitespace Whether to normalize whitespace. + */ + public void setNormalizeWhitespace(boolean normalizeWhitespace) { + this.normalizeWhitespace = normalizeWhitespace; + } + + /** {@return whether input dashes are normalized to the ASCII hyphen before inference} */ + public boolean isNormalizeDashes() { + return normalizeDashes; + } + + /** + * Replaces Unicode dashes in the input with the ASCII hyphen-minus before inference. This is + * offset preserving for the dash characters in the Basic Multilingual Plane (the common case). + * The mathematical minus signs are not affected. Off by default. + * + * @param normalizeDashes Whether to normalize dashes. + */ + public void setNormalizeDashes(boolean normalizeDashes) { + this.normalizeDashes = normalizeDashes; + } + /** * Returns whether tokenization should lower case the input text and strip * accents, as required by uncased models. diff --git a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java index c7293fc8b..c73ef6de0 100644 --- a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java +++ b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java @@ -85,6 +85,8 @@ public class DocumentCategorizerDL extends AbstractDL implements DocumentCategor private final boolean includeTokenTypeIds; private final int documentSplitSize; private final int splitOverlapSize; + private final boolean normalizeWhitespace; + private final boolean normalizeDashes; /** * Test-only constructor that injects an already-built {@link OrtSession} (or {@code null}), @@ -101,6 +103,8 @@ public class DocumentCategorizerDL extends AbstractDL implements DocumentCategor this.includeTokenTypeIds = inferenceOptions.isIncludeTokenTypeIds(); this.documentSplitSize = inferenceOptions.getDocumentSplitSize(); this.splitOverlapSize = inferenceOptions.getSplitOverlapSize(); + this.normalizeWhitespace = inferenceOptions.isNormalizeWhitespace(); + this.normalizeDashes = inferenceOptions.isNormalizeDashes(); } /** @@ -132,6 +136,8 @@ public class DocumentCategorizerDL extends AbstractDL implements DocumentCategor this.includeTokenTypeIds = inferenceOptions.isIncludeTokenTypeIds(); this.documentSplitSize = inferenceOptions.getDocumentSplitSize(); this.splitOverlapSize = inferenceOptions.getSplitOverlapSize(); + this.normalizeWhitespace = inferenceOptions.isNormalizeWhitespace(); + this.normalizeDashes = inferenceOptions.isNormalizeDashes(); } @@ -165,6 +171,8 @@ public class DocumentCategorizerDL extends AbstractDL implements DocumentCategor this.includeTokenTypeIds = inferenceOptions.isIncludeTokenTypeIds(); this.documentSplitSize = inferenceOptions.getDocumentSplitSize(); this.splitOverlapSize = inferenceOptions.getSplitOverlapSize(); + this.normalizeWhitespace = inferenceOptions.isNormalizeWhitespace(); + this.normalizeDashes = inferenceOptions.isNormalizeDashes(); } @@ -327,8 +335,9 @@ public class DocumentCategorizerDL extends AbstractDL implements DocumentCategor } - private List<Tokens> tokenize(final String text) { + private List<Tokens> tokenize(final String input) { + final String text = normalizeInput(input, normalizeWhitespace, normalizeDashes); final List<Tokens> t = new LinkedList<>(); // Segment long input text into overlapping chunks (split on Unicode whitespace) configured by diff --git a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java index eff6b87d5..555f71323 100644 --- a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java +++ b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java @@ -102,6 +102,8 @@ public class NameFinderDL extends AbstractDL implements TokenNameFinder { private final boolean includeTokenTypeIds; private final int documentSplitSize; private final int splitOverlapSize; + private final boolean normalizeWhitespace; + private final boolean normalizeDashes; /** * Instantiates a {@link TokenNameFinder name finder} using ONNX models. @@ -151,6 +153,8 @@ public class NameFinderDL extends AbstractDL implements TokenNameFinder { this.includeTokenTypeIds = inferenceOptions.isIncludeTokenTypeIds(); this.documentSplitSize = inferenceOptions.getDocumentSplitSize(); this.splitOverlapSize = inferenceOptions.getSplitOverlapSize(); + this.normalizeWhitespace = inferenceOptions.isNormalizeWhitespace(); + this.normalizeDashes = inferenceOptions.isNormalizeDashes(); this.sentenceDetector = sentenceDetector; } @@ -183,7 +187,8 @@ public class NameFinderDL extends AbstractDL implements TokenNameFinder { final List<Span> spans = new ArrayList<>(); // Join the tokens here because they will be tokenized using Wordpiece during inference. - final String text = String.join(" ", input); + final String text = + normalizeInput(String.join(" ", input), normalizeWhitespace, normalizeDashes); // sentPosDetect (not sentDetect) so each sentence's offset in the full text is known. final Span[] sentenceSpans = sentenceDetector.sentPosDetect(text); diff --git a/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/AbstractDLChunkingTest.java b/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/AbstractDLChunkingTest.java index 38ab38450..386f47ee7 100644 --- a/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/AbstractDLChunkingTest.java +++ b/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/AbstractDLChunkingTest.java @@ -58,4 +58,25 @@ public class AbstractDLChunkingTest { void testEmptyTextYieldsNoChunks() { assertEquals(List.of(), AbstractDL.whitespaceChunks("", 100, 0)); } + + @Test + void testNormalizeInputIsOptInAndOffsetPreserving() { + final String nbsp = new String(Character.toChars(0x00A0)); + final String emDash = new String(Character.toChars(0x2014)); + final String input = "a" + nbsp + "b" + emDash + "c"; + + // Off by default: unchanged. + assertEquals(input, AbstractDL.normalizeInput(input, false, false)); + + // Whitespace only: the no-break space becomes a space, and the length is preserved. + final String ws = AbstractDL.normalizeInput(input, true, false); + assertEquals("a b" + emDash + "c", ws); + assertEquals(input.length(), ws.length()); + + // Dashes only: the em dash becomes an ASCII hyphen. + assertEquals("a" + nbsp + "b-c", AbstractDL.normalizeInput(input, false, true)); + + // Both. + assertEquals("a b-c", AbstractDL.normalizeInput(input, true, true)); + } }
