This is an automated email from the ASF dual-hosted git repository. krickert pushed a commit to branch OPENNLP-1850-2-tokenizer in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit 7a3c25ac762853f4d0d986b44559c44e40f107eb Author: Kristian Rickert <[email protected]> AuthorDate: Tue Jun 23 08:20:15 2026 -0400 OPENNLP-1850 Address review: fail-loud TermAnalyzer default; harden WordBreakProperty Throw a targeted IllegalStateException when a requested character-level Dimension has no default normalizer instead of NPEing. Mask the byte-backed Word_Break ordinals as unsigned on read (defensive for >127 ordinals) and bulk-fill the BMP range with Arrays.fill. Drop a stdout print from the conformance test and rename the punctuation/symbols test to match what it asserts. --- .../java/opennlp/tools/tokenize/uax29/WordBreakProperty.java | 9 +++++---- .../main/java/opennlp/tools/util/normalizer/TermAnalyzer.java | 4 ++++ .../tools/tokenize/uax29/WordBoundaryConformanceTest.java | 2 -- .../java/opennlp/tools/tokenize/uax29/WordBreakPropertyTest.java | 3 ++- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordBreakProperty.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordBreakProperty.java index ed4fa5189..f650f01a4 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordBreakProperty.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordBreakProperty.java @@ -23,6 +23,7 @@ import java.io.InputStreamReader; import java.io.UncheckedIOException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; /** @@ -105,8 +106,8 @@ public final class WordBreakProperty { private static void assign(int start, int end, byte ordinal, List<int[]> supplementary) { final int bmpEnd = Math.min(end, 0xFFFF); - for (int codePoint = start; codePoint <= bmpEnd; codePoint++) { - BMP[codePoint] = ordinal; + if (start <= bmpEnd) { + Arrays.fill(BMP, start, bmpEnd + 1, ordinal); // bulk fill the BMP portion of the range } if (end > 0xFFFF) { supplementary.add(new int[] {Math.max(start, 0x10000), end, ordinal}); @@ -132,7 +133,7 @@ public final class WordBreakProperty { */ public static int ordinalOf(int codePoint) { if (codePoint >= 0 && codePoint <= 0xFFFF) { - return BMP[codePoint]; + return BMP[codePoint] & 0xFF; // unsigned: ordinals are stored as bytes, guard sign extension } return ordinalOfSupplementary(codePoint); } @@ -148,7 +149,7 @@ public final class WordBreakProperty { } else if (codePoint > SUPPLEMENTARY_END[mid]) { low = mid + 1; } else { - return SUPPLEMENTARY_VALUE[mid]; + return SUPPLEMENTARY_VALUE[mid] & 0xFF; // unsigned, as in the BMP path } } } diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java index d382fc09c..0d9956e8e 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java @@ -151,6 +151,10 @@ public final class TermAnalyzer { // A builder override wins; otherwise the dimension's own default normalizer. final CharSequenceNormalizer normalizer = transforms.containsKey(dimension) ? transforms.get(dimension) : dimension.defaultNormalizer(); + if (normalizer == null) { + throw new IllegalStateException("Dimension " + dimension + " has no default normalizer; " + + "configure it with builder().transform(" + dimension + ", ...)"); + } return normalizer.normalize(input).toString(); } } diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordBoundaryConformanceTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordBoundaryConformanceTest.java index 80339a801..a763f6786 100644 --- a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordBoundaryConformanceTest.java +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordBoundaryConformanceTest.java @@ -88,8 +88,6 @@ public class WordBoundaryConformanceTest { } final int passRate = total == 0 ? 0 : passed * 100 / total; - System.out.println("UAX#29 word-break conformance: " + passed + "/" + total - + " (" + passRate + "%)"); assertTrue(total > 1900, "expected the full conformance suite to load, ran only " + total); assertTrue(failures.isEmpty(), "UAX#29 word-break conformance: " + passed + "/" + total + " (" + passRate diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordBreakPropertyTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordBreakPropertyTest.java index 5735fca03..86383013b 100644 --- a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordBreakPropertyTest.java +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordBreakPropertyTest.java @@ -66,9 +66,10 @@ public class WordBreakPropertyTest { assertSame(WordBreak.OTHER, WordBreakProperty.of(0x1F600)); // grinning face } + // Assigned punctuation/symbols ('!', '@', em dash) carry no Word_Break property and map to OTHER. @ParameterizedTest @ValueSource(ints = {0x0021, 0x0040, 0x2014}) - void testUnassignedCodePointsAreOther(int codePoint) { + void testPunctuationAndSymbolsAreOther(int codePoint) { assertSame(WordBreak.OTHER, WordBreakProperty.of(codePoint)); }
