This is an automated email from the ASF dual-hosted git repository. krickert pushed a commit to branch OPENNLP-1850-2-tokenizer in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit cc89abf5206ad45574d21a4634aba7e2c74c060c Author: Kristian Rickert <[email protected]> AuthorDate: Sun Jun 21 14:14:37 2026 -0400 OPENNLP-1850 Address tokenizer review comments - Term.at: document that an unconfigured dimension is applied on top of normalized() rather than in canonical pipeline order, with a non-commutative example. - WordType.IDEOGRAPHIC: soften javadoc ('a token containing a Han ideograph', not 'a single Han ideograph'). - WordTokenizer: note the deliberate choice to implement Tokenizer directly instead of extending AbstractTokenizer. - WordSegmenter.IntList: overflow-aware 1.5x growth instead of length*2. --- .../main/java/opennlp/tools/tokenize/uax29/WordSegmenter.java | 9 ++++++++- .../main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java | 3 +++ .../src/main/java/opennlp/tools/tokenize/uax29/WordType.java | 2 +- .../src/main/java/opennlp/tools/util/normalizer/Term.java | 8 ++++++++ 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordSegmenter.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordSegmenter.java index eddbebacc..c1529d740 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordSegmenter.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordSegmenter.java @@ -380,12 +380,19 @@ public final class WordSegmenter { // A minimal growable int array, so boundaries() makes one backing allocation instead of one per // boundary (an ArrayList<Integer> would box every offset). private static final class IntList { + private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8; private int[] values = new int[16]; private int size; void add(int value) { if (size == values.length) { - values = Arrays.copyOf(values, values.length * 2); + // Overflow-aware 1.5x growth so a very large boundary count never wraps to a negative + // capacity (NegativeArraySizeException); it degrades to a clean OutOfMemoryError instead. + int newCapacity = values.length + (values.length >> 1); + if (newCapacity < 0 || newCapacity > MAX_ARRAY_SIZE) { + newCapacity = MAX_ARRAY_SIZE; + } + values = Arrays.copyOf(values, newCapacity); } values[size++] = value; } diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java index 8ab9189c1..e09bbed8e 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java @@ -37,6 +37,9 @@ import opennlp.tools.util.Span; * carries each token's {@link WordType}, and {@link #tokenize(CharSequence, TokenHandler)} streams * tokens with no per-token allocation. Instances are immutable and thread-safe.</p> */ +// Implements Tokenizer directly rather than extending AbstractTokenizer: this tokenizer produces +// its spans from the UAX #29 segmenter in one pass and shares none of AbstractTokenizer's +// per-character probability/merge machinery, so subclassing it would only add unused state. public final class WordTokenizer implements Tokenizer { /** Receives each word token as a character range and its type, with no allocation. */ diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordType.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordType.java index 652772069..4dcd7cc27 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordType.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordType.java @@ -30,7 +30,7 @@ public enum WordType { /** A token made up entirely of digits and numeric connectors. */ NUMERIC, - /** A single Han ideograph. */ + /** A token containing a Han ideograph (one ideograph per token under UAX #29 segmentation). */ IDEOGRAPHIC, /** A Hiragana token. */ diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java index 08d8793f1..eda3c4107 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java @@ -82,6 +82,14 @@ public final class Term { * Returns the token at {@code dimension}. Configured dimensions are cached; an unconfigured * dimension is computed by applying its transform to {@link #normalized()} and then cached. * + * <p>Note: an unconfigured dimension is applied on top of {@link #normalized()} (the most + * aggressive configured layer), not spliced into canonical pipeline order. Because the transforms + * do not commute (see {@link Dimension}), requesting a dimension that ranks <em>earlier</em> than + * the configured ones can differ from having configured it. For example, asking for + * {@link Dimension#CASE_FOLD} on an analyzer configured only through {@link Dimension#ACCENT_FOLD} + * case-folds the already accent-folded text, which is not the same as case-folding first. + * Configure the dimension on the analyzer when canonical order matters.</p> + * * @param dimension The dimension to project to. * @return The token at that dimension. * @throws IllegalStateException if the dimension needs an engine or tag that was not configured
