(opennlp) 03/06: OPENNLP-1850 Address tokenizer review comments

kristian Tue, 23 Jun 2026 15:44:36 -0700

This is an automated email from the ASF dual-hosted git repository.

krickert pushed a commit to branch OPENNLP-1850-2-tokenizer
in repository https://gitbox.apache.org/repos/asf/opennlp.git


commit cc89abf5206ad45574d21a4634aba7e2c74c060c
Author: Kristian Rickert <[email protected]>
AuthorDate: Sun Jun 21 14:14:37 2026 -0400

    OPENNLP-1850 Address tokenizer review comments
    
    - Term.at: document that an unconfigured dimension is applied on top of 
normalized()
      rather than in canonical pipeline order, with a non-commutative example.
    - WordType.IDEOGRAPHIC: soften javadoc ('a token containing a Han 
ideograph', not 'a
      single Han ideograph').
    - WordTokenizer: note the deliberate choice to implement Tokenizer directly 
instead of
      extending AbstractTokenizer.
    - WordSegmenter.IntList: overflow-aware 1.5x growth instead of length*2.
---
 .../main/java/opennlp/tools/tokenize/uax29/WordSegmenter.java    | 9 ++++++++-
 .../main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java    | 3 +++
 .../src/main/java/opennlp/tools/tokenize/uax29/WordType.java     | 2 +-
 .../src/main/java/opennlp/tools/util/normalizer/Term.java        | 8 ++++++++
 4 files changed, 20 insertions(+), 2 deletions(-)

diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordSegmenter.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordSegmenter.java
index eddbebacc..c1529d740 100644
--- 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordSegmenter.java
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordSegmenter.java
@@ -380,12 +380,19 @@ public final class WordSegmenter {
   // A minimal growable int array, so boundaries() makes one backing 
allocation instead of one per
   // boundary (an ArrayList<Integer> would box every offset).
   private static final class IntList {
+    private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8;
     private int[] values = new int[16];
     private int size;
 
     void add(int value) {
       if (size == values.length) {
-        values = Arrays.copyOf(values, values.length * 2);
+        // Overflow-aware 1.5x growth so a very large boundary count never 
wraps to a negative
+        // capacity (NegativeArraySizeException); it degrades to a clean 
OutOfMemoryError instead.
+        int newCapacity = values.length + (values.length >> 1);
+        if (newCapacity < 0 || newCapacity > MAX_ARRAY_SIZE) {
+          newCapacity = MAX_ARRAY_SIZE;
+        }
+        values = Arrays.copyOf(values, newCapacity);
       }
       values[size++] = value;
     }
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java
index 8ab9189c1..e09bbed8e 100644
--- 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java
@@ -37,6 +37,9 @@ import opennlp.tools.util.Span;
  * carries each token's {@link WordType}, and {@link #tokenize(CharSequence, 
TokenHandler)} streams
  * tokens with no per-token allocation. Instances are immutable and 
thread-safe.</p>
  */
+// Implements Tokenizer directly rather than extending AbstractTokenizer: this 
tokenizer produces
+// its spans from the UAX #29 segmenter in one pass and shares none of 
AbstractTokenizer's
+// per-character probability/merge machinery, so subclassing it would only add 
unused state.
 public final class WordTokenizer implements Tokenizer {
 
   /** Receives each word token as a character range and its type, with no 
allocation. */
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordType.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordType.java
index 652772069..4dcd7cc27 100644
--- 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordType.java
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordType.java
@@ -30,7 +30,7 @@ public enum WordType {
   /** A token made up entirely of digits and numeric connectors. */
   NUMERIC,
 
-  /** A single Han ideograph. */
+  /** A token containing a Han ideograph (one ideograph per token under UAX 
#29 segmentation). */
   IDEOGRAPHIC,
 
   /** A Hiragana token. */
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java
index 08d8793f1..eda3c4107 100644
--- 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java
@@ -82,6 +82,14 @@ public final class Term {
    * Returns the token at {@code dimension}. Configured dimensions are cached; 
an unconfigured
    * dimension is computed by applying its transform to {@link #normalized()} 
and then cached.
    *
+   * <p>Note: an unconfigured dimension is applied on top of {@link 
#normalized()} (the most
+   * aggressive configured layer), not spliced into canonical pipeline order. 
Because the transforms
+   * do not commute (see {@link Dimension}), requesting a dimension that ranks 
<em>earlier</em> than
+   * the configured ones can differ from having configured it. For example, 
asking for
+   * {@link Dimension#CASE_FOLD} on an analyzer configured only through {@link 
Dimension#ACCENT_FOLD}
+   * case-folds the already accent-folded text, which is not the same as 
case-folding first.
+   * Configure the dimension on the analyzer when canonical order matters.</p>
+   *
    * @param dimension The dimension to project to.
    * @return The token at that dimension.
    * @throws IllegalStateException if the dimension needs an engine or tag 
that was not configured

(opennlp) 03/06: OPENNLP-1850 Address tokenizer review comments

Reply via email to