(opennlp) branch opennlp-2.x updated: OPENNLP-53: Add Parse.createFromTokens() for convenient tokenized input (#1012)

mawiesne Mon, 06 Apr 2026 22:37:06 -0700

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch opennlp-2.x
in repository https://gitbox.apache.org/repos/asf/opennlp.git



The following commit(s) were added to refs/heads/opennlp-2.x by this push:
     new 6431df81 OPENNLP-53: Add Parse.createFromTokens() for convenient 
tokenized input (#1012)
6431df81 is described below

commit 6431df8122e54f00a14fc48c57d9ebb559bf03cc
Author: Martin Wiesner <[email protected]>
AuthorDate: Tue Apr 7 07:33:36 2026 +0200

    OPENNLP-53: Add Parse.createFromTokens() for convenient tokenized input 
(#1012)
    
    OPENNLP-53: Add Parse.createFromTokens() for convenient tokenized input
    
    Add a static factory method to Parse that creates a parse structure 
directly from a String[] of tokens, avoiding the need to manually join tokens, 
compute character offsets, and build the initial parse tree.
    Adds a ParserTool.parseLine(String[], Parser, int) overload and refactors 
existing code to use the new method.
    
    ---------
    
    Co-authored-by: Richard Zowalla <[email protected]>
    (cherry picked from commit 35227c726db6c5e218d9de4ca32d6edf530084e6)
---
 .../opennlp/tools/cmdline/parser/ParserTool.java   | 35 +++++++++-------
 .../src/main/java/opennlp/tools/parser/Parse.java  | 38 +++++++++++++++++
 .../tools/parser/AbstractParserModelTest.java      | 33 ++-------------
 .../test/java/opennlp/tools/parser/ParseTest.java  | 49 ++++++++++++++++++++++
 4 files changed, 111 insertions(+), 44 deletions(-)

diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java 
b/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java
index 5006d300..be1f02e7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java
@@ -19,9 +19,6 @@ package opennlp.tools.cmdline.parser;
 
 import java.io.File;
 import java.io.IOException;
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.List;
 import java.util.regex.Pattern;
 
 import org.slf4j.Logger;
@@ -44,7 +41,6 @@ import opennlp.tools.tokenize.TokenizerModel;
 import opennlp.tools.tokenize.WhitespaceTokenizer;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
-import opennlp.tools.util.Span;
 
 public final class ParserTool extends BasicCmdLineTool {
 
@@ -78,17 +74,26 @@ public final class ParserTool extends BasicCmdLineTool {
     line = UNTOKENIZED_PAREN_PATTERN_2.matcher(line).replaceAll("$1 $2");
 
     // tokenize
-    List<String> tokens = Arrays.asList( tokenizer.tokenize(line));
-    String text = String.join(" ", tokens);
-
-    Parse p = new Parse(text, new Span(0, text.length()), 
AbstractBottomUpParser.INC_NODE, 0, 0);
-    int start = 0;
-    int i = 0;
-    for (Iterator<String> ti = tokens.iterator(); ti.hasNext(); i++) {
-      String tok = ti.next();
-      p.insert(new Parse(text, new Span(start, start + tok.length()), 
AbstractBottomUpParser.TOK_NODE, 0, i));
-      start += tok.length() + 1;
-    }
+    String[] tokens = tokenizer.tokenize(line);
+    return parseLine(tokens, parser, numParses);
+  }
+
+  /**
+   * Parses the specified pre-tokenized sentence and returns the requested 
number of parses
+   * or fewer.
+   * <p>
+   * This is a convenience method for cases where the input has already been 
tokenized
+   * into individual tokens. It avoids re-tokenizing and the need to manually 
construct
+   * the whitespace-separated text and compute character offsets.
+   *
+   * @param tokens    The tokens of the sentence to parse.
+   * @param parser    The {@link Parser} to use.
+   * @param numParses The number of parses desired.
+   * @return The specified number of {@link Parse parses} for the given tokens.
+   * @throws IllegalArgumentException if {@code tokens} is {@code null} or 
empty.
+   */
+  public static Parse[] parseLine(String[] tokens, Parser parser, int 
numParses) {
+    Parse p = Parse.createFromTokens(tokens);
     Parse[] parses;
     if (numParses == 1) {
       parses = new Parse[]{parser.parse(p)};
diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java 
b/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java
index 3eaa7319..cb40c1a9 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java
@@ -803,6 +803,44 @@ public class Parse implements Cloneable, Comparable<Parse> 
{
   }
 
 
+  /**
+   * Creates a {@link Parse} structure from an array of
+   * pre-tokenized strings.
+   * <p>
+   * This is a convenience factory method for cases
+   * where the input sentence is already tokenized
+   * (e.g., as a {@code String[]}). It joins the
+   * tokens with whitespace, computes character offset
+   * {@link Span spans} for each token, and builds the
+   * initial flat parse tree expected by
+   * {@link Parser#parse(Parse)}.
+   *
+   * @param tokens The tokens of the sentence.
+   * @return A flat {@link Parse} structure with token
+   *         nodes ready for a {@link Parser}.
+   * @throws IllegalArgumentException if {@code tokens}
+   *         is {@code null} or empty.
+   */
+  public static Parse createFromTokens(final String[] tokens) {
+    if (tokens == null) {
+      throw new IllegalArgumentException("tokens must not be null");
+    }
+    if (tokens.length == 0) {
+      throw new IllegalArgumentException("tokens must not be empty");
+    }
+    String text = String.join(" ", tokens);
+    final Parse p = new Parse(text,
+        new Span(0, text.length()),
+        Parser.INC_NODE, 0, 0);
+    int start = 0;
+    for (int i = 0; i < tokens.length; i++) {
+      p.insert(new Parse(text, new Span(start, start + tokens[i].length()),
+          Parser.TOK_NODE, 0, i));
+      start += tokens[i].length() + 1;
+    }
+    return p;
+  }
+
   /**
    * Parses the specified tree-bank style parse string and return a {@link 
Parse} structure
    * for that string.
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java 
b/opennlp-tools/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java
index 5b4f8df4..67296aac 100644
--- 
a/opennlp-tools/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java
@@ -20,9 +20,6 @@ package opennlp.tools.parser;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.List;
 import java.util.stream.Stream;
 
 import org.junit.jupiter.api.Assertions;
@@ -100,19 +97,8 @@ public abstract class AbstractParserModelTest {
     // fixtures
     final String sent = "Martin is testing.";
     // prepare
-    List<String> tokens = 
Arrays.asList(WhitespaceTokenizer.INSTANCE.tokenize(sent));
-    String text = String.join(" ", tokens);
-
-    Parse sentP = new Parse(text, new Span(0, text.length()),
-            AbstractBottomUpParser.INC_NODE, 0, null);
-    int start = 0;
-    int i = 0;
-    for (Iterator<String> ti = tokens.iterator(); ti.hasNext(); i++) {
-      String tok = ti.next();
-      sentP.insert(new Parse(text, new Span(start, start + tok.length()),
-              AbstractBottomUpParser.TOK_NODE, 0, i));
-      start += tok.length() + 1;
-    }
+    String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(sent);
+    Parse sentP = Parse.createFromTokens(tokens);
 
     Parser parser = ParserFactory.create(getModel());
     Assertions.assertNotNull(parser);
@@ -159,19 +145,8 @@ public abstract class AbstractParserModelTest {
             "(TOP (S (NP (NNP Eric)) (VP (VBZ is) (NN testing.))))";
 
     // prepare
-    List<String> tokens = 
Arrays.asList(WhitespaceTokenizer.INSTANCE.tokenize(sent));
-    String text = String.join(" ", tokens);
-
-    Parse sentP = new Parse(text, new Span(0, text.length()),
-            AbstractBottomUpParser.INC_NODE, 0, 0);
-    int start = 0;
-    int i = 0;
-    for (Iterator<String> ti = tokens.iterator(); ti.hasNext(); i++) {
-      String tok = ti.next();
-      sentP.insert(new Parse(text, new Span(start, start + tok.length()),
-              AbstractBottomUpParser.TOK_NODE, 0, i));
-      start += tok.length() + 1;
-    }
+    String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(sent);
+    Parse sentP = Parse.createFromTokens(tokens);
 
     opennlp.tools.parser.Parser parser = ParserFactory.create(getModel());
     Assertions.assertNotNull(parser);
diff --git a/opennlp-tools/src/test/java/opennlp/tools/parser/ParseTest.java 
b/opennlp-tools/src/test/java/opennlp/tools/parser/ParseTest.java
index 08cf70a3..bdf5b6de 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/parser/ParseTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/parser/ParseTest.java
@@ -20,6 +20,8 @@ package opennlp.tools.parser;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
 
+import opennlp.tools.util.Span;
+
 /**
  * Tests for the {@link Parse} class.
  */
@@ -127,4 +129,51 @@ public class ParseTest {
     Assertions.assertEquals("NN", tags[16].getType());
     Assertions.assertEquals(".", tags[17].getType());
   }
+
+  @Test
+  void testCreateFromTokens() {
+    String[] tokens = {"The", "cat", "sat", "on", "the", "mat"};
+    Parse p = Parse.createFromTokens(tokens);
+
+    // Verify text is space-joined
+    Assertions.assertEquals("The cat sat on the mat", p.getText());
+
+    // Verify root span covers full text
+    Assertions.assertEquals(new Span(0, 22), p.getSpan());
+
+    // Verify root type is INC
+    Assertions.assertEquals(Parser.INC_NODE, p.getType());
+
+    // Verify token children
+    Parse[] children = p.getChildren();
+    Assertions.assertEquals(tokens.length, children.length);
+
+    int start = 0;
+    for (int i = 0; i < tokens.length; i++) {
+      Assertions.assertEquals(Parser.TOK_NODE, children[i].getType());
+      Assertions.assertEquals(new Span(start, start + tokens[i].length()), 
children[i].getSpan());
+      Assertions.assertEquals(tokens[i], children[i].getCoveredText());
+      start += tokens[i].length() + 1;
+    }
+  }
+
+  @Test
+  void testCreateFromTokensNullThrows() {
+    Assertions.assertThrows(IllegalArgumentException.class, () -> 
Parse.createFromTokens(null));
+  }
+
+  @Test
+  void testCreateFromTokensEmptyThrows() {
+    Assertions.assertThrows(IllegalArgumentException.class, () -> 
Parse.createFromTokens(new String[0]));
+  }
+
+  @Test
+  void testCreateFromTokensSingleToken() {
+    String[] tokens = {"Hello"};
+    Parse p = Parse.createFromTokens(tokens);
+
+    Assertions.assertEquals("Hello", p.getText());
+    Assertions.assertEquals(1, p.getChildren().length);
+    Assertions.assertEquals(new Span(0, 5), p.getChildren()[0].getSpan());
+  }
 }

(opennlp) branch opennlp-2.x updated: OPENNLP-53: Add Parse.createFromTokens() for convenient tokenized input (#1012)

Reply via email to