This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch opennlp-2.x
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/opennlp-2.x by this push:
new 6431df81 OPENNLP-53: Add Parse.createFromTokens() for convenient
tokenized input (#1012)
6431df81 is described below
commit 6431df8122e54f00a14fc48c57d9ebb559bf03cc
Author: Martin Wiesner <[email protected]>
AuthorDate: Tue Apr 7 07:33:36 2026 +0200
OPENNLP-53: Add Parse.createFromTokens() for convenient tokenized input
(#1012)
OPENNLP-53: Add Parse.createFromTokens() for convenient tokenized input
Add a static factory method to Parse that creates a parse structure
directly from a String[] of tokens, avoiding the need to manually join tokens,
compute character offsets, and build the initial parse tree.
Adds a ParserTool.parseLine(String[], Parser, int) overload and refactors
existing code to use the new method.
---------
Co-authored-by: Richard Zowalla <[email protected]>
(cherry picked from commit 35227c726db6c5e218d9de4ca32d6edf530084e6)
---
.../opennlp/tools/cmdline/parser/ParserTool.java | 35 +++++++++-------
.../src/main/java/opennlp/tools/parser/Parse.java | 38 +++++++++++++++++
.../tools/parser/AbstractParserModelTest.java | 33 ++-------------
.../test/java/opennlp/tools/parser/ParseTest.java | 49 ++++++++++++++++++++++
4 files changed, 111 insertions(+), 44 deletions(-)
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java
b/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java
index 5006d300..be1f02e7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java
@@ -19,9 +19,6 @@ package opennlp.tools.cmdline.parser;
import java.io.File;
import java.io.IOException;
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.List;
import java.util.regex.Pattern;
import org.slf4j.Logger;
@@ -44,7 +41,6 @@ import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
-import opennlp.tools.util.Span;
public final class ParserTool extends BasicCmdLineTool {
@@ -78,17 +74,26 @@ public final class ParserTool extends BasicCmdLineTool {
line = UNTOKENIZED_PAREN_PATTERN_2.matcher(line).replaceAll("$1 $2");
// tokenize
- List<String> tokens = Arrays.asList( tokenizer.tokenize(line));
- String text = String.join(" ", tokens);
-
- Parse p = new Parse(text, new Span(0, text.length()),
AbstractBottomUpParser.INC_NODE, 0, 0);
- int start = 0;
- int i = 0;
- for (Iterator<String> ti = tokens.iterator(); ti.hasNext(); i++) {
- String tok = ti.next();
- p.insert(new Parse(text, new Span(start, start + tok.length()),
AbstractBottomUpParser.TOK_NODE, 0, i));
- start += tok.length() + 1;
- }
+ String[] tokens = tokenizer.tokenize(line);
+ return parseLine(tokens, parser, numParses);
+ }
+
+ /**
+ * Parses the specified pre-tokenized sentence and returns the requested
number of parses
+ * or fewer.
+ * <p>
+ * This is a convenience method for cases where the input has already been
tokenized
+ * into individual tokens. It avoids re-tokenizing and the need to manually
construct
+ * the whitespace-separated text and compute character offsets.
+ *
+ * @param tokens The tokens of the sentence to parse.
+ * @param parser The {@link Parser} to use.
+ * @param numParses The number of parses desired.
+ * @return The specified number of {@link Parse parses} for the given tokens.
+ * @throws IllegalArgumentException if {@code tokens} is {@code null} or
empty.
+ */
+ public static Parse[] parseLine(String[] tokens, Parser parser, int
numParses) {
+ Parse p = Parse.createFromTokens(tokens);
Parse[] parses;
if (numParses == 1) {
parses = new Parse[]{parser.parse(p)};
diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java
b/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java
index 3eaa7319..cb40c1a9 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java
@@ -803,6 +803,44 @@ public class Parse implements Cloneable, Comparable<Parse>
{
}
+ /**
+ * Creates a {@link Parse} structure from an array of
+ * pre-tokenized strings.
+ * <p>
+ * This is a convenience factory method for cases
+ * where the input sentence is already tokenized
+ * (e.g., as a {@code String[]}). It joins the
+ * tokens with whitespace, computes character offset
+ * {@link Span spans} for each token, and builds the
+ * initial flat parse tree expected by
+ * {@link Parser#parse(Parse)}.
+ *
+ * @param tokens The tokens of the sentence.
+ * @return A flat {@link Parse} structure with token
+ * nodes ready for a {@link Parser}.
+ * @throws IllegalArgumentException if {@code tokens}
+ * is {@code null} or empty.
+ */
+ public static Parse createFromTokens(final String[] tokens) {
+ if (tokens == null) {
+ throw new IllegalArgumentException("tokens must not be null");
+ }
+ if (tokens.length == 0) {
+ throw new IllegalArgumentException("tokens must not be empty");
+ }
+ String text = String.join(" ", tokens);
+ final Parse p = new Parse(text,
+ new Span(0, text.length()),
+ Parser.INC_NODE, 0, 0);
+ int start = 0;
+ for (int i = 0; i < tokens.length; i++) {
+ p.insert(new Parse(text, new Span(start, start + tokens[i].length()),
+ Parser.TOK_NODE, 0, i));
+ start += tokens[i].length() + 1;
+ }
+ return p;
+ }
+
/**
* Parses the specified tree-bank style parse string and return a {@link
Parse} structure
* for that string.
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java
b/opennlp-tools/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java
index 5b4f8df4..67296aac 100644
---
a/opennlp-tools/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java
+++
b/opennlp-tools/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java
@@ -20,9 +20,6 @@ package opennlp.tools.parser;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.List;
import java.util.stream.Stream;
import org.junit.jupiter.api.Assertions;
@@ -100,19 +97,8 @@ public abstract class AbstractParserModelTest {
// fixtures
final String sent = "Martin is testing.";
// prepare
- List<String> tokens =
Arrays.asList(WhitespaceTokenizer.INSTANCE.tokenize(sent));
- String text = String.join(" ", tokens);
-
- Parse sentP = new Parse(text, new Span(0, text.length()),
- AbstractBottomUpParser.INC_NODE, 0, null);
- int start = 0;
- int i = 0;
- for (Iterator<String> ti = tokens.iterator(); ti.hasNext(); i++) {
- String tok = ti.next();
- sentP.insert(new Parse(text, new Span(start, start + tok.length()),
- AbstractBottomUpParser.TOK_NODE, 0, i));
- start += tok.length() + 1;
- }
+ String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(sent);
+ Parse sentP = Parse.createFromTokens(tokens);
Parser parser = ParserFactory.create(getModel());
Assertions.assertNotNull(parser);
@@ -159,19 +145,8 @@ public abstract class AbstractParserModelTest {
"(TOP (S (NP (NNP Eric)) (VP (VBZ is) (NN testing.))))";
// prepare
- List<String> tokens =
Arrays.asList(WhitespaceTokenizer.INSTANCE.tokenize(sent));
- String text = String.join(" ", tokens);
-
- Parse sentP = new Parse(text, new Span(0, text.length()),
- AbstractBottomUpParser.INC_NODE, 0, 0);
- int start = 0;
- int i = 0;
- for (Iterator<String> ti = tokens.iterator(); ti.hasNext(); i++) {
- String tok = ti.next();
- sentP.insert(new Parse(text, new Span(start, start + tok.length()),
- AbstractBottomUpParser.TOK_NODE, 0, i));
- start += tok.length() + 1;
- }
+ String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(sent);
+ Parse sentP = Parse.createFromTokens(tokens);
opennlp.tools.parser.Parser parser = ParserFactory.create(getModel());
Assertions.assertNotNull(parser);
diff --git a/opennlp-tools/src/test/java/opennlp/tools/parser/ParseTest.java
b/opennlp-tools/src/test/java/opennlp/tools/parser/ParseTest.java
index 08cf70a3..bdf5b6de 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/parser/ParseTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/parser/ParseTest.java
@@ -20,6 +20,8 @@ package opennlp.tools.parser;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
+import opennlp.tools.util.Span;
+
/**
* Tests for the {@link Parse} class.
*/
@@ -127,4 +129,51 @@ public class ParseTest {
Assertions.assertEquals("NN", tags[16].getType());
Assertions.assertEquals(".", tags[17].getType());
}
+
+ @Test
+ void testCreateFromTokens() {
+ String[] tokens = {"The", "cat", "sat", "on", "the", "mat"};
+ Parse p = Parse.createFromTokens(tokens);
+
+ // Verify text is space-joined
+ Assertions.assertEquals("The cat sat on the mat", p.getText());
+
+ // Verify root span covers full text
+ Assertions.assertEquals(new Span(0, 22), p.getSpan());
+
+ // Verify root type is INC
+ Assertions.assertEquals(Parser.INC_NODE, p.getType());
+
+ // Verify token children
+ Parse[] children = p.getChildren();
+ Assertions.assertEquals(tokens.length, children.length);
+
+ int start = 0;
+ for (int i = 0; i < tokens.length; i++) {
+ Assertions.assertEquals(Parser.TOK_NODE, children[i].getType());
+ Assertions.assertEquals(new Span(start, start + tokens[i].length()),
children[i].getSpan());
+ Assertions.assertEquals(tokens[i], children[i].getCoveredText());
+ start += tokens[i].length() + 1;
+ }
+ }
+
+ @Test
+ void testCreateFromTokensNullThrows() {
+ Assertions.assertThrows(IllegalArgumentException.class, () ->
Parse.createFromTokens(null));
+ }
+
+ @Test
+ void testCreateFromTokensEmptyThrows() {
+ Assertions.assertThrows(IllegalArgumentException.class, () ->
Parse.createFromTokens(new String[0]));
+ }
+
+ @Test
+ void testCreateFromTokensSingleToken() {
+ String[] tokens = {"Hello"};
+ Parse p = Parse.createFromTokens(tokens);
+
+ Assertions.assertEquals("Hello", p.getText());
+ Assertions.assertEquals(1, p.getChildren().length);
+ Assertions.assertEquals(new Span(0, 5), p.getChildren()[0].getSpan());
+ }
}