This is an automated email from the ASF dual-hosted git repository. rzo1 pushed a commit to branch OPENNLP-53 in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit fba846591d5429351d8a347f69b39278fef2c262 Author: Richard Zowalla <[email protected]> AuthorDate: Thu Apr 2 16:36:21 2026 +0200 OPENNLP-53: Add Parse.createFromTokens() for convenient tokenized input Add a static factory method to Parse that creates a parse structure directly from a String[] of tokens, avoiding the need to manually join tokens, compute character offsets, and build the initial parse tree. Also add a ParserTool.parseLine(String[], Parser, int) overload and refactor existing code to use the new method. --- .../src/main/java/opennlp/tools/parser/Parse.java | 36 ++++++++++++++++ .../opennlp/tools/cmdline/parser/ParserTool.java | 34 ++++++++------- .../tools/parser/AbstractParserModelTest.java | 33 ++------------- .../test/java/opennlp/tools/parser/ParseTest.java | 49 ++++++++++++++++++++++ 4 files changed, 108 insertions(+), 44 deletions(-) diff --git a/opennlp-api/src/main/java/opennlp/tools/parser/Parse.java b/opennlp-api/src/main/java/opennlp/tools/parser/Parse.java index 88d7bc05..a11442ab 100644 --- a/opennlp-api/src/main/java/opennlp/tools/parser/Parse.java +++ b/opennlp-api/src/main/java/opennlp/tools/parser/Parse.java @@ -803,6 +803,42 @@ public class Parse implements Cloneable, Comparable<Parse> { } + /** + * Creates a {@link Parse} structure from an array of + * pre-tokenized strings. + * <p> + * This is a convenience factory method for cases + * where the input sentence is already tokenized + * (e.g., as a {@code String[]}). It joins the + * tokens with whitespace, computes character offset + * {@link Span spans} for each token, and builds the + * initial flat parse tree expected by + * {@link Parser#parse(Parse)}. + * + * @param tokens The tokens of the sentence. + * @return A flat {@link Parse} structure with token + * nodes ready for a {@link Parser}. + * @throws IllegalArgumentException if {@code tokens} + * is {@code null} or empty. + */ + public static Parse createFromTokens(final String[] tokens) { + Objects.requireNonNull(tokens, "tokens must not be null"); + if (tokens.length == 0) { + throw new IllegalArgumentException("tokens must not be empty"); + } + String text = String.join(" ", tokens); + Parse p = new Parse(text, + new Span(0, text.length()), + Parser.INC_NODE, 0, 0); + int start = 0; + for (int i = 0; i < tokens.length; i++) { + p.insert(new Parse(text, new Span(start, start + tokens[i].length()), + Parser.TOK_NODE, 0, i)); + start += tokens[i].length() + 1; + } + return p; + } + /** * Parses the specified tree-bank style parse string and return a {@link Parse} structure * for that string. diff --git a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java index 5006d300..1470e3b9 100644 --- a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java +++ b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java @@ -19,9 +19,6 @@ package opennlp.tools.cmdline.parser; import java.io.File; import java.io.IOException; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; import java.util.regex.Pattern; import org.slf4j.Logger; @@ -44,7 +41,6 @@ import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.tokenize.WhitespaceTokenizer; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; -import opennlp.tools.util.Span; public final class ParserTool extends BasicCmdLineTool { @@ -78,17 +74,25 @@ public final class ParserTool extends BasicCmdLineTool { line = UNTOKENIZED_PAREN_PATTERN_2.matcher(line).replaceAll("$1 $2"); // tokenize - List<String> tokens = Arrays.asList( tokenizer.tokenize(line)); - String text = String.join(" ", tokens); - - Parse p = new Parse(text, new Span(0, text.length()), AbstractBottomUpParser.INC_NODE, 0, 0); - int start = 0; - int i = 0; - for (Iterator<String> ti = tokens.iterator(); ti.hasNext(); i++) { - String tok = ti.next(); - p.insert(new Parse(text, new Span(start, start + tok.length()), AbstractBottomUpParser.TOK_NODE, 0, i)); - start += tok.length() + 1; - } + String[] tokens = tokenizer.tokenize(line); + return parseLine(tokens, parser, numParses); + } + + /** + * Parses the specified pre-tokenized sentence and returns the requested number of parses + * or fewer. + * <p> + * This is a convenience method for cases where the input has already been tokenized + * into individual tokens. It avoids re-tokenizing and the need to manually construct + * the whitespace-separated text and compute character offsets. + * + * @param tokens The tokens of the sentence to parse. + * @param parser The {@link Parser} to use. + * @param numParses The number of parses desired. + * @return The specified number of {@link Parse parses} for the given tokens. + */ + public static Parse[] parseLine(String[] tokens, Parser parser, int numParses) { + Parse p = Parse.createFromTokens(tokens); Parse[] parses; if (numParses == 1) { parses = new Parse[]{parser.parse(p)}; diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java index 5b4f8df4..67296aac 100644 --- a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java @@ -20,9 +20,6 @@ package opennlp.tools.parser; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; import java.util.stream.Stream; import org.junit.jupiter.api.Assertions; @@ -100,19 +97,8 @@ public abstract class AbstractParserModelTest { // fixtures final String sent = "Martin is testing."; // prepare - List<String> tokens = Arrays.asList(WhitespaceTokenizer.INSTANCE.tokenize(sent)); - String text = String.join(" ", tokens); - - Parse sentP = new Parse(text, new Span(0, text.length()), - AbstractBottomUpParser.INC_NODE, 0, null); - int start = 0; - int i = 0; - for (Iterator<String> ti = tokens.iterator(); ti.hasNext(); i++) { - String tok = ti.next(); - sentP.insert(new Parse(text, new Span(start, start + tok.length()), - AbstractBottomUpParser.TOK_NODE, 0, i)); - start += tok.length() + 1; - } + String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(sent); + Parse sentP = Parse.createFromTokens(tokens); Parser parser = ParserFactory.create(getModel()); Assertions.assertNotNull(parser); @@ -159,19 +145,8 @@ public abstract class AbstractParserModelTest { "(TOP (S (NP (NNP Eric)) (VP (VBZ is) (NN testing.))))"; // prepare - List<String> tokens = Arrays.asList(WhitespaceTokenizer.INSTANCE.tokenize(sent)); - String text = String.join(" ", tokens); - - Parse sentP = new Parse(text, new Span(0, text.length()), - AbstractBottomUpParser.INC_NODE, 0, 0); - int start = 0; - int i = 0; - for (Iterator<String> ti = tokens.iterator(); ti.hasNext(); i++) { - String tok = ti.next(); - sentP.insert(new Parse(text, new Span(start, start + tok.length()), - AbstractBottomUpParser.TOK_NODE, 0, i)); - start += tok.length() + 1; - } + String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(sent); + Parse sentP = Parse.createFromTokens(tokens); opennlp.tools.parser.Parser parser = ParserFactory.create(getModel()); Assertions.assertNotNull(parser); diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/parser/ParseTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/parser/ParseTest.java index 08cf70a3..632c0ea7 100644 --- a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/parser/ParseTest.java +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/parser/ParseTest.java @@ -20,6 +20,8 @@ package opennlp.tools.parser; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import opennlp.tools.util.Span; + /** * Tests for the {@link Parse} class. */ @@ -127,4 +129,51 @@ public class ParseTest { Assertions.assertEquals("NN", tags[16].getType()); Assertions.assertEquals(".", tags[17].getType()); } + + @Test + void testCreateFromTokens() { + String[] tokens = {"The", "cat", "sat", "on", "the", "mat"}; + Parse p = Parse.createFromTokens(tokens); + + // Verify text is space-joined + Assertions.assertEquals("The cat sat on the mat", p.getText()); + + // Verify root span covers full text + Assertions.assertEquals(new Span(0, 22), p.getSpan()); + + // Verify root type is INC + Assertions.assertEquals(Parser.INC_NODE, p.getType()); + + // Verify token children + Parse[] children = p.getChildren(); + Assertions.assertEquals(tokens.length, children.length); + + int start = 0; + for (int i = 0; i < tokens.length; i++) { + Assertions.assertEquals(Parser.TOK_NODE, children[i].getType()); + Assertions.assertEquals(new Span(start, start + tokens[i].length()), children[i].getSpan()); + Assertions.assertEquals(tokens[i], children[i].getCoveredText()); + start += tokens[i].length() + 1; + } + } + + @Test + void testCreateFromTokensNullThrows() { + Assertions.assertThrows(NullPointerException.class, () -> Parse.createFromTokens(null)); + } + + @Test + void testCreateFromTokensEmptyThrows() { + Assertions.assertThrows(IllegalArgumentException.class, () -> Parse.createFromTokens(new String[0])); + } + + @Test + void testCreateFromTokensSingleToken() { + String[] tokens = {"Hello"}; + Parse p = Parse.createFromTokens(tokens); + + Assertions.assertEquals("Hello", p.getText()); + Assertions.assertEquals(1, p.getChildren().length); + Assertions.assertEquals(new Span(0, 5), p.getChildren()[0].getSpan()); + } }
