This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new 97f97b68 OPENNLP-1220: Add support for Byte Pair Encoding (BPE) (#1011)
97f97b68 is described below
commit 97f97b687917021fffb3b2eb03cb634ed1d4dc1b
Author: Martin Wiesner <[email protected]>
AuthorDate: Tue Apr 7 10:37:29 2026 +0200
OPENNLP-1220: Add support for Byte Pair Encoding (BPE) (#1011)
* OPENNLP-1220 - Add support for Byte Pair Encoding (BPE)
---------
Co-authored-by: Richard Zowalla <[email protected]>
---
.../main/java/opennlp/tools/tokenize/BPEModel.java | 162 ++++++++++
.../java/opennlp/tools/tokenize/BPETokenizer.java | 290 +++++++++++++++++
.../tools/tokenize/BPETokenizerFactory.java | 187 +++++++++++
.../tools/tokenize/BPETokenizerTrainer.java | 281 +++++++++++++++++
.../tools/tokenize/AbstractBPEModelTest.java | 158 ++++++++++
.../AbstractBPETokenizerRealisticTest.java | 347 +++++++++++++++++++++
.../opennlp/tools/tokenize/BPEModelDeTest.java | 48 +++
.../opennlp/tools/tokenize/BPEModelEnTest.java | 45 +++
.../opennlp/tools/tokenize/BPEModelFrTest.java | 47 +++
.../tools/tokenize/BPETokenizerFactoryTest.java | 149 +++++++++
.../tokenize/BPETokenizerRealisticDeTest.java | 119 +++++++
.../tokenize/BPETokenizerRealisticEnTest.java | 124 ++++++++
.../tokenize/BPETokenizerRealisticEsTest.java | 148 +++++++++
.../tokenize/BPETokenizerRealisticFrTest.java | 146 +++++++++
.../tokenize/BPETokenizerRealisticItTest.java | 119 +++++++
.../opennlp/tools/tokenize/BPETokenizerTest.java | 230 ++++++++++++++
.../tools/tokenize/BPETokenizerTrainerTest.java | 188 +++++++++++
17 files changed, 2788 insertions(+)
diff --git
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPEModel.java
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPEModel.java
new file mode 100644
index 00000000..fe59de46
--- /dev/null
+++
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPEModel.java
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.tokenize.BPETokenizer.SymbolPair;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.BaseModel;
+
+/**
+ * The {@link BPEModel} stores learned BPE merge operations and can be
+ * serialized and deserialized for reuse.
+ * <p>
+ * A model is created by the {@link BPETokenizerTrainer} and contains an
ordered
+ * list of {@link BPETokenizer.SymbolPair} merge operations that define the BPE
+ * vocabulary. The model is persisted as a standard OpenNLP ZIP package with a
+ * {@code bpe.merges} artifact containing the merge rules.
+ * <p>
+ * <b>Usage:</b>
+ * <pre>{@code
+ * // Create via training
+ * BPETokenizerTrainer trainer = new BPETokenizerTrainer();
+ * BPEModel model = trainer.train(corpus, 10000, "en");
+ *
+ * // Save to disk
+ * model.serialize(Path.of("bpe-en.bin"));
+ *
+ * // Load from disk
+ * BPEModel loaded = new BPEModel(Path.of("bpe-en.bin"));
+ *
+ * // Use for tokenization
+ * BPETokenizer tokenizer = new BPETokenizer(loaded);
+ * }</pre>
+ *
+ * @see BPETokenizer
+ * @see BPETokenizerTrainer
+ * @see BPETokenizerFactory
+ */
+public final class BPEModel extends BaseModel {
+
+ private static final long serialVersionUID = 1L;
+ /** The component name for this model type. */
+ private static final String COMPONENT_NAME = "BPETokenizer";
+
+ /**
+ * Creates a {@link BPEModel} from trained merge rules.
+ *
+ * @param merges The ordered list of merge operations.
+ * Must not be {@code null}.
+ * @param manifestInfoEntries Additional manifest info.
+ * @param factory The {@link BPETokenizerFactory}.
+ */
+ public BPEModel(final List<SymbolPair> merges,
+ final Map<String, String> manifestInfoEntries,
+ final BPETokenizerFactory factory) {
+ super(COMPONENT_NAME,
+ factory.getLanguageCode(),
+ manifestInfoEntries, factory);
+ artifactMap.put(BPETokenizerFactory.MERGES_ENTRY_NAME,
+ new ArrayList<>(merges));
+ checkArtifactMap();
+ }
+
+ /**
+ * Initializes a {@link BPEModel} from an {@link InputStream}.
+ *
+ * @param in The {@link InputStream} for loading the model.
+ * @throws IOException Thrown if IO errors occurred.
+ */
+ public BPEModel(final InputStream in) throws IOException {
+ super(COMPONENT_NAME, in);
+ }
+
+ /**
+ * Initializes a {@link BPEModel} from a {@link File}.
+ *
+ * @param modelFile The {@link File} for loading the model.
+ * @throws IOException Thrown if IO errors occurred.
+ */
+ public BPEModel(final File modelFile) throws IOException {
+ super(COMPONENT_NAME, modelFile);
+ }
+
+ /**
+ * Initializes a {@link BPEModel} from a {@link Path}.
+ *
+ * @param modelPath The {@link Path} for loading the model.
+ * @throws IOException Thrown if IO errors occurred.
+ */
+ public BPEModel(final Path modelPath) throws IOException {
+ super(COMPONENT_NAME, modelPath);
+ }
+
+ /**
+ * Initializes a {@link BPEModel} from a {@link URL}.
+ *
+ * @param modelURL The {@link URL} for loading the model.
+ * @throws IOException Thrown if IO errors occurred.
+ */
+ public BPEModel(final URL modelURL) throws IOException {
+ super(COMPONENT_NAME, modelURL);
+ }
+
+ @Override
+ protected void validateArtifactMap()
+ throws InvalidFormatException {
+ super.validateArtifactMap();
+
+ Object mergesArtifact =
+ artifactMap.get(BPETokenizerFactory.MERGES_ENTRY_NAME);
+ if (!(mergesArtifact instanceof List<?>)) {
+ throw new InvalidFormatException(
+ "BPE model is incomplete: missing merge rules!");
+ }
+ }
+
+ @Override
+ protected Class<? extends BaseToolFactory> getDefaultFactory() {
+ return BPETokenizerFactory.class;
+ }
+
+ /**
+ * @return The active {@link BPETokenizerFactory}.
+ */
+ public BPETokenizerFactory getFactory() {
+ return (BPETokenizerFactory) this.toolFactory;
+ }
+
+ /**
+ * @return An unmodifiable, ordered list of BPE merge operations stored in
this model.
+ */
+ @SuppressWarnings("unchecked")
+ public List<SymbolPair> getMerges() {
+ return Collections.unmodifiableList(
+ (List<SymbolPair>)
artifactMap.get(BPETokenizerFactory.MERGES_ENTRY_NAME));
+ }
+}
diff --git
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPETokenizer.java
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPETokenizer.java
new file mode 100644
index 00000000..1a937da5
--- /dev/null
+++
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPETokenizer.java
@@ -0,0 +1,290 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+
+import opennlp.tools.util.Span;
+
+/**
+ * A {@link Tokenizer} implementation that performs subword tokenization
+ * using Byte Pair Encoding (BPE).
+ * <p>
+ * BPE iteratively merges the most frequent pair of adjacent symbols,
+ * starting from a character-level representation of each word. This allows
+ * the tokenizer to handle out-of-vocabulary words by decomposing them into
+ * known subword units.
+ * <p>
+ * <b>Usage:</b>
+ * <pre>{@code
+ * // Train a BPE model from a corpus
+ * BPETokenizerTrainer trainer = new BPETokenizerTrainer();
+ * BPEModel model = trainer.train(corpus, 10000, "en");
+ *
+ * // Save the model for later reuse
+ * model.serialize(Path.of("bpe-en.bin"));
+ *
+ * // Load and tokenize
+ * BPEModel loaded = new BPEModel(Path.of("bpe-en.bin"));
+ * BPETokenizer tokenizer = new BPETokenizer(loaded);
+ * String[] tokens = tokenizer.tokenize("unseen words are split into
subwords");
+ * }</pre>
+ * <p>
+ * The tokenizer first splits text on whitespace, then applies learned merge
+ * operations to each word independently. Words are decomposed into characters
+ * with an {@link #END_OF_WORD} marker on the final character, then merges are
+ * applied in priority order (as learned during training) until no more merges
+ * are applicable. The resulting subword units are returned as tokens.
+ * <p>
+ * For reference see:
+ * <ul>
+ * <li>Sennrich, R., Haddow, B., & Birch, A. (2016).
+ * Neural Machine Translation of Rare Words with Subword Units.
+ * <a href="https://arxiv.org/abs/1508.07909">
+ * https://arxiv.org/abs/1508.07909</a>
+ * </li>
+ * </ul>
+ *
+ * @see BPEModel
+ * @see BPETokenizerTrainer
+ * @see WordpieceTokenizer
+ */
+public class BPETokenizer implements Tokenizer {
+
+ /**
+ * Suffix appended to the last symbol of each word during BPE encoding
+ * to distinguish word-final characters from word-internal ones.
+ * <p>
+ * Users constructing {@link SymbolPair} merge rules manually must use this
+ * constant to mark word-final symbols
+ * (e.g., {@code new SymbolPair("a", "b" + END_OF_WORD)}).
+ */
+ public static final String END_OF_WORD = "</w>";
+
+ /** Maps each merge pair to its priority rank. */
+ private final LinkedHashMap<SymbolPair, Integer> mergeRanks;
+
+ /**
+ * Initializes a {@link BPETokenizer} from a trained
+ * {@link BPEModel}.
+ *
+ * @param model The trained BPE model. Must not be {@code null}.
+ * @throws IllegalArgumentException if {@code model} is {@code null}.
+ */
+ public BPETokenizer(final BPEModel model) {
+ if (model == null) {
+ throw new IllegalArgumentException("model must not be null");
+ }
+ final List<SymbolPair> merges = model.getMerges();
+ this.mergeRanks = new LinkedHashMap<>();
+ for (int i = 0; i < merges.size(); i++) {
+ mergeRanks.put(merges.get(i), i);
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ * <p>
+ * Splits the input text on whitespace, then applies BPE merge operations
+ * to each word to produce subword tokens. Words not fully covered by
+ * learned merges are decomposed into individual characters.
+ */
+ @Override
+ public String[] tokenize(final String text) {
+ if (text == null || text.isEmpty()) {
+ return new String[0];
+ }
+
+ final String[] words = WhitespaceTokenizer.INSTANCE.tokenize(text);
+ final List<String> allTokens = new ArrayList<>();
+
+ for (final String word : words) {
+ allTokens.addAll(encodeToBPE(word));
+ }
+
+ return allTokens.toArray(new String[0]);
+ }
+
+ /**
+ * {@inheritDoc}
+ * <p>
+ * Returns {@link Span} offsets into the original text for each subword
token.
+ * Each span maps back to the exact character range in the input string.
+ */
+ @Override
+ public Span[] tokenizePos(final String text) {
+ if (text == null || text.isEmpty()) {
+ return new Span[0];
+ }
+
+ final Span[] wordSpans = WhitespaceTokenizer.INSTANCE.tokenizePos(text);
+ final List<Span> allSpans = new ArrayList<>();
+
+ for (final Span wordSpan : wordSpans) {
+ final String word = wordSpan.getCoveredText(text).toString();
+ final List<String> symbols = splitToSymbols(word);
+ final List<String> merged = applyMerges(symbols);
+
+ int offset = wordSpan.getStart();
+ for (final String token : merged) {
+ String clean = token.endsWith(END_OF_WORD)
+ ? token.substring(0, token.length() - END_OF_WORD.length())
+ : token;
+ int len = clean.length();
+ allSpans.add(new Span(offset, offset + len));
+ offset += len;
+ }
+ }
+
+ return allSpans.toArray(new Span[0]);
+ }
+
+ /**
+ * Splits a word into its initial character-level BPE symbol sequence.
+ * Each character becomes its own symbol, with {@link #END_OF_WORD} appended
+ * to the final character.
+ *
+ * @param word The word to split. Must not be {@code null} or empty.
+ * @return A mutable list of character symbols.
+ */
+ private List<String> splitToSymbols(final String word) {
+ final List<String> symbols = new ArrayList<>(word.length());
+ for (int i = 0; i < word.length(); i++) {
+ if (i == word.length() - 1) {
+ symbols.add(word.charAt(i) + END_OF_WORD);
+ } else {
+ symbols.add(String.valueOf(word.charAt(i)));
+ }
+ }
+ return symbols;
+ }
+
+ /**
+ * Encodes a single word into BPE subword tokens by splitting it into
+ * character-level symbols, applying learned merge operations, and stripping
+ * the {@link #END_OF_WORD} markers from the resulting tokens.
+ *
+ * @param word The word to encode. Must not be {@code null}.
+ * @return A list of subword token strings whose concatenation equals the
original word.
+ */
+ private List<String> encodeToBPE(final String word) {
+ if (word.isEmpty()) {
+ return List.of();
+ }
+
+ final List<String> symbols = splitToSymbols(word);
+ final List<String> merged = applyMerges(symbols);
+
+ // Strip end-of-word markers and collect final tokens
+ final List<String> result = new ArrayList<>();
+ for (final String token : merged) {
+ if (token.endsWith(END_OF_WORD)) {
+ result.add(token.substring(0, token.length() - END_OF_WORD.length()));
+ } else {
+ result.add(token);
+ }
+ }
+
+ return result;
+ }
+
+ /**
+ * Iteratively applies learned BPE merge operations to a list of symbols.
+ * In each iteration, the highest-priority (lowest-rank) adjacent pair is
merged
+ * into a single symbol. The process continues until no more applicable
merges
+ * remain or the symbol list is reduced to a single element.
+ *
+ * @param symbols The mutable list of symbols to merge. Must not be {@code
null}.
+ * @return The list of symbols after all applicable merges have been applied.
+ */
+ private List<String> applyMerges(final List<String> symbols) {
+ if (symbols.size() <= 1) {
+ return symbols;
+ }
+
+ List<String> current = new ArrayList<>(symbols);
+
+ while (current.size() > 1) {
+ int bestRank = Integer.MAX_VALUE;
+ SymbolPair bestPair = null;
+
+ for (int i = 0; i < current.size() - 1; i++) {
+ final SymbolPair pair = new SymbolPair(
+ current.get(i), current.get(i + 1));
+ final Integer rank = mergeRanks.get(pair);
+ if (rank != null && rank < bestRank) {
+ bestRank = rank;
+ bestPair = pair;
+ }
+ }
+
+ if (bestPair == null) {
+ break;
+ }
+
+ final List<String> next = new ArrayList<>();
+ int i = 0;
+ while (i < current.size()) {
+ if (i < current.size() - 1
+ && current.get(i).equals(bestPair.left())
+ && current.get(i + 1).equals(bestPair.right())) {
+ next.add(bestPair.left() + bestPair.right());
+ i += 2;
+ } else {
+ next.add(current.get(i));
+ i++;
+ }
+ }
+ current = next;
+ }
+
+ return current;
+ }
+
+ /**
+ * Represents a pair of adjacent symbols in BPE.
+ *
+ * @param left The left symbol.
+ * @param right The right symbol.
+ */
+ public record SymbolPair(String left, String right) {
+
+ /**
+ * Creates a new {@link SymbolPair}.
+ *
+ * @param left The left symbol. Must not be {@code null}.
+ * @param right The right symbol. Must not be {@code null}.
+ * @throws IllegalArgumentException if {@code left} or {@code right} is
{@code null}.
+ */
+ public SymbolPair {
+ if (left == null) {
+ throw new IllegalArgumentException("left must not be null");
+ }
+ if (right == null) {
+ throw new IllegalArgumentException("right must not be null");
+ }
+ }
+
+ @Override
+ public String toString() {
+ return left + " " + right;
+ }
+ }
+}
diff --git
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPETokenizerFactory.java
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPETokenizerFactory.java
new file mode 100644
index 00000000..881bbcfa
--- /dev/null
+++
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPETokenizerFactory.java
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.tokenize.BPETokenizer.SymbolPair;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.ArtifactSerializer;
+
+/**
+ * A {@link BaseToolFactory} for BPE tokenization that manages
+ * the BPE merge rules artifact and its serialization within a
+ * {@link BPEModel}.
+ * <p>
+ * This factory is responsible for:
+ * <ul>
+ * <li>Providing the {@link BPEMergesSerializer} that reads
+ * and writes BPE merge rules as a text-based artifact
+ * ({@code bpe.merges}) inside the model ZIP package.
+ * </li>
+ * <li>Supplying the merge rules to the {@link BPEModel}
+ * via {@link #createArtifactMap()}.</li>
+ * <li>Validating that a loaded model contains valid merge
+ * rules.</li>
+ * </ul>
+ * <p>
+ * This class is typically not used directly. It is
+ * instantiated internally by {@link BPETokenizerTrainer}
+ * during training and by {@link BPEModel} during model
+ * loading.
+ *
+ * @see BPEModel
+ * @see BPETokenizer
+ * @see BPETokenizerTrainer
+ */
+public class BPETokenizerFactory extends BaseToolFactory {
+
+ /** The artifact entry name for BPE merge rules. */
+ static final String MERGES_ENTRY_NAME = "bpe.merges";
+
+ /** The ISO language code. */
+ private String languageCode;
+
+ /**
+ * Creates a {@link BPETokenizerFactory}.
+ * Required empty constructor for model loading.
+ */
+ public BPETokenizerFactory() {
+ }
+
+ /**
+ * Creates a {@link BPETokenizerFactory} with the given
+ * language code.
+ *
+ * @param langCode The ISO language code.
+ * Must not be {@code null}.
+ * @throws IllegalArgumentException if {@code langCode}
+ * is {@code null}.
+ */
+ public BPETokenizerFactory(final String langCode) {
+ if (langCode == null) {
+ throw new IllegalArgumentException(
+ "languageCode must not be null");
+ }
+ this.languageCode = langCode;
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public Map<String, ArtifactSerializer<?>>
+ createArtifactSerializersMap() {
+ Map<String, ArtifactSerializer<?>> serializers =
+ super.createArtifactSerializersMap();
+ serializers.put("merges", new BPEMergesSerializer());
+ return serializers;
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public Map<String, String> createManifestEntries() {
+ Map<String, String> entries = super.createManifestEntries();
+ return entries;
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public void validateArtifactMap() throws InvalidFormatException {
+ Object mergesArtifact =
+ this.artifactProvider.getArtifact(MERGES_ENTRY_NAME);
+ if (!(mergesArtifact instanceof List<?>)) {
+ throw new InvalidFormatException(
+ "Missing or invalid BPE merges artifact!");
+ }
+ }
+
+ /**
+ * @return The ISO language code for this factory.
+ */
+ public String getLanguageCode() {
+ return languageCode;
+ }
+
+ /**
+ * An {@link ArtifactSerializer} for BPE merge rules.
+ * <p>
+ * Serializes merge rules as a text file with one merge pair per line,
+ * in the format: {@code left right}.
+ */
+ static class BPEMergesSerializer
+ implements ArtifactSerializer<List<SymbolPair>> {
+
+ @Override
+ public List<SymbolPair> create(final InputStream in)
+ throws IOException {
+ final List<SymbolPair> merges = new ArrayList<>();
+ final BufferedReader reader = new BufferedReader(
+ new InputStreamReader(in, StandardCharsets.UTF_8));
+ String line;
+ while ((line = reader.readLine()) != null) {
+ line = line.trim();
+ if (line.isEmpty()) {
+ continue;
+ }
+ final int space = line.indexOf(' ');
+ if (space < 0) {
+ throw new InvalidFormatException(
+ "Invalid BPE merge line (expected "
+ + "'left right'): " + line);
+ }
+ merges.add(new SymbolPair(
+ line.substring(0, space),
+ line.substring(space + 1)));
+ }
+ return merges;
+ }
+
+ /**
+ * Serializes the merge rules to the given {@link OutputStream}.
+ * <p>
+ * <b>Note:</b> This method wraps the provided {@link OutputStream}
+ * in a {@link BufferedWriter} and flushes it upon completion,
+ * but does <em>not</em> close the underlying stream. The caller
+ * is responsible for closing {@code out}.
+ */
+ @Override
+ public void serialize(final List<SymbolPair> artifact,
+ final OutputStream out)
+ throws IOException {
+ final BufferedWriter writer = new BufferedWriter(
+ new OutputStreamWriter(out, StandardCharsets.UTF_8));
+ for (final SymbolPair merge : artifact) {
+ writer.write(merge.left());
+ writer.write(' ');
+ writer.write(merge.right());
+ writer.newLine();
+ }
+ writer.flush();
+ }
+ }
+}
diff --git
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPETokenizerTrainer.java
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPETokenizerTrainer.java
new file mode 100644
index 00000000..22d32ee5
--- /dev/null
+++
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPETokenizerTrainer.java
@@ -0,0 +1,281 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.commons.Trainer;
+import opennlp.tools.tokenize.BPETokenizer.SymbolPair;
+import opennlp.tools.util.Parameters;
+import opennlp.tools.util.TrainingConfiguration;
+
+/**
+ * Learns BPE merge operations from a training corpus and
+ * produces a {@link BPEModel}.
+ * <p>
+ * Implements the BPE learning algorithm from
+ * Sennrich et al. (2016):
+ * <ol>
+ * <li>Build a vocabulary of character-level symbol
+ * sequences from the corpus, where each word is split
+ * into individual characters with an end-of-word
+ * marker.</li>
+ * <li>Count all adjacent symbol pairs across the
+ * vocabulary, weighted by word frequency.</li>
+ * <li>Merge the most frequent pair into a single new
+ * symbol.</li>
+ * <li>Repeat until the desired number of merges
+ * ({@code numMerges}) is reached.</li>
+ * </ol>
+ * <p>
+ * The number of merges controls the granularity of the
+ * resulting vocabulary: fewer merges produce finer-grained
+ * (more character-level) tokens, while more merges produce
+ * coarser (more word-level) tokens. A typical value ranges
+ * from a few thousand to tens of thousands, depending on
+ * the corpus size and language.
+ * <p>
+ * <b>Usage:</b>
+ * <pre>{@code
+ * List<String> corpus = List.of(
+ * "the cat sat on the mat",
+ * "the dog sat on the log"
+ * );
+ *
+ * BPETokenizerTrainer trainer = new BPETokenizerTrainer();
+ * BPEModel model = trainer.train(corpus, 10000, "en");
+ *
+ * // Persist the model
+ * model.serialize(Path.of("bpe-en.bin"));
+ *
+ * // Use it for tokenization
+ * BPETokenizer tokenizer = new BPETokenizer(model);
+ * String[] tokens = tokenizer.tokenize("the cat");
+ * }</pre>
+ * <p>
+ * For reference see:
+ * <ul>
+ * <li>Sennrich, R., Haddow, B., & Birch, A. (2016).
+ * Neural Machine Translation of Rare Words with Subword Units.
+ * <a href="https://arxiv.org/abs/1508.07909">
+ * https://arxiv.org/abs/1508.07909</a>
+ * </li>
+ * </ul>
+ *
+ * @see BPETokenizer
+ * @see BPEModel
+ */
+public final class BPETokenizerTrainer implements Trainer<Parameters> {
+
+ private Parameters trainingParameters;
+ private Map<String, String> reportMap;
+ private TrainingConfiguration trainingConfiguration;
+
+ /**
+ * Creates a new {@link BPETokenizerTrainer}.
+ */
+ public BPETokenizerTrainer() {
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public void init(final Parameters trainParams,
+ final Map<String, String> reportMap) {
+ this.trainingParameters = trainParams;
+ this.reportMap = reportMap;
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public void init(final Parameters trainParams,
+ final Map<String, String> reportMap,
+ final TrainingConfiguration config) {
+ init(trainParams, reportMap);
+ this.trainingConfiguration = config;
+ }
+
+ /**
+ * Learns BPE merge operations from a training corpus
+ * and returns a {@link BPEModel}.
+ *
+ * @param corpus An iterable of text strings
+ * (e.g., sentences or documents).
+ * Must not be {@code null}.
+ * @param numMerges The number of merge operations
+ * to learn. Must be positive.
+ * @param languageCode The ISO language code
+ * (e.g., "en", "de").
+ * Must not be {@code null}.
+ * @return A trained {@link BPEModel} containing the
+ * learned merge operations.
+ * @throws IllegalArgumentException if {@code numMerges}
+ * is not positive, or if {@code corpus} or
+ * {@code languageCode} is {@code null}.
+ */
+ public BPEModel train(final Iterable<String> corpus,
+ final int numMerges,
+ final String languageCode) {
+ if (corpus == null) {
+ throw new IllegalArgumentException(
+ "corpus must not be null");
+ }
+ if (languageCode == null) {
+ throw new IllegalArgumentException(
+ "languageCode must not be null");
+ }
+ if (numMerges <= 0) {
+ throw new IllegalArgumentException(
+ "numMerges must be positive, got: " + numMerges);
+ }
+
+ final List<SymbolPair> merges = learnMerges(corpus, numMerges);
+ final BPETokenizerFactory factory =
+ new BPETokenizerFactory(languageCode);
+
+ return new BPEModel(merges, new HashMap<>(), factory);
+ }
+
+ /**
+ * Learns BPE merge operations from the given corpus.
+ * <p>
+ * The algorithm proceeds as follows:
+ * <ol>
+ * <li>Builds a word frequency map from the corpus using whitespace
tokenization.</li>
+ * <li>Converts each word into a character-level symbol sequence with an
+ * end-of-word marker on the final character.</li>
+ * <li>Iteratively counts all adjacent symbol pairs (weighted by word
frequency),
+ * selects the most frequent pair, records it as a merge operation,
and applies
+ * the merge to all vocabulary entries.</li>
+ * <li>Stops after {@code numMerges} iterations or when no further pairs
exist.</li>
+ * </ol>
+ *
+ * @param corpus The training corpus, where each element is a text string.
+ * @param numMerges The maximum number of merge operations to learn.
+ * @return An ordered list of learned {@link SymbolPair} merge operations.
+ */
+ private List<SymbolPair> learnMerges(
+ final Iterable<String> corpus,
+ final int numMerges) {
+ // Step 1: Build word frequency map from corpus
+ final Map<String, Integer> wordFreqs = new HashMap<>();
+ for (final String line : corpus) {
+ final String[] words = WhitespaceTokenizer.INSTANCE.tokenize(line);
+ for (final String word : words) {
+ wordFreqs.merge(word, 1, Integer::sum);
+ }
+ }
+
+ // Step 2: Convert to symbol sequences with frequencies
+ final Map<List<String>, Integer> vocab = new HashMap<>();
+ for (final Map.Entry<String, Integer> entry : wordFreqs.entrySet()) {
+ final List<String> symbols = splitToSymbols(entry.getKey());
+ vocab.put(symbols, entry.getValue());
+ }
+
+ // Step 3: Iteratively learn merges
+ final List<SymbolPair> merges = new ArrayList<>();
+
+ for (int step = 0; step < numMerges; step++) {
+ // Count all adjacent pairs
+ final Map<SymbolPair, Integer> pairCounts = new HashMap<>();
+ for (final Map.Entry<List<String>, Integer> entry : vocab.entrySet()) {
+ final List<String> symbols = entry.getKey();
+ final int freq = entry.getValue();
+ for (int i = 0; i < symbols.size() - 1; i++) {
+ final SymbolPair pair = new SymbolPair(
+ symbols.get(i), symbols.get(i + 1));
+ pairCounts.merge(pair, freq, Integer::sum);
+ }
+ }
+
+ if (pairCounts.isEmpty()) {
+ break;
+ }
+
+ // Find most frequent pair
+ SymbolPair bestPair = null;
+ int bestCount = 0;
+ for (final Map.Entry<SymbolPair, Integer> entry : pairCounts.entrySet())
{
+ if (entry.getValue() > bestCount) {
+ bestCount = entry.getValue();
+ bestPair = entry.getKey();
+ }
+ }
+
+ if (bestPair == null || bestCount < 1) {
+ break;
+ }
+
+ merges.add(bestPair);
+
+ // Apply merge to vocabulary
+ final Map<List<String>, Integer> newVocab = new HashMap<>();
+ for (final Map.Entry<List<String>, Integer> entry : vocab.entrySet()) {
+ final List<String> merged = applyMerge(entry.getKey(), bestPair);
+ newVocab.merge(merged, entry.getValue(), Integer::sum);
+ }
+ vocab.clear();
+ vocab.putAll(newVocab);
+ }
+
+ return merges;
+ }
+
+ private List<String> splitToSymbols(final String word) {
+ final List<String> symbols = new ArrayList<>(word.length());
+ for (int i = 0; i < word.length(); i++) {
+ if (i == word.length() - 1) {
+ symbols.add(word.charAt(i) + BPETokenizer.END_OF_WORD);
+ } else {
+ symbols.add(String.valueOf(word.charAt(i)));
+ }
+ }
+ return symbols;
+ }
+
+ /**
+ * Applies a single merge operation to a symbol sequence.
+ * Scans the list for adjacent symbols matching the given pair and replaces
+ * each occurrence with a single concatenated symbol.
+ *
+ * @param symbols The current symbol sequence for a word.
+ * @param pair The {@link SymbolPair} to merge.
+ * @return A new list with all occurrences of the pair merged.
+ */
+ private List<String> applyMerge(
+ final List<String> symbols,
+ final SymbolPair pair) {
+ final List<String> result = new ArrayList<>();
+ int i = 0;
+ while (i < symbols.size()) {
+ if (i < symbols.size() - 1
+ && symbols.get(i).equals(pair.left())
+ && symbols.get(i + 1).equals(pair.right())) {
+ result.add(pair.left() + pair.right());
+ i += 2;
+ } else {
+ result.add(symbols.get(i));
+ i++;
+ }
+ }
+ return result;
+ }
+}
diff --git
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/AbstractBPEModelTest.java
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/AbstractBPEModelTest.java
new file mode 100644
index 00000000..22b50011
--- /dev/null
+++
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/AbstractBPEModelTest.java
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.util.List;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.tokenize.BPETokenizer.SymbolPair;
+
+/**
+ * Abstract base class for {@link BPEModel} tests.
+ * <p>
+ * Subclasses provide a language-specific corpus and language code.
+ *
+ * @see BPEModel
+ */
+public abstract class AbstractBPEModelTest {
+
+ /**
+ * @return a corpus of sentences for training.
+ */
+ protected abstract List<String> getCorpus();
+
+ /**
+ * @return the ISO language code to use during training.
+ */
+ protected abstract String getLanguageCode();
+
+ protected BPEModel trainModel(int numMerges) {
+ return new BPETokenizerTrainer().train(getCorpus(), numMerges,
getLanguageCode());
+ }
+
+ /**
+ * Tests that a model can be serialized and deserialized without data loss.
+ */
+ @Test
+ void testBPEModelSerialization() throws IOException {
+ final BPEModel model = trainModel(10);
+ Assertions.assertFalse(model.isLoadedFromSerialized());
+
+ try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
+ model.serialize(out);
+
+ final BPEModel restored = new BPEModel(new
ByteArrayInputStream(out.toByteArray()));
+ Assertions.assertNotNull(restored);
+ Assertions.assertTrue(restored.isLoadedFromSerialized());
+ }
+ }
+
+ /**
+ * Tests that merge rules are preserved after serialization roundtrip.
+ */
+ @Test
+ void testMergesPreservedAfterSerialization() throws IOException {
+ final BPEModel original = trainModel(10);
+
+ try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
+ original.serialize(out);
+
+ final BPEModel restored = new BPEModel(new
ByteArrayInputStream(out.toByteArray()));
+
+ final List<SymbolPair> originalMerges = original.getMerges();
+ final List<SymbolPair> restoredMerges = restored.getMerges();
+
+ Assertions.assertEquals(originalMerges.size(), restoredMerges.size());
+ for (int i = 0; i < originalMerges.size(); i++) {
+ Assertions.assertEquals(originalMerges.get(i), restoredMerges.get(i));
+ }
+ }
+ }
+
+ /**
+ * Tests that merge order is preserved — order determines priority.
+ */
+ @Test
+ void testMergeOrderPreserved() throws IOException {
+ final BPEModel model = trainModel(5);
+
+ try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
+ model.serialize(out);
+
+ final BPEModel restored = new BPEModel(new
ByteArrayInputStream(out.toByteArray()));
+
+ // Verify exact order matches
+ Assertions.assertEquals(model.getMerges(), restored.getMerges());
+ }
+ }
+
+ /**
+ * Tests that a deserialized model can be used to tokenize text.
+ */
+ @Test
+ void testDeserializedModelCanTokenize() throws IOException {
+ final BPEModel original = trainModel(10);
+
+ try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
+ original.serialize(out);
+
+ final BPEModel loaded = new BPEModel(new
ByteArrayInputStream(out.toByteArray()));
+ final BPETokenizer tokenizer = new BPETokenizer(loaded);
+
+ final String[] tokens = tokenizer.tokenize("low");
+ Assertions.assertTrue(tokens.length >= 1);
+ Assertions.assertEquals("low", String.join("", tokens));
+ }
+ }
+
+ /**
+ * Tests that the language code is preserved in the model.
+ */
+ @Test
+ void testLanguagePreserved() throws IOException {
+ final BPEModel model = trainModel(5);
+
+ try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
+ model.serialize(out);
+
+ final BPEModel restored = new BPEModel(new
ByteArrayInputStream(out.toByteArray()));
+ Assertions.assertEquals(getLanguageCode(), restored.getLanguage());
+ }
+ }
+
+ /**
+ * Tests that the factory is accessible from a deserialized model.
+ */
+ @Test
+ void testFactoryAccessibleAfterDeserialization() throws IOException {
+ final BPEModel original = trainModel(5);
+
+ try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
+ original.serialize(out);
+
+ final BPEModel restored = new BPEModel(new
ByteArrayInputStream(out.toByteArray()));
+ Assertions.assertNotNull(restored.getFactory());
+ Assertions.assertInstanceOf(BPETokenizerFactory.class,
restored.getFactory());
+ }
+ }
+}
diff --git
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/AbstractBPETokenizerRealisticTest.java
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/AbstractBPETokenizerRealisticTest.java
new file mode 100644
index 00000000..e93b62d9
--- /dev/null
+++
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/AbstractBPETokenizerRealisticTest.java
@@ -0,0 +1,347 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.TestInstance;
+
+import opennlp.tools.util.Span;
+
+/**
+ * Abstract base class for realistic BPE tokenization integration tests.
+ * <p>
+ * Subclasses provide language-specific training corpora and test inputs.
+ * This class contains all common test methods that exercise the BPE pipeline
+ * end-to-end: training, tokenization, serialization, and consistency checks.
+ *
+ * @see BPETokenizer
+ * @see BPETokenizerTrainer
+ * @see BPEModel
+ */
+@TestInstance(TestInstance.Lifecycle.PER_CLASS)
+abstract class AbstractBPETokenizerRealisticTest {
+
+ private BPEModel trainedModel;
+
+ // --- Abstract methods for language-specific data ---
+
+ /**
+ * Returns a realistic training corpus for the target language.
+ */
+ abstract List<String> getTrainingCorpus();
+
+ /**
+ * Returns the ISO language code (e.g., "en", "de", "fr").
+ */
+ abstract String getLanguageCode();
+
+ /**
+ * Returns the number of BPE merges to use during training. Default is 100.
+ */
+ int getNumMerges() {
+ return 100;
+ }
+
+ /**
+ * Returns a simple sentence whose words all appear in the training corpus.
+ */
+ abstract String getSimpleSentence();
+
+ /**
+ * Returns the expected words for {@link #getSimpleSentence()}.
+ */
+ abstract String[] getSimpleSentenceExpectedWords();
+
+ /**
+ * Returns a list of words expected to be single tokens after training.
+ */
+ abstract List<String> getFrequentWords();
+
+ /**
+ * Returns a word not seen in the training corpus.
+ */
+ abstract String getUnseenWord();
+
+ /**
+ * Returns a sentence for span coverage testing.
+ */
+ abstract String getSpanTestSentence();
+
+ /**
+ * Returns the expected words for {@link #getSpanTestSentence()}.
+ */
+ abstract String[] getSpanTestExpectedWords();
+
+ /**
+ * Returns a multi-word sentence for general tokenization testing.
+ */
+ abstract String getMultiWordSentence();
+
+ /**
+ * Returns a sentence for serialization roundtrip testing.
+ */
+ abstract String getSerializationTestSentence();
+
+ /**
+ * Returns a sentence for consistency testing between tokenize() and
tokenizePos().
+ */
+ abstract String getConsistencyTestSentence();
+
+ /**
+ * Returns a sentence containing punctuation for testing.
+ */
+ abstract String getPunctuationTestSentence();
+
+ /**
+ * Returns the expected words (whitespace-delimited, punctuation attached)
+ * for {@link #getPunctuationTestSentence()}.
+ */
+ abstract String[] getExpectedPunctuationWords();
+
+ /**
+ * Returns a sentence for testing that more merges produce coarser tokens.
+ */
+ abstract String getCoarseTokenizationSentence();
+
+ @BeforeAll
+ void setUpClass() {
+ trainedModel = new BPETokenizerTrainer().train(
+ getTrainingCorpus(), getNumMerges(), getLanguageCode());
+ }
+
+ /**
+ * Tests basic tokenization of a simple sentence with the trained model.
+ * All words appear in the training corpus and should be fully merged.
+ */
+ @Test
+ void testTokenizerSimpleModel() {
+ final BPETokenizer tokenizer = new BPETokenizer(trainedModel);
+ final String text = getSimpleSentence();
+
+ final String[] tokens = tokenizer.tokenize(text);
+ final Span[] spans = tokenizer.tokenizePos(text);
+
+ final String[] words = reconstructWords(tokens, spans, text);
+ Assertions.assertArrayEquals(getSimpleSentenceExpectedWords(), words);
+ }
+
+ /**
+ * Tests tokenization of frequent words seen during training.
+ * Frequent words should be tokenized into single tokens.
+ */
+ @Test
+ void testFrequentWordsTokenizeEfficiently() {
+ final BPETokenizer tokenizer = new BPETokenizer(trainedModel);
+
+ for (final String word : getFrequentWords()) {
+ final String[] tokens = tokenizer.tokenize(word);
+ Assertions.assertEquals(1, tokens.length,
+ "Expected '" + word + "' as single token");
+ Assertions.assertEquals(word, tokens[0]);
+ }
+ }
+
+ /**
+ * Tests tokenization of unseen words -- they should be split into subword
pieces
+ * but concatenation must still reconstruct the original.
+ */
+ @Test
+ void testUnseenWordsTokenization() {
+ final BPETokenizer tokenizer = new BPETokenizer(trainedModel);
+ final String unseen = getUnseenWord();
+
+ final String[] tokens = tokenizer.tokenize(unseen);
+
+ Assertions.assertTrue(tokens.length > 1,
+ "Unseen word '" + unseen + "' should be split into multiple subword
tokens");
+ Assertions.assertEquals(unseen, String.join("", tokens),
+ "Concatenation of subword tokens must reconstruct the original word");
+ }
+
+ /**
+ * Tests that tokenizePos spans cover the full input text without gaps or
overlaps
+ * and that reconstructed words match the original sentence.
+ */
+ @Test
+ void testTokenizePosSpanCoverage() {
+ final BPETokenizer tokenizer = new BPETokenizer(trainedModel);
+ final String text = getSpanTestSentence();
+ final String[] tokens = tokenizer.tokenize(text);
+ final Span[] spans = tokenizer.tokenizePos(text);
+
+ // Verify all spans extract non-empty substrings
+ for (final Span span : spans) {
+ final CharSequence covered = span.getCoveredText(text);
+ Assertions.assertNotNull(covered);
+ Assertions.assertFalse(covered.toString().isEmpty());
+ }
+
+ // Verify that spans + whitespace fully reconstruct the original text
+ final StringBuilder sb = new StringBuilder();
+ int lastEnd = 0;
+ for (final Span span : spans) {
+ if (span.getStart() > lastEnd) {
+ sb.append(text, lastEnd, span.getStart());
+ }
+ sb.append(span.getCoveredText(text));
+ lastEnd = span.getEnd();
+ }
+ Assertions.assertEquals(text, sb.toString());
+
+ // Verify reconstructed words match expected
+ final String[] words = reconstructWords(tokens, spans, text);
+ Assertions.assertArrayEquals(getSpanTestExpectedWords(), words);
+ }
+
+ /**
+ * Tests that the BPE tokenizer handles multi-word input correctly.
+ */
+ @Test
+ void testTokenizer() {
+ final BPETokenizer tokenizer = new BPETokenizer(trainedModel);
+ final String sentence = getMultiWordSentence();
+ final String[] tokens = tokenizer.tokenize(sentence);
+
+ // Each word produces at least one token
+ final String[] words = sentence.split(" ");
+ Assertions.assertTrue(tokens.length >= words.length);
+
+ // Reconstruct each word from its subword tokens via spans
+ final Span[] spans = tokenizer.tokenizePos(sentence);
+ final String[] reconstructed = reconstructWords(tokens, spans, sentence);
+ Assertions.assertArrayEquals(words, reconstructed);
+ }
+
+ /**
+ * Tests the full pipeline: train, serialize, deserialize, tokenize.
+ */
+ @Test
+ void testTrainSerializeDeserializeTokenize() throws IOException {
+ // Serialize
+ try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
+ trainedModel.serialize(out);
+
+ // Deserialize
+ final BPEModel loaded = new BPEModel(new
ByteArrayInputStream(out.toByteArray()));
+
+ // Tokenize with both original and deserialized model -- results should
match
+ final BPETokenizer original = new BPETokenizer(trainedModel);
+ final BPETokenizer restored = new BPETokenizer(loaded);
+
+ final String sentence = getSerializationTestSentence();
+ Assertions.assertArrayEquals(
+ original.tokenize(sentence),
+ restored.tokenize(sentence));
+ }
+ }
+
+ /**
+ * Tests that the BPE tokenizer fulfills the {@link Tokenizer} contract:
+ * tokenize() and tokenizePos() must be consistent.
+ */
+ @Test
+ void testTokenizeAndTokenizePosConsistency() {
+ final BPETokenizer tokenizer = new BPETokenizer(trainedModel);
+ final String text = getConsistencyTestSentence();
+
+ final String[] tokens = tokenizer.tokenize(text);
+ final Span[] spans = tokenizer.tokenizePos(text);
+
+ Assertions.assertEquals(tokens.length, spans.length);
+
+ for (int i = 0; i < tokens.length; i++) {
+ Assertions.assertEquals(tokens[i],
spans[i].getCoveredText(text).toString(),
+ "Token at index " + i + " should match span-covered text");
+ }
+ }
+
+ /**
+ * Tests that the BPE tokenizer handles punctuation mixed with words.
+ * BPE treats punctuation as characters -- they stay attached to the word
+ * since BPE splits on whitespace first.
+ */
+ @Test
+ void testPunctuationHandling() {
+ final BPETokenizer tokenizer = new BPETokenizer(trainedModel);
+ final String text = getPunctuationTestSentence();
+ final String[] expectedWords = getExpectedPunctuationWords();
+
+ final String[] tokens = tokenizer.tokenize(text);
+ final Span[] spans = tokenizer.tokenizePos(text);
+
+ final String[] words = reconstructWords(tokens, spans, text);
+ Assertions.assertEquals(expectedWords.length, words.length);
+ Assertions.assertArrayEquals(expectedWords, words);
+ }
+
+ /**
+ * Tests that training with a larger number of merges produces
+ * coarser tokenization (fewer subword tokens per word).
+ */
+ @Test
+ void testMoreMergesProducesCoarserTokens() {
+ final List<String> corpus = getTrainingCorpus();
+ final String lang = getLanguageCode();
+
+ final BPEModel fewMerges = new BPETokenizerTrainer().train(corpus, 5,
lang);
+ final BPEModel manyMerges = new BPETokenizerTrainer().train(corpus, 100,
lang);
+
+ final BPETokenizer fewTokenizer = new BPETokenizer(fewMerges);
+ final BPETokenizer manyTokenizer = new BPETokenizer(manyMerges);
+
+ final String text = getCoarseTokenizationSentence();
+ final int fewCount = fewTokenizer.tokenize(text).length;
+ final int manyCount = manyTokenizer.tokenize(text).length;
+
+ Assertions.assertTrue(manyCount <= fewCount,
+ "More merges (" + manyCount + " tokens) should produce fewer or equal
tokens "
+ + "than fewer merges (" + fewCount + " tokens)");
+ }
+
+ /**
+ * Reconstructs whitespace-separated words from subword tokens using span
positions.
+ */
+ String[] reconstructWords(String[] tokens, Span[] spans, String text) {
+ final List<String> words = new ArrayList<>();
+ final StringBuilder currentWord = new StringBuilder();
+ int lastWordEnd = -1;
+
+ for (final Span span : spans) {
+ if (lastWordEnd >= 0 && span.getStart() > lastWordEnd) {
+ // Gap between spans means a whitespace boundary -- new word
+ words.add(currentWord.toString());
+ currentWord.setLength(0);
+ }
+ currentWord.append(span.getCoveredText(text));
+ lastWordEnd = span.getEnd();
+ }
+ if (!currentWord.isEmpty()) {
+ words.add(currentWord.toString());
+ }
+
+ return words.toArray(new String[0]);
+ }
+}
diff --git
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPEModelDeTest.java
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPEModelDeTest.java
new file mode 100644
index 00000000..86b35358
--- /dev/null
+++
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPEModelDeTest.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.List;
+
+/**
+ * German-specific tests for the {@link BPEModel} class.
+ *
+ * @see AbstractBPEModelTest
+ * @see BPEModel
+ */
+public class BPEModelDeTest extends AbstractBPEModelTest {
+
+ private static final List<String> CORPUS = List.of(
+ "Der schnelle braune Fuchs springt über den faulen Hund, "
+ + "der gerade schlief; er hat ihn nicht kommen sehen",
+ "Natürliche Sprachverarbeitung ist faszinierend: "
+ + "sie verbindet Linguistik, Informatik und Statistik",
+ "Nachdem der Regen aufgehört hatte, kam die Sonne heraus; "
+ + "die Kinder spielten draußen und die Vögel sangen laut"
+ );
+
+ @Override
+ protected List<String> getCorpus() {
+ return CORPUS;
+ }
+
+ @Override
+ protected String getLanguageCode() {
+ return "de";
+ }
+}
diff --git
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPEModelEnTest.java
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPEModelEnTest.java
new file mode 100644
index 00000000..123bb3a6
--- /dev/null
+++
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPEModelEnTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.List;
+
+/**
+ * English-specific tests for the {@link BPEModel} class.
+ *
+ * @see AbstractBPEModelTest
+ * @see BPEModel
+ */
+public class BPEModelEnTest extends AbstractBPEModelTest {
+
+ private static final List<String> CORPUS = List.of(
+ "The quick brown fox jumps over the lazy dog, which was sleeping; it
never saw the fox coming",
+ "Natural language processing is fascinating: it combines linguistics,
computer science, and statistics",
+ "After the rain stopped, the sun came out; the children played outside,
and the birds sang loudly"
+ );
+
+ @Override
+ protected List<String> getCorpus() {
+ return CORPUS;
+ }
+
+ @Override
+ protected String getLanguageCode() {
+ return "en";
+ }
+}
diff --git
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPEModelFrTest.java
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPEModelFrTest.java
new file mode 100644
index 00000000..48a3fbb0
--- /dev/null
+++
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPEModelFrTest.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.List;
+
+/**
+ * French-specific tests for the {@link BPEModel} class.
+ *
+ * @see AbstractBPEModelTest
+ * @see BPEModel
+ */
+public class BPEModelFrTest extends AbstractBPEModelTest {
+
+ private static final List<String> CORPUS = List.of(
+ "Le renard brun rapide saute par-dessus le chien paresseux, "
+ + "qui dormait; il ne l'a jamais vu venir",
+ "Le traitement du langage naturel est fascinant: "
+ + "il combine la linguistique et les statistiques",
+ "Après la pluie, le soleil est apparu; les enfants ont joué dehors, et
les oiseaux ont chanté fort"
+ );
+
+ @Override
+ protected List<String> getCorpus() {
+ return CORPUS;
+ }
+
+ @Override
+ protected String getLanguageCode() {
+ return "fr";
+ }
+}
diff --git
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerFactoryTest.java
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerFactoryTest.java
new file mode 100644
index 00000000..4ca60e1f
--- /dev/null
+++
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerFactoryTest.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.util.List;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.tokenize.BPETokenizer.SymbolPair;
+
+/**
+ * Tests for the {@link BPETokenizerFactory} class.
+ * <p>
+ * Verifies that the factory correctly manages BPE merge rules artifacts,
+ * serializers, and that properties survive model serialization roundtrips.
+ *
+ * @see BPETokenizerFactory
+ * @see BPEModel
+ */
+public class BPETokenizerFactoryTest {
+
+ private static final List<String> CORPUS = List.of(
+ "low low low low low",
+ "lower lower lower",
+ "newest newest newest newest"
+ );
+
+ /**
+ * Tests that the factory provides merge rules after training.
+ */
+ @Test
+ void testFactoryProvidesMerges() {
+ final BPEModel model = new BPETokenizerTrainer().train(CORPUS, 10, "en");
+ final BPETokenizerFactory factory = model.getFactory();
+
+ Assertions.assertNotNull(factory);
+ Assertions.assertNotNull(model.getMerges());
+ Assertions.assertFalse(model.getMerges().isEmpty());
+ }
+
+ /**
+ * Tests that the factory language code is set correctly.
+ */
+ @Test
+ void testFactoryLanguageCode() {
+ final BPETokenizerFactory factory = new BPETokenizerFactory("de");
+
+ Assertions.assertEquals("de", factory.getLanguageCode());
+ }
+
+ /**
+ * Tests that merge rules are accessible from the factory after
+ * model serialization and deserialization.
+ */
+ @Test
+ void testFactorySurvivesSerialization() throws IOException {
+ final BPEModel original = new BPETokenizerTrainer().train(CORPUS, 10,
"en");
+
+ final ByteArrayOutputStream out = new ByteArrayOutputStream();
+ original.serialize(out);
+
+ final BPEModel restored = new BPEModel(new
ByteArrayInputStream(out.toByteArray()));
+ final BPETokenizerFactory factory = restored.getFactory();
+
+ Assertions.assertNotNull(factory);
+ Assertions.assertNotNull(restored.getMerges());
+ Assertions.assertEquals(original.getMerges().size(),
restored.getMerges().size());
+ }
+
+ /**
+ * Tests that the factory merges are consistent between direct construction
+ * and deserialized access.
+ */
+ @Test
+ void testMergesConsistentAfterRoundtrip() throws IOException {
+ final BPEModel original = new BPETokenizerTrainer().train(CORPUS, 5, "en");
+ final List<SymbolPair> originalMerges = original.getMerges();
+
+ final ByteArrayOutputStream out = new ByteArrayOutputStream();
+ original.serialize(out);
+
+ final BPEModel restored = new BPEModel(new
ByteArrayInputStream(out.toByteArray()));
+ final List<SymbolPair> restoredMerges = restored.getMerges();
+
+ Assertions.assertEquals(originalMerges, restoredMerges);
+ }
+
+ /**
+ * Tests that the factory creates the correct artifact serializer map.
+ */
+ @Test
+ void testArtifactSerializersMapContainsMergesSerializer() {
+ final BPETokenizerFactory factory = new BPETokenizerFactory("en");
+
+
Assertions.assertTrue(factory.createArtifactSerializersMap().containsKey("merges"));
+ }
+
+ /**
+ * Tests that the model artifact map contains the merges entry.
+ */
+ @Test
+ void testArtifactMapContainsMergesEntry() {
+ final BPEModel model =
+ new BPETokenizerTrainer().train(CORPUS, 5, "en");
+
+ Assertions.assertNotNull(model.getMerges());
+ Assertions.assertFalse(model.getMerges().isEmpty());
+ }
+
+ /**
+ * Tests that the empty constructor creates a valid factory (for model
loading).
+ */
+ @Test
+ void testEmptyConstructor() {
+ final BPETokenizerFactory factory = new BPETokenizerFactory();
+
+ // Empty factory should not throw
+ Assertions.assertNotNull(factory);
+ Assertions.assertNotNull(factory.createArtifactSerializersMap());
+ }
+
+ /**
+ * Tests null parameter validation.
+ */
+ @Test
+ void testNullLanguageCodeThrows() {
+ Assertions.assertThrows(IllegalArgumentException.class,
+ () -> new BPETokenizerFactory(null));
+ }
+}
diff --git
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticDeTest.java
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticDeTest.java
new file mode 100644
index 00000000..95e853c5
--- /dev/null
+++
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticDeTest.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.List;
+
+/**
+ * German-language realistic BPE tokenization integration tests.
+ *
+ * @see AbstractBPETokenizerRealisticTest
+ */
+public class BPETokenizerRealisticDeTest extends
AbstractBPETokenizerRealisticTest {
+
+ @Override
+ List<String> getTrainingCorpus() {
+ return List.of(
+ "Ich habe gestern einen alten Schulfreund in der Stadt getroffen",
+ "Er hat mir von seiner neuen Arbeit in Berlin erzählt",
+ "Die Arbeit ist sehr interessant und er ist sehr zufrieden",
+ "Wir haben zusammen in einem kleinen Restaurant zu Mittag gegessen",
+ "Das Essen war ausgezeichnet und die Bedienung sehr freundlich",
+ "Nach dem Essen sind wir durch den Park spazieren gegangen",
+ "Der Park war sehr schön und die Bäume hatten bunte Blätter",
+ "Am Abend haben wir uns einen Film im Kino angesehen",
+ "Der Film war spannend und hat uns beiden sehr gut gefallen",
+ "Danach sind wir noch in eine Bar gegangen und haben geredet",
+ "Er hat mir von seiner Reise nach Italien erzählt",
+ "Die Reise war wunderbar und er hat viele Fotos gemacht",
+ "Ich habe ihm von meiner Arbeit an die Monographie erzählt",
+ "Die Monographie behandelt die Geschichte der botanischen Forschung",
+ "Er fand das Thema sehr interessant und wollte mehr erfahren",
+ "Wir haben uns verabredet nächste Woche wieder zu treffen",
+ "Ich freue mich schon sehr auf unser nächstes Treffen",
+ "Die Stadt ist im Herbst besonders schön mit den bunten Blättern",
+ "Mein Freund wohnt jetzt in der Nähe vom Hauptbahnhof",
+ "Er nimmt jeden Tag die Bahn zur Arbeit in die Innenstadt"
+ );
+ }
+
+ @Override
+ String getLanguageCode() {
+ return "de";
+ }
+
+ @Override
+ String getSimpleSentence() {
+ return "Die Arbeit ist sehr interessant";
+ }
+
+ @Override
+ String[] getSimpleSentenceExpectedWords() {
+ return new String[] {"Die", "Arbeit", "ist", "sehr", "interessant"};
+ }
+
+ @Override
+ List<String> getFrequentWords() {
+ return List.of("die", "und", "er");
+ }
+
+ @Override
+ String getUnseenWord() {
+ return "Wissenschaftler";
+ }
+
+ @Override
+ String getSpanTestSentence() {
+ return "die Monographie behandelt die Geschichte";
+ }
+
+ @Override
+ String[] getSpanTestExpectedWords() {
+ return new String[] {"die", "Monographie", "behandelt", "die",
"Geschichte"};
+ }
+
+ @Override
+ String getMultiWordSentence() {
+ return "Er hat mir von seiner Reise erzählt";
+ }
+
+ @Override
+ String getSerializationTestSentence() {
+ return "Wir haben zusammen in einem Restaurant gegessen";
+ }
+
+ @Override
+ String getConsistencyTestSentence() {
+ return "Der Park war sehr schön und die Bäume hatten bunte Blätter";
+ }
+
+ @Override
+ String getPunctuationTestSentence() {
+ return "Hallo, Welt!";
+ }
+
+ @Override
+ String[] getExpectedPunctuationWords() {
+ return new String[] {"Hallo,", "Welt!"};
+ }
+
+ @Override
+ String getCoarseTokenizationSentence() {
+ return "Ich habe ihm von meiner Arbeit an die Monographie erzählt";
+ }
+}
diff --git
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticEnTest.java
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticEnTest.java
new file mode 100644
index 00000000..f8abde9d
--- /dev/null
+++
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticEnTest.java
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.List;
+
+/**
+ * English-language realistic BPE tokenization integration tests.
+ *
+ * @see AbstractBPETokenizerRealisticTest
+ */
+public class BPETokenizerRealisticEnTest extends
AbstractBPETokenizerRealisticTest {
+
+ @Override
+ List<String> getTrainingCorpus() {
+ return List.of(
+ "Last September I tried to find out the address of an old school
friend",
+ "whom I had not seen for 15 years",
+ "I just knew his name Alan McKennedy and I had heard the rumour",
+ "that he had moved to Scotland the country of his ancestors",
+ "So I called Julie a friend who is still in contact with him",
+ "She told me that he lived in Edinburgh Worcesterstreet 12",
+ "I wrote him a letter right away and he answered soon",
+ "sounding very happy and delighted",
+ "Last year I wanted to write a letter to my grandaunt",
+ "Her 86th birthday was on October 6 and I no longer wanted",
+ "to be hesitant to get in touch with her",
+ "I did not know her face to face and so it was not easy",
+ "for me to find out her address",
+ "As she had two apartments in different countries",
+ "I decided to write to both",
+ "The first was in Paris in Rue de Grandes Illusions 5",
+ "But Marie Clara as my aunt is called preferred her apartment in
Berlin",
+ "She lived there in beautiful Kaiserstrasse 13 particularly in summer",
+ "Hi my name is Michael Graf how much is a taxi",
+ "from Ostbahnhof to Hauptbahnhof",
+ "About 10 Euro I reckon",
+ "That sounds good",
+ "So please call a driver to Leonardstrasse 112 near the Ostbahnhof",
+ "I would like to be at Silberhornstrasse 12 as soon as possible",
+ "Thank you very much"
+ );
+ }
+
+ @Override
+ String getLanguageCode() {
+ return "en";
+ }
+
+ @Override
+ String getSimpleSentence() {
+ return "I wrote a letter";
+ }
+
+ @Override
+ String[] getSimpleSentenceExpectedWords() {
+ return new String[] {"I", "wrote", "a", "letter"};
+ }
+
+ @Override
+ List<String> getFrequentWords() {
+ return List.of("the", "in");
+ }
+
+ @Override
+ String getUnseenWord() {
+ return "unbelievable";
+ }
+
+ @Override
+ String getSpanTestSentence() {
+ return "She lived in Edinburgh";
+ }
+
+ @Override
+ String[] getSpanTestExpectedWords() {
+ return new String[] {"She", "lived", "in", "Edinburgh"};
+ }
+
+ @Override
+ String getMultiWordSentence() {
+ return "I had not seen him for years";
+ }
+
+ @Override
+ String getSerializationTestSentence() {
+ return "I wrote him a letter right away";
+ }
+
+ @Override
+ String getConsistencyTestSentence() {
+ return "She told me that he lived in Edinburgh";
+ }
+
+ @Override
+ String getPunctuationTestSentence() {
+ return "Hello, world!";
+ }
+
+ @Override
+ String[] getExpectedPunctuationWords() {
+ return new String[] {"Hello,", "world!"};
+ }
+
+ @Override
+ String getCoarseTokenizationSentence() {
+ return "I wanted to write a letter to my grandaunt";
+ }
+}
diff --git
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticEsTest.java
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticEsTest.java
new file mode 100644
index 00000000..23be28db
--- /dev/null
+++
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticEsTest.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.List;
+
+/**
+ * Spanish-language realistic BPE tokenization integration tests.
+ *
+ * @see AbstractBPETokenizerRealisticTest
+ */
+public class BPETokenizerRealisticEsTest
+ extends AbstractBPETokenizerRealisticTest {
+
+ @Override
+ List<String> getTrainingCorpus() {
+ return List.of(
+ "Ayer fui al mercado para comprar frutas y"
+ + " verduras frescas",
+ "El mercado estaba lleno de gente y los precios"
+ + " eran buenos",
+ "Las manzanas y las naranjas estaban muy frescas"
+ + " y baratas",
+ "Volví a casa y preparé una comida muy buena"
+ + " para todos",
+ "La comida estaba deliciosa y toda la familia"
+ + " estaba contenta",
+ "Después de la comida hicimos un paseo por el"
+ + " parque grande",
+ "El parque estaba muy bonito con los árboles"
+ + " en flor",
+ "Los niños jugaban en el jardín y los pájaros"
+ + " cantaban",
+ "Por la noche vimos una película muy buena"
+ + " en la televisión",
+ "La película era muy interesante y nos gustó"
+ + " mucho a todos",
+ "Mi amigo Carlos vive en una casa grande"
+ + " en Madrid",
+ "Él trabaja en una empresa de tecnología"
+ + " desde hace cinco años",
+ "Su esposa María es profesora en la universidad"
+ + " central",
+ "Tienen dos hijos que van a una escuela cerca"
+ + " de la casa",
+ "Los fines de semana les gusta hacer excursiones"
+ + " por el campo",
+ "Madrid es una ciudad muy bonita con una"
+ + " historia muy rica",
+ "La cocina española es conocida en todo el"
+ + " mundo por su calidad",
+ "Los museos de Madrid atraen a millones de"
+ + " visitantes cada año",
+ "El Prado es el museo más visitado de toda"
+ + " la ciudad",
+ "La vida en España es muy agradable y el clima"
+ + " es muy bueno"
+ );
+ }
+
+ @Override
+ String getLanguageCode() {
+ return "es";
+ }
+
+ @Override
+ String getSimpleSentence() {
+ return "La comida estaba deliciosa";
+ }
+
+ @Override
+ String[] getSimpleSentenceExpectedWords() {
+ return new String[] {
+ "La", "comida", "estaba", "deliciosa"
+ };
+ }
+
+ @Override
+ List<String> getFrequentWords() {
+ return List.of("muy", "los", "en");
+ }
+
+ @Override
+ String getUnseenWord() {
+ return "impresionante";
+ }
+
+ @Override
+ String getSpanTestSentence() {
+ return "Los niños jugaban en el jardín";
+ }
+
+ @Override
+ String[] getSpanTestExpectedWords() {
+ return new String[] {
+ "Los", "niños", "jugaban", "en", "el", "jardín"
+ };
+ }
+
+ @Override
+ String getMultiWordSentence() {
+ return "El parque estaba muy bonito con los árboles"
+ + " en flor";
+ }
+
+ @Override
+ String getSerializationTestSentence() {
+ return "Ayer fui al mercado para comprar frutas"
+ + " y verduras";
+ }
+
+ @Override
+ String getConsistencyTestSentence() {
+ return "Mi amigo Carlos vive en una casa grande"
+ + " en Madrid";
+ }
+
+ @Override
+ String getPunctuationTestSentence() {
+ return "Hola, mundo!";
+ }
+
+ @Override
+ String[] getExpectedPunctuationWords() {
+ return new String[] {"Hola,", "mundo!"};
+ }
+
+ @Override
+ String getCoarseTokenizationSentence() {
+ return "La cocina española es conocida en todo"
+ + " el mundo";
+ }
+}
diff --git
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticFrTest.java
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticFrTest.java
new file mode 100644
index 00000000..3f3eda64
--- /dev/null
+++
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticFrTest.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.List;
+
+/**
+ * French-language realistic BPE tokenization integration tests.
+ *
+ * @see AbstractBPETokenizerRealisticTest
+ */
+public class BPETokenizerRealisticFrTest
+ extends AbstractBPETokenizerRealisticTest {
+
+ @Override
+ List<String> getTrainingCorpus() {
+ return List.of(
+ "Hier je suis allé au marché pour acheter des fruits"
+ + " et des légumes",
+ "Le marché était plein de monde et les prix"
+ + " étaient raisonnables",
+ "Les pommes et les oranges étaient"
+ + " particulièrement fraîches",
+ "Je suis rentré à la maison et j'ai préparé"
+ + " un bon repas",
+ "Le repas était délicieux et toute la famille"
+ + " était contente",
+ "Après le repas nous avons fait une promenade"
+ + " dans le parc",
+ "Le parc était magnifique avec les arbres"
+ + " en fleurs",
+ "Les enfants jouaient dans le jardin et les"
+ + " oiseaux chantaient",
+ "Le soir nous avons regardé un film"
+ + " à la télévision",
+ "Le film était très intéressant et nous avons"
+ + " bien aimé",
+ "Mon ami Pierre habite dans une grande maison"
+ + " à Paris",
+ "Il travaille dans une entreprise de technologie"
+ + " depuis cinq ans",
+ "Sa femme Marie est professeur à une"
+ + " université",
+ "Ils ont deux enfants qui vont à une école"
+ + " près de la maison",
+ "Le weekend ils aiment faire des randonnées"
+ + " dans la campagne",
+ "La France est un beau pays avec une riche"
+ + " histoire",
+ "Paris est la capitale et la plus grande ville"
+ + " du pays",
+ "La cuisine française est connue dans le monde"
+ + " entier",
+ "Les musées de Paris attirent des millions"
+ + " de visiteurs chaque année",
+ "La Tour Eiffel est le monument le plus visité"
+ + " de France"
+ );
+ }
+
+ @Override
+ String getLanguageCode() {
+ return "fr";
+ }
+
+ @Override
+ String getSimpleSentence() {
+ return "Le repas était délicieux";
+ }
+
+ @Override
+ String[] getSimpleSentenceExpectedWords() {
+ return new String[] {"Le", "repas", "était", "délicieux"};
+ }
+
+ @Override
+ List<String> getFrequentWords() {
+ return List.of("les", "dans", "le");
+ }
+
+ @Override
+ String getUnseenWord() {
+ return "extraordinaire";
+ }
+
+ @Override
+ String getSpanTestSentence() {
+ return "Les enfants jouaient dans le jardin";
+ }
+
+ @Override
+ String[] getSpanTestExpectedWords() {
+ return new String[] {
+ "Les", "enfants", "jouaient", "dans", "le", "jardin"
+ };
+ }
+
+ @Override
+ String getMultiWordSentence() {
+ return "Le parc était magnifique avec les arbres"
+ + " en fleurs";
+ }
+
+ @Override
+ String getSerializationTestSentence() {
+ return "Je suis allé au marché pour acheter"
+ + " des fruits";
+ }
+
+ @Override
+ String getConsistencyTestSentence() {
+ return "Mon ami Pierre habite dans une grande"
+ + " maison à Paris";
+ }
+
+ @Override
+ String getPunctuationTestSentence() {
+ return "Bonjour, monde!";
+ }
+
+ @Override
+ String[] getExpectedPunctuationWords() {
+ return new String[] {"Bonjour,", "monde!"};
+ }
+
+ @Override
+ String getCoarseTokenizationSentence() {
+ return "La cuisine française est connue dans le"
+ + " monde entier";
+ }
+}
diff --git
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticItTest.java
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticItTest.java
new file mode 100644
index 00000000..f6bd50a4
--- /dev/null
+++
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticItTest.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.List;
+
+/**
+ * Italian-language realistic BPE tokenization integration tests.
+ *
+ * @see AbstractBPETokenizerRealisticTest
+ */
+public class BPETokenizerRealisticItTest extends
AbstractBPETokenizerRealisticTest {
+
+ @Override
+ List<String> getTrainingCorpus() {
+ return List.of(
+ "Ieri sono andato al mercato per comprare della frutta e della
verdura",
+ "Il mercato era pieno di gente e i prezzi erano ragionevoli",
+ "Le mele e le arance erano particolarmente fresche e buone",
+ "Sono tornato a casa e ho preparato un buon pranzo",
+ "Il pranzo era delizioso e tutta la famiglia era contenta",
+ "Dopo il pranzo abbiamo fatto una passeggiata nel parco",
+ "Il parco era bellissimo con gli alberi in fiore",
+ "I bambini giocavano nel giardino e gli uccelli cantavano",
+ "La sera abbiamo guardato un film alla televisione",
+ "Il film era molto interessante e ci e piaciuto tanto",
+ "Il mio amico Marco abita in una grande casa a Roma",
+ "Lui lavora in una azienda di tecnologia da cinque anni",
+ "Sua moglie Giulia e professoressa alla universita",
+ "Hanno due bambini che vanno a una scuola vicino a casa",
+ "Nel fine settimana amano fare delle escursioni in campagna",
+ "Roma e una citta bellissima con una storia molto ricca",
+ "La cucina italiana e conosciuta in tutto il mondo",
+ "I musei di Roma attraggono milioni di visitatori ogni anno",
+ "Il Colosseo e il monumento piu visitato di Roma",
+ "La vita in Italia e molto piacevole e rilassante"
+ );
+ }
+
+ @Override
+ String getLanguageCode() {
+ return "it";
+ }
+
+ @Override
+ String getSimpleSentence() {
+ return "Il pranzo era delizioso";
+ }
+
+ @Override
+ String[] getSimpleSentenceExpectedWords() {
+ return new String[] {"Il", "pranzo", "era", "delizioso"};
+ }
+
+ @Override
+ List<String> getFrequentWords() {
+ return List.of("il", "una", "in");
+ }
+
+ @Override
+ String getUnseenWord() {
+ return "straordinario";
+ }
+
+ @Override
+ String getSpanTestSentence() {
+ return "I bambini giocavano nel giardino";
+ }
+
+ @Override
+ String[] getSpanTestExpectedWords() {
+ return new String[] {"I", "bambini", "giocavano", "nel", "giardino"};
+ }
+
+ @Override
+ String getMultiWordSentence() {
+ return "Il parco era bellissimo con gli alberi in fiore";
+ }
+
+ @Override
+ String getSerializationTestSentence() {
+ return "Sono andato al mercato per comprare della frutta";
+ }
+
+ @Override
+ String getConsistencyTestSentence() {
+ return "Il mio amico Marco abita in una grande casa a Roma";
+ }
+
+ @Override
+ String getPunctuationTestSentence() {
+ return "Ciao, mondo!";
+ }
+
+ @Override
+ String[] getExpectedPunctuationWords() {
+ return new String[] {"Ciao,", "mondo!"};
+ }
+
+ @Override
+ String getCoarseTokenizationSentence() {
+ return "La cucina italiana e conosciuta in tutto il mondo";
+ }
+}
diff --git
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerTest.java
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerTest.java
new file mode 100644
index 00000000..2666683c
--- /dev/null
+++
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerTest.java
@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.HashMap;
+import java.util.List;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.tokenize.BPETokenizer.SymbolPair;
+import opennlp.tools.util.Span;
+
+/**
+ * Tests for the {@link BPETokenizer} class.
+ * <p>
+ * Verifies that BPE tokenization correctly splits text into subword tokens
+ * based on learned merge operations, and that span positions map back to
+ * the original text.
+ *
+ * @see BPETokenizer
+ */
+public class BPETokenizerTest {
+
+ private static BPEModel createModel(List<SymbolPair> merges) {
+ final BPETokenizerFactory factory = new BPETokenizerFactory("en");
+ return new BPEModel(merges, new HashMap<>(), factory);
+ }
+
+ /**
+ * Tests that a fully merged word produces a single token.
+ */
+ @Test
+ void testBasicBPETokenization() {
+ final List<SymbolPair> merges = List.of(
+ new SymbolPair("l", "o"),
+ new SymbolPair("lo", "w" + BPETokenizer.END_OF_WORD),
+ new SymbolPair("e", "r" + BPETokenizer.END_OF_WORD)
+ );
+
+ final BPETokenizer tokenizer = new BPETokenizer(createModel(merges));
+ final String[] tokens = tokenizer.tokenize("low");
+
+ Assertions.assertArrayEquals(new String[]{"low"}, tokens);
+ }
+
+ /**
+ * Tests that a word not fully covered by merges is split into subword
tokens.
+ */
+ @Test
+ void testSubwordSplitting() {
+ final List<SymbolPair> merges = List.of(
+ new SymbolPair("l", "o"),
+ new SymbolPair("lo", "w" + BPETokenizer.END_OF_WORD)
+ );
+
+ final BPETokenizer tokenizer = new BPETokenizer(createModel(merges));
+ final String[] tokens = tokenizer.tokenize("lower");
+
+ // "lower" cannot fully merge since "w" is not word-final here
+ Assertions.assertTrue(tokens.length > 1);
+ Assertions.assertEquals("lower", String.join("", tokens));
+ }
+
+ /**
+ * Tests tokenization of multiple whitespace-separated words.
+ */
+ @Test
+ void testMultipleWords() {
+ final List<SymbolPair> merges = List.of(
+ new SymbolPair("l", "o"),
+ new SymbolPair("lo", "w" + BPETokenizer.END_OF_WORD)
+ );
+
+ final BPETokenizer tokenizer = new BPETokenizer(createModel(merges));
+ final String[] tokens = tokenizer.tokenize("low low");
+
+ Assertions.assertEquals(2, tokens.length);
+ Assertions.assertEquals("low", tokens[0]);
+ Assertions.assertEquals("low", tokens[1]);
+ }
+
+ /**
+ * Tests that empty and null input produce empty arrays.
+ */
+ @Test
+ void testEmptyInput() {
+ final BPETokenizer tokenizer = new BPETokenizer(createModel(List.of()));
+
+ Assertions.assertArrayEquals(new String[0], tokenizer.tokenize(""));
+ Assertions.assertArrayEquals(new String[0], tokenizer.tokenize(null));
+ Assertions.assertArrayEquals(new Span[0], tokenizer.tokenizePos(""));
+ Assertions.assertArrayEquals(new Span[0], tokenizer.tokenizePos(null));
+ }
+
+ /**
+ * Tests that with no merges, each character becomes a separate token.
+ */
+ @Test
+ void testNoMergesProducesCharacterTokens() {
+ final BPETokenizer tokenizer = new BPETokenizer(createModel(List.of()));
+ final String[] tokens = tokenizer.tokenize("hi");
+
+ Assertions.assertArrayEquals(new String[]{"h", "i"}, tokens);
+ }
+
+ /**
+ * Tests single-character word tokenization.
+ */
+ @Test
+ void testSingleCharacterWord() {
+ final BPETokenizer tokenizer = new BPETokenizer(createModel(List.of()));
+ final String[] tokens = tokenizer.tokenize("a");
+
+ Assertions.assertArrayEquals(new String[]{"a"}, tokens);
+ }
+
+ /**
+ * Tests that {@link BPETokenizer#tokenizePos(String)} returns correct spans
+ * that map back to the original text.
+ */
+ @Test
+ void testTokenizePos() {
+ final List<SymbolPair> merges = List.of(
+ new SymbolPair("l", "o"),
+ new SymbolPair("lo", "w" + BPETokenizer.END_OF_WORD)
+ );
+
+ final BPETokenizer tokenizer = new BPETokenizer(createModel(merges));
+ final String text = "low hi";
+ final Span[] spans = tokenizer.tokenizePos(text);
+
+ // "low" -> 1 token, "hi" -> 2 tokens (no merges for h, i)
+ Assertions.assertEquals(3, spans.length);
+ Assertions.assertEquals(0, spans[0].getStart());
+ Assertions.assertEquals(3, spans[0].getEnd());
+ Assertions.assertEquals("low", spans[0].getCoveredText(text));
+ // "h"
+ Assertions.assertEquals(4, spans[1].getStart());
+ Assertions.assertEquals(5, spans[1].getEnd());
+ Assertions.assertEquals("h", spans[1].getCoveredText(text));
+ // "i"
+ Assertions.assertEquals(5, spans[2].getStart());
+ Assertions.assertEquals(6, spans[2].getEnd());
+ Assertions.assertEquals("i", spans[2].getCoveredText(text));
+ }
+
+ /**
+ * Tests that span offsets are correct for subword-split words.
+ */
+ @Test
+ void testTokenizePosWithSubwords() {
+ final BPETokenizer tokenizer = new BPETokenizer(createModel(List.of()));
+ final String text = "ab cd";
+ final Span[] spans = tokenizer.tokenizePos(text);
+
+ // "ab" -> a, b; "cd" -> c, d
+ Assertions.assertEquals(4, spans.length);
+ Assertions.assertEquals("a", spans[0].getCoveredText(text));
+ Assertions.assertEquals("b", spans[1].getCoveredText(text));
+ Assertions.assertEquals("c", spans[2].getCoveredText(text));
+ Assertions.assertEquals("d", spans[3].getCoveredText(text));
+ }
+
+ /**
+ * Tests that concatenating all tokens reconstructs the original word.
+ */
+ @Test
+ void testTokenConcatenationEqualsOriginal() {
+ final List<SymbolPair> merges = List.of(
+ new SymbolPair("l", "o"),
+ new SymbolPair("lo", "w" + BPETokenizer.END_OF_WORD)
+ );
+
+ final BPETokenizer tokenizer = new BPETokenizer(createModel(merges));
+ final String[] tokens = tokenizer.tokenize("lower");
+
+ Assertions.assertEquals("lower", String.join("", tokens));
+ }
+
+ /**
+ * Tests that a null model throws IllegalArgumentException.
+ */
+ @Test
+ void testNullModelThrows() {
+ Assertions.assertThrows(IllegalArgumentException.class, () -> new
BPETokenizer(null));
+ }
+
+ @Test
+ void testSymbolPairNullLeftThrows() {
+ Assertions.assertThrows(IllegalArgumentException.class, () -> new
SymbolPair(null, "b"));
+ }
+
+ @Test
+ void testSymbolPairNullRightThrows() {
+ Assertions.assertThrows(IllegalArgumentException.class, () -> new
SymbolPair("a", null));
+ }
+
+ @Test
+ void testSymbolPairEquality() {
+ final SymbolPair a = new SymbolPair("lo", "w");
+ final SymbolPair b = new SymbolPair("lo", "w");
+ final SymbolPair c = new SymbolPair("l", "ow");
+
+ Assertions.assertEquals(a, b);
+ Assertions.assertEquals(a.hashCode(), b.hashCode());
+ Assertions.assertNotEquals(a, c);
+ }
+
+ @Test
+ void testSymbolPairToString() {
+ final SymbolPair pair = new SymbolPair("lo", "w");
+ Assertions.assertEquals("lo w", pair.toString());
+ }
+}
diff --git
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerTrainerTest.java
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerTrainerTest.java
new file mode 100644
index 00000000..0e5dba45
--- /dev/null
+++
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerTrainerTest.java
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Tests for the {@link BPETokenizerTrainer} class.
+ * <p>
+ * Verifies that BPE merge operations are learned correctly from
+ * a training corpus and that the resulting model can be used for tokenization.
+ *
+ * @see BPETokenizerTrainer
+ * @see BPEModel
+ */
+public class BPETokenizerTrainerTest {
+
+ private BPETokenizerTrainer trainer;
+
+ @BeforeEach
+ void setUp() {
+ trainer = new BPETokenizerTrainer();
+ }
+
+ /**
+ * Tests that training produces a non-null model with merge rules.
+ */
+ @Test
+ void testTrainProducesModel() {
+ final List<String> corpus = List.of(
+ "low low low low low",
+ "lower lower lower",
+ "newest newest newest newest",
+ "widest widest widest"
+ );
+
+ final BPEModel model = trainer.train(corpus, 10, "en");
+
+ Assertions.assertNotNull(model);
+ Assertions.assertFalse(model.getMerges().isEmpty());
+ Assertions.assertTrue(model.getMerges().size() <= 10);
+ }
+
+ /**
+ * Tests that the first merge is the most frequent adjacent pair.
+ * For the corpus "ab ab ab ...", the most frequent pair is ("a",
"b</w>").
+ */
+ @Test
+ void testFirstMergeIsMostFrequentPair() {
+ final List<String> corpus = List.of(
+ "ab ab ab ab ab ab ab ab ab ab"
+ );
+
+ final BPEModel model = trainer.train(corpus, 1, "en");
+
+ Assertions.assertEquals(1, model.getMerges().size());
+ Assertions.assertEquals("a", model.getMerges().getFirst().left());
+ Assertions.assertEquals("b" + BPETokenizer.END_OF_WORD,
model.getMerges().getFirst().right());
+ }
+
+ /**
+ * Tests that requesting more merges than possible stops gracefully.
+ */
+ @Test
+ void testMoreMergesThanPossible() {
+ final List<String> corpus = List.of("ab");
+
+ // "ab" has only one possible pair: ("a", "b</w>")
+ final BPEModel model = trainer.train(corpus, 100, "en");
+
+ // Should stop after exhausting all possible merges
+ Assertions.assertTrue(model.getMerges().size() < 100);
+ Assertions.assertFalse(model.getMerges().isEmpty());
+ }
+
+ /**
+ * Tests that frequent words get merged into fewer tokens.
+ */
+ @Test
+ void testFrequentWordsProduceFewerTokens() {
+ final List<String> corpus = List.of(
+ "the the the the the the the the the the",
+ "the the the the the the the the the the",
+ "xyzzy"
+ );
+
+ final BPEModel model = trainer.train(corpus, 20, "en");
+ final BPETokenizer tokenizer = new BPETokenizer(model);
+
+ final String[] theTokens = tokenizer.tokenize("the");
+ final String[] xyzzyTokens = tokenizer.tokenize("xyzzy");
+
+ // "the" (very frequent) should have fewer or equal tokens compared to
"xyzzy" (rare)
+ Assertions.assertTrue(theTokens.length <= xyzzyTokens.length,
+ "Expected 'the' (" + Arrays.toString(theTokens) + ") to have fewer
tokens than 'xyzzy' ("
+ + Arrays.toString(xyzzyTokens) + ")");
+ }
+
+ /**
+ * Tests that the trained model produces a tokenizer that reconstructs
+ * the original words when tokens are concatenated.
+ */
+ @Test
+ void testTrainAndTokenizeRoundtrip() {
+ final List<String> corpus = List.of(
+ "the cat sat on the mat",
+ "the cat sat on the mat",
+ "the cat sat on the mat",
+ "the dog sat on the log",
+ "the dog sat on the log"
+ );
+
+ final BPEModel model = trainer.train(corpus, 20, "en");
+ final BPETokenizer tokenizer = new BPETokenizer(model);
+
+ // Verify token concatenation restores the original word
+ for (final String word : new String[]{"the", "cat", "sat", "dog"}) {
+ final String[] tokens = tokenizer.tokenize(word);
+ Assertions.assertEquals(word, String.join("", tokens),
+ "Token concatenation should reconstruct '" + word + "'");
+ }
+ }
+
+ /**
+ * Tests that training with an empty corpus produces a model with no merges.
+ */
+ @Test
+ void testEmptyCorpus() {
+ final BPEModel model = trainer.train(List.of(), 10, "en");
+
+ Assertions.assertNotNull(model);
+ Assertions.assertTrue(model.getMerges().isEmpty());
+ }
+
+ /**
+ * Tests that the language code is set on the produced model.
+ */
+ @Test
+ void testLanguageCodePreserved() {
+ final BPEModel model = trainer.train(List.of("hello world"), 5, "de");
+
+ Assertions.assertEquals("de", model.getLanguage());
+ }
+
+ @Test
+ void testNullCorpusThrows() {
+ Assertions.assertThrows(IllegalArgumentException.class,
+ () -> trainer.train(null, 10, "en"));
+ }
+
+ @Test
+ void testNullLanguageThrows() {
+ Assertions.assertThrows(IllegalArgumentException.class,
+ () -> trainer.train(List.of("hello"), 10, null));
+ }
+
+ @Test
+ void testZeroMergesThrows() {
+ Assertions.assertThrows(IllegalArgumentException.class,
+ () -> trainer.train(List.of("hello"), 0, "en"));
+ }
+
+ @Test
+ void testNegativeMergesThrows() {
+ Assertions.assertThrows(IllegalArgumentException.class,
+ () -> trainer.train(List.of("hello"), -1, "en"));
+ }
+}