(opennlp) branch main updated: OPENNLP-1220: Add support for Byte Pair Encoding (BPE) (#1011)

mawiesne Tue, 07 Apr 2026 01:37:50 -0700

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git



The following commit(s) were added to refs/heads/main by this push:
     new 97f97b68 OPENNLP-1220: Add support for Byte Pair Encoding (BPE) (#1011)
97f97b68 is described below

commit 97f97b687917021fffb3b2eb03cb634ed1d4dc1b
Author: Martin Wiesner <[email protected]>
AuthorDate: Tue Apr 7 10:37:29 2026 +0200

    OPENNLP-1220: Add support for Byte Pair Encoding (BPE) (#1011)
    
    * OPENNLP-1220 - Add support for Byte Pair Encoding (BPE)
    
    ---------
    
    Co-authored-by: Richard Zowalla <[email protected]>
---
 .../main/java/opennlp/tools/tokenize/BPEModel.java | 162 ++++++++++
 .../java/opennlp/tools/tokenize/BPETokenizer.java  | 290 +++++++++++++++++
 .../tools/tokenize/BPETokenizerFactory.java        | 187 +++++++++++
 .../tools/tokenize/BPETokenizerTrainer.java        | 281 +++++++++++++++++
 .../tools/tokenize/AbstractBPEModelTest.java       | 158 ++++++++++
 .../AbstractBPETokenizerRealisticTest.java         | 347 +++++++++++++++++++++
 .../opennlp/tools/tokenize/BPEModelDeTest.java     |  48 +++
 .../opennlp/tools/tokenize/BPEModelEnTest.java     |  45 +++
 .../opennlp/tools/tokenize/BPEModelFrTest.java     |  47 +++
 .../tools/tokenize/BPETokenizerFactoryTest.java    | 149 +++++++++
 .../tokenize/BPETokenizerRealisticDeTest.java      | 119 +++++++
 .../tokenize/BPETokenizerRealisticEnTest.java      | 124 ++++++++
 .../tokenize/BPETokenizerRealisticEsTest.java      | 148 +++++++++
 .../tokenize/BPETokenizerRealisticFrTest.java      | 146 +++++++++
 .../tokenize/BPETokenizerRealisticItTest.java      | 119 +++++++
 .../opennlp/tools/tokenize/BPETokenizerTest.java   | 230 ++++++++++++++
 .../tools/tokenize/BPETokenizerTrainerTest.java    | 188 +++++++++++
 17 files changed, 2788 insertions(+)

diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPEModel.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPEModel.java
new file mode 100644
index 00000000..fe59de46
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPEModel.java
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.tokenize.BPETokenizer.SymbolPair;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.BaseModel;
+
+/**
+ * The {@link BPEModel} stores learned BPE merge operations and can be
+ * serialized and deserialized for reuse.
+ * <p>
+ * A model is created by the {@link BPETokenizerTrainer} and contains an 
ordered
+ * list of {@link BPETokenizer.SymbolPair} merge operations that define the BPE
+ * vocabulary. The model is persisted as a standard OpenNLP ZIP package with a
+ * {@code bpe.merges} artifact containing the merge rules.
+ * <p>
+ * <b>Usage:</b>
+ * <pre>{@code
+ * // Create via training
+ * BPETokenizerTrainer trainer = new BPETokenizerTrainer();
+ * BPEModel model = trainer.train(corpus, 10000, "en");
+ *
+ * // Save to disk
+ * model.serialize(Path.of("bpe-en.bin"));
+ *
+ * // Load from disk
+ * BPEModel loaded = new BPEModel(Path.of("bpe-en.bin"));
+ *
+ * // Use for tokenization
+ * BPETokenizer tokenizer = new BPETokenizer(loaded);
+ * }</pre>
+ *
+ * @see BPETokenizer
+ * @see BPETokenizerTrainer
+ * @see BPETokenizerFactory
+ */
+public final class BPEModel extends BaseModel {
+
+  private static final long serialVersionUID = 1L;
+  /** The component name for this model type. */
+  private static final String COMPONENT_NAME = "BPETokenizer";
+
+  /**
+   * Creates a {@link BPEModel} from trained merge rules.
+   *
+   * @param merges             The ordered list of merge operations.
+   *                           Must not be {@code null}.
+   * @param manifestInfoEntries Additional manifest info.
+   * @param factory            The {@link BPETokenizerFactory}.
+   */
+  public BPEModel(final List<SymbolPair> merges,
+                  final Map<String, String> manifestInfoEntries,
+                  final BPETokenizerFactory factory) {
+    super(COMPONENT_NAME,
+        factory.getLanguageCode(),
+        manifestInfoEntries, factory);
+    artifactMap.put(BPETokenizerFactory.MERGES_ENTRY_NAME,
+        new ArrayList<>(merges));
+    checkArtifactMap();
+  }
+
+  /**
+   * Initializes a {@link BPEModel} from an {@link InputStream}.
+   *
+   * @param in The {@link InputStream} for loading the model.
+   * @throws IOException Thrown if IO errors occurred.
+   */
+  public BPEModel(final InputStream in) throws IOException {
+    super(COMPONENT_NAME, in);
+  }
+
+  /**
+   * Initializes a {@link BPEModel} from a {@link File}.
+   *
+   * @param modelFile The {@link File} for loading the model.
+   * @throws IOException Thrown if IO errors occurred.
+   */
+  public BPEModel(final File modelFile) throws IOException {
+    super(COMPONENT_NAME, modelFile);
+  }
+
+  /**
+   * Initializes a {@link BPEModel} from a {@link Path}.
+   *
+   * @param modelPath The {@link Path} for loading the model.
+   * @throws IOException Thrown if IO errors occurred.
+   */
+  public BPEModel(final Path modelPath) throws IOException {
+    super(COMPONENT_NAME, modelPath);
+  }
+
+  /**
+   * Initializes a {@link BPEModel} from a {@link URL}.
+   *
+   * @param modelURL The {@link URL} for loading the model.
+   * @throws IOException Thrown if IO errors occurred.
+   */
+  public BPEModel(final URL modelURL) throws IOException {
+    super(COMPONENT_NAME, modelURL);
+  }
+
+  @Override
+  protected void validateArtifactMap()
+      throws InvalidFormatException {
+    super.validateArtifactMap();
+
+    Object mergesArtifact =
+        artifactMap.get(BPETokenizerFactory.MERGES_ENTRY_NAME);
+    if (!(mergesArtifact instanceof List<?>)) {
+      throw new InvalidFormatException(
+          "BPE model is incomplete: missing merge rules!");
+    }
+  }
+
+  @Override
+  protected Class<? extends BaseToolFactory> getDefaultFactory() {
+    return BPETokenizerFactory.class;
+  }
+
+  /**
+   * @return The active {@link BPETokenizerFactory}.
+   */
+  public BPETokenizerFactory getFactory() {
+    return (BPETokenizerFactory) this.toolFactory;
+  }
+
+  /**
+   * @return An unmodifiable, ordered list of BPE merge operations stored in 
this model.
+   */
+  @SuppressWarnings("unchecked")
+  public List<SymbolPair> getMerges() {
+    return Collections.unmodifiableList(
+        (List<SymbolPair>) 
artifactMap.get(BPETokenizerFactory.MERGES_ENTRY_NAME));
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPETokenizer.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPETokenizer.java
new file mode 100644
index 00000000..1a937da5
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPETokenizer.java
@@ -0,0 +1,290 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+
+import opennlp.tools.util.Span;
+
+/**
+ * A {@link Tokenizer} implementation that performs subword tokenization
+ * using Byte Pair Encoding (BPE).
+ * <p>
+ * BPE iteratively merges the most frequent pair of adjacent symbols,
+ * starting from a character-level representation of each word. This allows
+ * the tokenizer to handle out-of-vocabulary words by decomposing them into
+ * known subword units.
+ * <p>
+ * <b>Usage:</b>
+ * <pre>{@code
+ * // Train a BPE model from a corpus
+ * BPETokenizerTrainer trainer = new BPETokenizerTrainer();
+ * BPEModel model = trainer.train(corpus, 10000, "en");
+ *
+ * // Save the model for later reuse
+ * model.serialize(Path.of("bpe-en.bin"));
+ *
+ * // Load and tokenize
+ * BPEModel loaded = new BPEModel(Path.of("bpe-en.bin"));
+ * BPETokenizer tokenizer = new BPETokenizer(loaded);
+ * String[] tokens = tokenizer.tokenize("unseen words are split into 
subwords");
+ * }</pre>
+ * <p>
+ * The tokenizer first splits text on whitespace, then applies learned merge
+ * operations to each word independently. Words are decomposed into characters
+ * with an {@link #END_OF_WORD} marker on the final character, then merges are
+ * applied in priority order (as learned during training) until no more merges
+ * are applicable. The resulting subword units are returned as tokens.
+ * <p>
+ * For reference see:
+ * <ul>
+ *   <li>Sennrich, R., Haddow, B., &amp; Birch, A. (2016).
+ *       Neural Machine Translation of Rare Words with Subword Units.
+ *       <a href="https://arxiv.org/abs/1508.07909";>
+ *       https://arxiv.org/abs/1508.07909</a>
+ *   </li>
+ * </ul>
+ *
+ * @see BPEModel
+ * @see BPETokenizerTrainer
+ * @see WordpieceTokenizer
+ */
+public class BPETokenizer implements Tokenizer {
+
+  /**
+   * Suffix appended to the last symbol of each word during BPE encoding
+   * to distinguish word-final characters from word-internal ones.
+   * <p>
+   * Users constructing {@link SymbolPair} merge rules manually must use this
+   * constant to mark word-final symbols
+   * (e.g., {@code new SymbolPair("a", "b" + END_OF_WORD)}).
+   */
+  public static final String END_OF_WORD = "</w>";
+
+  /** Maps each merge pair to its priority rank. */
+  private final LinkedHashMap<SymbolPair, Integer> mergeRanks;
+
+  /**
+   * Initializes a {@link BPETokenizer} from a trained
+   * {@link BPEModel}.
+   *
+   * @param model The trained BPE model. Must not be {@code null}.
+   * @throws IllegalArgumentException if {@code model} is {@code null}.
+   */
+  public BPETokenizer(final BPEModel model) {
+    if (model == null) {
+      throw new IllegalArgumentException("model must not be null");
+    }
+    final List<SymbolPair> merges = model.getMerges();
+    this.mergeRanks = new LinkedHashMap<>();
+    for (int i = 0; i < merges.size(); i++) {
+      mergeRanks.put(merges.get(i), i);
+    }
+  }
+
+  /**
+   * {@inheritDoc}
+   * <p>
+   * Splits the input text on whitespace, then applies BPE merge operations
+   * to each word to produce subword tokens. Words not fully covered by
+   * learned merges are decomposed into individual characters.
+   */
+  @Override
+  public String[] tokenize(final String text) {
+    if (text == null || text.isEmpty()) {
+      return new String[0];
+    }
+
+    final String[] words = WhitespaceTokenizer.INSTANCE.tokenize(text);
+    final List<String> allTokens = new ArrayList<>();
+
+    for (final String word : words) {
+      allTokens.addAll(encodeToBPE(word));
+    }
+
+    return allTokens.toArray(new String[0]);
+  }
+
+  /**
+   * {@inheritDoc}
+   * <p>
+   * Returns {@link Span} offsets into the original text for each subword 
token.
+   * Each span maps back to the exact character range in the input string.
+   */
+  @Override
+  public Span[] tokenizePos(final String text) {
+    if (text == null || text.isEmpty()) {
+      return new Span[0];
+    }
+
+    final Span[] wordSpans = WhitespaceTokenizer.INSTANCE.tokenizePos(text);
+    final List<Span> allSpans = new ArrayList<>();
+
+    for (final Span wordSpan : wordSpans) {
+      final String word = wordSpan.getCoveredText(text).toString();
+      final List<String> symbols = splitToSymbols(word);
+      final List<String> merged = applyMerges(symbols);
+
+      int offset = wordSpan.getStart();
+      for (final String token : merged) {
+        String clean = token.endsWith(END_OF_WORD)
+            ? token.substring(0, token.length() - END_OF_WORD.length())
+            : token;
+        int len = clean.length();
+        allSpans.add(new Span(offset, offset + len));
+        offset += len;
+      }
+    }
+
+    return allSpans.toArray(new Span[0]);
+  }
+
+  /**
+   * Splits a word into its initial character-level BPE symbol sequence.
+   * Each character becomes its own symbol, with {@link #END_OF_WORD} appended
+   * to the final character.
+   *
+   * @param word The word to split. Must not be {@code null} or empty.
+   * @return A mutable list of character symbols.
+   */
+  private List<String> splitToSymbols(final String word) {
+    final List<String> symbols = new ArrayList<>(word.length());
+    for (int i = 0; i < word.length(); i++) {
+      if (i == word.length() - 1) {
+        symbols.add(word.charAt(i) + END_OF_WORD);
+      } else {
+        symbols.add(String.valueOf(word.charAt(i)));
+      }
+    }
+    return symbols;
+  }
+
+  /**
+   * Encodes a single word into BPE subword tokens by splitting it into
+   * character-level symbols, applying learned merge operations, and stripping
+   * the {@link #END_OF_WORD} markers from the resulting tokens.
+   *
+   * @param word The word to encode. Must not be {@code null}.
+   * @return A list of subword token strings whose concatenation equals the 
original word.
+   */
+  private List<String> encodeToBPE(final String word) {
+    if (word.isEmpty()) {
+      return List.of();
+    }
+
+    final List<String> symbols = splitToSymbols(word);
+    final List<String> merged = applyMerges(symbols);
+
+    // Strip end-of-word markers and collect final tokens
+    final List<String> result = new ArrayList<>();
+    for (final String token : merged) {
+      if (token.endsWith(END_OF_WORD)) {
+        result.add(token.substring(0, token.length() - END_OF_WORD.length()));
+      } else {
+        result.add(token);
+      }
+    }
+
+    return result;
+  }
+
+  /**
+   * Iteratively applies learned BPE merge operations to a list of symbols.
+   * In each iteration, the highest-priority (lowest-rank) adjacent pair is 
merged
+   * into a single symbol. The process continues until no more applicable 
merges
+   * remain or the symbol list is reduced to a single element.
+   *
+   * @param symbols The mutable list of symbols to merge. Must not be {@code 
null}.
+   * @return The list of symbols after all applicable merges have been applied.
+   */
+  private List<String> applyMerges(final List<String> symbols) {
+    if (symbols.size() <= 1) {
+      return symbols;
+    }
+
+    List<String> current = new ArrayList<>(symbols);
+
+    while (current.size() > 1) {
+      int bestRank = Integer.MAX_VALUE;
+      SymbolPair bestPair = null;
+
+      for (int i = 0; i < current.size() - 1; i++) {
+        final SymbolPair pair = new SymbolPair(
+            current.get(i), current.get(i + 1));
+        final Integer rank = mergeRanks.get(pair);
+        if (rank != null && rank < bestRank) {
+          bestRank = rank;
+          bestPair = pair;
+        }
+      }
+
+      if (bestPair == null) {
+        break;
+      }
+
+      final List<String> next = new ArrayList<>();
+      int i = 0;
+      while (i < current.size()) {
+        if (i < current.size() - 1
+            && current.get(i).equals(bestPair.left())
+            && current.get(i + 1).equals(bestPair.right())) {
+          next.add(bestPair.left() + bestPair.right());
+          i += 2;
+        } else {
+          next.add(current.get(i));
+          i++;
+        }
+      }
+      current = next;
+    }
+
+    return current;
+  }
+
+  /**
+   * Represents a pair of adjacent symbols in BPE.
+   *
+   * @param left  The left symbol.
+   * @param right The right symbol.
+   */
+  public record SymbolPair(String left, String right) {
+
+    /**
+     * Creates a new {@link SymbolPair}.
+     *
+     * @param left  The left symbol. Must not be {@code null}.
+     * @param right The right symbol. Must not be {@code null}.
+     * @throws IllegalArgumentException if {@code left} or {@code right} is 
{@code null}.
+     */
+    public SymbolPair {
+      if (left == null) {
+        throw new IllegalArgumentException("left must not be null");
+      }
+      if (right == null) {
+        throw new IllegalArgumentException("right must not be null");
+      }
+    }
+
+    @Override
+    public String toString() {
+      return left + " " + right;
+    }
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPETokenizerFactory.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPETokenizerFactory.java
new file mode 100644
index 00000000..881bbcfa
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPETokenizerFactory.java
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.tokenize.BPETokenizer.SymbolPair;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.ArtifactSerializer;
+
+/**
+ * A {@link BaseToolFactory} for BPE tokenization that manages
+ * the BPE merge rules artifact and its serialization within a
+ * {@link BPEModel}.
+ * <p>
+ * This factory is responsible for:
+ * <ul>
+ *   <li>Providing the {@link BPEMergesSerializer} that reads
+ *       and writes BPE merge rules as a text-based artifact
+ *       ({@code bpe.merges}) inside the model ZIP package.
+ *   </li>
+ *   <li>Supplying the merge rules to the {@link BPEModel}
+ *       via {@link #createArtifactMap()}.</li>
+ *   <li>Validating that a loaded model contains valid merge
+ *       rules.</li>
+ * </ul>
+ * <p>
+ * This class is typically not used directly. It is
+ * instantiated internally by {@link BPETokenizerTrainer}
+ * during training and by {@link BPEModel} during model
+ * loading.
+ *
+ * @see BPEModel
+ * @see BPETokenizer
+ * @see BPETokenizerTrainer
+ */
+public class BPETokenizerFactory extends BaseToolFactory {
+
+  /** The artifact entry name for BPE merge rules. */
+  static final String MERGES_ENTRY_NAME = "bpe.merges";
+
+  /** The ISO language code. */
+  private String languageCode;
+
+  /**
+   * Creates a {@link BPETokenizerFactory}.
+   * Required empty constructor for model loading.
+   */
+  public BPETokenizerFactory() {
+  }
+
+  /**
+   * Creates a {@link BPETokenizerFactory} with the given
+   * language code.
+   *
+   * @param langCode The ISO language code.
+   *                 Must not be {@code null}.
+   * @throws IllegalArgumentException if {@code langCode}
+   *         is {@code null}.
+   */
+  public BPETokenizerFactory(final String langCode) {
+    if (langCode == null) {
+      throw new IllegalArgumentException(
+          "languageCode must not be null");
+    }
+    this.languageCode = langCode;
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  public Map<String, ArtifactSerializer<?>>
+      createArtifactSerializersMap() {
+    Map<String, ArtifactSerializer<?>> serializers =
+        super.createArtifactSerializersMap();
+    serializers.put("merges", new BPEMergesSerializer());
+    return serializers;
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  public Map<String, String> createManifestEntries() {
+    Map<String, String> entries = super.createManifestEntries();
+    return entries;
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  public void validateArtifactMap() throws InvalidFormatException {
+    Object mergesArtifact =
+        this.artifactProvider.getArtifact(MERGES_ENTRY_NAME);
+    if (!(mergesArtifact instanceof List<?>)) {
+      throw new InvalidFormatException(
+          "Missing or invalid BPE merges artifact!");
+    }
+  }
+
+  /**
+   * @return The ISO language code for this factory.
+   */
+  public String getLanguageCode() {
+    return languageCode;
+  }
+
+  /**
+   * An {@link ArtifactSerializer} for BPE merge rules.
+   * <p>
+   * Serializes merge rules as a text file with one merge pair per line,
+   * in the format: {@code left right}.
+   */
+  static class BPEMergesSerializer
+      implements ArtifactSerializer<List<SymbolPair>> {
+
+    @Override
+    public List<SymbolPair> create(final InputStream in)
+        throws IOException {
+      final List<SymbolPair> merges = new ArrayList<>();
+      final BufferedReader reader = new BufferedReader(
+          new InputStreamReader(in, StandardCharsets.UTF_8));
+      String line;
+      while ((line = reader.readLine()) != null) {
+        line = line.trim();
+        if (line.isEmpty()) {
+          continue;
+        }
+        final int space = line.indexOf(' ');
+        if (space < 0) {
+          throw new InvalidFormatException(
+              "Invalid BPE merge line (expected "
+              + "'left right'): " + line);
+        }
+        merges.add(new SymbolPair(
+            line.substring(0, space),
+            line.substring(space + 1)));
+      }
+      return merges;
+    }
+
+    /**
+     * Serializes the merge rules to the given {@link OutputStream}.
+     * <p>
+     * <b>Note:</b> This method wraps the provided {@link OutputStream}
+     * in a {@link BufferedWriter} and flushes it upon completion,
+     * but does <em>not</em> close the underlying stream. The caller
+     * is responsible for closing {@code out}.
+     */
+    @Override
+    public void serialize(final List<SymbolPair> artifact,
+                          final OutputStream out)
+        throws IOException {
+      final BufferedWriter writer = new BufferedWriter(
+          new OutputStreamWriter(out, StandardCharsets.UTF_8));
+      for (final SymbolPair merge : artifact) {
+        writer.write(merge.left());
+        writer.write(' ');
+        writer.write(merge.right());
+        writer.newLine();
+      }
+      writer.flush();
+    }
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPETokenizerTrainer.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPETokenizerTrainer.java
new file mode 100644
index 00000000..22d32ee5
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/BPETokenizerTrainer.java
@@ -0,0 +1,281 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.commons.Trainer;
+import opennlp.tools.tokenize.BPETokenizer.SymbolPair;
+import opennlp.tools.util.Parameters;
+import opennlp.tools.util.TrainingConfiguration;
+
+/**
+ * Learns BPE merge operations from a training corpus and
+ * produces a {@link BPEModel}.
+ * <p>
+ * Implements the BPE learning algorithm from
+ * Sennrich et al. (2016):
+ * <ol>
+ *   <li>Build a vocabulary of character-level symbol
+ *       sequences from the corpus, where each word is split
+ *       into individual characters with an end-of-word
+ *       marker.</li>
+ *   <li>Count all adjacent symbol pairs across the
+ *       vocabulary, weighted by word frequency.</li>
+ *   <li>Merge the most frequent pair into a single new
+ *       symbol.</li>
+ *   <li>Repeat until the desired number of merges
+ *       ({@code numMerges}) is reached.</li>
+ * </ol>
+ * <p>
+ * The number of merges controls the granularity of the
+ * resulting vocabulary: fewer merges produce finer-grained
+ * (more character-level) tokens, while more merges produce
+ * coarser (more word-level) tokens. A typical value ranges
+ * from a few thousand to tens of thousands, depending on
+ * the corpus size and language.
+ * <p>
+ * <b>Usage:</b>
+ * <pre>{@code
+ * List<String> corpus = List.of(
+ *     "the cat sat on the mat",
+ *     "the dog sat on the log"
+ * );
+ *
+ * BPETokenizerTrainer trainer = new BPETokenizerTrainer();
+ * BPEModel model = trainer.train(corpus, 10000, "en");
+ *
+ * // Persist the model
+ * model.serialize(Path.of("bpe-en.bin"));
+ *
+ * // Use it for tokenization
+ * BPETokenizer tokenizer = new BPETokenizer(model);
+ * String[] tokens = tokenizer.tokenize("the cat");
+ * }</pre>
+ * <p>
+ * For reference see:
+ * <ul>
+ *   <li>Sennrich, R., Haddow, B., &amp; Birch, A. (2016).
+ *       Neural Machine Translation of Rare Words with Subword Units.
+ *       <a href="https://arxiv.org/abs/1508.07909";>
+ *       https://arxiv.org/abs/1508.07909</a>
+ *   </li>
+ * </ul>
+ *
+ * @see BPETokenizer
+ * @see BPEModel
+ */
+public final class BPETokenizerTrainer implements Trainer<Parameters> {
+
+  private Parameters trainingParameters;
+  private Map<String, String> reportMap;
+  private TrainingConfiguration trainingConfiguration;
+
+  /**
+   * Creates a new {@link BPETokenizerTrainer}.
+   */
+  public BPETokenizerTrainer() {
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  public void init(final Parameters trainParams,
+                   final Map<String, String> reportMap) {
+    this.trainingParameters = trainParams;
+    this.reportMap = reportMap;
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  public void init(final Parameters trainParams,
+                   final Map<String, String> reportMap,
+                   final TrainingConfiguration config) {
+    init(trainParams, reportMap);
+    this.trainingConfiguration = config;
+  }
+
+  /**
+   * Learns BPE merge operations from a training corpus
+   * and returns a {@link BPEModel}.
+   *
+   * @param corpus       An iterable of text strings
+   *                     (e.g., sentences or documents).
+   *                     Must not be {@code null}.
+   * @param numMerges    The number of merge operations
+   *                     to learn. Must be positive.
+   * @param languageCode The ISO language code
+   *                     (e.g., "en", "de").
+   *                     Must not be {@code null}.
+   * @return A trained {@link BPEModel} containing the
+   *         learned merge operations.
+   * @throws IllegalArgumentException if {@code numMerges}
+   *         is not positive, or if {@code corpus} or
+   *         {@code languageCode} is {@code null}.
+   */
+  public BPEModel train(final Iterable<String> corpus,
+                         final int numMerges,
+                         final String languageCode) {
+    if (corpus == null) {
+      throw new IllegalArgumentException(
+          "corpus must not be null");
+    }
+    if (languageCode == null) {
+      throw new IllegalArgumentException(
+          "languageCode must not be null");
+    }
+    if (numMerges <= 0) {
+      throw new IllegalArgumentException(
+          "numMerges must be positive, got: " + numMerges);
+    }
+
+    final List<SymbolPair> merges = learnMerges(corpus, numMerges);
+    final BPETokenizerFactory factory =
+        new BPETokenizerFactory(languageCode);
+
+    return new BPEModel(merges, new HashMap<>(), factory);
+  }
+
+  /**
+   * Learns BPE merge operations from the given corpus.
+   * <p>
+   * The algorithm proceeds as follows:
+   * <ol>
+   *   <li>Builds a word frequency map from the corpus using whitespace 
tokenization.</li>
+   *   <li>Converts each word into a character-level symbol sequence with an
+   *       end-of-word marker on the final character.</li>
+   *   <li>Iteratively counts all adjacent symbol pairs (weighted by word 
frequency),
+   *       selects the most frequent pair, records it as a merge operation, 
and applies
+   *       the merge to all vocabulary entries.</li>
+   *   <li>Stops after {@code numMerges} iterations or when no further pairs 
exist.</li>
+   * </ol>
+   *
+   * @param corpus    The training corpus, where each element is a text string.
+   * @param numMerges The maximum number of merge operations to learn.
+   * @return An ordered list of learned {@link SymbolPair} merge operations.
+   */
+  private List<SymbolPair> learnMerges(
+      final Iterable<String> corpus,
+      final int numMerges) {
+    // Step 1: Build word frequency map from corpus
+    final Map<String, Integer> wordFreqs = new HashMap<>();
+    for (final String line : corpus) {
+      final String[] words = WhitespaceTokenizer.INSTANCE.tokenize(line);
+      for (final String word : words) {
+        wordFreqs.merge(word, 1, Integer::sum);
+      }
+    }
+
+    // Step 2: Convert to symbol sequences with frequencies
+    final Map<List<String>, Integer> vocab = new HashMap<>();
+    for (final Map.Entry<String, Integer> entry : wordFreqs.entrySet()) {
+      final List<String> symbols = splitToSymbols(entry.getKey());
+      vocab.put(symbols, entry.getValue());
+    }
+
+    // Step 3: Iteratively learn merges
+    final List<SymbolPair> merges = new ArrayList<>();
+
+    for (int step = 0; step < numMerges; step++) {
+      // Count all adjacent pairs
+      final Map<SymbolPair, Integer> pairCounts = new HashMap<>();
+      for (final Map.Entry<List<String>, Integer> entry : vocab.entrySet()) {
+        final List<String> symbols = entry.getKey();
+        final int freq = entry.getValue();
+        for (int i = 0; i < symbols.size() - 1; i++) {
+          final SymbolPair pair = new SymbolPair(
+              symbols.get(i), symbols.get(i + 1));
+          pairCounts.merge(pair, freq, Integer::sum);
+        }
+      }
+
+      if (pairCounts.isEmpty()) {
+        break;
+      }
+
+      // Find most frequent pair
+      SymbolPair bestPair = null;
+      int bestCount = 0;
+      for (final Map.Entry<SymbolPair, Integer> entry : pairCounts.entrySet()) 
{
+        if (entry.getValue() > bestCount) {
+          bestCount = entry.getValue();
+          bestPair = entry.getKey();
+        }
+      }
+
+      if (bestPair == null || bestCount < 1) {
+        break;
+      }
+
+      merges.add(bestPair);
+
+      // Apply merge to vocabulary
+      final Map<List<String>, Integer> newVocab = new HashMap<>();
+      for (final Map.Entry<List<String>, Integer> entry : vocab.entrySet()) {
+        final List<String> merged = applyMerge(entry.getKey(), bestPair);
+        newVocab.merge(merged, entry.getValue(), Integer::sum);
+      }
+      vocab.clear();
+      vocab.putAll(newVocab);
+    }
+
+    return merges;
+  }
+
+  private List<String> splitToSymbols(final String word) {
+    final List<String> symbols = new ArrayList<>(word.length());
+    for (int i = 0; i < word.length(); i++) {
+      if (i == word.length() - 1) {
+        symbols.add(word.charAt(i) + BPETokenizer.END_OF_WORD);
+      } else {
+        symbols.add(String.valueOf(word.charAt(i)));
+      }
+    }
+    return symbols;
+  }
+
+  /**
+   * Applies a single merge operation to a symbol sequence.
+   * Scans the list for adjacent symbols matching the given pair and replaces
+   * each occurrence with a single concatenated symbol.
+   *
+   * @param symbols The current symbol sequence for a word.
+   * @param pair    The {@link SymbolPair} to merge.
+   * @return A new list with all occurrences of the pair merged.
+   */
+  private List<String> applyMerge(
+      final List<String> symbols,
+      final SymbolPair pair) {
+    final List<String> result = new ArrayList<>();
+    int i = 0;
+    while (i < symbols.size()) {
+      if (i < symbols.size() - 1
+          && symbols.get(i).equals(pair.left())
+          && symbols.get(i + 1).equals(pair.right())) {
+        result.add(pair.left() + pair.right());
+        i += 2;
+      } else {
+        result.add(symbols.get(i));
+        i++;
+      }
+    }
+    return result;
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/AbstractBPEModelTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/AbstractBPEModelTest.java
new file mode 100644
index 00000000..22b50011
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/AbstractBPEModelTest.java
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.util.List;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.tokenize.BPETokenizer.SymbolPair;
+
+/**
+ * Abstract base class for {@link BPEModel} tests.
+ * <p>
+ * Subclasses provide a language-specific corpus and language code.
+ *
+ * @see BPEModel
+ */
+public abstract class AbstractBPEModelTest {
+
+  /**
+   * @return a corpus of sentences for training.
+   */
+  protected abstract List<String> getCorpus();
+
+  /**
+   * @return the ISO language code to use during training.
+   */
+  protected abstract String getLanguageCode();
+
+  protected BPEModel trainModel(int numMerges) {
+    return new BPETokenizerTrainer().train(getCorpus(), numMerges, 
getLanguageCode());
+  }
+
+  /**
+   * Tests that a model can be serialized and deserialized without data loss.
+   */
+  @Test
+  void testBPEModelSerialization() throws IOException {
+    final BPEModel model = trainModel(10);
+    Assertions.assertFalse(model.isLoadedFromSerialized());
+
+    try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
+      model.serialize(out);
+
+      final BPEModel restored = new BPEModel(new 
ByteArrayInputStream(out.toByteArray()));
+      Assertions.assertNotNull(restored);
+      Assertions.assertTrue(restored.isLoadedFromSerialized());
+    }
+  }
+
+  /**
+   * Tests that merge rules are preserved after serialization roundtrip.
+   */
+  @Test
+  void testMergesPreservedAfterSerialization() throws IOException {
+    final BPEModel original = trainModel(10);
+
+    try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
+      original.serialize(out);
+
+      final BPEModel restored = new BPEModel(new 
ByteArrayInputStream(out.toByteArray()));
+
+      final List<SymbolPair> originalMerges = original.getMerges();
+      final List<SymbolPair> restoredMerges = restored.getMerges();
+
+      Assertions.assertEquals(originalMerges.size(), restoredMerges.size());
+      for (int i = 0; i < originalMerges.size(); i++) {
+        Assertions.assertEquals(originalMerges.get(i), restoredMerges.get(i));
+      }
+    }
+  }
+
+  /**
+   * Tests that merge order is preserved — order determines priority.
+   */
+  @Test
+  void testMergeOrderPreserved() throws IOException {
+    final BPEModel model = trainModel(5);
+
+    try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
+      model.serialize(out);
+
+      final BPEModel restored = new BPEModel(new 
ByteArrayInputStream(out.toByteArray()));
+
+      // Verify exact order matches
+      Assertions.assertEquals(model.getMerges(), restored.getMerges());
+    }
+  }
+
+  /**
+   * Tests that a deserialized model can be used to tokenize text.
+   */
+  @Test
+  void testDeserializedModelCanTokenize() throws IOException {
+    final BPEModel original = trainModel(10);
+
+    try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
+      original.serialize(out);
+
+      final BPEModel loaded = new BPEModel(new 
ByteArrayInputStream(out.toByteArray()));
+      final BPETokenizer tokenizer = new BPETokenizer(loaded);
+
+      final String[] tokens = tokenizer.tokenize("low");
+      Assertions.assertTrue(tokens.length >= 1);
+      Assertions.assertEquals("low", String.join("", tokens));
+    }
+  }
+
+  /**
+   * Tests that the language code is preserved in the model.
+   */
+  @Test
+  void testLanguagePreserved() throws IOException {
+    final BPEModel model = trainModel(5);
+
+    try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
+      model.serialize(out);
+
+      final BPEModel restored = new BPEModel(new 
ByteArrayInputStream(out.toByteArray()));
+      Assertions.assertEquals(getLanguageCode(), restored.getLanguage());
+    }
+  }
+
+  /**
+   * Tests that the factory is accessible from a deserialized model.
+   */
+  @Test
+  void testFactoryAccessibleAfterDeserialization() throws IOException {
+    final BPEModel original = trainModel(5);
+
+    try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
+      original.serialize(out);
+
+      final BPEModel restored = new BPEModel(new 
ByteArrayInputStream(out.toByteArray()));
+      Assertions.assertNotNull(restored.getFactory());
+      Assertions.assertInstanceOf(BPETokenizerFactory.class, 
restored.getFactory());
+    }
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/AbstractBPETokenizerRealisticTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/AbstractBPETokenizerRealisticTest.java
new file mode 100644
index 00000000..e93b62d9
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/AbstractBPETokenizerRealisticTest.java
@@ -0,0 +1,347 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.TestInstance;
+
+import opennlp.tools.util.Span;
+
+/**
+ * Abstract base class for realistic BPE tokenization integration tests.
+ * <p>
+ * Subclasses provide language-specific training corpora and test inputs.
+ * This class contains all common test methods that exercise the BPE pipeline
+ * end-to-end: training, tokenization, serialization, and consistency checks.
+ *
+ * @see BPETokenizer
+ * @see BPETokenizerTrainer
+ * @see BPEModel
+ */
+@TestInstance(TestInstance.Lifecycle.PER_CLASS)
+abstract class AbstractBPETokenizerRealisticTest {
+
+  private BPEModel trainedModel;
+
+  // --- Abstract methods for language-specific data ---
+
+  /**
+   * Returns a realistic training corpus for the target language.
+   */
+  abstract List<String> getTrainingCorpus();
+
+  /**
+   * Returns the ISO language code (e.g., "en", "de", "fr").
+   */
+  abstract String getLanguageCode();
+
+  /**
+   * Returns the number of BPE merges to use during training. Default is 100.
+   */
+  int getNumMerges() {
+    return 100;
+  }
+
+  /**
+   * Returns a simple sentence whose words all appear in the training corpus.
+   */
+  abstract String getSimpleSentence();
+
+  /**
+   * Returns the expected words for {@link #getSimpleSentence()}.
+   */
+  abstract String[] getSimpleSentenceExpectedWords();
+
+  /**
+   * Returns a list of words expected to be single tokens after training.
+   */
+  abstract List<String> getFrequentWords();
+
+  /**
+   * Returns a word not seen in the training corpus.
+   */
+  abstract String getUnseenWord();
+
+  /**
+   * Returns a sentence for span coverage testing.
+   */
+  abstract String getSpanTestSentence();
+
+  /**
+   * Returns the expected words for {@link #getSpanTestSentence()}.
+   */
+  abstract String[] getSpanTestExpectedWords();
+
+  /**
+   * Returns a multi-word sentence for general tokenization testing.
+   */
+  abstract String getMultiWordSentence();
+
+  /**
+   * Returns a sentence for serialization roundtrip testing.
+   */
+  abstract String getSerializationTestSentence();
+
+  /**
+   * Returns a sentence for consistency testing between tokenize() and 
tokenizePos().
+   */
+  abstract String getConsistencyTestSentence();
+
+  /**
+   * Returns a sentence containing punctuation for testing.
+   */
+  abstract String getPunctuationTestSentence();
+
+  /**
+   * Returns the expected words (whitespace-delimited, punctuation attached)
+   * for {@link #getPunctuationTestSentence()}.
+   */
+  abstract String[] getExpectedPunctuationWords();
+
+  /**
+   * Returns a sentence for testing that more merges produce coarser tokens.
+   */
+  abstract String getCoarseTokenizationSentence();
+
+  @BeforeAll
+  void setUpClass() {
+    trainedModel = new BPETokenizerTrainer().train(
+        getTrainingCorpus(), getNumMerges(), getLanguageCode());
+  }
+
+  /**
+   * Tests basic tokenization of a simple sentence with the trained model.
+   * All words appear in the training corpus and should be fully merged.
+   */
+  @Test
+  void testTokenizerSimpleModel() {
+    final BPETokenizer tokenizer = new BPETokenizer(trainedModel);
+    final String text = getSimpleSentence();
+
+    final String[] tokens = tokenizer.tokenize(text);
+    final Span[] spans = tokenizer.tokenizePos(text);
+
+    final String[] words = reconstructWords(tokens, spans, text);
+    Assertions.assertArrayEquals(getSimpleSentenceExpectedWords(), words);
+  }
+
+  /**
+   * Tests tokenization of frequent words seen during training.
+   * Frequent words should be tokenized into single tokens.
+   */
+  @Test
+  void testFrequentWordsTokenizeEfficiently() {
+    final BPETokenizer tokenizer = new BPETokenizer(trainedModel);
+
+    for (final String word : getFrequentWords()) {
+      final String[] tokens = tokenizer.tokenize(word);
+      Assertions.assertEquals(1, tokens.length,
+          "Expected '" + word + "' as single token");
+      Assertions.assertEquals(word, tokens[0]);
+    }
+  }
+
+  /**
+   * Tests tokenization of unseen words -- they should be split into subword 
pieces
+   * but concatenation must still reconstruct the original.
+   */
+  @Test
+  void testUnseenWordsTokenization() {
+    final BPETokenizer tokenizer = new BPETokenizer(trainedModel);
+    final String unseen = getUnseenWord();
+
+    final String[] tokens = tokenizer.tokenize(unseen);
+
+    Assertions.assertTrue(tokens.length > 1,
+        "Unseen word '" + unseen + "' should be split into multiple subword 
tokens");
+    Assertions.assertEquals(unseen, String.join("", tokens),
+        "Concatenation of subword tokens must reconstruct the original word");
+  }
+
+  /**
+   * Tests that tokenizePos spans cover the full input text without gaps or 
overlaps
+   * and that reconstructed words match the original sentence.
+   */
+  @Test
+  void testTokenizePosSpanCoverage() {
+    final BPETokenizer tokenizer = new BPETokenizer(trainedModel);
+    final String text = getSpanTestSentence();
+    final String[] tokens = tokenizer.tokenize(text);
+    final Span[] spans = tokenizer.tokenizePos(text);
+
+    // Verify all spans extract non-empty substrings
+    for (final Span span : spans) {
+      final CharSequence covered = span.getCoveredText(text);
+      Assertions.assertNotNull(covered);
+      Assertions.assertFalse(covered.toString().isEmpty());
+    }
+
+    // Verify that spans + whitespace fully reconstruct the original text
+    final StringBuilder sb = new StringBuilder();
+    int lastEnd = 0;
+    for (final Span span : spans) {
+      if (span.getStart() > lastEnd) {
+        sb.append(text, lastEnd, span.getStart());
+      }
+      sb.append(span.getCoveredText(text));
+      lastEnd = span.getEnd();
+    }
+    Assertions.assertEquals(text, sb.toString());
+
+    // Verify reconstructed words match expected
+    final String[] words = reconstructWords(tokens, spans, text);
+    Assertions.assertArrayEquals(getSpanTestExpectedWords(), words);
+  }
+
+  /**
+   * Tests that the BPE tokenizer handles multi-word input correctly.
+   */
+  @Test
+  void testTokenizer() {
+    final BPETokenizer tokenizer = new BPETokenizer(trainedModel);
+    final String sentence = getMultiWordSentence();
+    final String[] tokens = tokenizer.tokenize(sentence);
+
+    // Each word produces at least one token
+    final String[] words = sentence.split(" ");
+    Assertions.assertTrue(tokens.length >= words.length);
+
+    // Reconstruct each word from its subword tokens via spans
+    final Span[] spans = tokenizer.tokenizePos(sentence);
+    final String[] reconstructed = reconstructWords(tokens, spans, sentence);
+    Assertions.assertArrayEquals(words, reconstructed);
+  }
+
+  /**
+   * Tests the full pipeline: train, serialize, deserialize, tokenize.
+   */
+  @Test
+  void testTrainSerializeDeserializeTokenize() throws IOException {
+    // Serialize
+    try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
+      trainedModel.serialize(out);
+
+      // Deserialize
+      final BPEModel loaded = new BPEModel(new 
ByteArrayInputStream(out.toByteArray()));
+
+      // Tokenize with both original and deserialized model -- results should 
match
+      final BPETokenizer original = new BPETokenizer(trainedModel);
+      final BPETokenizer restored = new BPETokenizer(loaded);
+
+      final String sentence = getSerializationTestSentence();
+      Assertions.assertArrayEquals(
+          original.tokenize(sentence),
+          restored.tokenize(sentence));
+    }
+  }
+
+  /**
+   * Tests that the BPE tokenizer fulfills the {@link Tokenizer} contract:
+   * tokenize() and tokenizePos() must be consistent.
+   */
+  @Test
+  void testTokenizeAndTokenizePosConsistency() {
+    final BPETokenizer tokenizer = new BPETokenizer(trainedModel);
+    final String text = getConsistencyTestSentence();
+
+    final String[] tokens = tokenizer.tokenize(text);
+    final Span[] spans = tokenizer.tokenizePos(text);
+
+    Assertions.assertEquals(tokens.length, spans.length);
+
+    for (int i = 0; i < tokens.length; i++) {
+      Assertions.assertEquals(tokens[i], 
spans[i].getCoveredText(text).toString(),
+          "Token at index " + i + " should match span-covered text");
+    }
+  }
+
+  /**
+   * Tests that the BPE tokenizer handles punctuation mixed with words.
+   * BPE treats punctuation as characters -- they stay attached to the word
+   * since BPE splits on whitespace first.
+   */
+  @Test
+  void testPunctuationHandling() {
+    final BPETokenizer tokenizer = new BPETokenizer(trainedModel);
+    final String text = getPunctuationTestSentence();
+    final String[] expectedWords = getExpectedPunctuationWords();
+
+    final String[] tokens = tokenizer.tokenize(text);
+    final Span[] spans = tokenizer.tokenizePos(text);
+
+    final String[] words = reconstructWords(tokens, spans, text);
+    Assertions.assertEquals(expectedWords.length, words.length);
+    Assertions.assertArrayEquals(expectedWords, words);
+  }
+
+  /**
+   * Tests that training with a larger number of merges produces
+   * coarser tokenization (fewer subword tokens per word).
+   */
+  @Test
+  void testMoreMergesProducesCoarserTokens() {
+    final List<String> corpus = getTrainingCorpus();
+    final String lang = getLanguageCode();
+
+    final BPEModel fewMerges = new BPETokenizerTrainer().train(corpus, 5, 
lang);
+    final BPEModel manyMerges = new BPETokenizerTrainer().train(corpus, 100, 
lang);
+
+    final BPETokenizer fewTokenizer = new BPETokenizer(fewMerges);
+    final BPETokenizer manyTokenizer = new BPETokenizer(manyMerges);
+
+    final String text = getCoarseTokenizationSentence();
+    final int fewCount = fewTokenizer.tokenize(text).length;
+    final int manyCount = manyTokenizer.tokenize(text).length;
+
+    Assertions.assertTrue(manyCount <= fewCount,
+        "More merges (" + manyCount + " tokens) should produce fewer or equal 
tokens "
+            + "than fewer merges (" + fewCount + " tokens)");
+  }
+
+  /**
+   * Reconstructs whitespace-separated words from subword tokens using span 
positions.
+   */
+  String[] reconstructWords(String[] tokens, Span[] spans, String text) {
+    final List<String> words = new ArrayList<>();
+    final StringBuilder currentWord = new StringBuilder();
+    int lastWordEnd = -1;
+
+    for (final Span span : spans) {
+      if (lastWordEnd >= 0 && span.getStart() > lastWordEnd) {
+        // Gap between spans means a whitespace boundary -- new word
+        words.add(currentWord.toString());
+        currentWord.setLength(0);
+      }
+      currentWord.append(span.getCoveredText(text));
+      lastWordEnd = span.getEnd();
+    }
+    if (!currentWord.isEmpty()) {
+      words.add(currentWord.toString());
+    }
+
+    return words.toArray(new String[0]);
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPEModelDeTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPEModelDeTest.java
new file mode 100644
index 00000000..86b35358
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPEModelDeTest.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.List;
+
+/**
+ * German-specific tests for the {@link BPEModel} class.
+ *
+ * @see AbstractBPEModelTest
+ * @see BPEModel
+ */
+public class BPEModelDeTest extends AbstractBPEModelTest {
+
+  private static final List<String> CORPUS = List.of(
+      "Der schnelle braune Fuchs springt über den faulen Hund, "
+          + "der gerade schlief; er hat ihn nicht kommen sehen",
+      "Natürliche Sprachverarbeitung ist faszinierend: "
+          + "sie verbindet Linguistik, Informatik und Statistik",
+      "Nachdem der Regen aufgehört hatte, kam die Sonne heraus; "
+          + "die Kinder spielten draußen und die Vögel sangen laut"
+  );
+
+  @Override
+  protected List<String> getCorpus() {
+    return CORPUS;
+  }
+
+  @Override
+  protected String getLanguageCode() {
+    return "de";
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPEModelEnTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPEModelEnTest.java
new file mode 100644
index 00000000..123bb3a6
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPEModelEnTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.List;
+
+/**
+ * English-specific tests for the {@link BPEModel} class.
+ *
+ * @see AbstractBPEModelTest
+ * @see BPEModel
+ */
+public class BPEModelEnTest extends AbstractBPEModelTest {
+
+  private static final List<String> CORPUS = List.of(
+      "The quick brown fox jumps over the lazy dog, which was sleeping; it 
never saw the fox coming",
+      "Natural language processing is fascinating: it combines linguistics, 
computer science, and statistics",
+      "After the rain stopped, the sun came out; the children played outside, 
and the birds sang loudly"
+  );
+
+  @Override
+  protected List<String> getCorpus() {
+    return CORPUS;
+  }
+
+  @Override
+  protected String getLanguageCode() {
+    return "en";
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPEModelFrTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPEModelFrTest.java
new file mode 100644
index 00000000..48a3fbb0
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPEModelFrTest.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.List;
+
+/**
+ * French-specific tests for the {@link BPEModel} class.
+ *
+ * @see AbstractBPEModelTest
+ * @see BPEModel
+ */
+public class BPEModelFrTest extends AbstractBPEModelTest {
+
+  private static final List<String> CORPUS = List.of(
+      "Le renard brun rapide saute par-dessus le chien paresseux, "
+          + "qui dormait; il ne l'a jamais vu venir",
+      "Le traitement du langage naturel est fascinant: "
+          + "il combine la linguistique et les statistiques",
+      "Après la pluie, le soleil est apparu; les enfants ont joué dehors, et 
les oiseaux ont chanté fort"
+  );
+
+  @Override
+  protected List<String> getCorpus() {
+    return CORPUS;
+  }
+
+  @Override
+  protected String getLanguageCode() {
+    return "fr";
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerFactoryTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerFactoryTest.java
new file mode 100644
index 00000000..4ca60e1f
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerFactoryTest.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.util.List;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.tokenize.BPETokenizer.SymbolPair;
+
+/**
+ * Tests for the {@link BPETokenizerFactory} class.
+ * <p>
+ * Verifies that the factory correctly manages BPE merge rules artifacts,
+ * serializers, and that properties survive model serialization roundtrips.
+ *
+ * @see BPETokenizerFactory
+ * @see BPEModel
+ */
+public class BPETokenizerFactoryTest {
+
+  private static final List<String> CORPUS = List.of(
+      "low low low low low",
+      "lower lower lower",
+      "newest newest newest newest"
+  );
+
+  /**
+   * Tests that the factory provides merge rules after training.
+   */
+  @Test
+  void testFactoryProvidesMerges() {
+    final BPEModel model = new BPETokenizerTrainer().train(CORPUS, 10, "en");
+    final BPETokenizerFactory factory = model.getFactory();
+
+    Assertions.assertNotNull(factory);
+    Assertions.assertNotNull(model.getMerges());
+    Assertions.assertFalse(model.getMerges().isEmpty());
+  }
+
+  /**
+   * Tests that the factory language code is set correctly.
+   */
+  @Test
+  void testFactoryLanguageCode() {
+    final BPETokenizerFactory factory = new BPETokenizerFactory("de");
+
+    Assertions.assertEquals("de", factory.getLanguageCode());
+  }
+
+  /**
+   * Tests that merge rules are accessible from the factory after
+   * model serialization and deserialization.
+   */
+  @Test
+  void testFactorySurvivesSerialization() throws IOException {
+    final BPEModel original = new BPETokenizerTrainer().train(CORPUS, 10, 
"en");
+
+    final ByteArrayOutputStream out = new ByteArrayOutputStream();
+    original.serialize(out);
+
+    final BPEModel restored = new BPEModel(new 
ByteArrayInputStream(out.toByteArray()));
+    final BPETokenizerFactory factory = restored.getFactory();
+
+    Assertions.assertNotNull(factory);
+    Assertions.assertNotNull(restored.getMerges());
+    Assertions.assertEquals(original.getMerges().size(), 
restored.getMerges().size());
+  }
+
+  /**
+   * Tests that the factory merges are consistent between direct construction
+   * and deserialized access.
+   */
+  @Test
+  void testMergesConsistentAfterRoundtrip() throws IOException {
+    final BPEModel original = new BPETokenizerTrainer().train(CORPUS, 5, "en");
+    final List<SymbolPair> originalMerges = original.getMerges();
+
+    final ByteArrayOutputStream out = new ByteArrayOutputStream();
+    original.serialize(out);
+
+    final BPEModel restored = new BPEModel(new 
ByteArrayInputStream(out.toByteArray()));
+    final List<SymbolPair> restoredMerges = restored.getMerges();
+
+    Assertions.assertEquals(originalMerges, restoredMerges);
+  }
+
+  /**
+   * Tests that the factory creates the correct artifact serializer map.
+   */
+  @Test
+  void testArtifactSerializersMapContainsMergesSerializer() {
+    final BPETokenizerFactory factory = new BPETokenizerFactory("en");
+
+    
Assertions.assertTrue(factory.createArtifactSerializersMap().containsKey("merges"));
+  }
+
+  /**
+   * Tests that the model artifact map contains the merges entry.
+   */
+  @Test
+  void testArtifactMapContainsMergesEntry() {
+    final BPEModel model =
+        new BPETokenizerTrainer().train(CORPUS, 5, "en");
+
+    Assertions.assertNotNull(model.getMerges());
+    Assertions.assertFalse(model.getMerges().isEmpty());
+  }
+
+  /**
+   * Tests that the empty constructor creates a valid factory (for model 
loading).
+   */
+  @Test
+  void testEmptyConstructor() {
+    final BPETokenizerFactory factory = new BPETokenizerFactory();
+
+    // Empty factory should not throw
+    Assertions.assertNotNull(factory);
+    Assertions.assertNotNull(factory.createArtifactSerializersMap());
+  }
+
+  /**
+   * Tests null parameter validation.
+   */
+  @Test
+  void testNullLanguageCodeThrows() {
+    Assertions.assertThrows(IllegalArgumentException.class,
+        () -> new BPETokenizerFactory(null));
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticDeTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticDeTest.java
new file mode 100644
index 00000000..95e853c5
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticDeTest.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.List;
+
+/**
+ * German-language realistic BPE tokenization integration tests.
+ *
+ * @see AbstractBPETokenizerRealisticTest
+ */
+public class BPETokenizerRealisticDeTest extends 
AbstractBPETokenizerRealisticTest {
+
+  @Override
+  List<String> getTrainingCorpus() {
+    return List.of(
+        "Ich habe gestern einen alten Schulfreund in der Stadt getroffen",
+        "Er hat mir von seiner neuen Arbeit in Berlin erzählt",
+        "Die Arbeit ist sehr interessant und er ist sehr zufrieden",
+        "Wir haben zusammen in einem kleinen Restaurant zu Mittag gegessen",
+        "Das Essen war ausgezeichnet und die Bedienung sehr freundlich",
+        "Nach dem Essen sind wir durch den Park spazieren gegangen",
+        "Der Park war sehr schön und die Bäume hatten bunte Blätter",
+        "Am Abend haben wir uns einen Film im Kino angesehen",
+        "Der Film war spannend und hat uns beiden sehr gut gefallen",
+        "Danach sind wir noch in eine Bar gegangen und haben geredet",
+        "Er hat mir von seiner Reise nach Italien erzählt",
+        "Die Reise war wunderbar und er hat viele Fotos gemacht",
+        "Ich habe ihm von meiner Arbeit an die Monographie erzählt",
+        "Die Monographie behandelt die Geschichte der botanischen Forschung",
+        "Er fand das Thema sehr interessant und wollte mehr erfahren",
+        "Wir haben uns verabredet nächste Woche wieder zu treffen",
+        "Ich freue mich schon sehr auf unser nächstes Treffen",
+        "Die Stadt ist im Herbst besonders schön mit den bunten Blättern",
+        "Mein Freund wohnt jetzt in der Nähe vom Hauptbahnhof",
+        "Er nimmt jeden Tag die Bahn zur Arbeit in die Innenstadt"
+    );
+  }
+
+  @Override
+  String getLanguageCode() {
+    return "de";
+  }
+
+  @Override
+  String getSimpleSentence() {
+    return "Die Arbeit ist sehr interessant";
+  }
+
+  @Override
+  String[] getSimpleSentenceExpectedWords() {
+    return new String[] {"Die", "Arbeit", "ist", "sehr", "interessant"};
+  }
+
+  @Override
+  List<String> getFrequentWords() {
+    return List.of("die", "und", "er");
+  }
+
+  @Override
+  String getUnseenWord() {
+    return "Wissenschaftler";
+  }
+
+  @Override
+  String getSpanTestSentence() {
+    return "die Monographie behandelt die Geschichte";
+  }
+
+  @Override
+  String[] getSpanTestExpectedWords() {
+    return new String[] {"die", "Monographie", "behandelt", "die", 
"Geschichte"};
+  }
+
+  @Override
+  String getMultiWordSentence() {
+    return "Er hat mir von seiner Reise erzählt";
+  }
+
+  @Override
+  String getSerializationTestSentence() {
+    return "Wir haben zusammen in einem Restaurant gegessen";
+  }
+
+  @Override
+  String getConsistencyTestSentence() {
+    return "Der Park war sehr schön und die Bäume hatten bunte Blätter";
+  }
+
+  @Override
+  String getPunctuationTestSentence() {
+    return "Hallo, Welt!";
+  }
+
+  @Override
+  String[] getExpectedPunctuationWords() {
+    return new String[] {"Hallo,", "Welt!"};
+  }
+
+  @Override
+  String getCoarseTokenizationSentence() {
+    return "Ich habe ihm von meiner Arbeit an die Monographie erzählt";
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticEnTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticEnTest.java
new file mode 100644
index 00000000..f8abde9d
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticEnTest.java
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.List;
+
+/**
+ * English-language realistic BPE tokenization integration tests.
+ *
+ * @see AbstractBPETokenizerRealisticTest
+ */
+public class BPETokenizerRealisticEnTest extends 
AbstractBPETokenizerRealisticTest {
+
+  @Override
+  List<String> getTrainingCorpus() {
+    return List.of(
+        "Last September I tried to find out the address of an old school 
friend",
+        "whom I had not seen for 15 years",
+        "I just knew his name Alan McKennedy and I had heard the rumour",
+        "that he had moved to Scotland the country of his ancestors",
+        "So I called Julie a friend who is still in contact with him",
+        "She told me that he lived in Edinburgh Worcesterstreet 12",
+        "I wrote him a letter right away and he answered soon",
+        "sounding very happy and delighted",
+        "Last year I wanted to write a letter to my grandaunt",
+        "Her 86th birthday was on October 6 and I no longer wanted",
+        "to be hesitant to get in touch with her",
+        "I did not know her face to face and so it was not easy",
+        "for me to find out her address",
+        "As she had two apartments in different countries",
+        "I decided to write to both",
+        "The first was in Paris in Rue de Grandes Illusions 5",
+        "But Marie Clara as my aunt is called preferred her apartment in 
Berlin",
+        "She lived there in beautiful Kaiserstrasse 13 particularly in summer",
+        "Hi my name is Michael Graf how much is a taxi",
+        "from Ostbahnhof to Hauptbahnhof",
+        "About 10 Euro I reckon",
+        "That sounds good",
+        "So please call a driver to Leonardstrasse 112 near the Ostbahnhof",
+        "I would like to be at Silberhornstrasse 12 as soon as possible",
+        "Thank you very much"
+    );
+  }
+
+  @Override
+  String getLanguageCode() {
+    return "en";
+  }
+
+  @Override
+  String getSimpleSentence() {
+    return "I wrote a letter";
+  }
+
+  @Override
+  String[] getSimpleSentenceExpectedWords() {
+    return new String[] {"I", "wrote", "a", "letter"};
+  }
+
+  @Override
+  List<String> getFrequentWords() {
+    return List.of("the", "in");
+  }
+
+  @Override
+  String getUnseenWord() {
+    return "unbelievable";
+  }
+
+  @Override
+  String getSpanTestSentence() {
+    return "She lived in Edinburgh";
+  }
+
+  @Override
+  String[] getSpanTestExpectedWords() {
+    return new String[] {"She", "lived", "in", "Edinburgh"};
+  }
+
+  @Override
+  String getMultiWordSentence() {
+    return "I had not seen him for years";
+  }
+
+  @Override
+  String getSerializationTestSentence() {
+    return "I wrote him a letter right away";
+  }
+
+  @Override
+  String getConsistencyTestSentence() {
+    return "She told me that he lived in Edinburgh";
+  }
+
+  @Override
+  String getPunctuationTestSentence() {
+    return "Hello, world!";
+  }
+
+  @Override
+  String[] getExpectedPunctuationWords() {
+    return new String[] {"Hello,", "world!"};
+  }
+
+  @Override
+  String getCoarseTokenizationSentence() {
+    return "I wanted to write a letter to my grandaunt";
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticEsTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticEsTest.java
new file mode 100644
index 00000000..23be28db
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticEsTest.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.List;
+
+/**
+ * Spanish-language realistic BPE tokenization integration tests.
+ *
+ * @see AbstractBPETokenizerRealisticTest
+ */
+public class BPETokenizerRealisticEsTest
+    extends AbstractBPETokenizerRealisticTest {
+
+  @Override
+  List<String> getTrainingCorpus() {
+    return List.of(
+        "Ayer fui al mercado para comprar frutas y"
+            + " verduras frescas",
+        "El mercado estaba lleno de gente y los precios"
+            + " eran buenos",
+        "Las manzanas y las naranjas estaban muy frescas"
+            + " y baratas",
+        "Volví a casa y preparé una comida muy buena"
+            + " para todos",
+        "La comida estaba deliciosa y toda la familia"
+            + " estaba contenta",
+        "Después de la comida hicimos un paseo por el"
+            + " parque grande",
+        "El parque estaba muy bonito con los árboles"
+            + " en flor",
+        "Los niños jugaban en el jardín y los pájaros"
+            + " cantaban",
+        "Por la noche vimos una película muy buena"
+            + " en la televisión",
+        "La película era muy interesante y nos gustó"
+            + " mucho a todos",
+        "Mi amigo Carlos vive en una casa grande"
+            + " en Madrid",
+        "Él trabaja en una empresa de tecnología"
+            + " desde hace cinco años",
+        "Su esposa María es profesora en la universidad"
+            + " central",
+        "Tienen dos hijos que van a una escuela cerca"
+            + " de la casa",
+        "Los fines de semana les gusta hacer excursiones"
+            + " por el campo",
+        "Madrid es una ciudad muy bonita con una"
+            + " historia muy rica",
+        "La cocina española es conocida en todo el"
+            + " mundo por su calidad",
+        "Los museos de Madrid atraen a millones de"
+            + " visitantes cada año",
+        "El Prado es el museo más visitado de toda"
+            + " la ciudad",
+        "La vida en España es muy agradable y el clima"
+            + " es muy bueno"
+    );
+  }
+
+  @Override
+  String getLanguageCode() {
+    return "es";
+  }
+
+  @Override
+  String getSimpleSentence() {
+    return "La comida estaba deliciosa";
+  }
+
+  @Override
+  String[] getSimpleSentenceExpectedWords() {
+    return new String[] {
+        "La", "comida", "estaba", "deliciosa"
+    };
+  }
+
+  @Override
+  List<String> getFrequentWords() {
+    return List.of("muy", "los", "en");
+  }
+
+  @Override
+  String getUnseenWord() {
+    return "impresionante";
+  }
+
+  @Override
+  String getSpanTestSentence() {
+    return "Los niños jugaban en el jardín";
+  }
+
+  @Override
+  String[] getSpanTestExpectedWords() {
+    return new String[] {
+        "Los", "niños", "jugaban", "en", "el", "jardín"
+    };
+  }
+
+  @Override
+  String getMultiWordSentence() {
+    return "El parque estaba muy bonito con los árboles"
+        + " en flor";
+  }
+
+  @Override
+  String getSerializationTestSentence() {
+    return "Ayer fui al mercado para comprar frutas"
+        + " y verduras";
+  }
+
+  @Override
+  String getConsistencyTestSentence() {
+    return "Mi amigo Carlos vive en una casa grande"
+        + " en Madrid";
+  }
+
+  @Override
+  String getPunctuationTestSentence() {
+    return "Hola, mundo!";
+  }
+
+  @Override
+  String[] getExpectedPunctuationWords() {
+    return new String[] {"Hola,", "mundo!"};
+  }
+
+  @Override
+  String getCoarseTokenizationSentence() {
+    return "La cocina española es conocida en todo"
+        + " el mundo";
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticFrTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticFrTest.java
new file mode 100644
index 00000000..3f3eda64
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticFrTest.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.List;
+
+/**
+ * French-language realistic BPE tokenization integration tests.
+ *
+ * @see AbstractBPETokenizerRealisticTest
+ */
+public class BPETokenizerRealisticFrTest
+    extends AbstractBPETokenizerRealisticTest {
+
+  @Override
+  List<String> getTrainingCorpus() {
+    return List.of(
+        "Hier je suis allé au marché pour acheter des fruits"
+            + " et des légumes",
+        "Le marché était plein de monde et les prix"
+            + " étaient raisonnables",
+        "Les pommes et les oranges étaient"
+            + " particulièrement fraîches",
+        "Je suis rentré à la maison et j'ai préparé"
+            + " un bon repas",
+        "Le repas était délicieux et toute la famille"
+            + " était contente",
+        "Après le repas nous avons fait une promenade"
+            + " dans le parc",
+        "Le parc était magnifique avec les arbres"
+            + " en fleurs",
+        "Les enfants jouaient dans le jardin et les"
+            + " oiseaux chantaient",
+        "Le soir nous avons regardé un film"
+            + " à la télévision",
+        "Le film était très intéressant et nous avons"
+            + " bien aimé",
+        "Mon ami Pierre habite dans une grande maison"
+            + " à Paris",
+        "Il travaille dans une entreprise de technologie"
+            + " depuis cinq ans",
+        "Sa femme Marie est professeur à une"
+            + " université",
+        "Ils ont deux enfants qui vont à une école"
+            + " près de la maison",
+        "Le weekend ils aiment faire des randonnées"
+            + " dans la campagne",
+        "La France est un beau pays avec une riche"
+            + " histoire",
+        "Paris est la capitale et la plus grande ville"
+            + " du pays",
+        "La cuisine française est connue dans le monde"
+            + " entier",
+        "Les musées de Paris attirent des millions"
+            + " de visiteurs chaque année",
+        "La Tour Eiffel est le monument le plus visité"
+            + " de France"
+    );
+  }
+
+  @Override
+  String getLanguageCode() {
+    return "fr";
+  }
+
+  @Override
+  String getSimpleSentence() {
+    return "Le repas était délicieux";
+  }
+
+  @Override
+  String[] getSimpleSentenceExpectedWords() {
+    return new String[] {"Le", "repas", "était", "délicieux"};
+  }
+
+  @Override
+  List<String> getFrequentWords() {
+    return List.of("les", "dans", "le");
+  }
+
+  @Override
+  String getUnseenWord() {
+    return "extraordinaire";
+  }
+
+  @Override
+  String getSpanTestSentence() {
+    return "Les enfants jouaient dans le jardin";
+  }
+
+  @Override
+  String[] getSpanTestExpectedWords() {
+    return new String[] {
+        "Les", "enfants", "jouaient", "dans", "le", "jardin"
+    };
+  }
+
+  @Override
+  String getMultiWordSentence() {
+    return "Le parc était magnifique avec les arbres"
+        + " en fleurs";
+  }
+
+  @Override
+  String getSerializationTestSentence() {
+    return "Je suis allé au marché pour acheter"
+        + " des fruits";
+  }
+
+  @Override
+  String getConsistencyTestSentence() {
+    return "Mon ami Pierre habite dans une grande"
+        + " maison à Paris";
+  }
+
+  @Override
+  String getPunctuationTestSentence() {
+    return "Bonjour, monde!";
+  }
+
+  @Override
+  String[] getExpectedPunctuationWords() {
+    return new String[] {"Bonjour,", "monde!"};
+  }
+
+  @Override
+  String getCoarseTokenizationSentence() {
+    return "La cuisine française est connue dans le"
+        + " monde entier";
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticItTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticItTest.java
new file mode 100644
index 00000000..f6bd50a4
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerRealisticItTest.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.List;
+
+/**
+ * Italian-language realistic BPE tokenization integration tests.
+ *
+ * @see AbstractBPETokenizerRealisticTest
+ */
+public class BPETokenizerRealisticItTest extends 
AbstractBPETokenizerRealisticTest {
+
+  @Override
+  List<String> getTrainingCorpus() {
+    return List.of(
+        "Ieri sono andato al mercato per comprare della frutta e della 
verdura",
+        "Il mercato era pieno di gente e i prezzi erano ragionevoli",
+        "Le mele e le arance erano particolarmente fresche e buone",
+        "Sono tornato a casa e ho preparato un buon pranzo",
+        "Il pranzo era delizioso e tutta la famiglia era contenta",
+        "Dopo il pranzo abbiamo fatto una passeggiata nel parco",
+        "Il parco era bellissimo con gli alberi in fiore",
+        "I bambini giocavano nel giardino e gli uccelli cantavano",
+        "La sera abbiamo guardato un film alla televisione",
+        "Il film era molto interessante e ci e piaciuto tanto",
+        "Il mio amico Marco abita in una grande casa a Roma",
+        "Lui lavora in una azienda di tecnologia da cinque anni",
+        "Sua moglie Giulia e professoressa alla universita",
+        "Hanno due bambini che vanno a una scuola vicino a casa",
+        "Nel fine settimana amano fare delle escursioni in campagna",
+        "Roma e una citta bellissima con una storia molto ricca",
+        "La cucina italiana e conosciuta in tutto il mondo",
+        "I musei di Roma attraggono milioni di visitatori ogni anno",
+        "Il Colosseo e il monumento piu visitato di Roma",
+        "La vita in Italia e molto piacevole e rilassante"
+    );
+  }
+
+  @Override
+  String getLanguageCode() {
+    return "it";
+  }
+
+  @Override
+  String getSimpleSentence() {
+    return "Il pranzo era delizioso";
+  }
+
+  @Override
+  String[] getSimpleSentenceExpectedWords() {
+    return new String[] {"Il", "pranzo", "era", "delizioso"};
+  }
+
+  @Override
+  List<String> getFrequentWords() {
+    return List.of("il", "una", "in");
+  }
+
+  @Override
+  String getUnseenWord() {
+    return "straordinario";
+  }
+
+  @Override
+  String getSpanTestSentence() {
+    return "I bambini giocavano nel giardino";
+  }
+
+  @Override
+  String[] getSpanTestExpectedWords() {
+    return new String[] {"I", "bambini", "giocavano", "nel", "giardino"};
+  }
+
+  @Override
+  String getMultiWordSentence() {
+    return "Il parco era bellissimo con gli alberi in fiore";
+  }
+
+  @Override
+  String getSerializationTestSentence() {
+    return "Sono andato al mercato per comprare della frutta";
+  }
+
+  @Override
+  String getConsistencyTestSentence() {
+    return "Il mio amico Marco abita in una grande casa a Roma";
+  }
+
+  @Override
+  String getPunctuationTestSentence() {
+    return "Ciao, mondo!";
+  }
+
+  @Override
+  String[] getExpectedPunctuationWords() {
+    return new String[] {"Ciao,", "mondo!"};
+  }
+
+  @Override
+  String getCoarseTokenizationSentence() {
+    return "La cucina italiana e conosciuta in tutto il mondo";
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerTest.java
new file mode 100644
index 00000000..2666683c
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerTest.java
@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.HashMap;
+import java.util.List;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.tokenize.BPETokenizer.SymbolPair;
+import opennlp.tools.util.Span;
+
+/**
+ * Tests for the {@link BPETokenizer} class.
+ * <p>
+ * Verifies that BPE tokenization correctly splits text into subword tokens
+ * based on learned merge operations, and that span positions map back to
+ * the original text.
+ *
+ * @see BPETokenizer
+ */
+public class BPETokenizerTest {
+
+  private static BPEModel createModel(List<SymbolPair> merges) {
+    final BPETokenizerFactory factory = new BPETokenizerFactory("en");
+    return new BPEModel(merges, new HashMap<>(), factory);
+  }
+
+  /**
+   * Tests that a fully merged word produces a single token.
+   */
+  @Test
+  void testBasicBPETokenization() {
+    final List<SymbolPair> merges = List.of(
+        new SymbolPair("l", "o"),
+        new SymbolPair("lo", "w" + BPETokenizer.END_OF_WORD),
+        new SymbolPair("e", "r" + BPETokenizer.END_OF_WORD)
+    );
+
+    final BPETokenizer tokenizer = new BPETokenizer(createModel(merges));
+    final String[] tokens = tokenizer.tokenize("low");
+
+    Assertions.assertArrayEquals(new String[]{"low"}, tokens);
+  }
+
+  /**
+   * Tests that a word not fully covered by merges is split into subword 
tokens.
+   */
+  @Test
+  void testSubwordSplitting() {
+    final List<SymbolPair> merges = List.of(
+        new SymbolPair("l", "o"),
+        new SymbolPair("lo", "w" + BPETokenizer.END_OF_WORD)
+    );
+
+    final BPETokenizer tokenizer = new BPETokenizer(createModel(merges));
+    final String[] tokens = tokenizer.tokenize("lower");
+
+    // "lower" cannot fully merge since "w" is not word-final here
+    Assertions.assertTrue(tokens.length > 1);
+    Assertions.assertEquals("lower", String.join("", tokens));
+  }
+
+  /**
+   * Tests tokenization of multiple whitespace-separated words.
+   */
+  @Test
+  void testMultipleWords() {
+    final List<SymbolPair> merges = List.of(
+        new SymbolPair("l", "o"),
+        new SymbolPair("lo", "w" + BPETokenizer.END_OF_WORD)
+    );
+
+    final BPETokenizer tokenizer = new BPETokenizer(createModel(merges));
+    final String[] tokens = tokenizer.tokenize("low low");
+
+    Assertions.assertEquals(2, tokens.length);
+    Assertions.assertEquals("low", tokens[0]);
+    Assertions.assertEquals("low", tokens[1]);
+  }
+
+  /**
+   * Tests that empty and null input produce empty arrays.
+   */
+  @Test
+  void testEmptyInput() {
+    final BPETokenizer tokenizer = new BPETokenizer(createModel(List.of()));
+
+    Assertions.assertArrayEquals(new String[0], tokenizer.tokenize(""));
+    Assertions.assertArrayEquals(new String[0], tokenizer.tokenize(null));
+    Assertions.assertArrayEquals(new Span[0], tokenizer.tokenizePos(""));
+    Assertions.assertArrayEquals(new Span[0], tokenizer.tokenizePos(null));
+  }
+
+  /**
+   * Tests that with no merges, each character becomes a separate token.
+   */
+  @Test
+  void testNoMergesProducesCharacterTokens() {
+    final BPETokenizer tokenizer = new BPETokenizer(createModel(List.of()));
+    final String[] tokens = tokenizer.tokenize("hi");
+
+    Assertions.assertArrayEquals(new String[]{"h", "i"}, tokens);
+  }
+
+  /**
+   * Tests single-character word tokenization.
+   */
+  @Test
+  void testSingleCharacterWord() {
+    final BPETokenizer tokenizer = new BPETokenizer(createModel(List.of()));
+    final String[] tokens = tokenizer.tokenize("a");
+
+    Assertions.assertArrayEquals(new String[]{"a"}, tokens);
+  }
+
+  /**
+   * Tests that {@link BPETokenizer#tokenizePos(String)} returns correct spans
+   * that map back to the original text.
+   */
+  @Test
+  void testTokenizePos() {
+    final List<SymbolPair> merges = List.of(
+        new SymbolPair("l", "o"),
+        new SymbolPair("lo", "w" + BPETokenizer.END_OF_WORD)
+    );
+
+    final BPETokenizer tokenizer = new BPETokenizer(createModel(merges));
+    final String text = "low hi";
+    final Span[] spans = tokenizer.tokenizePos(text);
+
+    // "low" -> 1 token, "hi" -> 2 tokens (no merges for h, i)
+    Assertions.assertEquals(3, spans.length);
+    Assertions.assertEquals(0, spans[0].getStart());
+    Assertions.assertEquals(3, spans[0].getEnd());
+    Assertions.assertEquals("low", spans[0].getCoveredText(text));
+    // "h"
+    Assertions.assertEquals(4, spans[1].getStart());
+    Assertions.assertEquals(5, spans[1].getEnd());
+    Assertions.assertEquals("h", spans[1].getCoveredText(text));
+    // "i"
+    Assertions.assertEquals(5, spans[2].getStart());
+    Assertions.assertEquals(6, spans[2].getEnd());
+    Assertions.assertEquals("i", spans[2].getCoveredText(text));
+  }
+
+  /**
+   * Tests that span offsets are correct for subword-split words.
+   */
+  @Test
+  void testTokenizePosWithSubwords() {
+    final BPETokenizer tokenizer = new BPETokenizer(createModel(List.of()));
+    final String text = "ab cd";
+    final Span[] spans = tokenizer.tokenizePos(text);
+
+    // "ab" -> a, b; "cd" -> c, d
+    Assertions.assertEquals(4, spans.length);
+    Assertions.assertEquals("a", spans[0].getCoveredText(text));
+    Assertions.assertEquals("b", spans[1].getCoveredText(text));
+    Assertions.assertEquals("c", spans[2].getCoveredText(text));
+    Assertions.assertEquals("d", spans[3].getCoveredText(text));
+  }
+
+  /**
+   * Tests that concatenating all tokens reconstructs the original word.
+   */
+  @Test
+  void testTokenConcatenationEqualsOriginal() {
+    final List<SymbolPair> merges = List.of(
+        new SymbolPair("l", "o"),
+        new SymbolPair("lo", "w" + BPETokenizer.END_OF_WORD)
+    );
+
+    final BPETokenizer tokenizer = new BPETokenizer(createModel(merges));
+    final String[] tokens = tokenizer.tokenize("lower");
+
+    Assertions.assertEquals("lower", String.join("", tokens));
+  }
+
+  /**
+   * Tests that a null model throws IllegalArgumentException.
+   */
+  @Test
+  void testNullModelThrows() {
+    Assertions.assertThrows(IllegalArgumentException.class, () -> new 
BPETokenizer(null));
+  }
+
+  @Test
+  void testSymbolPairNullLeftThrows() {
+    Assertions.assertThrows(IllegalArgumentException.class, () -> new 
SymbolPair(null, "b"));
+  }
+
+  @Test
+  void testSymbolPairNullRightThrows() {
+    Assertions.assertThrows(IllegalArgumentException.class, () -> new 
SymbolPair("a", null));
+  }
+
+  @Test
+  void testSymbolPairEquality() {
+    final SymbolPair a = new SymbolPair("lo", "w");
+    final SymbolPair b = new SymbolPair("lo", "w");
+    final SymbolPair c = new SymbolPair("l", "ow");
+
+    Assertions.assertEquals(a, b);
+    Assertions.assertEquals(a.hashCode(), b.hashCode());
+    Assertions.assertNotEquals(a, c);
+  }
+
+  @Test
+  void testSymbolPairToString() {
+    final SymbolPair pair = new SymbolPair("lo", "w");
+    Assertions.assertEquals("lo w", pair.toString());
+  }
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerTrainerTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerTrainerTest.java
new file mode 100644
index 00000000..0e5dba45
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BPETokenizerTrainerTest.java
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Tests for the {@link BPETokenizerTrainer} class.
+ * <p>
+ * Verifies that BPE merge operations are learned correctly from
+ * a training corpus and that the resulting model can be used for tokenization.
+ *
+ * @see BPETokenizerTrainer
+ * @see BPEModel
+ */
+public class BPETokenizerTrainerTest {
+
+  private BPETokenizerTrainer trainer;
+
+  @BeforeEach
+  void setUp() {
+    trainer = new BPETokenizerTrainer();
+  }
+
+  /**
+   * Tests that training produces a non-null model with merge rules.
+   */
+  @Test
+  void testTrainProducesModel() {
+    final List<String> corpus = List.of(
+        "low low low low low",
+        "lower lower lower",
+        "newest newest newest newest",
+        "widest widest widest"
+    );
+
+    final BPEModel model = trainer.train(corpus, 10, "en");
+
+    Assertions.assertNotNull(model);
+    Assertions.assertFalse(model.getMerges().isEmpty());
+    Assertions.assertTrue(model.getMerges().size() <= 10);
+  }
+
+  /**
+   * Tests that the first merge is the most frequent adjacent pair.
+   * For the corpus "ab ab ab ...", the most frequent pair is ("a", 
"b&lt;/w&gt;").
+   */
+  @Test
+  void testFirstMergeIsMostFrequentPair() {
+    final List<String> corpus = List.of(
+        "ab ab ab ab ab ab ab ab ab ab"
+    );
+
+    final BPEModel model = trainer.train(corpus, 1, "en");
+
+    Assertions.assertEquals(1, model.getMerges().size());
+    Assertions.assertEquals("a", model.getMerges().getFirst().left());
+    Assertions.assertEquals("b" + BPETokenizer.END_OF_WORD, 
model.getMerges().getFirst().right());
+  }
+
+  /**
+   * Tests that requesting more merges than possible stops gracefully.
+   */
+  @Test
+  void testMoreMergesThanPossible() {
+    final List<String> corpus = List.of("ab");
+
+    // "ab" has only one possible pair: ("a", "b</w>")
+    final BPEModel model = trainer.train(corpus, 100, "en");
+
+    // Should stop after exhausting all possible merges
+    Assertions.assertTrue(model.getMerges().size() < 100);
+    Assertions.assertFalse(model.getMerges().isEmpty());
+  }
+
+  /**
+   * Tests that frequent words get merged into fewer tokens.
+   */
+  @Test
+  void testFrequentWordsProduceFewerTokens() {
+    final List<String> corpus = List.of(
+        "the the the the the the the the the the",
+        "the the the the the the the the the the",
+        "xyzzy"
+    );
+
+    final BPEModel model = trainer.train(corpus, 20, "en");
+    final BPETokenizer tokenizer = new BPETokenizer(model);
+
+    final String[] theTokens = tokenizer.tokenize("the");
+    final String[] xyzzyTokens = tokenizer.tokenize("xyzzy");
+
+    // "the" (very frequent) should have fewer or equal tokens compared to 
"xyzzy" (rare)
+    Assertions.assertTrue(theTokens.length <= xyzzyTokens.length,
+        "Expected 'the' (" + Arrays.toString(theTokens) + ") to have fewer 
tokens than 'xyzzy' ("
+            + Arrays.toString(xyzzyTokens) + ")");
+  }
+
+  /**
+   * Tests that the trained model produces a tokenizer that reconstructs
+   * the original words when tokens are concatenated.
+   */
+  @Test
+  void testTrainAndTokenizeRoundtrip() {
+    final List<String> corpus = List.of(
+        "the cat sat on the mat",
+        "the cat sat on the mat",
+        "the cat sat on the mat",
+        "the dog sat on the log",
+        "the dog sat on the log"
+    );
+
+    final BPEModel model = trainer.train(corpus, 20, "en");
+    final BPETokenizer tokenizer = new BPETokenizer(model);
+
+    // Verify token concatenation restores the original word
+    for (final String word : new String[]{"the", "cat", "sat", "dog"}) {
+      final String[] tokens = tokenizer.tokenize(word);
+      Assertions.assertEquals(word, String.join("", tokens),
+          "Token concatenation should reconstruct '" + word + "'");
+    }
+  }
+
+  /**
+   * Tests that training with an empty corpus produces a model with no merges.
+   */
+  @Test
+  void testEmptyCorpus() {
+    final BPEModel model = trainer.train(List.of(), 10, "en");
+
+    Assertions.assertNotNull(model);
+    Assertions.assertTrue(model.getMerges().isEmpty());
+  }
+
+  /**
+   * Tests that the language code is set on the produced model.
+   */
+  @Test
+  void testLanguageCodePreserved() {
+    final BPEModel model = trainer.train(List.of("hello world"), 5, "de");
+
+    Assertions.assertEquals("de", model.getLanguage());
+  }
+
+  @Test
+  void testNullCorpusThrows() {
+    Assertions.assertThrows(IllegalArgumentException.class,
+        () -> trainer.train(null, 10, "en"));
+  }
+
+  @Test
+  void testNullLanguageThrows() {
+    Assertions.assertThrows(IllegalArgumentException.class,
+        () -> trainer.train(List.of("hello"), 10, null));
+  }
+
+  @Test
+  void testZeroMergesThrows() {
+    Assertions.assertThrows(IllegalArgumentException.class,
+        () -> trainer.train(List.of("hello"), 0, "en"));
+  }
+
+  @Test
+  void testNegativeMergesThrows() {
+    Assertions.assertThrows(IllegalArgumentException.class,
+        () -> trainer.train(List.of("hello"), -1, "en"));
+  }
+}

(opennlp) branch main updated: OPENNLP-1220: Add support for Byte Pair Encoding (BPE) (#1011)

Reply via email to