This is an automated email from the ASF dual-hosted git repository. krickert pushed a commit to branch OPENNLP-1850-2-tokenizer in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit f70c1956a1c88c9db00f840e9a40b94c58b2fc11 Author: Kristian Rickert <[email protected]> AuthorDate: Sun Jun 21 23:23:43 2026 -0400 OPENNLP-1850 Clarify that Extended_Pictographic symbols are kept as emoji WordType classifies every Extended_Pictographic code point as EMOJI, which includes symbol-like characters (copyright, trademark, double-exclamation, arrows), so the word tokenizer keeps them rather than dropping them as punctuation. State this in the WordTokenizer javadoc and add a test. --- .../java/opennlp/tools/tokenize/uax29/WordTokenizer.java | 4 +++- .../java/opennlp/tools/tokenize/uax29/WordTokenizerTest.java | 12 ++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java index e09bbed8e..1f6707b13 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java @@ -26,7 +26,9 @@ import opennlp.tools.util.Span; * A word tokenizer built on the Unicode Text Segmentation algorithm (UAX #29). It finds segments * with {@link WordSegmenter}, keeps the ones that are words (letters, digits, ideographs, kana, * Hangul, Southeast-Asian script, or emoji), drops whitespace and punctuation, and classifies each - * kept token with a {@link WordType}. + * kept token with a {@link WordType}. Emoji here means any {@code Extended_Pictographic} code point, + * so symbol-like characters such as the copyright, trademark, and double-exclamation signs are kept + * (typed {@link WordType#EMOJI}) rather than dropped as punctuation. * * <p>A token longer than {@code maxTokenLength} is emitted as consecutive pieces, never splitting a * surrogate pair. The tokenizer reports offset {@link Span}s, so the original text and its character diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordTokenizerTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordTokenizerTest.java index 83e8c20fb..3d46ce870 100644 --- a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordTokenizerTest.java +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordTokenizerTest.java @@ -89,6 +89,18 @@ public class WordTokenizerTest { assertEquals(text, tokens.get(0).text(text)); } + @Test + void testExtendedPictographicSymbolsAreKeptAsEmoji() { + // Extended_Pictographic includes symbol-like characters (copyright U+00A9, trademark U+2122, + // double exclamation U+203C), which WordType classifies as EMOJI, so the tokenizer keeps them + // rather than dropping them as punctuation. + final String text = "a " + cp(0x00A9) + " " + cp(0x2122) + " " + cp(0x203C) + " b"; + final List<WordToken> tokens = TOKENIZER.tokenizeTyped(text); + assertEquals(List.of(WordType.ALPHANUMERIC, WordType.EMOJI, WordType.EMOJI, + WordType.EMOJI, WordType.ALPHANUMERIC), + tokens.stream().map(WordToken::type).toList()); + } + @Test void testHangulSyllablesStayTogether() { final String text = cp(0xAC00) + cp(0xB098); // ga + na
