This is an automated email from the ASF dual-hosted git repository.

krickert pushed a commit to branch OPENNLP-1850-2-tokenizer
in repository https://gitbox.apache.org/repos/asf/opennlp.git

commit f70c1956a1c88c9db00f840e9a40b94c58b2fc11
Author: Kristian Rickert <[email protected]>
AuthorDate: Sun Jun 21 23:23:43 2026 -0400

    OPENNLP-1850 Clarify that Extended_Pictographic symbols are kept as emoji
    
    WordType classifies every Extended_Pictographic code point as EMOJI, which
    includes symbol-like characters (copyright, trademark, double-exclamation,
    arrows), so the word tokenizer keeps them rather than dropping them as
    punctuation. State this in the WordTokenizer javadoc and add a test.
---
 .../java/opennlp/tools/tokenize/uax29/WordTokenizer.java     |  4 +++-
 .../java/opennlp/tools/tokenize/uax29/WordTokenizerTest.java | 12 ++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java
index e09bbed8e..1f6707b13 100644
--- 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java
@@ -26,7 +26,9 @@ import opennlp.tools.util.Span;
  * A word tokenizer built on the Unicode Text Segmentation algorithm (UAX 
#29). It finds segments
  * with {@link WordSegmenter}, keeps the ones that are words (letters, digits, 
ideographs, kana,
  * Hangul, Southeast-Asian script, or emoji), drops whitespace and 
punctuation, and classifies each
- * kept token with a {@link WordType}.
+ * kept token with a {@link WordType}. Emoji here means any {@code 
Extended_Pictographic} code point,
+ * so symbol-like characters such as the copyright, trademark, and 
double-exclamation signs are kept
+ * (typed {@link WordType#EMOJI}) rather than dropped as punctuation.
  *
  * <p>A token longer than {@code maxTokenLength} is emitted as consecutive 
pieces, never splitting a
  * surrogate pair. The tokenizer reports offset {@link Span}s, so the original 
text and its character
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordTokenizerTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordTokenizerTest.java
index 83e8c20fb..3d46ce870 100644
--- 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordTokenizerTest.java
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordTokenizerTest.java
@@ -89,6 +89,18 @@ public class WordTokenizerTest {
     assertEquals(text, tokens.get(0).text(text));
   }
 
+  @Test
+  void testExtendedPictographicSymbolsAreKeptAsEmoji() {
+    // Extended_Pictographic includes symbol-like characters (copyright 
U+00A9, trademark U+2122,
+    // double exclamation U+203C), which WordType classifies as EMOJI, so the 
tokenizer keeps them
+    // rather than dropping them as punctuation.
+    final String text = "a " + cp(0x00A9) + " " + cp(0x2122) + " " + 
cp(0x203C) + " b";
+    final List<WordToken> tokens = TOKENIZER.tokenizeTyped(text);
+    assertEquals(List.of(WordType.ALPHANUMERIC, WordType.EMOJI, WordType.EMOJI,
+            WordType.EMOJI, WordType.ALPHANUMERIC),
+        tokens.stream().map(WordToken::type).toList());
+  }
+
   @Test
   void testHangulSyllablesStayTogether() {
     final String text = cp(0xAC00) + cp(0xB098); // ga + na

Reply via email to