This is an automated email from the ASF dual-hosted git repository. krickert pushed a commit to branch OPENNLP-1850_Whitespace-UTF-Normalizae in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit ab15d7523bc768a39d46e3b291417cb8220501e3 Author: Kristian Rickert <[email protected]> AuthorDate: Thu Jun 18 23:12:04 2026 -0400 OPENNLP-1850 - Add quote/digit/invisible/ellipsis/bullet normalizers, the TextNormalizer pipeline, and offset-preserving TextAnalyzer Quote, digit, decimal, invisible-control, ellipsis, and bullet normalizers, all reusing the cursor-based CharClass engine (O(1) membership, no regex). TextNormalizer is a fluent builder that composes the rungs into an AggregateCharSequenceNormalizer, with a conservative searchDefault() chain. TextAnalyzer/AnalyzedToken tokenize and normalize per token while keeping each token's source span, the offset-preserving building block for BM25 matching. --- .../tools/util/normalizer/AnalyzedToken.java | 34 +++++ .../tools/util/normalizer/TextAnalyzer.java | 93 ++++++++++++ .../tools/util/normalizer/TextAnalyzerTest.java | 102 +++++++++++++ .../normalizer/BulletCharSequenceNormalizer.java | 51 +++++++ .../normalizer/DigitCharSequenceNormalizer.java | 57 +++++++ .../normalizer/EllipsisCharSequenceNormalizer.java | 60 ++++++++ .../InvisibleCharSequenceNormalizer.java | 71 +++++++++ .../normalizer/QuoteCharSequenceNormalizer.java | 69 +++++++++ .../tools/util/normalizer/TextNormalizer.java | 142 ++++++++++++++++++ .../AccentFoldCharSequenceNormalizerTest.java | 30 ++++ .../util/normalizer/SetBasedNormalizerTest.java | 163 +++++++++++++++++++++ .../tools/util/normalizer/TextNormalizerTest.java | 77 ++++++++++ 12 files changed, 949 insertions(+) diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/AnalyzedToken.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/AnalyzedToken.java new file mode 100644 index 000000000..389146596 --- /dev/null +++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/AnalyzedToken.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import opennlp.tools.util.Span; + +/** + * One analyzed token: its character span in the source text, the original token text, and the + * normalized form used for matching or indexing. + * + * <p>The span ties the normalized term back to the original text, so a search hit on + * {@link #normalized()} can be highlighted against the source using {@link #span()} even though + * the normalized form may differ in length (for example after diacritic folding).</p> + * + * @param span The character span of the token in the source text. + * @param original The original token text. + * @param normalized The normalized token text (the match/index form). + */ +public record AnalyzedToken(Span span, String original, String normalized) { +} diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/TextAnalyzer.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/TextAnalyzer.java new file mode 100644 index 000000000..7e8ce8d77 --- /dev/null +++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/TextAnalyzer.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +import opennlp.tools.util.Span; + +/** + * Splits text into tokens and normalizes each one, keeping every token's original character span. + * + * <p>This is the offset-preserving building block for search and BM25-style matching: tokens are + * found with a {@link CharClass} splitter (O(1) membership, a single cursor pass, no regular + * expression) and each token's text is run through a {@link CharSequenceNormalizer}. The result is + * a list of {@link AnalyzedToken}, each carrying the source {@link Span} alongside its normalized + * form, so a match on the normalized term can always be reported and highlighted against the + * original text even when normalization changes a token's length.</p> + */ +public final class TextAnalyzer { + + private final CharClass splitter; + private final CharSequenceNormalizer normalizer; + + /** + * Creates an analyzer. + * + * @param splitter The character class whose members delimit tokens (typically + * {@link CharClass#whitespace()}). + * @param normalizer The per-token normalizer. + */ + public TextAnalyzer(CharClass splitter, CharSequenceNormalizer normalizer) { + this.splitter = Objects.requireNonNull(splitter, "splitter"); + this.normalizer = Objects.requireNonNull(normalizer, "normalizer"); + } + + /** + * Creates an analyzer that splits on Unicode whitespace. + * + * @param normalizer The per-token normalizer. + * @return The analyzer. + */ + public static TextAnalyzer whitespace(CharSequenceNormalizer normalizer) { + return new TextAnalyzer(CharClass.whitespace(), normalizer); + } + + /** + * Tokenizes {@code text} and normalizes each token. + * + * @param text The text to analyze. + * @return The analyzed tokens, in order, each with its source span and normalized form. + */ + public List<AnalyzedToken> analyze(CharSequence text) { + Objects.requireNonNull(text, "text"); + final List<AnalyzedToken> tokens = new ArrayList<>(); + for (final Span span : splitter.splitSpans(text)) { + final String original = text.subSequence(span.getStart(), span.getEnd()).toString(); + final String normalized = normalizer.normalize(original).toString(); + tokens.add(new AnalyzedToken(span, original, normalized)); + } + return tokens; + } + + /** + * Tokenizes {@code text} and returns only the normalized terms. + * + * @param text The text to analyze. + * @return The normalized token terms, in order. + */ + public List<String> terms(CharSequence text) { + final List<AnalyzedToken> analyzed = analyze(text); + final List<String> terms = new ArrayList<>(analyzed.size()); + for (final AnalyzedToken token : analyzed) { + terms.add(token.normalized()); + } + return terms; + } +} diff --git a/opennlp-api/src/test/java/opennlp/tools/util/normalizer/TextAnalyzerTest.java b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/TextAnalyzerTest.java new file mode 100644 index 000000000..77decf860 --- /dev/null +++ b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/TextAnalyzerTest.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.List; +import java.util.Locale; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TextAnalyzerTest { + + private static final CharSequenceNormalizer LOWER = s -> s.toString().toLowerCase(Locale.ROOT); + + private static String cp(int codePoint) { + return new String(Character.toChars(codePoint)); + } + + @Test + void testAnalyzePreservesSpansAndNormalizesTokens() { + final String text = "Hello WORLD"; + final List<AnalyzedToken> tokens = TextAnalyzer.whitespace(LOWER).analyze(text); + + assertEquals(2, tokens.size()); + assertEquals(0, tokens.get(0).span().getStart()); + assertEquals(5, tokens.get(0).span().getEnd()); + assertEquals("Hello", tokens.get(0).original()); + assertEquals("hello", tokens.get(0).normalized()); + assertEquals("WORLD", tokens.get(1).original()); + assertEquals("world", tokens.get(1).normalized()); + assertEquals("Hello", tokens.get(0).span().getCoveredText(text).toString()); + } + + @Test + void testSpanStaysCorrectWhenNormalizedLengthChanges() { + final CharSequenceNormalizer bracket = s -> "[" + s + "]"; + final String text = "ab cd"; + final List<AnalyzedToken> tokens = TextAnalyzer.whitespace(bracket).analyze(text); + + assertEquals("[ab]", tokens.get(0).normalized()); + assertEquals(0, tokens.get(0).span().getStart()); + assertEquals(2, tokens.get(0).span().getEnd()); + assertEquals(3, tokens.get(1).span().getStart()); + assertEquals(5, tokens.get(1).span().getEnd()); + } + + @Test + void testSplitsOnUnicodeWhitespace() { + final String text = "alpha" + cp(0x00A0) + "beta"; + final List<AnalyzedToken> tokens = TextAnalyzer.whitespace(LOWER).analyze(text); + + assertEquals(2, tokens.size()); + assertEquals("alpha", tokens.get(0).normalized()); + assertEquals("beta", tokens.get(1).normalized()); + } + + @Test + void testSupplementaryTokenIsKeptIntact() { + final String emoji = cp(0x1F600); + final String text = "a " + emoji + " b"; + final List<AnalyzedToken> tokens = TextAnalyzer.whitespace(LOWER).analyze(text); + + assertEquals(3, tokens.size()); + assertEquals(emoji, tokens.get(1).original()); + assertTrue(tokens.get(1).span().getEnd() - tokens.get(1).span().getStart() == emoji.length()); + } + + @Test + void testTermsReturnsNormalizedFormsOnly() { + assertEquals(List.of("a", "b", "c"), TextAnalyzer.whitespace(LOWER).terms("A B C")); + } + + @Test + void testEmptyAndWhitespaceOnlyYieldNoTokens() { + assertEquals(List.of(), TextAnalyzer.whitespace(LOWER).analyze("")); + assertEquals(List.of(), TextAnalyzer.whitespace(LOWER).analyze(" ")); + } + + @Test + void testRejectsNullArguments() { + assertThrows(NullPointerException.class, () -> new TextAnalyzer(null, LOWER)); + assertThrows(NullPointerException.class, () -> new TextAnalyzer(CharClass.whitespace(), null)); + assertThrows(NullPointerException.class, () -> TextAnalyzer.whitespace(LOWER).analyze(null)); + } +} diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java new file mode 100644 index 000000000..a58a45c53 --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +/** + * A {@link CharSequenceNormalizer} that replaces unambiguous list-bullet characters with a space, + * so a bullet acts as a token separator rather than sticking to the following word. + * + * <p>Membership is an O(1) {@link CharClass} lookup and scanning is a single cursor pass with no + * regular expression. The middle dot ({@code U+00B7}) is deliberately <em>not</em> included, + * because it is a letter in Catalan ({@code l..l}) and other orthographies; only characters that + * are unambiguously list bullets are replaced.</p> + */ +public class BulletCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final long serialVersionUID = 1L; + + private static final CharClass BULLETS = CharClass.of(CodePointSet.of( + 0x2022, // bullet + 0x2023, // triangular bullet + 0x2043, // hyphen bullet + 0x2219, // bullet operator + 0x25E6), // white bullet + 0x0020); + + private static final BulletCharSequenceNormalizer INSTANCE = new BulletCharSequenceNormalizer(); + + /** {@return the shared, stateless instance} */ + public static BulletCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + @Override + public CharSequence normalize(CharSequence text) { + return BULLETS.normalize(text); + } +} diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java new file mode 100644 index 000000000..90ee0d3d1 --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +/** + * A {@link CharSequenceNormalizer} that maps Unicode decimal digits to their ASCII equivalents, + * so for example Arabic-Indic, Devanagari, or fullwidth digits all become {@code 0}-{@code 9}. + * + * <p>It maps a code point when {@link Character#digit(int, int)} reports a value of {@code 0}- + * {@code 9} in radix ten, that is, when the code point is a Unicode decimal digit. Other numeric + * forms (Roman numerals, superscripts, circled numbers, fractions) are not decimal digits and are + * left unchanged. Scanning is a single O(1)-per-code-point cursor pass with no regular + * expression.</p> + */ +public class DigitCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final long serialVersionUID = 1L; + + private static final DigitCharSequenceNormalizer INSTANCE = new DigitCharSequenceNormalizer(); + + /** {@return the shared, stateless instance} */ + public static DigitCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + @Override + public CharSequence normalize(CharSequence text) { + final StringBuilder out = new StringBuilder(text.length()); + final int length = text.length(); + int i = 0; + while (i < length) { + final int codePoint = Character.codePointAt(text, i); + final int value = Character.digit(codePoint, 10); + if (value >= 0) { + out.append((char) ('0' + value)); + } else { + out.appendCodePoint(codePoint); + } + i += Character.charCount(codePoint); + } + return out.toString(); + } +} diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java new file mode 100644 index 000000000..8eccf2e5c --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +/** + * A {@link CharSequenceNormalizer} that expands the ellipsis and leader characters to ASCII dots: + * the horizontal ellipsis ({@code U+2026}) to {@code "..."} and the two-dot leader + * ({@code U+2025}) to {@code ".."}. + * + * <p>Scanning is a single O(1)-per-code-point cursor pass with no regular expression. ASCII dot + * runs are left unchanged.</p> + */ +public class EllipsisCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final long serialVersionUID = 1L; + + private static final EllipsisCharSequenceNormalizer INSTANCE = + new EllipsisCharSequenceNormalizer(); + + /** {@return the shared, stateless instance} */ + public static EllipsisCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + @Override + public CharSequence normalize(CharSequence text) { + final StringBuilder out = new StringBuilder(text.length()); + final int length = text.length(); + int i = 0; + while (i < length) { + final int codePoint = Character.codePointAt(text, i); + final String mapped = switch (codePoint) { + case 0x2026 -> "..."; // horizontal ellipsis + case 0x2025 -> ".."; // two dot leader + default -> null; + }; + if (mapped != null) { + out.append(mapped); + } else { + out.appendCodePoint(codePoint); + } + i += Character.charCount(codePoint); + } + return out.toString(); + } +} diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java new file mode 100644 index 000000000..3828f6f22 --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +/** + * A {@link CharSequenceNormalizer} that removes invisible format and bidirectional control + * characters that add no textual content and are a common source of noise and spoofing (the + * byte-order mark, zero width space, word joiner, bidi marks/embeddings/overrides/isolates, the + * invisible math operators, soft hyphen, and the Arabic letter mark). + * + * <p>Membership is an O(1) {@link CharClass} lookup and removal is a single cursor pass with no + * regular expression. The zero width joiner ({@code U+200D}) and non-joiner ({@code U+200C}) are + * deliberately <em>kept</em>, because they carry meaning in Persian, Indic scripts, and emoji + * sequences; so are variation selectors. Use this only for a matching/search form, not for + * display.</p> + */ +public class InvisibleCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final long serialVersionUID = 1L; + + // The replacement is unused: removeAll deletes members rather than substituting them. + private static final CharClass INVISIBLE = CharClass.of(CodePointSet.of( + 0x00AD, // soft hyphen + 0x061C, // arabic letter mark + 0x200B, // zero width space + 0x200E, // left-to-right mark + 0x200F, // right-to-left mark + 0x202A, // left-to-right embedding + 0x202B, // right-to-left embedding + 0x202C, // pop directional formatting + 0x202D, // left-to-right override + 0x202E, // right-to-left override + 0x2060, // word joiner + 0x2061, // function application + 0x2062, // invisible times + 0x2063, // invisible separator + 0x2064, // invisible plus + 0x2066, // left-to-right isolate + 0x2067, // right-to-left isolate + 0x2068, // first strong isolate + 0x2069, // pop directional isolate + 0xFEFF), // zero width no-break space (byte order mark) + 0x0020); + + private static final InvisibleCharSequenceNormalizer INSTANCE = + new InvisibleCharSequenceNormalizer(); + + /** {@return the shared, stateless instance} */ + public static InvisibleCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + @Override + public CharSequence normalize(CharSequence text) { + return INVISIBLE.removeAll(text); + } +} diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java new file mode 100644 index 000000000..acef8dcd0 --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +/** + * A {@link CharSequenceNormalizer} that folds typographic quotation marks to their ASCII forms: + * the single quotes and apostrophes to {@code '} and the double quotes to {@code "}. + * + * <p>This is high value for matching, since curly quotes, guillemets, and fullwidth quotes + * otherwise prevent {@code "don't"} from matching {@code "don" + U+2019 + "t"}. It is built from + * two {@link CharClass} sets, so membership is O(1) and scanning is a single cursor pass with no + * regular expression. ASCII quotes are left unchanged.</p> + */ +public class QuoteCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final long serialVersionUID = 1L; + + // Single quotes / apostrophes -> U+0027 APOSTROPHE. + private static final CharClass SINGLE = CharClass.of(CodePointSet.of( + 0x2018, // left single quotation mark + 0x2019, // right single quotation mark + 0x201A, // single low-9 quotation mark + 0x201B, // single high-reversed-9 quotation mark + 0x2039, // single left-pointing angle quotation mark + 0x203A, // single right-pointing angle quotation mark + 0x02BC, // modifier letter apostrophe + 0xFF07), // fullwidth apostrophe + '\''); + + // Double quotes -> U+0022 QUOTATION MARK. + private static final CharClass DOUBLE = CharClass.of(CodePointSet.of( + 0x201C, // left double quotation mark + 0x201D, // right double quotation mark + 0x201E, // double low-9 quotation mark + 0x201F, // double high-reversed-9 quotation mark + 0x00AB, // left-pointing double angle quotation mark + 0x00BB, // right-pointing double angle quotation mark + 0x301D, // reversed double prime quotation mark + 0x301E, // double prime quotation mark + 0x301F, // low double prime quotation mark + 0xFF02), // fullwidth quotation mark + '"'); + + private static final QuoteCharSequenceNormalizer INSTANCE = new QuoteCharSequenceNormalizer(); + + /** {@return the shared, stateless instance} */ + public static QuoteCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + @Override + public CharSequence normalize(CharSequence text) { + return DOUBLE.normalize(SINGLE.normalize(text)); + } +} diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java new file mode 100644 index 000000000..c1bac6409 --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +/** + * A fluent builder that composes the normalization rungs into a single + * {@link CharSequenceNormalizer}. + * + * <p>The rungs are applied in the order they are added, so the caller controls the chain. A + * conservative, search-oriented chain is available through {@link #searchDefault()}. Each rung is + * a shared, stateless normalizer; the built normalizer is an {@link AggregateCharSequenceNormalizer} + * that applies them in sequence.</p> + * + * <pre>{@code + * CharSequenceNormalizer n = TextNormalizer.builder() + * .nfc().caseFold().accentFold() + * .build(); + * }</pre> + */ +public final class TextNormalizer { + + private final List<CharSequenceNormalizer> steps = new ArrayList<>(); + + private TextNormalizer() { + } + + /** {@return a new, empty builder} */ + public static TextNormalizer builder() { + return new TextNormalizer(); + } + + /** {@return this builder with NFC canonical composition appended} */ + public TextNormalizer nfc() { + return add(NfcCharSequenceNormalizer.getInstance()); + } + + /** {@return this builder with NFKC compatibility composition appended} */ + public TextNormalizer nfkc() { + return add(NfkcCharSequenceNormalizer.getInstance()); + } + + /** {@return this builder with invisible/bidi control stripping appended} */ + public TextNormalizer stripInvisible() { + return add(InvisibleCharSequenceNormalizer.getInstance()); + } + + /** {@return this builder with Unicode whitespace collapsing appended} */ + public TextNormalizer whitespace() { + return add(WhitespaceCharSequenceNormalizer.getInstance()); + } + + /** {@return this builder with quotation-mark folding appended} */ + public TextNormalizer quotes() { + return add(QuoteCharSequenceNormalizer.getInstance()); + } + + /** {@return this builder with dash folding appended} */ + public TextNormalizer dashes() { + return add(DashCharSequenceNormalizer.getInstance()); + } + + /** {@return this builder with decimal-digit folding appended} */ + public TextNormalizer digits() { + return add(DigitCharSequenceNormalizer.getInstance()); + } + + /** {@return this builder with ellipsis expansion appended} */ + public TextNormalizer ellipsis() { + return add(EllipsisCharSequenceNormalizer.getInstance()); + } + + /** {@return this builder with list-bullet replacement appended} */ + public TextNormalizer bullets() { + return add(BulletCharSequenceNormalizer.getInstance()); + } + + /** {@return this builder with case folding appended} */ + public TextNormalizer caseFold() { + return add(CaseFoldCharSequenceNormalizer.getInstance()); + } + + /** {@return this builder with script-gated diacritic folding appended} */ + public TextNormalizer accentFold() { + return add(AccentFoldCharSequenceNormalizer.getInstance()); + } + + /** + * Appends a custom normalizer. + * + * @param custom The normalizer to append. + * @return This builder. + */ + public TextNormalizer with(CharSequenceNormalizer custom) { + return add(Objects.requireNonNull(custom, "custom")); + } + + /** {@return the composed normalizer for the rungs added so far} */ + public CharSequenceNormalizer build() { + return new AggregateCharSequenceNormalizer(steps.toArray(new CharSequenceNormalizer[0])); + } + + /** + * {@return a conservative search/matching chain} + * + * <p>The chain strips invisible controls, applies NFC, collapses whitespace, folds quotes and + * dashes, case folds, and finally applies script-gated diacritic folding.</p> + */ + public static CharSequenceNormalizer searchDefault() { + return builder() + .stripInvisible() + .nfc() + .whitespace() + .quotes() + .dashes() + .caseFold() + .accentFold() + .build(); + } + + private TextNormalizer add(CharSequenceNormalizer normalizer) { + steps.add(normalizer); + return this; + } +} diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizerTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizerTest.java index ba4a6ea4b..5db1a4683 100644 --- a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizerTest.java +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizerTest.java @@ -53,6 +53,36 @@ public class AccentFoldCharSequenceNormalizerTest { assertEquals("i", fold(cp(0x0131))); // dotless i } + @Test + void testEveryStrokeAndLigatureLetterMaps() { + assertEquals("o", fold(cp(0x00F8))); // o with stroke + assertEquals("O", fold(cp(0x00D8))); // O with stroke + assertEquals("ae", fold(cp(0x00E6))); // ae + assertEquals("AE", fold(cp(0x00C6))); // AE + assertEquals("oe", fold(cp(0x0153))); // oe + assertEquals("OE", fold(cp(0x0152))); // OE + assertEquals("ss", fold(cp(0x00DF))); // eszett + assertEquals("SS", fold(cp(0x1E9E))); // capital eszett + assertEquals("th", fold(cp(0x00FE))); // thorn + assertEquals("TH", fold(cp(0x00DE))); // capital thorn + assertEquals("d", fold(cp(0x00F0))); // eth + assertEquals("D", fold(cp(0x00D0))); // capital eth + assertEquals("d", fold(cp(0x0111))); // d with stroke + assertEquals("D", fold(cp(0x0110))); // D with stroke + assertEquals("l", fold(cp(0x0142))); // l with stroke + assertEquals("L", fold(cp(0x0141))); // L with stroke + assertEquals("h", fold(cp(0x0127))); // h with stroke + assertEquals("H", fold(cp(0x0126))); // H with stroke + assertEquals("i", fold(cp(0x0131))); // dotless i + } + + @Test + void testLeadingCombiningMarkWithNoBaseIsKept() { + // A combining mark with no preceding base (baseScript == null) must be kept, not dropped. + final String input = cp(0x0301) + "x"; // combining acute, then x + assertEquals(input, fold(input)); + } + @Test void testFoldsGreekAndCyrillicAccents() { assertEquals(cp(0x03B1), fold(cp(0x03AC))); // Greek alpha with tonos -> alpha diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/SetBasedNormalizerTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/SetBasedNormalizerTest.java new file mode 100644 index 000000000..ea333f06b --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/SetBasedNormalizerTest.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertSame; + +public class SetBasedNormalizerTest { + + private static String cp(int codePoint) { + return new String(Character.toChars(codePoint)); + } + + private static String quotes(String text) { + return QuoteCharSequenceNormalizer.getInstance().normalize(text).toString(); + } + + private static String digits(String text) { + return DigitCharSequenceNormalizer.getInstance().normalize(text).toString(); + } + + private static String invisible(String text) { + return InvisibleCharSequenceNormalizer.getInstance().normalize(text).toString(); + } + + private static String ellipsis(String text) { + return EllipsisCharSequenceNormalizer.getInstance().normalize(text).toString(); + } + + private static String bullet(String text) { + return BulletCharSequenceNormalizer.getInstance().normalize(text).toString(); + } + + // --- quotes ------------------------------------------------------------------------------ + + @Test + void testQuotesFoldSingleAndDouble() { + assertEquals("don't", quotes("don" + cp(0x2019) + "t")); // right single quote + assertEquals("\"hi\"", quotes(cp(0x201C) + "hi" + cp(0x201D))); // curly double quotes + assertEquals("\"x\"", quotes(cp(0x00AB) + "x" + cp(0x00BB))); // guillemets + assertEquals("'y'", quotes(cp(0x2039) + "y" + cp(0x203A))); // single angle quotes + assertEquals("'", quotes(cp(0xFF07))); // fullwidth apostrophe + assertEquals("\"", quotes(cp(0xFF02))); // fullwidth quotation mark + assertEquals("'", quotes(cp(0x02BC))); // modifier letter apostrophe + } + + @Test + void testQuotesLeaveAsciiAndNonQuotesAlone() { + assertEquals("'a' \"b\"", quotes("'a' \"b\"")); + assertEquals("abc", quotes("abc")); + assertEquals(cp(0x2014), quotes(cp(0x2014))); // em dash is not a quote + } + + @Test + void testQuotesSingleton() { + assertSame(QuoteCharSequenceNormalizer.getInstance(), QuoteCharSequenceNormalizer.getInstance()); + } + + // --- digits ------------------------------------------------------------------------------ + + @Test + void testDigitsMapDecimalDigitsToAscii() { + assertEquals("123", digits(cp(0x0661) + cp(0x0662) + cp(0x0663))); // arabic-indic 1 2 3 + assertEquals("12", digits(cp(0x0967) + cp(0x0968))); // devanagari 1 2 + assertEquals("15", digits(cp(0xFF11) + cp(0xFF15))); // fullwidth 1 5 + assertEquals("a5b", digits("a" + cp(0x0665) + "b")); // arabic-indic 5 + } + + @Test + void testDigitsLeaveAsciiAndNonDecimalNumeralsAlone() { + assertEquals("0123456789", digits("0123456789")); + assertEquals(cp(0x00B2), digits(cp(0x00B2))); // superscript two (category No) + assertEquals(cp(0x2160), digits(cp(0x2160))); // roman numeral one (category Nl) + assertEquals(cp(0x00BD), digits(cp(0x00BD))); // vulgar fraction one half (category No) + assertEquals("abc", digits("abc")); + } + + @Test + void testDigitsSingleton() { + assertSame(DigitCharSequenceNormalizer.getInstance(), DigitCharSequenceNormalizer.getInstance()); + } + + // --- invisible / bidi controls ----------------------------------------------------------- + + @Test + void testInvisibleRemovesFormatAndBidiControls() { + assertEquals("ab", invisible("a" + cp(0xFEFF) + "b")); // byte order mark + assertEquals("ab", invisible("a" + cp(0x200B) + "b")); // zero width space + assertEquals("ab", invisible("a" + cp(0x2060) + "b")); // word joiner + assertEquals("softhyphen", invisible("soft" + cp(0x00AD) + "hyphen")); + assertEquals("evil", invisible(cp(0x202E) + "evil" + cp(0x202C))); // bidi override + pop + } + + @Test + void testInvisibleKeepsJoinersVariationSelectorsAndText() { + final String zwj = "a" + cp(0x200D) + "b"; // zero width joiner is meaningful + assertEquals(zwj, invisible(zwj)); + final String zwnj = "a" + cp(0x200C) + "b"; // zero width non-joiner is meaningful + assertEquals(zwnj, invisible(zwnj)); + final String family = cp(0x1F468) + cp(0x200D) + cp(0x1F469); // ZWJ emoji sequence preserved + assertEquals(family, invisible(family)); + assertEquals("hello", invisible("hello")); + } + + @Test + void testInvisibleSingleton() { + assertSame(InvisibleCharSequenceNormalizer.getInstance(), + InvisibleCharSequenceNormalizer.getInstance()); + } + + // --- ellipsis ---------------------------------------------------------------------------- + + @Test + void testEllipsisExpandsToAsciiDots() { + assertEquals("...", ellipsis(cp(0x2026))); // horizontal ellipsis + assertEquals("wait...", ellipsis("wait" + cp(0x2026))); + assertEquals("..", ellipsis(cp(0x2025))); // two dot leader + assertEquals("...", ellipsis("...")); // ascii dots unchanged + } + + @Test + void testEllipsisSingleton() { + assertSame(EllipsisCharSequenceNormalizer.getInstance(), + EllipsisCharSequenceNormalizer.getInstance()); + } + + // --- bullets ----------------------------------------------------------------------------- + + @Test + void testBulletsBecomeSeparatorSpaces() { + assertEquals(" item", bullet(cp(0x2022) + "item")); // bullet + assertEquals(" item", bullet(cp(0x25E6) + "item")); // white bullet + assertEquals("a b", bullet("a" + cp(0x2043) + "b")); // hyphen bullet + } + + @Test + void testBulletsLeaveMiddleDotAndTextAlone() { + assertEquals(cp(0x00B7), bullet(cp(0x00B7))); // middle dot kept (Catalan) + assertEquals("plain", bullet("plain")); + } + + @Test + void testBulletSingleton() { + assertSame(BulletCharSequenceNormalizer.getInstance(), + BulletCharSequenceNormalizer.getInstance()); + } +} diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TextNormalizerTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TextNormalizerTest.java new file mode 100644 index 000000000..64aa6df3a --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TextNormalizerTest.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.Locale; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class TextNormalizerTest { + + private static String cp(int codePoint) { + return new String(Character.toChars(codePoint)); + } + + @Test + void testRungsApplyInOrder() { + final CharSequenceNormalizer n = TextNormalizer.builder().caseFold().accentFold().build(); + assertEquals("cafe", n.normalize("CAF" + cp(0x00C9)).toString()); // CAFE-acute -> cafe + } + + @Test + void testEmptyBuilderIsIdentity() { + assertEquals("UnChanged", TextNormalizer.builder().build().normalize("UnChanged").toString()); + } + + @Test + void testWhitespaceAndFoldChain() { + final CharSequenceNormalizer n = TextNormalizer.builder() + .nfc().whitespace().caseFold().accentFold().build(); + assertEquals("cafe", n.normalize(" CAF" + cp(0x00C9) + " ").toString()); + } + + @Test + void testWithCustomNormalizer() { + final CharSequenceNormalizer up = s -> s.toString().toUpperCase(Locale.ROOT); + assertEquals("AB", TextNormalizer.builder().with(up).build().normalize("ab").toString()); + } + + @Test + void testWithRejectsNull() { + assertThrows(NullPointerException.class, () -> TextNormalizer.builder().with(null)); + } + + @Test + void testSearchDefaultCleansMessyInput() { + // BOM + curly-quoted, mixed-case, accented text -> stripped, ASCII-quoted, folded. + final String input = cp(0xFEFF) + cp(0x201C) + "Caf" + cp(0x00C9) + cp(0x201D); + assertEquals("\"cafe\"", TextNormalizer.searchDefault().normalize(input).toString()); + } + + @Test + void testEveryRungIsInvokable() { + final CharSequenceNormalizer n = TextNormalizer.builder() + .stripInvisible().nfc().nfkc().whitespace().quotes().dashes().digits().ellipsis().bullets() + .caseFold().accentFold().build(); + // BOM stripped, Arabic-Indic 1 -> 1, case + accent folded. + final String input = cp(0xFEFF) + "CAF" + cp(0x00C9) + " " + cp(0x0661); + assertEquals("cafe 1", n.normalize(input).toString()); + } +}
