This is an automated email from the ASF dual-hosted git repository. krickert pushed a commit to branch OPENNLP-1850_Whitespace-UTF-Normalizae in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit 0d53e31bb3f6774ad4eb2ad3c8171b64ef5bfdce Author: Kristian Rickert <[email protected]> AuthorDate: Thu Jun 18 22:28:13 2026 -0400 OPENNLP-1850 - Add robust character sequence normalization utilities and tests Co-authored-by: Junie <[email protected]> Signed-off-by: Kristian Rickert <[email protected]> --- opennlp-api/pom.xml | 6 + .../opennlp/tools/util/normalizer/CharClass.java | 383 +++++++++++++++++++++ .../tools/util/normalizer/CodePointSet.java | 245 +++++++++++++ .../tools/util/normalizer/NormalizedText.java | 51 +++ .../opennlp/tools/util/normalizer/OffsetMap.java | 135 ++++++++ .../opennlp/tools/util/normalizer/UnicodeDash.java | 189 ++++++++++ .../tools/util/normalizer/UnicodeWhitespace.java | 242 +++++++++++++ .../tools/util/normalizer/CharClassTest.java | 292 ++++++++++++++++ .../tools/util/normalizer/CodePointSetTest.java | 241 +++++++++++++ .../tools/util/normalizer/UnicodeDashTest.java | 170 +++++++++ .../util/normalizer/UnicodeWhitespaceTest.java | 239 +++++++++++++ .../src/main/java/opennlp/dl/AbstractDL.java | 33 ++ .../opennlp/dl/doccat/DocumentCategorizerDL.java | 13 +- .../java/opennlp/dl/namefinder/NameFinderDL.java | 90 +++-- .../java/opennlp/dl/AbstractDLChunkingTest.java | 61 ++++ .../opennlp/dl/namefinder/NameFinderDLTest.java | 33 +- .../AccentFoldCharSequenceNormalizer.java | 133 +++++++ .../normalizer/CaseFoldCharSequenceNormalizer.java | 47 +++ .../normalizer/DashCharSequenceNormalizer.java | 45 +++ .../util/normalizer/NfcCharSequenceNormalizer.java | 45 +++ .../normalizer/NfkcCharSequenceNormalizer.java | 46 +++ .../WhitespaceCharSequenceNormalizer.java | 46 +++ .../AccentFoldCharSequenceNormalizerTest.java | 115 +++++++ .../UnicodeCharSequenceNormalizerTest.java | 97 ++++++ 24 files changed, 2960 insertions(+), 37 deletions(-) diff --git a/opennlp-api/pom.xml b/opennlp-api/pom.xml index 05404d154..516d9baec 100644 --- a/opennlp-api/pom.xml +++ b/opennlp-api/pom.xml @@ -49,6 +49,12 @@ <artifactId>junit-jupiter-engine</artifactId> <scope>test</scope> </dependency> + + <dependency> + <groupId>org.junit.jupiter</groupId> + <artifactId>junit-jupiter-params</artifactId> + <scope>test</scope> + </dependency> </dependencies> </project> \ No newline at end of file diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java new file mode 100644 index 000000000..766f3324e --- /dev/null +++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java @@ -0,0 +1,383 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +import opennlp.tools.util.Span; + +/** + * A configurable class of Unicode code points and the cursor based operations over it. + * + * <p>A {@code CharClass} pairs a {@link CodePointSet} of member code points with a single + * canonical ASCII {@code replacement} code point. Whitespace and dashes are the two built-in + * presets ({@link #whitespace()}, {@link #dashes()}); any other class is one more configured + * instance with no new engine code.</p> + * + * <p>Every operation is a single forward pass that reads one code point + * ({@link Character#codePointAt(CharSequence, int)}), tests membership in O(1), acts, and advances + * by {@link Character#charCount(int)}. There is no regular expression, no {@link java.util.regex} + * allocation, and no reliance on {@link Character#isWhitespace(int)} or + * {@link Character#isSpaceChar(int)}, all of which disagree with the Unicode standard.</p> + * + * <p>Instances are immutable and thread-safe.</p> + */ +public final class CharClass { + + private static final CharClass WHITESPACE = + new CharClass(CodePointSet.of(UnicodeWhitespace.codePoints()), 0x0020); + private static final CharClass DASHES = + new CharClass(CodePointSet.of(UnicodeDash.defaultDashCodePoints()), UnicodeDash.HYPHEN_MINUS); + + private final CodePointSet members; + private final int replacement; + + private CharClass(CodePointSet members, int replacement) { + this.members = members; + this.replacement = replacement; + } + + /** + * Creates a class from a member set and a replacement code point. + * + * @param members The member code points. + * @param replacement The canonical code point used by {@link #normalize(CharSequence)} and + * {@link #collapse(CharSequence)}. + * @return The class. + * @throws IllegalArgumentException Thrown if {@code replacement} is not a valid code point. + */ + public static CharClass of(CodePointSet members, int replacement) { + Objects.requireNonNull(members, "members"); + requireValidCodePoint(replacement); + return new CharClass(members, replacement); + } + + /** {@return the whitespace preset: the Unicode {@code White_Space} set, replacement {@code U+0020}} */ + public static CharClass whitespace() { + return WHITESPACE; + } + + /** + * {@return the dash preset: the Unicode {@code Dash} set excluding the mathematical minus signs, + * replacement {@code U+002D}} + */ + public static CharClass dashes() { + return DASHES; + } + + /** + * Returns a copy of this class whose member set is extended with {@code extra} (for example, + * user-defined code points loaded from {@link CodePointSet#fromFile}). + * + * @param extra The additional member code points. + * @return A new {@code CharClass}; this instance is unchanged. + */ + public CharClass withAdditional(CodePointSet extra) { + Objects.requireNonNull(extra, "extra"); + return new CharClass(members.union(extra), replacement); + } + + /** {@return the member code points of this class} */ + public CodePointSet members() { + return members; + } + + /** {@return the canonical replacement code point} */ + public int replacement() { + return replacement; + } + + /** + * Tests membership. + * + * @param codePoint The code point to test. + * @return {@code true} if the code point is a member of this class. + */ + public boolean contains(int codePoint) { + return members.contains(codePoint); + } + + /** + * Splits text into the maximal runs of non-member code points, as character spans into the + * original text. Runs of members are delimiters and produce no empty spans. + * + * @param text The text to split. + * @return The token spans, in order. + */ + public List<Span> splitSpans(CharSequence text) { + Objects.requireNonNull(text, "text"); + final List<Span> spans = new ArrayList<>(); + final int length = text.length(); + int tokenStart = -1; + int i = 0; + while (i < length) { + final int codePoint = Character.codePointAt(text, i); + if (members.contains(codePoint)) { + if (tokenStart >= 0) { + spans.add(new Span(tokenStart, i)); + tokenStart = -1; + } + } else if (tokenStart < 0) { + tokenStart = i; + } + i += Character.charCount(codePoint); + } + if (tokenStart >= 0) { + spans.add(new Span(tokenStart, length)); + } + return spans; + } + + /** + * Splits text into the maximal runs of non-member code points. + * + * @param text The text to split. + * @return The tokens, in order, with no empty entries. + */ + public String[] split(CharSequence text) { + final List<Span> spans = splitSpans(text); + final String[] tokens = new String[spans.size()]; + for (int i = 0; i < spans.size(); i++) { + final Span span = spans.get(i); + tokens[i] = text.subSequence(span.getStart(), span.getEnd()).toString(); + } + return tokens; + } + + /** + * Replaces each member code point with the replacement, one for one. + * + * @param text The text to normalize. + * @return The normalized text. + */ + public String normalize(CharSequence text) { + Objects.requireNonNull(text, "text"); + final StringBuilder out = new StringBuilder(text.length()); + final int length = text.length(); + int i = 0; + while (i < length) { + final int codePoint = Character.codePointAt(text, i); + out.appendCodePoint(members.contains(codePoint) ? replacement : codePoint); + i += Character.charCount(codePoint); + } + return out.toString(); + } + + /** + * Collapses each maximal run of member code points to a single replacement. + * + * @param text The text to collapse. + * @return The collapsed text. + */ + public String collapse(CharSequence text) { + Objects.requireNonNull(text, "text"); + final StringBuilder out = new StringBuilder(text.length()); + final int length = text.length(); + int i = 0; + while (i < length) { + final int codePoint = Character.codePointAt(text, i); + if (members.contains(codePoint)) { + out.appendCodePoint(replacement); + i = skipRun(text, i); + } else { + out.appendCodePoint(codePoint); + i += Character.charCount(codePoint); + } + } + return out.toString(); + } + + /** + * Collapses runs of members like {@link #collapse(CharSequence)}, but emits + * {@code keepReplacement} instead of the usual replacement for any run that contains a code + * point in {@code keep}. The whitespace "squish" that preserves a line break uses this with the + * line-break code points as {@code keep} and {@code '\n'} as {@code keepReplacement}. + * + * @param text The text to collapse. + * @param keep The member code points whose presence in a run preserves structure. + * @param keepReplacement The replacement emitted for a run that contains a {@code keep} member. + * @return The collapsed text. + * @throws IllegalArgumentException Thrown if {@code keepReplacement} is not a valid code point. + */ + public String collapsePreserving(CharSequence text, CodePointSet keep, int keepReplacement) { + Objects.requireNonNull(text, "text"); + Objects.requireNonNull(keep, "keep"); + requireValidCodePoint(keepReplacement); + final StringBuilder out = new StringBuilder(text.length()); + final int length = text.length(); + int i = 0; + while (i < length) { + final int codePoint = Character.codePointAt(text, i); + if (members.contains(codePoint)) { + boolean preserve = keep.contains(codePoint); + int j = i + Character.charCount(codePoint); + while (j < length) { + final int next = Character.codePointAt(text, j); + if (!members.contains(next)) { + break; + } + preserve |= keep.contains(next); + j += Character.charCount(next); + } + out.appendCodePoint(preserve ? keepReplacement : replacement); + i = j; + } else { + out.appendCodePoint(codePoint); + i += Character.charCount(codePoint); + } + } + return out.toString(); + } + + /** + * Removes leading and trailing member code points. + * + * @param text The text to trim. + * @return The trimmed text. + */ + public String trim(CharSequence text) { + Objects.requireNonNull(text, "text"); + final int length = text.length(); + int start = 0; + while (start < length) { + final int codePoint = Character.codePointAt(text, start); + if (!members.contains(codePoint)) { + break; + } + start += Character.charCount(codePoint); + } + int end = length; + while (end > start) { + final int codePoint = Character.codePointBefore(text, end); + if (!members.contains(codePoint)) { + break; + } + end -= Character.charCount(codePoint); + } + return text.subSequence(start, end).toString(); + } + + /** + * Removes every member code point. + * + * @param text The text to filter. + * @return The text with all members removed. + */ + public String removeAll(CharSequence text) { + Objects.requireNonNull(text, "text"); + final StringBuilder out = new StringBuilder(text.length()); + final int length = text.length(); + int i = 0; + while (i < length) { + final int codePoint = Character.codePointAt(text, i); + if (!members.contains(codePoint)) { + out.appendCodePoint(codePoint); + } + i += Character.charCount(codePoint); + } + return out.toString(); + } + + /** + * Like {@link #normalize(CharSequence)} but also produces the {@link OffsetMap} back to the + * original text. + * + * @param text The text to normalize. + * @return The normalized text and its offset map. + */ + public NormalizedText normalizeMapped(CharSequence text) { + Objects.requireNonNull(text, "text"); + final StringBuilder out = new StringBuilder(text.length()); + final OffsetMap.Builder offsets = new OffsetMap.Builder(); + final int length = text.length(); + int i = 0; + while (i < length) { + final int codePoint = Character.codePointAt(text, i); + if (members.contains(codePoint)) { + appendMapped(out, replacement, offsets, i, i); + } else { + appendMapped(out, codePoint, offsets, i, i + 1); + } + i += Character.charCount(codePoint); + } + return new NormalizedText(text, out.toString(), offsets.build(length)); + } + + /** + * Like {@link #collapse(CharSequence)} but also produces the {@link OffsetMap} back to the + * original text. Each collapsed run maps to the run's start offset. + * + * @param text The text to collapse. + * @return The collapsed text and its offset map. + */ + public NormalizedText collapseMapped(CharSequence text) { + Objects.requireNonNull(text, "text"); + final StringBuilder out = new StringBuilder(text.length()); + final OffsetMap.Builder offsets = new OffsetMap.Builder(); + final int length = text.length(); + int i = 0; + while (i < length) { + final int codePoint = Character.codePointAt(text, i); + if (members.contains(codePoint)) { + appendMapped(out, replacement, offsets, i, i); + i = skipRun(text, i); + } else { + appendMapped(out, codePoint, offsets, i, i + 1); + i += Character.charCount(codePoint); + } + } + return new NormalizedText(text, out.toString(), offsets.build(length)); + } + + // Appends one code point to the output and records an original offset for each output char. + // firstOffset maps the first (or only) char; secondOffset maps the low surrogate of a + // supplementary code point. + private static void appendMapped(StringBuilder out, int codePoint, OffsetMap.Builder offsets, + int firstOffset, int secondOffset) { + if (Character.isBmpCodePoint(codePoint)) { + out.append((char) codePoint); + offsets.map(firstOffset); + } else { + out.append(Character.highSurrogate(codePoint)); + offsets.map(firstOffset); + out.append(Character.lowSurrogate(codePoint)); + offsets.map(secondOffset); + } + } + + // Returns the offset just past the maximal run of members starting at runStart. + private int skipRun(CharSequence text, int runStart) { + final int length = text.length(); + int i = runStart; + while (i < length) { + final int codePoint = Character.codePointAt(text, i); + if (!members.contains(codePoint)) { + break; + } + i += Character.charCount(codePoint); + } + return i; + } + + private static void requireValidCodePoint(int codePoint) { + if (codePoint < 0 || codePoint > Character.MAX_CODE_POINT) { + throw new IllegalArgumentException("Not a Unicode code point: " + codePoint); + } + } +} diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CodePointSet.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CodePointSet.java new file mode 100644 index 000000000..a15b005b0 --- /dev/null +++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CodePointSet.java @@ -0,0 +1,245 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.BitSet; +import java.util.List; +import java.util.Locale; +import java.util.Objects; + +/** + * An immutable set of Unicode code points with O(1) membership. + * + * <p>Backed by a {@link BitSet} keyed directly by code point, so {@link #contains(int)} is a + * single array-word read with no boxing, hashing, or branching beyond a range check. Memory is + * bounded by the largest member code point (the whole of Unicode would cost about {@code 136 KiB}, + * and the standard whitespace and dash sets are entirely or almost entirely in the Basic + * Multilingual Plane, so a few kilobytes in practice).</p> + * + * <p>This type carries no opinion about what the code points mean. It is the explicit, + * standards-sourced data layer that {@link CharClass} and the reference tables + * ({@link UnicodeWhitespace}, {@link UnicodeDash}) are built from, and that users extend or + * override through {@link #fromFile(Path, String)}.</p> + */ +public final class CodePointSet { + + private final BitSet members; + + private CodePointSet(BitSet members) { + this.members = members; + } + + /** + * Creates a set from explicit code points. + * + * @param codePoints The code points to include. + * @return The set. + * @throws IllegalArgumentException Thrown if any value is not a valid Unicode code point + * (outside {@code [0, U+10FFFF]}). + */ + public static CodePointSet of(int... codePoints) { + final BitSet members = new BitSet(); + for (final int codePoint : codePoints) { + requireValid(codePoint); + members.set(codePoint); + } + return new CodePointSet(members); + } + + /** + * Creates a set covering an inclusive code point range. + * + * @param firstInclusive The first code point in the range. + * @param lastInclusive The last code point in the range. + * @return The set. + * @throws IllegalArgumentException Thrown if either bound is invalid or {@code firstInclusive} + * is greater than {@code lastInclusive}. + */ + public static CodePointSet ofRange(int firstInclusive, int lastInclusive) { + requireValid(firstInclusive); + requireValid(lastInclusive); + if (firstInclusive > lastInclusive) { + throw new IllegalArgumentException("Range start " + firstInclusive + + " must not exceed range end " + lastInclusive + "."); + } + final BitSet members = new BitSet(); + members.set(firstInclusive, lastInclusive + 1); + return new CodePointSet(members); + } + + /** + * Loads the code points declared under one section of a user definitions file. + * + * <p>The format is line oriented and parsed with simple cursor scanning, not a regular + * expression: a {@code [name]} line opens a section; a {@code #} begins a comment that runs to + * end of line; each remaining line is a single hex code point ({@code U+00A0}, {@code 0x00A0}, + * or {@code 00A0}) or an inclusive range ({@code U+2000-U+200A}). Section names match case + * insensitively. Only entries under the requested section are returned, so one file can carry, + * for example, both {@code [whitespace]} and {@code [dash]} sections.</p> + * + * @param definitions The file to read (UTF-8). + * @param section The section whose entries should be loaded. + * @return The code points declared under {@code section}, or an empty set if the section is + * absent. + * @throws IOException Thrown if the file cannot be read. + * @throws IllegalArgumentException Thrown if a line is malformed, naming the offending line. + */ + public static CodePointSet fromFile(Path definitions, String section) throws IOException { + Objects.requireNonNull(definitions, "definitions"); + return parse(Files.readAllLines(definitions, StandardCharsets.UTF_8), section); + } + + // Package visible so the parser can be exercised directly, without a temporary file. + static CodePointSet parse(List<String> lines, String section) { + Objects.requireNonNull(section, "section"); + final String wanted = section.trim().toLowerCase(Locale.ROOT); + final BitSet members = new BitSet(); + String current = null; + + for (int i = 0; i < lines.size(); i++) { + final String raw = lines.get(i); + final int lineNumber = i + 1; + final String line = stripComment(raw).strip(); + if (line.isEmpty()) { + continue; + } + if (line.charAt(0) == '[') { + if (line.length() < 3 || line.charAt(line.length() - 1) != ']') { + throw malformed("section header", lineNumber, raw); + } + current = line.substring(1, line.length() - 1).strip().toLowerCase(Locale.ROOT); + continue; + } + if (current == null) { + throw new IllegalArgumentException("Code point entry before any [section] header on line " + + lineNumber + ": " + raw); + } + if (wanted.equals(current)) { + addEntry(members, line, lineNumber, raw); + } + } + + return new CodePointSet(members); + } + + private static void addEntry(BitSet members, String line, int lineNumber, String raw) { + final int separator = line.indexOf('-'); + if (separator < 0) { + members.set(parseCodePoint(line, lineNumber, raw)); + return; + } + final int low = parseCodePoint(line.substring(0, separator).strip(), lineNumber, raw); + final int high = parseCodePoint(line.substring(separator + 1).strip(), lineNumber, raw); + if (low > high) { + throw new IllegalArgumentException("Descending code point range on line " + + lineNumber + ": " + raw); + } + members.set(low, high + 1); + } + + private static int parseCodePoint(String token, int lineNumber, String raw) { + String hex = token; + if (hex.length() >= 2) { + final String prefix = hex.substring(0, 2).toLowerCase(Locale.ROOT); + if (prefix.equals("u+") || prefix.equals("0x")) { + hex = hex.substring(2); + } + } + if (hex.isEmpty()) { + throw malformed("code point", lineNumber, raw); + } + final int codePoint; + try { + codePoint = Integer.parseInt(hex, 16); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Invalid hex code point '" + token + "' on line " + + lineNumber + ": " + raw, e); + } + if (codePoint < 0 || codePoint > Character.MAX_CODE_POINT) { + throw new IllegalArgumentException("Code point out of range on line " + + lineNumber + ": " + raw); + } + return codePoint; + } + + private static String stripComment(String raw) { + final int hash = raw.indexOf('#'); + return hash < 0 ? raw : raw.substring(0, hash); + } + + private static IllegalArgumentException malformed(String what, int lineNumber, String raw) { + return new IllegalArgumentException("Malformed " + what + " on line " + lineNumber + ": " + raw); + } + + private static void requireValid(int codePoint) { + if (codePoint < 0 || codePoint > Character.MAX_CODE_POINT) { + throw new IllegalArgumentException("Not a Unicode code point: " + codePoint); + } + } + + /** + * Tests membership. + * + * @param codePoint The code point to test. Out-of-range values return {@code false}. + * @return {@code true} if the code point is in this set. + */ + public boolean contains(int codePoint) { + return codePoint >= 0 && codePoint <= Character.MAX_CODE_POINT && members.get(codePoint); + } + + /** + * Returns a new set containing every code point in this set or {@code other}. + * + * @param other The set to union with. + * @return The union, a new set; neither input is modified. + */ + public CodePointSet union(CodePointSet other) { + Objects.requireNonNull(other, "other"); + final BitSet merged = (BitSet) members.clone(); + merged.or(other.members); + return new CodePointSet(merged); + } + + /** {@return the number of code points in this set} */ + public int size() { + return members.cardinality(); + } + + /** {@return whether this set is empty} */ + public boolean isEmpty() { + return members.isEmpty(); + } + + /** {@return the member code points, in ascending order} */ + public int[] toArray() { + return members.stream().toArray(); + } + + @Override + public boolean equals(Object o) { + return o instanceof CodePointSet other && members.equals(other.members); + } + + @Override + public int hashCode() { + return members.hashCode(); + } +} diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/NormalizedText.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/NormalizedText.java new file mode 100644 index 000000000..87678d741 --- /dev/null +++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/NormalizedText.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +/** + * The result of a normalization that keeps the original text alongside the normalized form. + * + * <p>The original is the source of truth (display, offsets, language-specific analysis); the + * normalized form is a derived view tuned for matching and search. The {@link OffsetMap} ties the + * two together so a position in the normalized text can be reported against the original.</p> + * + * @param original The untouched source text. + * @param normalized The normalized text. + * @param offsets The mapping between normalized and original character offsets. + */ +public record NormalizedText(CharSequence original, String normalized, OffsetMap offsets) { + + /** + * Maps a normalized character offset back to the original text. + * + * @param normalizedOffset An offset in {@code [0, normalized().length()]}. + * @return The corresponding original character offset. + */ + public int toOriginalOffset(int normalizedOffset) { + return offsets.toOriginalOffset(normalizedOffset); + } + + /** + * Maps an original character offset forward to the normalized text. + * + * @param originalOffset An offset in {@code [0, original().length()]}. + * @return The corresponding normalized character offset. + */ + public int toNormalizedOffset(int originalOffset) { + return offsets.toNormalizedOffset(originalOffset); + } +} diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetMap.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetMap.java new file mode 100644 index 000000000..24fa558cf --- /dev/null +++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetMap.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.Arrays; + +/** + * A mapping between character offsets in a normalized string and the original text it came from. + * + * <p>Normalization that collapses runs or substitutes supplementary characters changes string + * length, so an offset into the normalized form no longer lines up with the original. This map + * records, for every normalized character, the original character offset it was produced from, + * which lets a match found in the normalized form be reported in original coordinates.</p> + * + * <p>The internal mapping is non-decreasing, so {@link #toOriginalOffset(int)} is a direct array + * read (O(1)) and {@link #toNormalizedOffset(int)} is a binary search (O(log n)). The map is + * built in the same single cursor pass that produces the normalized text, via {@link Builder}.</p> + */ +public final class OffsetMap { + + // normalizedToOriginal[k] is the original char offset that produced normalized char k. + // It has one extra trailing slot mapping the end of the normalized text to the end of the + // original text, so offsets in [0, normalizedLength] are all valid. + private final int[] normalizedToOriginal; + private final int originalLength; + + private OffsetMap(int[] normalizedToOriginal, int originalLength) { + this.normalizedToOriginal = normalizedToOriginal; + this.originalLength = originalLength; + } + + /** + * Maps a normalized character offset back to the original text. + * + * @param normalizedOffset An offset in {@code [0, normalizedLength]}. + * @return The corresponding original character offset. + * @throws IndexOutOfBoundsException Thrown if {@code normalizedOffset} is out of range. + */ + public int toOriginalOffset(int normalizedOffset) { + if (normalizedOffset < 0 || normalizedOffset >= normalizedToOriginal.length) { + throw new IndexOutOfBoundsException("normalized offset " + normalizedOffset + + " is outside [0, " + normalizedLength() + "]"); + } + return normalizedToOriginal[normalizedOffset]; + } + + /** + * Maps an original character offset forward to the normalized text. + * + * <p>Returns the first normalized offset whose source is at or after {@code originalOffset}. + * When several original characters collapse to one normalized character, they all map to that + * single normalized offset.</p> + * + * @param originalOffset An offset in {@code [0, originalLength]}. + * @return The corresponding normalized character offset. + * @throws IndexOutOfBoundsException Thrown if {@code originalOffset} is out of range. + */ + public int toNormalizedOffset(int originalOffset) { + if (originalOffset < 0 || originalOffset > originalLength) { + throw new IndexOutOfBoundsException("original offset " + originalOffset + + " is outside [0, " + originalLength + "]"); + } + int low = 0; + int high = normalizedToOriginal.length - 1; + int answer = normalizedToOriginal.length - 1; + while (low <= high) { + final int mid = (low + high) >>> 1; + if (normalizedToOriginal[mid] >= originalOffset) { + answer = mid; + high = mid - 1; + } else { + low = mid + 1; + } + } + return answer; + } + + /** {@return the length of the normalized text this map was built for} */ + public int normalizedLength() { + return normalizedToOriginal.length - 1; + } + + /** {@return the length of the original text this map was built for} */ + public int originalLength() { + return originalLength; + } + + /** + * Builds an {@link OffsetMap} incrementally during a normalization pass. Call {@link #map(int)} + * once for each character appended to the normalized output, then {@link #build(int)} once. + */ + public static final class Builder { + + private int[] buffer = new int[16]; + private int length; + + /** + * Records the original character offset that produced the next normalized character. + * + * @param originalOffset The source offset in the original text. + */ + public void map(int originalOffset) { + if (length == buffer.length) { + buffer = Arrays.copyOf(buffer, buffer.length * 2); + } + buffer[length++] = originalOffset; + } + + /** + * Finalizes the map. + * + * @param originalLength The length of the original text (used as the trailing sentinel). + * @return The immutable {@link OffsetMap}. + */ + public OffsetMap build(int originalLength) { + final int[] mapping = Arrays.copyOf(buffer, length + 1); + mapping[length] = originalLength; + return new OffsetMap(mapping, originalLength); + } + } +} diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/UnicodeDash.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/UnicodeDash.java new file mode 100644 index 000000000..7ac3ea829 --- /dev/null +++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/UnicodeDash.java @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.ArrayList; +import java.util.BitSet; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +/** + * Reference data for Unicode dashes, plus O(1) membership lookups. + * + * <p>This is a static, immutable table of every code point that carries the Unicode {@code Dash} + * property (Unicode Character Database, {@code PropList.txt}). The set is broader than the + * {@code Pd} (dash punctuation) general category: it also includes the swung dash ({@code Po}) + * and the mathematical minus signs ({@code Sm}). Java offers no {@code Dash} predicate and + * {@code \p{Pd}} would miss the {@code Sm} and {@code Po} members, which is why the set is kept + * here explicitly.</p> + * + * <p>Two distinctions matter for normalization:</p> + * <ul> + * <li>The three mathematical minus signs ({@code U+207B}, {@code U+208B}, {@code U+2212}, all + * category {@code Sm}) are excluded from {@link #defaultDashCodePoints()} because flattening + * them to {@code U+002D} can change mathematical meaning. They remain available through + * {@link #codePoints()} for callers that opt in.</li> + * <li>{@code U+00AD} SOFT HYPHEN is deliberately absent: it is a format character + * ({@code White_Space=no}, {@code Dash=no}), an invisible line-break hint, and must not be + * turned into a visible hyphen.</li> + * </ul> + */ +public final class UnicodeDash { + + /** The canonical ASCII dash that dashes are normalized to: {@code U+002D} HYPHEN-MINUS. */ + public static final int HYPHEN_MINUS = 0x002D; + + /** The Unicode general category of a dash code point. */ + public enum Category { + /** {@code Pd} - dash punctuation. */ + Pd, + /** {@code Po} - other punctuation (the swung dash). */ + Po, + /** {@code Sm} - math symbol (the minus signs). */ + Sm + } + + /** + * One Unicode dash code point and its reference attributes. + * + * @param codePoint The Unicode code point. + * @param name The Unicode character name, lower cased. + * @param category The Unicode general {@link Category category}. + */ + public record DashCharacter(int codePoint, String name, Category category) { + + /** {@return whether this is a mathematical minus sign (category {@code Sm})} */ + public boolean isMathematical() { + return category == Category.Sm; + } + + /** {@return whether this code point is outside the Basic Multilingual Plane} */ + public boolean isSupplementary() { + return codePoint > 0xFFFF; + } + + /** {@return the {@code U+XXXX} notation for this code point} */ + public String toUnicodeNotation() { + return String.format("U+%04X", codePoint); + } + } + + private static final List<DashCharacter> DASHES = List.of( + new DashCharacter(0x002D, "hyphen-minus", Category.Pd), + new DashCharacter(0x058A, "armenian hyphen", Category.Pd), + new DashCharacter(0x05BE, "hebrew punctuation maqaf", Category.Pd), + new DashCharacter(0x1400, "canadian syllabics hyphen", Category.Pd), + new DashCharacter(0x1806, "mongolian todo soft hyphen", Category.Pd), + new DashCharacter(0x2010, "hyphen", Category.Pd), + new DashCharacter(0x2011, "non-breaking hyphen", Category.Pd), + new DashCharacter(0x2012, "figure dash", Category.Pd), + new DashCharacter(0x2013, "en dash", Category.Pd), + new DashCharacter(0x2014, "em dash", Category.Pd), + new DashCharacter(0x2015, "horizontal bar", Category.Pd), + new DashCharacter(0x2053, "swung dash", Category.Po), + new DashCharacter(0x207B, "superscript minus", Category.Sm), + new DashCharacter(0x208B, "subscript minus", Category.Sm), + new DashCharacter(0x2212, "minus sign", Category.Sm), + new DashCharacter(0x2E17, "double oblique hyphen", Category.Pd), + new DashCharacter(0x2E1A, "hyphen with diaeresis", Category.Pd), + new DashCharacter(0x2E3A, "two-em dash", Category.Pd), + new DashCharacter(0x2E3B, "three-em dash", Category.Pd), + new DashCharacter(0x2E40, "double hyphen", Category.Pd), + new DashCharacter(0x2E5D, "oblique hyphen", Category.Pd), + new DashCharacter(0x301C, "wave dash", Category.Pd), + new DashCharacter(0x3030, "wavy dash", Category.Pd), + new DashCharacter(0x30A0, "katakana-hiragana double hyphen", Category.Pd), + new DashCharacter(0xFE31, "presentation form for vertical em dash", Category.Pd), + new DashCharacter(0xFE32, "presentation form for vertical en dash", Category.Pd), + new DashCharacter(0xFE58, "small em dash", Category.Pd), + new DashCharacter(0xFE63, "small hyphen-minus", Category.Pd), + new DashCharacter(0xFF0D, "fullwidth hyphen-minus", Category.Pd), + new DashCharacter(0x10D6E, "garay hyphen", Category.Pd), + new DashCharacter(0x10EAD, "yezidi hyphenation mark", Category.Pd)); + + private static final Map<Integer, DashCharacter> BY_CODE_POINT = new HashMap<>(); + private static final BitSet MEMBERSHIP = new BitSet(); + private static final int[] CODE_POINTS = new int[DASHES.size()]; + private static final List<DashCharacter> MATHEMATICAL = new ArrayList<>(); + private static final int[] DEFAULT_CODE_POINTS; + + static { + final List<Integer> defaults = new ArrayList<>(); + for (int i = 0; i < DASHES.size(); i++) { + final DashCharacter dash = DASHES.get(i); + BY_CODE_POINT.put(dash.codePoint(), dash); + MEMBERSHIP.set(dash.codePoint()); + CODE_POINTS[i] = dash.codePoint(); + if (dash.isMathematical()) { + MATHEMATICAL.add(dash); + } else { + defaults.add(dash.codePoint()); + } + } + DEFAULT_CODE_POINTS = defaults.stream().mapToInt(Integer::intValue).toArray(); + } + + private UnicodeDash() { + } + + /** + * Tests whether a code point carries the Unicode {@code Dash} property. + * + * @param codePoint The code point to test. Out-of-range values return {@code false}. + * @return {@code true} if the code point is one of the Unicode dash characters. + */ + public static boolean isDash(int codePoint) { + return codePoint >= 0 && codePoint <= Character.MAX_CODE_POINT && MEMBERSHIP.get(codePoint); + } + + /** + * Looks up the reference entry for a dash code point. + * + * @param codePoint The code point. + * @return The {@link DashCharacter}, or {@link Optional#empty()} if it is not a dash. + */ + public static Optional<DashCharacter> byCodePoint(int codePoint) { + return Optional.ofNullable(BY_CODE_POINT.get(codePoint)); + } + + /** {@return all Unicode dash characters, in ascending code point order} */ + public static List<DashCharacter> all() { + return DASHES; + } + + /** {@return the mathematical minus signs, excluded from the default normalization set} */ + public static List<DashCharacter> mathematical() { + return List.copyOf(MATHEMATICAL); + } + + /** {@return all dash code points, in ascending order, including the mathematical minus signs} */ + public static int[] codePoints() { + return CODE_POINTS.clone(); + } + + /** + * {@return the dash code points used for normalization by default, in ascending order} + * + * <p>This is every dash except the mathematical minus signs, so flattening to + * {@link #HYPHEN_MINUS} does not silently rewrite mathematics.</p> + */ + public static int[] defaultDashCodePoints() { + return DEFAULT_CODE_POINTS.clone(); + } +} diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/UnicodeWhitespace.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/UnicodeWhitespace.java new file mode 100644 index 000000000..3712f0906 --- /dev/null +++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/UnicodeWhitespace.java @@ -0,0 +1,242 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.ArrayList; +import java.util.BitSet; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +/** + * Reference data for Unicode whitespace, plus O(1) membership lookups. + * + * <p>This is a static, immutable table of the {@code 25} code points that carry the Unicode + * {@code White_Space} property, and the related {@code 6} code points that are commonly mistaken + * for whitespace but carry {@code White_Space=no} (zero-width and other format characters). + * The data mirrors the tables in + * <a href="https://en.wikipedia.org/wiki/Whitespace_character">Whitespace character</a> + * and the Unicode Character Database ({@code PropList.txt}).</p> + * + * <p>The membership test is deliberately built from this explicit table rather than from + * {@link Character#isWhitespace(int)} or {@link Character#isSpaceChar(int)}, both of which + * disagree with the Unicode {@code White_Space} property. {@code Character.isWhitespace} + * excludes the non-breaking spaces and {@code NEL} but includes the information-separator + * controls {@code U+001C}-{@code U+001F}; {@code Character.isSpaceChar} excludes tab, newline, + * and the other line breaks. {@link #isWhitespace(int)} matches the standard exactly.</p> + */ +public final class UnicodeWhitespace { + + /** Unicode general category for a whitespace or related code point. */ + public enum Category { + /** {@code Cc} - control. */ + Cc, + /** {@code Zs} - space separator. */ + Zs, + /** {@code Zl} - line separator. */ + Zl, + /** {@code Zp} - paragraph separator. */ + Zp, + /** {@code Cf} - format (the related, non-whitespace code points). */ + Cf + } + + /** Line-breaking behavior, mirroring the "Notes" column of the reference table. */ + public enum Breaking { + /** A break opportunity, but not a forced line break (e.g. {@code SPACE}). */ + MAY_BREAK, + /** A forced line or paragraph break (e.g. {@code LF}, {@code LINE SEPARATOR}). */ + LINE_BREAK, + /** A space that suppresses line breaking (e.g. {@code NO-BREAK SPACE}). */ + NON_BREAKING + } + + /** + * One Unicode whitespace code point and its reference attributes. + * + * @param codePoint The Unicode code point. + * @param name The Unicode character name, lower cased as in the reference table. + * @param abbreviation The common abbreviation (for example {@code NBSP}), or {@code ""} if none. + * @param category The Unicode general {@link Category category}. + * @param breaking The line-{@link Breaking breaking} behavior. + */ + public record WhitespaceCharacter(int codePoint, String name, String abbreviation, + Category category, Breaking breaking) { + + /** {@return whether this code point forces a line or paragraph break} */ + public boolean isLineBreak() { + return breaking == Breaking.LINE_BREAK; + } + + /** {@return whether this is a non-breaking space} */ + public boolean isNonBreaking() { + return breaking == Breaking.NON_BREAKING; + } + + /** {@return the {@code U+XXXX} notation for this code point} */ + public String toUnicodeNotation() { + return String.format("U+%04X", codePoint); + } + } + + /** + * One related code point that is commonly confused with whitespace but is not + * ({@code White_Space=no}). These are format characters and must not be treated as, or + * normalized like, whitespace. + * + * @param codePoint The Unicode code point. + * @param name The Unicode character name, lower cased as in the reference table. + * @param abbreviation The common abbreviation (for example {@code BOM}), or {@code ""} if none. + * @param note A short description of what the character actually does. + */ + public record RelatedCharacter(int codePoint, String name, String abbreviation, String note) { + + /** {@return the {@code U+XXXX} notation for this code point} */ + public String toUnicodeNotation() { + return String.format("U+%04X", codePoint); + } + } + + private static final List<WhitespaceCharacter> WHITESPACE = List.of( + new WhitespaceCharacter(0x0009, "character tabulation", "HT", Category.Cc, Breaking.MAY_BREAK), + new WhitespaceCharacter(0x000A, "line feed", "LF", Category.Cc, Breaking.LINE_BREAK), + new WhitespaceCharacter(0x000B, "line tabulation", "VT", Category.Cc, Breaking.LINE_BREAK), + new WhitespaceCharacter(0x000C, "form feed", "FF", Category.Cc, Breaking.LINE_BREAK), + new WhitespaceCharacter(0x000D, "carriage return", "CR", Category.Cc, Breaking.LINE_BREAK), + new WhitespaceCharacter(0x0020, "space", "", Category.Zs, Breaking.MAY_BREAK), + new WhitespaceCharacter(0x0085, "next line", "NEL", Category.Cc, Breaking.LINE_BREAK), + new WhitespaceCharacter(0x00A0, "no-break space", "NBSP", Category.Zs, Breaking.NON_BREAKING), + new WhitespaceCharacter(0x1680, "ogham space mark", "", Category.Zs, Breaking.MAY_BREAK), + new WhitespaceCharacter(0x2000, "en quad", "", Category.Zs, Breaking.MAY_BREAK), + new WhitespaceCharacter(0x2001, "em quad", "", Category.Zs, Breaking.MAY_BREAK), + new WhitespaceCharacter(0x2002, "en space", "", Category.Zs, Breaking.MAY_BREAK), + new WhitespaceCharacter(0x2003, "em space", "", Category.Zs, Breaking.MAY_BREAK), + new WhitespaceCharacter(0x2004, "three-per-em space", "", Category.Zs, Breaking.MAY_BREAK), + new WhitespaceCharacter(0x2005, "four-per-em space", "", Category.Zs, Breaking.MAY_BREAK), + new WhitespaceCharacter(0x2006, "six-per-em space", "", Category.Zs, Breaking.MAY_BREAK), + new WhitespaceCharacter(0x2007, "figure space", "", Category.Zs, Breaking.NON_BREAKING), + new WhitespaceCharacter(0x2008, "punctuation space", "", Category.Zs, Breaking.MAY_BREAK), + new WhitespaceCharacter(0x2009, "thin space", "", Category.Zs, Breaking.MAY_BREAK), + new WhitespaceCharacter(0x200A, "hair space", "", Category.Zs, Breaking.MAY_BREAK), + new WhitespaceCharacter(0x2028, "line separator", "", Category.Zl, Breaking.LINE_BREAK), + new WhitespaceCharacter(0x2029, "paragraph separator", "", Category.Zp, Breaking.LINE_BREAK), + new WhitespaceCharacter(0x202F, "narrow no-break space", "NNBSP", Category.Zs, + Breaking.NON_BREAKING), + new WhitespaceCharacter(0x205F, "medium mathematical space", "MMSP", Category.Zs, + Breaking.MAY_BREAK), + new WhitespaceCharacter(0x3000, "ideographic space", "", Category.Zs, Breaking.MAY_BREAK)); + + private static final List<RelatedCharacter> LOOKALIKES = List.of( + new RelatedCharacter(0x180E, "mongolian vowel separator", "MVS", + "format character; narrow space for Mongolian"), + new RelatedCharacter(0x200B, "zero width space", "ZWSP", + "format; word boundary indicator, no visible width"), + new RelatedCharacter(0x200C, "zero width non-joiner", "ZWNJ", + "format; prevents character connection"), + new RelatedCharacter(0x200D, "zero width joiner", "ZWJ", + "format; enables character connection"), + new RelatedCharacter(0x2060, "word joiner", "WJ", + "format; non-breaking, no line break point"), + new RelatedCharacter(0xFEFF, "zero width no-break space", "BOM", + "format; byte order mark")); + + private static final Map<Integer, WhitespaceCharacter> BY_CODE_POINT = new HashMap<>(); + private static final BitSet MEMBERSHIP = new BitSet(); + private static final BitSet LOOKALIKE_MEMBERSHIP = new BitSet(); + private static final int[] CODE_POINTS = new int[WHITESPACE.size()]; + private static final List<WhitespaceCharacter> LINE_BREAKS = new ArrayList<>(); + private static final List<WhitespaceCharacter> NON_BREAKING = new ArrayList<>(); + + static { + for (int i = 0; i < WHITESPACE.size(); i++) { + final WhitespaceCharacter ws = WHITESPACE.get(i); + BY_CODE_POINT.put(ws.codePoint(), ws); + MEMBERSHIP.set(ws.codePoint()); + CODE_POINTS[i] = ws.codePoint(); + if (ws.isLineBreak()) { + LINE_BREAKS.add(ws); + } + if (ws.isNonBreaking()) { + NON_BREAKING.add(ws); + } + } + for (final RelatedCharacter related : LOOKALIKES) { + LOOKALIKE_MEMBERSHIP.set(related.codePoint()); + } + } + + private UnicodeWhitespace() { + } + + /** + * Tests whether a code point carries the Unicode {@code White_Space} property. + * + * @param codePoint The code point to test. Out-of-range values (negative or beyond + * {@link Character#MAX_CODE_POINT}) simply return {@code false}. + * @return {@code true} if the code point is one of the {@code 25} Unicode whitespace characters. + */ + public static boolean isWhitespace(int codePoint) { + return codePoint >= 0 && codePoint <= Character.MAX_CODE_POINT && MEMBERSHIP.get(codePoint); + } + + /** + * Tests whether a code point is one of the related, non-whitespace look-alike format characters. + * + * @param codePoint The code point to test. + * @return {@code true} if the code point is in the {@link #lookalikes() look-alike} set. + */ + public static boolean isLookalike(int codePoint) { + return codePoint >= 0 && codePoint <= Character.MAX_CODE_POINT + && LOOKALIKE_MEMBERSHIP.get(codePoint); + } + + /** + * Looks up the reference entry for a whitespace code point. + * + * @param codePoint The code point. + * @return The {@link WhitespaceCharacter}, or {@link Optional#empty()} if it is not whitespace. + */ + public static Optional<WhitespaceCharacter> byCodePoint(int codePoint) { + return Optional.ofNullable(BY_CODE_POINT.get(codePoint)); + } + + /** {@return the {@code 25} Unicode whitespace characters, in ascending code point order} */ + public static List<WhitespaceCharacter> all() { + return WHITESPACE; + } + + /** {@return the related, non-whitespace look-alike format characters} */ + public static List<RelatedCharacter> lookalikes() { + return LOOKALIKES; + } + + /** {@return the whitespace characters that force a line or paragraph break} */ + public static List<WhitespaceCharacter> lineBreaks() { + return List.copyOf(LINE_BREAKS); + } + + /** {@return the non-breaking whitespace characters} */ + public static List<WhitespaceCharacter> nonBreaking() { + return List.copyOf(NON_BREAKING); + } + + /** {@return the whitespace code points, in ascending order} */ + public static int[] codePoints() { + return CODE_POINTS.clone(); + } +} diff --git a/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java new file mode 100644 index 000000000..5e2a42ba6 --- /dev/null +++ b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java @@ -0,0 +1,292 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.List; + +import org.junit.jupiter.api.Test; + +import opennlp.tools.util.Span; +import opennlp.tools.util.normalizer.UnicodeWhitespace.WhitespaceCharacter; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class CharClassTest { + + private static final CharClass WS = CharClass.whitespace(); + private static final CharClass DASH = CharClass.dashes(); + + // Non-ASCII test characters are built from code points (no literal glyphs, no Unicode escapes) + // so the source stays pure ASCII and the intent is explicit. Tab and newline use \t and \n. + private static final String NBSP = cp(0x00A0); + private static final String IDEOGRAPHIC = cp(0x3000); + private static final String EM_DASH = cp(0x2014); + private static final String EN_DASH = cp(0x2013); + private static final String FIGURE_DASH = cp(0x2012); + private static final String MINUS_SIGN = cp(0x2212); + private static final String YEZIDI_HYPHEN = cp(0x10EAD); + private static final String GRINNING_FACE = cp(0x1F600); + + private static String cp(int codePoint) { + return new String(Character.toChars(codePoint)); + } + + private static CodePointSet lineBreaks() { + return CodePointSet.of(UnicodeWhitespace.lineBreaks().stream() + .mapToInt(WhitespaceCharacter::codePoint).toArray()); + } + + // --- membership -------------------------------------------------------------------------- + + @Test + void testWhitespacePresetMembership() { + assertTrue(WS.contains(0x0020)); + assertTrue(WS.contains(0x0009)); + assertTrue(WS.contains(0x00A0)); + assertTrue(WS.contains(0x3000)); + assertTrue(WS.contains(0x2028)); + assertFalse(WS.contains('a')); + assertFalse(WS.contains(0x200B), "zero width space is not whitespace"); + } + + @Test + void testDashPresetMembershipExcludesMathMinus() { + assertTrue(DASH.contains(0x2014)); + assertTrue(DASH.contains(0x2013)); + assertTrue(DASH.contains(0xFF0D)); + assertFalse(DASH.contains(0x2212), "math minus is excluded by default"); + assertFalse(DASH.contains('a')); + } + + // --- normalize / collapse ---------------------------------------------------------------- + + @Test + void testNormalizeReplacesEachMemberOneForOne() { + assertEquals("a b", WS.normalize("a" + NBSP + IDEOGRAPHIC + "b")); + assertEquals("well-known", DASH.normalize("well" + EM_DASH + "known")); + assertEquals("a-b-c", DASH.normalize("a" + EN_DASH + "b" + FIGURE_DASH + "c")); + } + + @Test + void testNormalizeLeavesMathMinusUntouched() { + assertEquals("5" + MINUS_SIGN + "3", DASH.normalize("5" + MINUS_SIGN + "3")); + } + + @Test + void testCollapseMergesRuns() { + assertEquals("a b", WS.collapse("a" + NBSP + IDEOGRAPHIC + "b")); + assertEquals(" a b ", WS.collapse(" a\t\tb ")); + assertEquals("a-b", DASH.collapse("a" + EM_DASH + EN_DASH + EM_DASH + "b")); + } + + @Test + void testNormalizeAndCollapseHandleSupplementaryMembers() { + assertEquals("x-y", DASH.normalize("x" + YEZIDI_HYPHEN + "y")); + assertEquals("x-y", DASH.collapse("x" + YEZIDI_HYPHEN + YEZIDI_HYPHEN + "y")); + } + + @Test + void testEmptyAndAllMemberInputs() { + assertEquals("", WS.normalize("")); + assertEquals("", WS.collapse("")); + assertEquals("", WS.trim("")); + assertEquals("", WS.removeAll("")); + assertArrayEquals(new String[0], WS.split("")); + assertArrayEquals(new String[0], WS.split(" " + IDEOGRAPHIC)); + } + + // --- squish (collapsePreserving) --------------------------------------------------------- + + @Test + void testCollapsePreservingKeepsLineBreaks() { + final CodePointSet keep = lineBreaks(); + assertEquals("a\nb", WS.collapsePreserving("a\n\n\t\tb", keep, '\n')); + assertEquals("a b", WS.collapsePreserving("a \t b", keep, '\n')); + assertEquals("a\nb\nc", WS.collapsePreserving("a\n \tb \nc", keep, '\n')); + } + + // --- trim / removeAll -------------------------------------------------------------------- + + @Test + void testTrim() { + assertEquals("hello", WS.trim("\t hello" + IDEOGRAPHIC + IDEOGRAPHIC)); + assertEquals("noedge", WS.trim("noedge")); + assertEquals("", WS.trim(" ")); + assertEquals("a b", WS.trim(" a b "), "interior whitespace is preserved"); + } + + @Test + void testRemoveAll() { + assertEquals("abcd", WS.removeAll("a b\tc d")); + } + + // --- split / splitSpans ------------------------------------------------------------------ + + @Test + void testSplitOnUnicodeWhitespace() { + assertArrayEquals(new String[] {"one", "two", "three"}, + WS.split("one two" + IDEOGRAPHIC + IDEOGRAPHIC + "three")); + assertArrayEquals(new String[] {"a", "b"}, WS.split(" a b ")); + } + + @Test + void testSplitSpansCarryOriginalOffsets() { + final String text = "one two"; + final List<Span> spans = WS.splitSpans(text); + assertEquals(2, spans.size()); + assertEquals(0, spans.get(0).getStart()); + assertEquals(3, spans.get(0).getEnd()); + assertEquals("one", spans.get(0).getCoveredText(text).toString()); + assertEquals(4, spans.get(1).getStart()); + assertEquals(7, spans.get(1).getEnd()); + assertEquals("two", spans.get(1).getCoveredText(text).toString()); + } + + @Test + void testSplitSpansWithSupplementaryToken() { + final String text = "a " + GRINNING_FACE + " b"; + final List<Span> spans = WS.splitSpans(text); + assertEquals(3, spans.size()); + assertEquals("a", spans.get(0).getCoveredText(text).toString()); + assertEquals(GRINNING_FACE, spans.get(1).getCoveredText(text).toString()); + assertEquals("b", spans.get(2).getCoveredText(text).toString()); + } + + // --- custom classes ---------------------------------------------------------------------- + + @Test + void testCustomClass() { + final CharClass vowelO = CharClass.of(CodePointSet.of('o'), '0'); + assertEquals("f00 bar", vowelO.normalize("foo bar")); + assertEquals("f0", vowelO.collapse("foo")); + } + + @Test + void testWithAdditionalExtendsWithoutMutatingOriginal() { + final CharClass extended = WS.withAdditional(CodePointSet.of('_')); + assertTrue(extended.contains('_')); + assertTrue(extended.contains(0x0020)); + assertEquals("a b c", extended.normalize("a_b c")); + assertFalse(WS.contains('_'), "the preset must be unchanged"); + } + + @Test + void testOfRejectsInvalidReplacement() { + assertThrows(IllegalArgumentException.class, + () -> CharClass.of(CodePointSet.of(0x20), -1)); + assertThrows(IllegalArgumentException.class, + () -> CharClass.of(CodePointSet.of(0x20), Character.MAX_CODE_POINT + 1)); + } + + // --- offset-mapped variants -------------------------------------------------------------- + + @Test + void testCollapseMappedOffsets() { + final NormalizedText nt = WS.collapseMapped("a b"); + assertEquals("a b", nt.normalized()); + assertEquals(3, nt.offsets().normalizedLength()); + assertEquals(4, nt.offsets().originalLength()); + + assertEquals(0, nt.toOriginalOffset(0)); + assertEquals(1, nt.toOriginalOffset(1)); + assertEquals(3, nt.toOriginalOffset(2)); + assertEquals(4, nt.toOriginalOffset(3)); + + assertEquals(0, nt.toNormalizedOffset(0)); + assertEquals(1, nt.toNormalizedOffset(1)); + assertEquals(2, nt.toNormalizedOffset(3)); + assertEquals(3, nt.toNormalizedOffset(4)); + } + + @Test + void testNormalizeMappedIsIdentityWhenNothingMatches() { + final NormalizedText nt = WS.normalizeMapped("abc"); + assertEquals("abc", nt.normalized()); + for (int i = 0; i <= 3; i++) { + assertEquals(i, nt.toOriginalOffset(i)); + } + } + + @Test + void testNormalizeMappedPreservesSupplementaryCopyOffsets() { + final String text = "a" + GRINNING_FACE + "b"; + final NormalizedText nt = WS.normalizeMapped(text); + assertEquals(text, nt.normalized()); + for (int i = 0; i <= text.length(); i++) { + assertEquals(i, nt.toOriginalOffset(i)); + } + } + + @Test + void testNormalizeMappedCollapsesSupplementaryMemberToOneChar() { + final String text = "x" + YEZIDI_HYPHEN + "y"; + final NormalizedText nt = DASH.normalizeMapped(text); + assertEquals("x-y", nt.normalized()); + assertEquals(0, nt.toOriginalOffset(0)); + assertEquals(1, nt.toOriginalOffset(1)); + assertEquals(3, nt.toOriginalOffset(2)); + assertEquals(4, nt.toOriginalOffset(3)); + } + + @Test + void testOffsetMapRejectsOutOfRange() { + final OffsetMap map = WS.collapseMapped("ab").offsets(); + assertThrows(IndexOutOfBoundsException.class, () -> map.toOriginalOffset(-1)); + assertThrows(IndexOutOfBoundsException.class, + () -> map.toOriginalOffset(map.normalizedLength() + 1)); + assertThrows(IndexOutOfBoundsException.class, () -> map.toNormalizedOffset(-1)); + assertThrows(IndexOutOfBoundsException.class, + () -> map.toNormalizedOffset(map.originalLength() + 1)); + } + + @Test + void testAccessorsExposeMembersAndReplacement() { + assertEquals(0x0020, WS.replacement()); + assertEquals('-', DASH.replacement()); + assertTrue(WS.members().contains(0x00A0)); + assertFalse(WS.members().contains('a')); + } + + @Test + void testOffsetMapBuilderGrowsBeyondInitialCapacity() { + // 26 output characters force the OffsetMap builder past its initial 16-entry buffer. + final String text = "abcdefghijklmnopqrstuvwxyz"; + final NormalizedText nt = WS.normalizeMapped(text); + assertEquals(text, nt.normalized()); + assertEquals(26, nt.offsets().normalizedLength()); + for (int i = 0; i <= text.length(); i++) { + assertEquals(i, nt.toOriginalOffset(i)); + } + } + + @Test + void testNormalizeMappedWithSupplementaryReplacement() { + // A supplementary replacement exercises the two-char substitution path of the offset map. + final int penguin = 0x1F427; + final CharClass toPenguin = CharClass.of(CodePointSet.of(' '), penguin); + final NormalizedText nt = toPenguin.normalizeMapped("a b"); + assertEquals("a" + new String(Character.toChars(penguin)) + "b", nt.normalized()); + assertEquals(0, nt.toOriginalOffset(0)); + assertEquals(1, nt.toOriginalOffset(1)); + assertEquals(1, nt.toOriginalOffset(2)); + assertEquals(2, nt.toOriginalOffset(3)); + } +} diff --git a/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CodePointSetTest.java b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CodePointSetTest.java new file mode 100644 index 000000000..769cea71f --- /dev/null +++ b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CodePointSetTest.java @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class CodePointSetTest { + + @Test + void testOfContainsExactlyTheGivenCodePoints() { + final CodePointSet set = CodePointSet.of(0x0041, 0x00A0, 0x1F600); + assertTrue(set.contains(0x0041)); + assertTrue(set.contains(0x00A0)); + assertTrue(set.contains(0x1F600)); + assertFalse(set.contains(0x0042)); + assertEquals(3, set.size()); + assertFalse(set.isEmpty()); + } + + @Test + void testToArrayIsAscending() { + final CodePointSet set = CodePointSet.of(0x3000, 0x0009, 0x00A0); + assertArrayEquals(new int[] {0x0009, 0x00A0, 0x3000}, set.toArray()); + } + + @Test + void testOfRangeIsInclusive() { + final CodePointSet set = CodePointSet.ofRange(0x2000, 0x200A); + assertTrue(set.contains(0x2000)); + assertTrue(set.contains(0x2005)); + assertTrue(set.contains(0x200A)); + assertFalse(set.contains(0x1FFF)); + assertFalse(set.contains(0x200B)); + assertEquals(11, set.size()); + } + + @Test + void testOfRangeRejectsDescending() { + assertThrows(IllegalArgumentException.class, () -> CodePointSet.ofRange(0x200A, 0x2000)); + } + + @ParameterizedTest + @ValueSource(ints = {-1, Integer.MIN_VALUE, Character.MAX_CODE_POINT + 1, Integer.MAX_VALUE}) + void testOfRejectsInvalidCodePoints(int codePoint) { + assertThrows(IllegalArgumentException.class, () -> CodePointSet.of(codePoint)); + } + + @ParameterizedTest + @ValueSource(ints = {-1, Integer.MIN_VALUE, Character.MAX_CODE_POINT + 1, Integer.MAX_VALUE}) + void testContainsIsRangeSafe(int codePoint) { + assertFalse(CodePointSet.of(0x0020).contains(codePoint)); + } + + @Test + void testUnionIsNonDestructive() { + final CodePointSet a = CodePointSet.of(0x0041); + final CodePointSet b = CodePointSet.of(0x0042); + final CodePointSet union = a.union(b); + + assertTrue(union.contains(0x0041)); + assertTrue(union.contains(0x0042)); + assertEquals(2, union.size()); + assertFalse(a.contains(0x0042), "left operand must be unchanged"); + assertFalse(b.contains(0x0041), "right operand must be unchanged"); + } + + @Test + void testEqualsAndHashCode() { + assertEquals(CodePointSet.of(0x01, 0x02), CodePointSet.of(0x02, 0x01)); + assertEquals(CodePointSet.of(0x01, 0x02).hashCode(), CodePointSet.of(0x02, 0x01).hashCode()); + assertFalse(CodePointSet.of(0x01).equals(CodePointSet.of(0x02))); + } + + @Test + void testEqualsAgainstOtherTypesAndNull() { + final CodePointSet set = CodePointSet.of(0x20); + assertFalse(set.equals(null)); + assertFalse(set.equals("not a code point set")); + } + + @Test + void testParseAcceptsSingleHexDigit() { + assertTrue(CodePointSet.parse(List.of("[s]", "9"), "s").contains(0x9)); + } + + @Test + void testParseRejectsEmptyCodePointAfterPrefix() { + assertThrows(IllegalArgumentException.class, + () -> CodePointSet.parse(List.of("[s]", "U+"), "s")); + } + + @Test + void testParseRejectsTooShortSectionHeader() { + assertThrows(IllegalArgumentException.class, + () -> CodePointSet.parse(List.of("[]", "U+0041"), "s")); + } + + @Test + void testParseSingleCodePointsRangesCommentsAndBlankLines() { + final List<String> lines = List.of( + "# a whitespace overlay", + "[whitespace]", + "U+00A0 # no-break space", + "0x2028", + "2029", + "", + "U+2000-U+200A # typographic spaces"); + + final CodePointSet set = CodePointSet.parse(lines, "whitespace"); + + assertTrue(set.contains(0x00A0)); + assertTrue(set.contains(0x2028)); + assertTrue(set.contains(0x2029)); + assertTrue(set.contains(0x2000)); + assertTrue(set.contains(0x2007)); + assertTrue(set.contains(0x200A)); + assertFalse(set.contains(0x200B)); + assertEquals(3 + 11, set.size()); + } + + @Test + void testParseReturnsOnlyRequestedSection() { + final List<String> lines = List.of( + "[whitespace]", + "U+00A0", + "[dash]", + "U+2212", + "U+2014"); + + final CodePointSet whitespace = CodePointSet.parse(lines, "whitespace"); + assertTrue(whitespace.contains(0x00A0)); + assertFalse(whitespace.contains(0x2212)); + assertFalse(whitespace.contains(0x2014)); + + final CodePointSet dash = CodePointSet.parse(lines, "dash"); + assertTrue(dash.contains(0x2212)); + assertTrue(dash.contains(0x2014)); + assertFalse(dash.contains(0x00A0)); + } + + @Test + void testParseSectionNameIsCaseInsensitive() { + final List<String> lines = List.of("[WhiteSpace]", "U+00A0"); + assertTrue(CodePointSet.parse(lines, "whitespace").contains(0x00A0)); + assertTrue(CodePointSet.parse(lines, "WHITESPACE").contains(0x00A0)); + } + + @Test + void testParseMissingSectionIsEmpty() { + final List<String> lines = List.of("[whitespace]", "U+00A0"); + assertTrue(CodePointSet.parse(lines, "dash").isEmpty()); + } + + @Test + void testParseRejectsMalformedSectionHeader() { + final List<String> lines = List.of("[whitespace", "U+00A0"); + final IllegalArgumentException e = assertThrows(IllegalArgumentException.class, + () -> CodePointSet.parse(lines, "whitespace")); + assertTrue(e.getMessage().contains("line 1"), e.getMessage()); + } + + @Test + void testParseRejectsInvalidHex() { + final List<String> lines = List.of("[whitespace]", "U+ZZZZ"); + final IllegalArgumentException e = assertThrows(IllegalArgumentException.class, + () -> CodePointSet.parse(lines, "whitespace")); + assertTrue(e.getMessage().contains("line 2"), e.getMessage()); + } + + @Test + void testParseRejectsDescendingRange() { + final List<String> lines = List.of("[whitespace]", "U+200A-U+2000"); + assertThrows(IllegalArgumentException.class, () -> CodePointSet.parse(lines, "whitespace")); + } + + @Test + void testParseRejectsOutOfRangeCodePoint() { + final List<String> lines = List.of("[whitespace]", "U+110000"); + assertThrows(IllegalArgumentException.class, () -> CodePointSet.parse(lines, "whitespace")); + } + + @Test + void testParseRejectsEntryBeforeAnySection() { + final List<String> lines = List.of("U+00A0"); + final IllegalArgumentException e = assertThrows(IllegalArgumentException.class, + () -> CodePointSet.parse(lines, "whitespace")); + assertTrue(e.getMessage().contains("before any [section]"), e.getMessage()); + } + + @Test + void testParseAcceptsAllThreeHexPrefixes() { + final List<String> lines = List.of("[s]", "U+0041", "0x0042", "0043"); + final CodePointSet set = CodePointSet.parse(lines, "s"); + assertTrue(set.contains(0x41)); + assertTrue(set.contains(0x42)); + assertTrue(set.contains(0x43)); + } + + @Test + void testFromFileReadsTheNamedSection(@TempDir Path dir) throws IOException { + final Path file = dir.resolve("delimiters.txt"); + Files.writeString(file, String.join("\n", + "[whitespace]", + "U+00A0", + "[dash]", + "U+2E5D"), StandardCharsets.UTF_8); + + assertTrue(CodePointSet.fromFile(file, "whitespace").contains(0x00A0)); + assertTrue(CodePointSet.fromFile(file, "dash").contains(0x2E5D)); + assertFalse(CodePointSet.fromFile(file, "dash").contains(0x00A0)); + } +} diff --git a/opennlp-api/src/test/java/opennlp/tools/util/normalizer/UnicodeDashTest.java b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/UnicodeDashTest.java new file mode 100644 index 000000000..9d547a980 --- /dev/null +++ b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/UnicodeDashTest.java @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; + +import opennlp.tools.util.normalizer.UnicodeDash.Category; +import opennlp.tools.util.normalizer.UnicodeDash.DashCharacter; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class UnicodeDashTest { + + private static List<DashCharacter> dashes() { + return UnicodeDash.all(); + } + + // Maps the running JDK's general category to our enum, or null if it cannot be expressed (which + // includes code points the JDK's Unicode version does not yet assign). + private static Category jdkCategory(int codePoint) { + return switch (Character.getType(codePoint)) { + case Character.DASH_PUNCTUATION -> Category.Pd; + case Character.MATH_SYMBOL -> Category.Sm; + case Character.OTHER_PUNCTUATION -> Category.Po; + default -> null; + }; + } + + @Test + void testDashSetHasExactly31() { + assertEquals(31, UnicodeDash.all().size()); + } + + @ParameterizedTest + @MethodSource("dashes") + void testEachDashIsSelfConsistent(DashCharacter dash) { + assertTrue(UnicodeDash.isDash(dash.codePoint()), dash::toUnicodeNotation); + assertEquals(dash, UnicodeDash.byCodePoint(dash.codePoint()).orElseThrow()); + assertNotNull(dash.category()); + assertFalse(dash.name().isBlank()); + } + + @ParameterizedTest + @MethodSource("dashes") + void testCategoryMatchesJdkUnicodeDataWhenAssigned(DashCharacter dash) { + final Category jdk = jdkCategory(dash.codePoint()); + // Skip code points the running JVM's Unicode version does not assign yet (e.g. newer dashes). + if (Character.getType(dash.codePoint()) != Character.UNASSIGNED) { + assertEquals(jdk, dash.category(), dash::toUnicodeNotation); + } + } + + @Test + void testCodePointsAreUniqueAndStrictlyAscending() { + final int[] cps = UnicodeDash.codePoints(); + for (int i = 1; i < cps.length; i++) { + assertTrue(cps[i] > cps[i - 1], "dash code points must be unique and ascending"); + } + } + + @Test + void testMathematicalAreExactlyTheThreeMinusSigns() { + final Set<Integer> math = UnicodeDash.mathematical().stream() + .map(DashCharacter::codePoint).collect(Collectors.toSet()); + assertEquals(Set.of(0x207B, 0x208B, 0x2212), math); + UnicodeDash.mathematical().forEach(d -> { + assertTrue(d.isMathematical()); + assertEquals(Category.Sm, d.category()); + }); + } + + @Test + void testDefaultDashSetExcludesMathematicalMinusSigns() { + final int[] defaults = UnicodeDash.defaultDashCodePoints(); + assertEquals(UnicodeDash.all().size() - 3, defaults.length); + for (final int codePoint : defaults) { + assertFalse(UnicodeDash.byCodePoint(codePoint).orElseThrow().isMathematical(), + () -> String.format("U+%04X must not be a math minus", codePoint)); + } + assertFalse(Arrays.stream(defaults).anyMatch(cp -> cp == 0x2212)); + } + + @Test + void testHyphenMinusIsTheCanonicalTarget() { + assertEquals(0x002D, UnicodeDash.HYPHEN_MINUS); + assertTrue(UnicodeDash.isDash(0x002D)); + assertEquals(Category.Pd, UnicodeDash.byCodePoint(0x002D).orElseThrow().category()); + } + + @Test + void testSupplementaryDashesArePresent() { + for (final int codePoint : new int[] {0x10D6E, 0x10EAD}) { + assertTrue(UnicodeDash.isDash(codePoint)); + assertTrue(UnicodeDash.byCodePoint(codePoint).orElseThrow().isSupplementary()); + } + } + + @Test + void testBmpDashIsNotSupplementary() { + assertFalse(UnicodeDash.byCodePoint(0x2014).orElseThrow().isSupplementary()); + } + + @Test + void testDashToUnicodeNotation() { + assertEquals("U+2014", UnicodeDash.byCodePoint(0x2014).orElseThrow().toUnicodeNotation()); + assertEquals("U+10EAD", UnicodeDash.byCodePoint(0x10EAD).orElseThrow().toUnicodeNotation()); + } + + @ParameterizedTest + @ValueSource(ints = {0x00AD, 0x002E, 0x0041, 0x0020, 0x007E, 0x1F600}) + void testNonDashesAreNotDashes(int codePoint) { + // Notably U+00AD SOFT HYPHEN is a format character, not a dash, and must not be treated as one. + assertFalse(UnicodeDash.isDash(codePoint)); + } + + @ParameterizedTest + @ValueSource(ints = {-1, Integer.MIN_VALUE, Character.MAX_CODE_POINT + 1, Integer.MAX_VALUE}) + void testIsDashIsRangeSafe(int codePoint) { + assertFalse(UnicodeDash.isDash(codePoint)); + } + + @Test + void testByCodePointUnknownIsEmpty() { + assertTrue(UnicodeDash.byCodePoint('A').isEmpty()); + assertTrue(UnicodeDash.byCodePoint(0x00AD).isEmpty()); + } + + @Test + void testReferenceListIsImmutable() { + assertThrows(UnsupportedOperationException.class, () -> UnicodeDash.all().add(null)); + assertThrows(UnsupportedOperationException.class, () -> UnicodeDash.mathematical().add(null)); + } + + @Test + void testArrayAccessorsReturnDefensiveCopies() { + final int[] all = UnicodeDash.codePoints(); + all[0] = -1; + assertEquals(0x002D, UnicodeDash.codePoints()[0]); + + final int[] defaults = UnicodeDash.defaultDashCodePoints(); + defaults[0] = -1; + assertEquals(0x002D, UnicodeDash.defaultDashCodePoints()[0]); + } +} diff --git a/opennlp-api/src/test/java/opennlp/tools/util/normalizer/UnicodeWhitespaceTest.java b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/UnicodeWhitespaceTest.java new file mode 100644 index 000000000..bd040efc0 --- /dev/null +++ b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/UnicodeWhitespaceTest.java @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; + +import opennlp.tools.util.normalizer.UnicodeWhitespace.Category; +import opennlp.tools.util.normalizer.UnicodeWhitespace.RelatedCharacter; +import opennlp.tools.util.normalizer.UnicodeWhitespace.WhitespaceCharacter; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class UnicodeWhitespaceTest { + + private static List<WhitespaceCharacter> whitespace() { + return UnicodeWhitespace.all(); + } + + private static List<RelatedCharacter> lookalikes() { + return UnicodeWhitespace.lookalikes(); + } + + // Maps the JDK's Unicode general category to our enum, used as an independent oracle. + private static Category jdkCategory(int codePoint) { + return switch (Character.getType(codePoint)) { + case Character.CONTROL -> Category.Cc; + case Character.SPACE_SEPARATOR -> Category.Zs; + case Character.LINE_SEPARATOR -> Category.Zl; + case Character.PARAGRAPH_SEPARATOR -> Category.Zp; + case Character.FORMAT -> Category.Cf; + default -> null; + }; + } + + @Test + void testWhitespaceSetHasExactly25() { + assertEquals(25, UnicodeWhitespace.all().size()); + } + + @Test + void testLookalikeSetHasExactly6() { + assertEquals(6, UnicodeWhitespace.lookalikes().size()); + } + + @Test + void testRelatedCharacterExposesAttributes() { + final var bom = UnicodeWhitespace.lookalikes().stream() + .filter(r -> r.codePoint() == 0xFEFF).findFirst().orElseThrow(); + assertEquals("zero width no-break space", bom.name()); + assertEquals("BOM", bom.abbreviation()); + assertFalse(bom.note().isBlank()); + assertEquals("U+FEFF", bom.toUnicodeNotation()); + } + + @ParameterizedTest + @MethodSource("whitespace") + void testEachWhitespaceCharIsSelfConsistent(WhitespaceCharacter ws) { + assertTrue(UnicodeWhitespace.isWhitespace(ws.codePoint()), + () -> ws.toUnicodeNotation() + " should be whitespace"); + assertEquals(ws, UnicodeWhitespace.byCodePoint(ws.codePoint()).orElseThrow()); + assertFalse(UnicodeWhitespace.isLookalike(ws.codePoint()), + () -> ws.toUnicodeNotation() + " must not also be a look-alike"); + assertNotNull(ws.category()); + assertNotNull(ws.breaking()); + assertNotNull(ws.abbreviation()); + assertFalse(ws.name().isBlank()); + } + + @ParameterizedTest + @MethodSource("whitespace") + void testAllWhitespaceIsInTheBmp(WhitespaceCharacter ws) { + // Every Unicode White_Space code point is in the Basic Multilingual Plane (one char). + assertTrue(ws.codePoint() <= 0xFFFF, ws::toUnicodeNotation); + assertEquals(1, Character.charCount(ws.codePoint())); + } + + @ParameterizedTest + @MethodSource("whitespace") + void testCategoryMatchesJdkUnicodeData(WhitespaceCharacter ws) { + // Independent cross-check: our hand-entered category must agree with the JDK's UCD. + assertEquals(jdkCategory(ws.codePoint()), ws.category(), ws::toUnicodeNotation); + } + + @Test + void testCodePointsAreUniqueAndStrictlyAscending() { + final int[] cps = UnicodeWhitespace.codePoints(); + for (int i = 1; i < cps.length; i++) { + assertTrue(cps[i] > cps[i - 1], + "code points must be unique and ascending at index " + i); + } + } + + @Test + void testCodePointsMatchAllOrder() { + final int[] fromRecords = whitespace().stream().mapToInt(WhitespaceCharacter::codePoint).toArray(); + assertArrayEqualsInt(fromRecords, UnicodeWhitespace.codePoints()); + } + + @Test + void testCodePointsReturnsDefensiveCopy() { + final int[] first = UnicodeWhitespace.codePoints(); + first[0] = -999; + assertEquals(0x0009, UnicodeWhitespace.codePoints()[0]); + } + + @ParameterizedTest + @MethodSource("lookalikes") + void testLookalikesAreNotWhitespace(RelatedCharacter related) { + assertFalse(UnicodeWhitespace.isWhitespace(related.codePoint()), + () -> related.toUnicodeNotation() + " is White_Space=no"); + assertTrue(UnicodeWhitespace.byCodePoint(related.codePoint()).isEmpty()); + assertTrue(UnicodeWhitespace.isLookalike(related.codePoint())); + // Every look-alike is a format character in the UCD. + assertEquals(Category.Cf, jdkCategory(related.codePoint()), related::toUnicodeNotation); + } + + @Test + void testLineBreaksAreExactlyTheSeven() { + final Set<Integer> expected = Set.of(0x000A, 0x000B, 0x000C, 0x000D, 0x0085, 0x2028, 0x2029); + assertEquals(expected, UnicodeWhitespace.lineBreaks().stream() + .map(WhitespaceCharacter::codePoint).collect(Collectors.toSet())); + } + + @Test + void testNonBreakingAreExactlyTheThree() { + final Set<Integer> expected = Set.of(0x00A0, 0x2007, 0x202F); + assertEquals(expected, UnicodeWhitespace.nonBreaking().stream() + .map(WhitespaceCharacter::codePoint).collect(Collectors.toSet())); + } + + @ParameterizedTest + @ValueSource(ints = {0x0008, 0x000E, 0x001F, 0x0021, 0x1FFF, 0x200B, 0x202A, 0x2FFF, 0x3001}) + void testNeighboringCodePointsAreNotWhitespace(int codePoint) { + assertFalse(UnicodeWhitespace.isWhitespace(codePoint), + () -> String.format("U+%04X must not be whitespace", codePoint)); + } + + @Test + void testIncludesNbspAndNelThatJavaIsWhitespaceOmits() { + // Documents the deliberate divergence from Character.isWhitespace. + assertTrue(UnicodeWhitespace.isWhitespace(0x00A0)); + assertFalse(Character.isWhitespace(0x00A0)); + assertTrue(UnicodeWhitespace.isWhitespace(0x0085)); + assertFalse(Character.isWhitespace(0x0085)); + } + + @ParameterizedTest + @ValueSource(ints = {0x001C, 0x001D, 0x001E, 0x001F}) + void testExcludesInfoSeparatorsThatJavaIsWhitespaceIncludes(int codePoint) { + assertFalse(UnicodeWhitespace.isWhitespace(codePoint)); + assertTrue(Character.isWhitespace(codePoint)); + } + + @Test + void testIncludesTabThatIsSpaceCharOmits() { + // Character.isSpaceChar excludes the control whitespace; ours includes it. + assertTrue(UnicodeWhitespace.isWhitespace(0x0009)); + assertFalse(Character.isSpaceChar(0x0009)); + } + + @Test + void testByCodePointUnknownIsEmpty() { + assertTrue(UnicodeWhitespace.byCodePoint('A').isEmpty()); + assertTrue(UnicodeWhitespace.byCodePoint(0x200B).isEmpty(), "a look-alike is not whitespace"); + } + + @ParameterizedTest + @ValueSource(ints = {Integer.MIN_VALUE, -1, Character.MAX_CODE_POINT + 1, Integer.MAX_VALUE}) + void testIsWhitespaceHandlesOutOfRangeSafely(int codePoint) { + assertFalse(UnicodeWhitespace.isWhitespace(codePoint)); + assertFalse(UnicodeWhitespace.isLookalike(codePoint)); + } + + @Test + void testReferenceListsAreImmutable() { + assertThrows(UnsupportedOperationException.class, + () -> UnicodeWhitespace.all().add(null)); + assertThrows(UnsupportedOperationException.class, + () -> UnicodeWhitespace.lookalikes().add(null)); + assertThrows(UnsupportedOperationException.class, + () -> UnicodeWhitespace.lineBreaks().add(null)); + assertThrows(UnsupportedOperationException.class, + () -> UnicodeWhitespace.nonBreaking().add(null)); + } + + @Test + void testToUnicodeNotationIsZeroPadded() { + assertEquals("U+0009", UnicodeWhitespace.byCodePoint(0x0009).orElseThrow().toUnicodeNotation()); + assertEquals("U+00A0", UnicodeWhitespace.byCodePoint(0x00A0).orElseThrow().toUnicodeNotation()); + assertEquals("U+3000", UnicodeWhitespace.byCodePoint(0x3000).orElseThrow().toUnicodeNotation()); + } + + @Test + void testLineBreakAndNonBreakingFlagsAgreeWithBreaking() { + final WhitespaceCharacter lf = UnicodeWhitespace.byCodePoint(0x000A).orElseThrow(); + assertTrue(lf.isLineBreak()); + assertFalse(lf.isNonBreaking()); + + final WhitespaceCharacter nbsp = UnicodeWhitespace.byCodePoint(0x00A0).orElseThrow(); + assertTrue(nbsp.isNonBreaking()); + assertFalse(nbsp.isLineBreak()); + + final WhitespaceCharacter space = UnicodeWhitespace.byCodePoint(0x0020).orElseThrow(); + assertFalse(space.isLineBreak()); + assertFalse(space.isNonBreaking()); + } + + private static void assertArrayEqualsInt(int[] expected, int[] actual) { + assertEquals(Arrays.toString(expected), Arrays.toString(actual)); + assertTrue(IntStream.range(0, expected.length).allMatch(i -> expected[i] == actual[i])); + } +} diff --git a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/AbstractDL.java b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/AbstractDL.java index 6e6e54767..5b0a14f88 100644 --- a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/AbstractDL.java +++ b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/AbstractDL.java @@ -23,6 +23,7 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -40,6 +41,7 @@ import ai.onnxruntime.OrtSession; import opennlp.tools.tokenize.BertTokenizer; import opennlp.tools.tokenize.Tokenizer; import opennlp.tools.tokenize.WordpieceTokenizer; +import opennlp.tools.util.normalizer.CharClass; /** * Base class for OpenNLP deep-learning classes using ONNX Runtime. @@ -327,6 +329,37 @@ public abstract class AbstractDL implements AutoCloseable { } } + /** + * Unicode-aware whitespace. Input is tokenized on the full Unicode {@code White_Space} set + * rather than the six ASCII characters Java's {@code \s} recognizes, and the same class is + * reused by subclasses that need to match against whitespace in the source text. + */ + protected static final CharClass WHITESPACE = CharClass.whitespace(); + + /** + * Splits {@code text} on Unicode whitespace and groups the resulting tokens into overlapping + * chunks, each rejoined with single ASCII spaces, ready for WordPiece tokenization. The split + * uses the Unicode {@code White_Space} set, so spacing such as a no-break space or the + * ideographic space is recognized, and it yields no empty tokens from leading, trailing, or + * repeated whitespace. + * + * @param text The input text. + * @param documentSplitSize The maximum number of whitespace tokens per chunk. + * @param splitOverlapSize The number of tokens shared between consecutive chunks. + * @return The chunk strings, in order. + */ + protected static List<String> whitespaceChunks(final String text, final int documentSplitSize, + final int splitOverlapSize) { + final String[] whitespaceTokenized = WHITESPACE.split(text); + final List<String> groups = new ArrayList<>(); + for (final ChunkRange chunkRange : chunkRanges( + whitespaceTokenized.length, documentSplitSize, splitOverlapSize)) { + groups.add(String.join(" ", + Arrays.copyOfRange(whitespaceTokenized, chunkRange.start(), chunkRange.end()))); + } + return groups; + } + /** * Splits a token sequence into overlapping chunk ranges. * diff --git a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java index 7aa36e494..c7293fc8b 100644 --- a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java +++ b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java @@ -331,17 +331,10 @@ public class DocumentCategorizerDL extends AbstractDL implements DocumentCategor final List<Tokens> t = new LinkedList<>(); - // Segment long input text into overlapping chunks configured by InferenceOptions before - // feeding each chunk into BERT. + // Segment long input text into overlapping chunks (split on Unicode whitespace) configured by + // InferenceOptions before feeding each chunk into BERT. // https://medium.com/analytics-vidhya/text-classification-with-bert-using-transformers-for-long-text-inputs-f54833994dfd - final String[] whitespaceTokenized = text.split("\\s+"); - - for (ChunkRange chunkRange : chunkRanges( - whitespaceTokenized.length, documentSplitSize, splitOverlapSize)) { - - // The group is that subsection of string. - final String group = String.join(" ", - Arrays.copyOfRange(whitespaceTokenized, chunkRange.start(), chunkRange.end())); + for (final String group : whitespaceChunks(text, documentSplitSize, splitOverlapSize)) { // Now we can tokenize the group and continue. final String[] tokens = tokenizer.tokenize(group); diff --git a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java index e5b5c89b5..eff6b87d5 100644 --- a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java +++ b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java @@ -28,8 +28,6 @@ import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import ai.onnxruntime.OnnxTensor; import ai.onnxruntime.OrtException; @@ -356,7 +354,7 @@ public class NameFinderDL extends AbstractDL implements TokenNameFinder { continue; } - final SpanMatch match = findByRegex(text, spanText, characterStart, searchEnd); + final SpanMatch match = findInSource(text, spanText, characterStart, searchEnd); if (match.start() != -1) { spans.add(new Span(match.start(), match.end(), entityType, entity.probability())); characterStart = match.end(); @@ -567,35 +565,82 @@ public class NameFinderDL extends AbstractDL implements TokenNameFinder { /** * Locates reconstructed span text in a bounded region of the original input text. * + * <p>Matching is a single forward cursor scan, not a regular expression. Each space in the + * reconstructed span matches a run of zero or more Unicode whitespace characters in the source + * (so an entity whose WordPiece pieces were rejoined with spaces, such as {@code "AT & T"} for + * {@code "AT&T"}, is still located), and every other code point matches case-insensitively. + * Using a cursor avoids {@link java.util.regex.Pattern}/{@link java.util.regex.Matcher} + * allocation and the ReDoS surface of regular expressions, and recognizes Unicode whitespace + * that Java's {@code \s} does not.</p> + * * @param text The original text. - * @param span The reconstructed span text. + * @param span The reconstructed span text, with sub-tokens separated by single ASCII spaces. * @param searchStart The first character offset to search from. * @param searchEnd The exclusive upper bound of the region to search. * @return The matched character offsets, or {@code (-1, -1)} when the reconstructed text * cannot be found in the requested region. */ - private static SpanMatch findByRegex(String text, String span, int searchStart, int searchEnd) { - - // Reconstructed span text normalizes whitespace, so match flexibly: a space in the span may - // map to any run of whitespace OR none in the source (e.g. punctuation/'&' inside "U.S.A", - // "AT&T" that wordpiece tokenization split apart). Use \s* rather than \s+ so such entities - // are still located instead of being silently dropped. - final String regex = Pattern.quote(span).replace(" ", "\\E\\s*\\Q"); + private static SpanMatch findInSource(String text, String span, int searchStart, int searchEnd) { - final Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE); - final Matcher matcher = pattern.matcher(text); final int regionStart = Math.min(Math.max(searchStart, 0), text.length()); final int regionEnd = Math.min(Math.max(searchEnd, regionStart), text.length()); - matcher.region(regionStart, regionEnd); - if (matcher.find()) { - return new SpanMatch(matcher.start(), matcher.end()); + int start = regionStart; + while (start < regionEnd) { + final int end = matchAt(text, span, start, regionEnd); + if (end != -1) { + return new SpanMatch(start, end); + } + start += Character.charCount(text.codePointAt(start)); } return new SpanMatch(-1, -1); } + /** + * Attempts to match {@code span} against {@code text} beginning at {@code start} and bounded by + * {@code regionEnd}. A space in {@code span} consumes a run of zero or more Unicode whitespace + * code points in the source; every other code point must match case-insensitively. + * + * @return The exclusive end offset of the match in {@code text}, or {@code -1} if no match + * begins at {@code start}. + */ + private static int matchAt(String text, String span, int start, int regionEnd) { + + int t = start; + int s = 0; + + while (s < span.length()) { + final int spanCp = span.codePointAt(s); + if (spanCp == ' ') { + while (t < regionEnd && WHITESPACE.contains(text.codePointAt(t))) { + t += Character.charCount(text.codePointAt(t)); + } + s += 1; + } else { + if (t >= regionEnd) { + return -1; + } + final int textCp = text.codePointAt(t); + if (!equalsIgnoreCase(spanCp, textCp)) { + return -1; + } + t += Character.charCount(textCp); + s += Character.charCount(spanCp); + } + } + + return t; + + } + + private static boolean equalsIgnoreCase(int a, int b) { + return a == b + || Character.toLowerCase(a) == Character.toLowerCase(b) + || Character.toUpperCase(a) == Character.toUpperCase(b); + } + private record LabelPrediction(String label, double probability) { } @@ -613,17 +658,10 @@ public class NameFinderDL extends AbstractDL implements TokenNameFinder { final List<Tokens> t = new LinkedList<>(); - // Segment long input text into overlapping chunks configured by InferenceOptions before - // feeding each chunk into BERT. + // Segment long input text into overlapping chunks (split on Unicode whitespace) configured by + // InferenceOptions before feeding each chunk into BERT. // https://medium.com/analytics-vidhya/text-classification-with-bert-using-transformers-for-long-text-inputs-f54833994dfd - final String[] whitespaceTokenized = text.split("\\s+"); - - for (ChunkRange chunkRange : chunkRanges( - whitespaceTokenized.length, documentSplitSize, splitOverlapSize)) { - - // The group is that subsection of string. - final String group = String.join(" ", - Arrays.copyOfRange(whitespaceTokenized, chunkRange.start(), chunkRange.end())); + for (final String group : whitespaceChunks(text, documentSplitSize, splitOverlapSize)) { // Now we can tokenize the group and continue. final String[] tokens = tokenizer.tokenize(group); diff --git a/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/AbstractDLChunkingTest.java b/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/AbstractDLChunkingTest.java new file mode 100644 index 000000000..38ab38450 --- /dev/null +++ b/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/AbstractDLChunkingTest.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.dl; + +import java.util.List; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * Model-free tests for {@link AbstractDL#whitespaceChunks(String, int, int)}, the shared + * tokenize-and-chunk seam used by both {@code NameFinderDL} and {@code DocumentCategorizerDL}. + */ +public class AbstractDLChunkingTest { + + @Test + void testSplitsOnUnicodeWhitespaceNotJustAscii() { + // A no-break space (U+00A0) and an ideographic space (U+3000) are not matched by Java's \s + // but must still separate tokens; the chunk is rejoined with single ASCII spaces. + final String nbsp = new String(Character.toChars(0x00A0)); + final String ideographic = new String(Character.toChars(0x3000)); + assertEquals(List.of("alpha beta gamma"), + AbstractDL.whitespaceChunks("alpha" + nbsp + "beta" + ideographic + "gamma", 100, 0)); + } + + @Test + void testDropsEmptyTokensFromLeadingTrailingAndRepeatedWhitespace() { + // Unlike split("\\s+"), the Unicode-aware split yields no empty leading or trailing tokens. + assertEquals(List.of("a b c"), AbstractDL.whitespaceChunks(" a b\tc ", 100, 0)); + } + + @Test + void testAppliesChunkSizeWithoutOverlap() { + assertEquals(List.of("a b", "c d"), AbstractDL.whitespaceChunks("a b c d", 2, 0)); + } + + @Test + void testAppliesChunkOverlap() { + assertEquals(List.of("a b", "b c", "c d"), AbstractDL.whitespaceChunks("a b c d", 2, 1)); + } + + @Test + void testEmptyTextYieldsNoChunks() { + assertEquals(List.of(), AbstractDL.whitespaceChunks("", 100, 0)); + } +} diff --git a/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/namefinder/NameFinderDLTest.java b/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/namefinder/NameFinderDLTest.java index c0a8aede2..1c97e0ad1 100644 --- a/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/namefinder/NameFinderDLTest.java +++ b/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/namefinder/NameFinderDLTest.java @@ -169,7 +169,7 @@ public class NameFinderDLTest { void testDecodeSpansLocatesEntityWithInternalPunctuation() { // WordPiece splits "AT&T" into separate AT / & / T tokens, so the reconstructed span text // ("AT & T") must still be located in the contiguous source. Regression guard for the - // flexible-whitespace (\s*) matching in findByRegex. + // flexible-whitespace matching in findInSource (a span space matches zero source whitespace). final String text = "Buy AT&T stock"; final String[] tokens = {"[CLS]", "Buy", "AT", "&", "T", "stock", "[SEP]"}; final float[][] scores = { @@ -184,6 +184,37 @@ public class NameFinderDLTest { assertEquals("AT&T", spans.get(0).getCoveredText(text)); } + @Test + void testDecodeSpansMatchesEntitySeparatedByNoBreakSpace() { + // The source separates "New" and "York" with a no-break space (U+00A0). Java's \s does not + // match it, so the previous regex matcher would have dropped this LOC span; the Unicode-aware + // cursor matcher locates it and the covered text includes the no-break space. + final String nbsp = new String(Character.toChars(0x00A0)); + final String text = "Visit New" + nbsp + "York today"; + final String[] tokens = {"[CLS]", "New", "York", "[SEP]"}; + final float[][] scores = {scoresFor(0), scoresFor(3), scoresFor(4), scoresFor(0)}; + + final List<Span> spans = NameFinderDL.decodeSpans(text, tokens, scores, ID_TO_LABELS); + + assertEquals(1, spans.size()); + assertEquals("LOC", spans.get(0).getType()); + assertEquals("New" + nbsp + "York", spans.get(0).getCoveredText(text)); + } + + @Test + void testDecodeSpansMatchesEntitySeparatedByIdeographicSpace() { + // Same idea with the CJK ideographic space (U+3000), another character outside Java's \s. + final String ideographic = new String(Character.toChars(0x3000)); + final String text = "from New" + ideographic + "York city"; + final String[] tokens = {"[CLS]", "New", "York", "[SEP]"}; + final float[][] scores = {scoresFor(0), scoresFor(3), scoresFor(4), scoresFor(0)}; + + final List<Span> spans = NameFinderDL.decodeSpans(text, tokens, scores, ID_TO_LABELS); + + assertEquals(1, spans.size()); + assertEquals("New" + ideographic + "York", spans.get(0).getCoveredText(text)); + } + @Test void testDecodeSpansDoesNotMatchBeyondSearchEnd() { final String text = "London was quiet. Later Paris was loud."; diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizer.java new file mode 100644 index 000000000..3a940b1b8 --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizer.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.text.Normalizer; +import java.util.Set; + +/** + * A {@link CharSequenceNormalizer} that folds diacritics for search and matching, the + * multilingual-safe counterpart to a Latin-only ASCII folding filter. + * + * <p>Folding decomposes the text (NFD) and drops nonspacing combining marks, but only for base + * characters whose script is in {@code foldScripts} (Latin, Greek, and Cyrillic by default). Marks + * on other scripts are left untouched, because there they are essential orthography rather than + * decoration: stripping an Indic vowel sign or a virama, an Arabic harakat, a Hebrew point, or a + * Thai vowel changes the word. This script gating is the key correctness rule; never strip all + * nonspacing marks globally.</p> + * + * <p>Many "accented" Latin letters are atomic and do not decompose ({@code o} with stroke, the + * {@code ae}/{@code oe} ligatures, eszett, thorn, and so on). When {@code foldStrokeLetters} is + * enabled (the default) these are mapped to an ASCII approximation. Folding is a recall + * optimization, not a linguistically correct transform, so it is intended for a search/matching + * token rather than for display or language-specific analysis.</p> + * + * <p>Scanning is a single cursor pass over the decomposed text; no regular expression is used, and + * no global {@code \p{Mn}} strip is performed.</p> + */ +public class AccentFoldCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final long serialVersionUID = 1L; + + private static final Set<Character.UnicodeScript> DEFAULT_SCRIPTS = Set.of( + Character.UnicodeScript.LATIN, + Character.UnicodeScript.GREEK, + Character.UnicodeScript.CYRILLIC); + + private static final AccentFoldCharSequenceNormalizer INSTANCE = + new AccentFoldCharSequenceNormalizer(DEFAULT_SCRIPTS, true); + + private final Set<Character.UnicodeScript> foldScripts; + private final boolean foldStrokeLetters; + + /** + * Creates a folder. + * + * @param foldScripts The scripts whose base characters' diacritics are folded; marks on every + * other script are preserved. + * @param foldStrokeLetters Whether atomic Latin letters such as the stroke letters and ligatures + * are mapped to an ASCII approximation. + */ + public AccentFoldCharSequenceNormalizer(Set<Character.UnicodeScript> foldScripts, + boolean foldStrokeLetters) { + this.foldScripts = Set.copyOf(foldScripts); + this.foldStrokeLetters = foldStrokeLetters; + } + + /** {@return the shared instance with the safe defaults: Latin, Greek, and Cyrillic plus the + * stroke-letter map} */ + public static AccentFoldCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + @Override + public CharSequence normalize(CharSequence text) { + final String decomposed = Normalizer.normalize(text, Normalizer.Form.NFD); + final StringBuilder out = new StringBuilder(decomposed.length()); + + Character.UnicodeScript baseScript = null; + int i = 0; + final int length = decomposed.length(); + while (i < length) { + final int codePoint = decomposed.codePointAt(i); + if (Character.getType(codePoint) == Character.NON_SPACING_MARK) { + // Drop the mark only when its base character belongs to a folded script. + if (baseScript == null || !foldScripts.contains(baseScript)) { + out.appendCodePoint(codePoint); + } + } else { + final String mapped = foldStrokeLetters ? strokeLetter(codePoint) : null; + if (mapped != null) { + out.append(mapped); + baseScript = Character.UnicodeScript.LATIN; + } else { + out.appendCodePoint(codePoint); + baseScript = Character.UnicodeScript.of(codePoint); + } + } + i += Character.charCount(codePoint); + } + + return Normalizer.normalize(out, Normalizer.Form.NFC); + } + + // Atomic Latin letters that NFD does not decompose, mapped to an ASCII approximation. + private static String strokeLetter(int codePoint) { + return switch (codePoint) { + case 0x00F8 -> "o"; // o with stroke + case 0x00D8 -> "O"; // O with stroke + case 0x00E6 -> "ae"; // ae ligature + case 0x00C6 -> "AE"; // AE ligature + case 0x0153 -> "oe"; // oe ligature + case 0x0152 -> "OE"; // OE ligature + case 0x00DF -> "ss"; // eszett + case 0x1E9E -> "SS"; // capital eszett + case 0x00FE -> "th"; // thorn + case 0x00DE -> "TH"; // capital thorn + case 0x00F0 -> "d"; // eth + case 0x00D0 -> "D"; // capital eth + case 0x0111 -> "d"; // d with stroke + case 0x0110 -> "D"; // D with stroke + case 0x0142 -> "l"; // l with stroke + case 0x0141 -> "L"; // L with stroke + case 0x0127 -> "h"; // h with stroke + case 0x0126 -> "H"; // H with stroke + case 0x0131 -> "i"; // dotless i + default -> null; + }; + } +} diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/CaseFoldCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/CaseFoldCharSequenceNormalizer.java new file mode 100644 index 000000000..176dd108b --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/CaseFoldCharSequenceNormalizer.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.Locale; + +/** + * A {@link CharSequenceNormalizer} that lower cases text for case-insensitive matching, using + * {@link Locale#ROOT} so the result does not depend on the JVM's default locale. + * + * <p>This is the case-folding step of a search / BM25 analysis chain (the counterpart to Lucene's + * lower-case filter). {@code Locale.ROOT} avoids locale surprises such as the Turkish dotless-i + * mapping; callers that need language-specific case rules should fold with an explicit locale + * upstream. Full Unicode case folding (for example German eszett, {@code U+00DF}, to {@code ss}) + * is a distinct, heavier transform and is intentionally out of scope here.</p> + */ +public class CaseFoldCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final long serialVersionUID = 1L; + + private static final CaseFoldCharSequenceNormalizer INSTANCE = + new CaseFoldCharSequenceNormalizer(); + + /** {@return the shared, stateless instance} */ + public static CaseFoldCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + @Override + public CharSequence normalize(CharSequence text) { + return text.toString().toLowerCase(Locale.ROOT); + } +} diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java new file mode 100644 index 000000000..31237e73f --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +/** + * A {@link CharSequenceNormalizer} that maps every Unicode dash to an ASCII hyphen-minus + * ({@code U+002D}), reusing the cursor based {@link CharClass#dashes()} engine. + * + * <p>This folds the many dash code points (en dash, em dash, figure dash, non-breaking hyphen, + * fullwidth hyphen, and so on) to a single form so that {@code "state-of-the-art"} matches + * regardless of which dash the source used. The mathematical minus signs are left untouched by + * default, and {@code U+00AD} SOFT HYPHEN (a format character) is not treated as a dash.</p> + */ +public class DashCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final long serialVersionUID = 1L; + + private static final CharClass DASHES = CharClass.dashes(); + + private static final DashCharSequenceNormalizer INSTANCE = new DashCharSequenceNormalizer(); + + /** {@return the shared, stateless instance} */ + public static DashCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + @Override + public CharSequence normalize(CharSequence text) { + return DASHES.normalize(text); + } +} diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NfcCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NfcCharSequenceNormalizer.java new file mode 100644 index 000000000..72d25d93b --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NfcCharSequenceNormalizer.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.text.Normalizer; + +/** + * A {@link CharSequenceNormalizer} that applies Unicode Normalization Form C (canonical + * composition, UAX #15). + * + * <p>NFC is the safe, lossless (under canonical equivalence) baseline for matching: precomposed + * and decomposed spellings of the same text (for example {@code U+00E9} versus {@code e} plus a + * combining acute accent) become identical, so equal text compares equal regardless of how it was + * encoded. It changes no characters' meaning and is the W3C-recommended interchange form.</p> + */ +public class NfcCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final long serialVersionUID = 1L; + + private static final NfcCharSequenceNormalizer INSTANCE = new NfcCharSequenceNormalizer(); + + /** {@return the shared, stateless instance} */ + public static NfcCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + @Override + public CharSequence normalize(CharSequence text) { + return Normalizer.normalize(text, Normalizer.Form.NFC); + } +} diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NfkcCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NfkcCharSequenceNormalizer.java new file mode 100644 index 000000000..c95568fab --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NfkcCharSequenceNormalizer.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.text.Normalizer; + +/** + * A {@link CharSequenceNormalizer} that applies Unicode Normalization Form KC (compatibility + * composition, UAX #15). + * + * <p>NFKC folds compatibility variants to their canonical form: fullwidth and halfwidth letters, + * the {@code U+FB01} ligature to {@code fi}, and super/subscript digits to plain digits. It is + * more aggressive than {@link NfcCharSequenceNormalizer NFC} and is lossy (it can change a + * character's appearance or meaning, e.g. a squared numeral to a plain one), so it is a deliberate + * choice for search/recall rather than a safe default.</p> + */ +public class NfkcCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final long serialVersionUID = 1L; + + private static final NfkcCharSequenceNormalizer INSTANCE = new NfkcCharSequenceNormalizer(); + + /** {@return the shared, stateless instance} */ + public static NfkcCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + @Override + public CharSequence normalize(CharSequence text) { + return Normalizer.normalize(text, Normalizer.Form.NFKC); + } +} diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java new file mode 100644 index 000000000..affa82745 --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +/** + * A {@link CharSequenceNormalizer} that collapses each run of Unicode whitespace to a single ASCII + * space and trims the edges, reusing the cursor based {@link CharClass#whitespace()} engine. + * + * <p>Unlike a {@code \s} regular expression, this recognizes the full Unicode {@code White_Space} + * set (no-break space, ideographic space, the typographic spaces, line and paragraph separators, + * and so on), so spacing copied from the web, PDFs, or non-Latin sources normalizes consistently. + * It is the Unicode-aware, regex-free counterpart to {@link ShrinkCharSequenceNormalizer}.</p> + */ +public class WhitespaceCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final long serialVersionUID = 1L; + + private static final CharClass WHITESPACE = CharClass.whitespace(); + + private static final WhitespaceCharSequenceNormalizer INSTANCE = + new WhitespaceCharSequenceNormalizer(); + + /** {@return the shared, stateless instance} */ + public static WhitespaceCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + @Override + public CharSequence normalize(CharSequence text) { + return WHITESPACE.trim(WHITESPACE.collapse(text)); + } +} diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizerTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizerTest.java new file mode 100644 index 000000000..ba4a6ea4b --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizerTest.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.Set; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class AccentFoldCharSequenceNormalizerTest { + + private static String cp(int codePoint) { + return new String(Character.toChars(codePoint)); + } + + private static String fold(String text) { + return AccentFoldCharSequenceNormalizer.getInstance().normalize(text).toString(); + } + + @Test + void testFoldsLatinAccents() { + assertEquals("cafe", fold("caf" + cp(0x00E9))); // cafe with acute e + assertEquals("naive", fold("na" + cp(0x00EF) + "ve")); // naive with diaeresis i + assertEquals("Muller", fold("M" + cp(0x00FC) + "ller")); // Muller with umlaut u + assertEquals("anos", fold("a" + cp(0x00F1) + "os")); // anos with tilde n + } + + @Test + void testMapsStrokeAndLigatureLetters() { + assertEquals("o", fold(cp(0x00F8))); // o with stroke + assertEquals("ae", fold(cp(0x00E6))); // ae ligature + assertEquals("oe", fold(cp(0x0153))); // oe ligature + assertEquals("Strasse", fold("Stra" + cp(0x00DF) + "e")); // eszett + assertEquals("th", fold(cp(0x00FE))); // thorn + assertEquals("l", fold(cp(0x0142))); // l with stroke + assertEquals("i", fold(cp(0x0131))); // dotless i + } + + @Test + void testFoldsGreekAndCyrillicAccents() { + assertEquals(cp(0x03B1), fold(cp(0x03AC))); // Greek alpha with tonos -> alpha + assertEquals(cp(0x0438), fold(cp(0x0439))); // Cyrillic short i -> i + } + + @Test + void testLeavesAsciiUnchanged() { + assertEquals("hello world", fold("hello world")); + } + + @Test + void testDoesNotTouchDevanagariArabicOrHebrewMarks() { + // The critical guard: marks on non-folded scripts are essential orthography and must survive. + final String devanagari = cp(0x0915) + cp(0x093E); // ka + aa vowel sign + assertEquals(devanagari, fold(devanagari)); + + final String arabic = cp(0x0628) + cp(0x064E); // beh + fatha (a nonspacing mark) + assertEquals(arabic, fold(arabic)); + assertTrue(fold(arabic).indexOf(0x064E) >= 0, "the Arabic fatha must not be stripped"); + + final String hebrew = cp(0x05D0) + cp(0x05B8); // alef + qamats (a nonspacing mark) + assertEquals(hebrew, fold(hebrew)); + assertTrue(fold(hebrew).indexOf(0x05B8) >= 0, "the Hebrew point must not be stripped"); + } + + @Test + void testScriptScopeIsConfigurable() { + // With no folded scripts, Latin accents are preserved. + final AccentFoldCharSequenceNormalizer none = + new AccentFoldCharSequenceNormalizer(Set.of(), false); + assertEquals("caf" + cp(0x00E9), none.normalize("caf" + cp(0x00E9)).toString()); + + // Widening the scope to Arabic folds an Arabic mark that the default leaves untouched. + final AccentFoldCharSequenceNormalizer arabicToo = + new AccentFoldCharSequenceNormalizer(Set.of(Character.UnicodeScript.ARABIC), false); + assertEquals(cp(0x0628), arabicToo.normalize(cp(0x0628) + cp(0x064E)).toString()); + } + + @Test + void testStrokeLetterMappingIsConfigurable() { + final AccentFoldCharSequenceNormalizer noStroke = + new AccentFoldCharSequenceNormalizer(Set.of(Character.UnicodeScript.LATIN), false); + assertEquals(cp(0x00DF), noStroke.normalize(cp(0x00DF)).toString()); // eszett kept as-is + } + + @Test + void testComposesAfterCaseFold() { + final CharSequenceNormalizer pipeline = new AggregateCharSequenceNormalizer( + CaseFoldCharSequenceNormalizer.getInstance(), + AccentFoldCharSequenceNormalizer.getInstance()); + assertEquals("cafe", pipeline.normalize("CAF" + cp(0x00C9)).toString()); // CAFE with acute E + } + + @Test + void testInstanceIsSharedSingleton() { + assertSame(AccentFoldCharSequenceNormalizer.getInstance(), + AccentFoldCharSequenceNormalizer.getInstance()); + } +} diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizerTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizerTest.java new file mode 100644 index 000000000..7a700739f --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizerTest.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertSame; + +/** + * Tests for the {@link CharClass}-backed and Unicode-normalization {@link CharSequenceNormalizer} + * implementations, and their composition through {@link AggregateCharSequenceNormalizer}. + */ +public class UnicodeCharSequenceNormalizerTest { + + private static String cp(int codePoint) { + return new String(Character.toChars(codePoint)); + } + + @Test + void testWhitespaceCollapsesUnicodeRunsAndTrims() { + final String input = " a" + cp(0x00A0) + cp(0x00A0) + "b" + cp(0x3000) + " "; + assertEquals("a b", + WhitespaceCharSequenceNormalizer.getInstance().normalize(input).toString()); + } + + @Test + void testDashFoldsUnicodeDashesButNotMathMinus() { + assertEquals("a-b", + DashCharSequenceNormalizer.getInstance().normalize("a" + cp(0x2014) + "b").toString()); + final String math = "5" + cp(0x2212) + "3"; + assertEquals(math, DashCharSequenceNormalizer.getInstance().normalize(math).toString()); + } + + @Test + void testNfcComposesDecomposedSequences() { + // "e" + combining acute accent -> the precomposed letter U+00E9. + assertEquals(cp(0x00E9), + NfcCharSequenceNormalizer.getInstance().normalize("e" + cp(0x0301)).toString()); + } + + @Test + void testNfkcFoldsCompatibilityForms() { + assertEquals("A", + NfkcCharSequenceNormalizer.getInstance().normalize(cp(0xFF21)).toString()); + assertEquals("fi", + NfkcCharSequenceNormalizer.getInstance().normalize(cp(0xFB01)).toString()); + } + + @Test + void testCaseFoldLowercasesIndependentOfLocale() { + assertEquals("abc", CaseFoldCharSequenceNormalizer.getInstance().normalize("ABC").toString()); + // Accents are preserved; only case changes (CAFE-acute -> cafe-acute). + assertEquals("caf" + cp(0x00E9), + CaseFoldCharSequenceNormalizer.getInstance().normalize("CAF" + cp(0x00C9)).toString()); + } + + @Test + void testInstancesAreSharedSingletons() { + assertSame(WhitespaceCharSequenceNormalizer.getInstance(), + WhitespaceCharSequenceNormalizer.getInstance()); + assertSame(DashCharSequenceNormalizer.getInstance(), + DashCharSequenceNormalizer.getInstance()); + assertSame(NfcCharSequenceNormalizer.getInstance(), + NfcCharSequenceNormalizer.getInstance()); + assertSame(NfkcCharSequenceNormalizer.getInstance(), + NfkcCharSequenceNormalizer.getInstance()); + assertSame(CaseFoldCharSequenceNormalizer.getInstance(), + CaseFoldCharSequenceNormalizer.getInstance()); + } + + @Test + void testComposeIntoAUnifiedPipeline() { + // NFC, then Unicode whitespace, then dash folding, applied in order through the aggregate. + final CharSequenceNormalizer pipeline = new AggregateCharSequenceNormalizer( + NfcCharSequenceNormalizer.getInstance(), + WhitespaceCharSequenceNormalizer.getInstance(), + DashCharSequenceNormalizer.getInstance()); + + final String input = cp(0x00A0) + "a" + cp(0x2014) + "b" + cp(0x00A0); + assertEquals("a-b", pipeline.normalize(input).toString()); + } +}
