This is an automated email from the ASF dual-hosted git repository.
krickert pushed a change to branch OPENNLP-1850_Whitespace-UTF-Normalizae
in repository https://gitbox.apache.org/repos/asf/opennlp.git
from 39d8cb6b9 Bump actions/setup-java from 5.2.0 to 5.3.0 (#1096)
add c22a7f582 OPENNLP-1846 - Recognize all entity types in NameFinderDL
and harden decoding
add a3c423a2f OPENNLP-1846 - Address NameFinderDL review feedback
add 0ada8a835 OPENNLP-1846 - Harden NameFinderDL constants and fail loud
on unmapped labels
new d17eb8435 Merge branch 'OPENNLP-1846' into
OPENNLP-1850_Whitespace-UTF-Normalizae
new 0d53e31bb OPENNLP-1850 - Add robust character sequence normalization
utilities and tests
new ab15d7523 OPENNLP-1850 - Add quote/digit/invisible/ellipsis/bullet
normalizers, the TextNormalizer pipeline, and offset-preserving TextAnalyzer
new 19fb1b630 OPENNLP-1850 - Add offset-safe input normalization opt-ins
to the DL components
new 858fb7f57 OPENNLP-1850 - Document text normalization in the manual
The 5 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
opennlp-api/pom.xml | 6 +
...rSequenceNormalizer.java => AnalyzedToken.java} | 24 +-
.../opennlp/tools/util/normalizer/CharClass.java | 383 +++++++++++++
.../tools/util/normalizer/CodePointSet.java | 245 +++++++++
.../tools/util/normalizer/NormalizedText.java | 51 ++
.../opennlp/tools/util/normalizer/OffsetMap.java | 135 +++++
.../tools/util/normalizer/TextAnalyzer.java | 93 ++++
.../opennlp/tools/util/normalizer/UnicodeDash.java | 189 +++++++
.../tools/util/normalizer/UnicodeWhitespace.java | 242 ++++++++
.../tools/util/normalizer/CharClassTest.java | 292 ++++++++++
.../tools/util/normalizer/CodePointSetTest.java | 241 ++++++++
.../tools/util/normalizer/TextAnalyzerTest.java | 102 ++++
.../tools/util/normalizer/UnicodeDashTest.java | 170 ++++++
.../util/normalizer/UnicodeWhitespaceTest.java | 239 ++++++++
opennlp-core/opennlp-ml/opennlp-dl/README.md | 16 +-
.../src/main/java/opennlp/dl/AbstractDL.java | 59 ++
.../src/main/java/opennlp/dl/InferenceOptions.java | 34 ++
.../src/main/java/opennlp/dl/SpanEnd.java | 27 -
.../opennlp/dl/doccat/DocumentCategorizerDL.java | 24 +-
.../java/opennlp/dl/namefinder/NameFinderDL.java | 606 +++++++++++++++------
.../java/opennlp/dl/AbstractDLChunkingTest.java | 82 +++
.../opennlp/dl/namefinder/NameFinderDLTest.java | 399 ++++++++++++++
.../AccentFoldCharSequenceNormalizer.java | 133 +++++
.../normalizer/BulletCharSequenceNormalizer.java | 51 ++
.../normalizer/CaseFoldCharSequenceNormalizer.java | 47 ++
.../normalizer/DashCharSequenceNormalizer.java | 45 ++
.../normalizer/DigitCharSequenceNormalizer.java | 57 ++
.../normalizer/EllipsisCharSequenceNormalizer.java | 60 ++
.../InvisibleCharSequenceNormalizer.java | 71 +++
.../util/normalizer/NfcCharSequenceNormalizer.java | 45 ++
.../normalizer/NfkcCharSequenceNormalizer.java | 46 ++
.../normalizer/QuoteCharSequenceNormalizer.java | 69 +++
.../tools/util/normalizer/TextNormalizer.java | 142 +++++
.../WhitespaceCharSequenceNormalizer.java | 46 ++
.../AccentFoldCharSequenceNormalizerTest.java | 145 +++++
.../util/normalizer/SetBasedNormalizerTest.java | 163 ++++++
.../tools/util/normalizer/TextNormalizerTest.java | 77 +++
.../UnicodeCharSequenceNormalizerTest.java | 97 ++++
opennlp-docs/src/docbkx/normalizer.xml | 317 +++++++++++
opennlp-docs/src/docbkx/opennlp.xml | 1 +
.../opennlp/dl/namefinder/NameFinderDLEval.java | 60 +-
41 files changed, 5099 insertions(+), 232 deletions(-)
copy
opennlp-api/src/main/java/opennlp/tools/util/normalizer/{CharSequenceNormalizer.java
=> AnalyzedToken.java} (54%)
create mode 100644
opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java
create mode 100644
opennlp-api/src/main/java/opennlp/tools/util/normalizer/CodePointSet.java
create mode 100644
opennlp-api/src/main/java/opennlp/tools/util/normalizer/NormalizedText.java
create mode 100644
opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetMap.java
create mode 100644
opennlp-api/src/main/java/opennlp/tools/util/normalizer/TextAnalyzer.java
create mode 100644
opennlp-api/src/main/java/opennlp/tools/util/normalizer/UnicodeDash.java
create mode 100644
opennlp-api/src/main/java/opennlp/tools/util/normalizer/UnicodeWhitespace.java
create mode 100644
opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java
create mode 100644
opennlp-api/src/test/java/opennlp/tools/util/normalizer/CodePointSetTest.java
create mode 100644
opennlp-api/src/test/java/opennlp/tools/util/normalizer/TextAnalyzerTest.java
create mode 100644
opennlp-api/src/test/java/opennlp/tools/util/normalizer/UnicodeDashTest.java
create mode 100644
opennlp-api/src/test/java/opennlp/tools/util/normalizer/UnicodeWhitespaceTest.java
delete mode 100644
opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/SpanEnd.java
create mode 100644
opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/AbstractDLChunkingTest.java
create mode 100644
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizer.java
create mode 100644
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java
create mode 100644
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/CaseFoldCharSequenceNormalizer.java
create mode 100644
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java
create mode 100644
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java
create mode 100644
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java
create mode 100644
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java
create mode 100644
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NfcCharSequenceNormalizer.java
create mode 100644
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NfkcCharSequenceNormalizer.java
create mode 100644
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java
create mode 100644
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java
create mode 100644
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java
create mode 100644
opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizerTest.java
create mode 100644
opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/SetBasedNormalizerTest.java
create mode 100644
opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TextNormalizerTest.java
create mode 100644
opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizerTest.java
create mode 100644 opennlp-docs/src/docbkx/normalizer.xml