This is an automated email from the ASF dual-hosted git repository.

krickert pushed a change to branch OPENNLP-1850_Whitespace-UTF-Normalizae
in repository https://gitbox.apache.org/repos/asf/opennlp.git


    from 39d8cb6b9 Bump actions/setup-java from 5.2.0 to 5.3.0 (#1096)
     add c22a7f582 OPENNLP-1846 - Recognize all entity types in NameFinderDL 
and harden decoding
     add a3c423a2f OPENNLP-1846 - Address NameFinderDL review feedback
     add 0ada8a835 OPENNLP-1846 - Harden NameFinderDL constants and fail loud 
on unmapped labels
     new d17eb8435 Merge branch 'OPENNLP-1846' into 
OPENNLP-1850_Whitespace-UTF-Normalizae
     new 0d53e31bb OPENNLP-1850 - Add robust character sequence normalization 
utilities and tests
     new ab15d7523 OPENNLP-1850 - Add quote/digit/invisible/ellipsis/bullet 
normalizers, the TextNormalizer pipeline, and offset-preserving TextAnalyzer
     new 19fb1b630 OPENNLP-1850 - Add offset-safe input normalization opt-ins 
to the DL components
     new 858fb7f57 OPENNLP-1850 - Document text normalization in the manual

The 5 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 opennlp-api/pom.xml                                |   6 +
 ...rSequenceNormalizer.java => AnalyzedToken.java} |  24 +-
 .../opennlp/tools/util/normalizer/CharClass.java   | 383 +++++++++++++
 .../tools/util/normalizer/CodePointSet.java        | 245 +++++++++
 .../tools/util/normalizer/NormalizedText.java      |  51 ++
 .../opennlp/tools/util/normalizer/OffsetMap.java   | 135 +++++
 .../tools/util/normalizer/TextAnalyzer.java        |  93 ++++
 .../opennlp/tools/util/normalizer/UnicodeDash.java | 189 +++++++
 .../tools/util/normalizer/UnicodeWhitespace.java   | 242 ++++++++
 .../tools/util/normalizer/CharClassTest.java       | 292 ++++++++++
 .../tools/util/normalizer/CodePointSetTest.java    | 241 ++++++++
 .../tools/util/normalizer/TextAnalyzerTest.java    | 102 ++++
 .../tools/util/normalizer/UnicodeDashTest.java     | 170 ++++++
 .../util/normalizer/UnicodeWhitespaceTest.java     | 239 ++++++++
 opennlp-core/opennlp-ml/opennlp-dl/README.md       |  16 +-
 .../src/main/java/opennlp/dl/AbstractDL.java       |  59 ++
 .../src/main/java/opennlp/dl/InferenceOptions.java |  34 ++
 .../src/main/java/opennlp/dl/SpanEnd.java          |  27 -
 .../opennlp/dl/doccat/DocumentCategorizerDL.java   |  24 +-
 .../java/opennlp/dl/namefinder/NameFinderDL.java   | 606 +++++++++++++++------
 .../java/opennlp/dl/AbstractDLChunkingTest.java    |  82 +++
 .../opennlp/dl/namefinder/NameFinderDLTest.java    | 399 ++++++++++++++
 .../AccentFoldCharSequenceNormalizer.java          | 133 +++++
 .../normalizer/BulletCharSequenceNormalizer.java   |  51 ++
 .../normalizer/CaseFoldCharSequenceNormalizer.java |  47 ++
 .../normalizer/DashCharSequenceNormalizer.java     |  45 ++
 .../normalizer/DigitCharSequenceNormalizer.java    |  57 ++
 .../normalizer/EllipsisCharSequenceNormalizer.java |  60 ++
 .../InvisibleCharSequenceNormalizer.java           |  71 +++
 .../util/normalizer/NfcCharSequenceNormalizer.java |  45 ++
 .../normalizer/NfkcCharSequenceNormalizer.java     |  46 ++
 .../normalizer/QuoteCharSequenceNormalizer.java    |  69 +++
 .../tools/util/normalizer/TextNormalizer.java      | 142 +++++
 .../WhitespaceCharSequenceNormalizer.java          |  46 ++
 .../AccentFoldCharSequenceNormalizerTest.java      | 145 +++++
 .../util/normalizer/SetBasedNormalizerTest.java    | 163 ++++++
 .../tools/util/normalizer/TextNormalizerTest.java  |  77 +++
 .../UnicodeCharSequenceNormalizerTest.java         |  97 ++++
 opennlp-docs/src/docbkx/normalizer.xml             | 317 +++++++++++
 opennlp-docs/src/docbkx/opennlp.xml                |   1 +
 .../opennlp/dl/namefinder/NameFinderDLEval.java    |  60 +-
 41 files changed, 5099 insertions(+), 232 deletions(-)
 copy 
opennlp-api/src/main/java/opennlp/tools/util/normalizer/{CharSequenceNormalizer.java
 => AnalyzedToken.java} (54%)
 create mode 100644 
opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java
 create mode 100644 
opennlp-api/src/main/java/opennlp/tools/util/normalizer/CodePointSet.java
 create mode 100644 
opennlp-api/src/main/java/opennlp/tools/util/normalizer/NormalizedText.java
 create mode 100644 
opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetMap.java
 create mode 100644 
opennlp-api/src/main/java/opennlp/tools/util/normalizer/TextAnalyzer.java
 create mode 100644 
opennlp-api/src/main/java/opennlp/tools/util/normalizer/UnicodeDash.java
 create mode 100644 
opennlp-api/src/main/java/opennlp/tools/util/normalizer/UnicodeWhitespace.java
 create mode 100644 
opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java
 create mode 100644 
opennlp-api/src/test/java/opennlp/tools/util/normalizer/CodePointSetTest.java
 create mode 100644 
opennlp-api/src/test/java/opennlp/tools/util/normalizer/TextAnalyzerTest.java
 create mode 100644 
opennlp-api/src/test/java/opennlp/tools/util/normalizer/UnicodeDashTest.java
 create mode 100644 
opennlp-api/src/test/java/opennlp/tools/util/normalizer/UnicodeWhitespaceTest.java
 delete mode 100644 
opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/SpanEnd.java
 create mode 100644 
opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/AbstractDLChunkingTest.java
 create mode 100644 
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizer.java
 create mode 100644 
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java
 create mode 100644 
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/CaseFoldCharSequenceNormalizer.java
 create mode 100644 
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java
 create mode 100644 
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java
 create mode 100644 
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java
 create mode 100644 
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java
 create mode 100644 
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NfcCharSequenceNormalizer.java
 create mode 100644 
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NfkcCharSequenceNormalizer.java
 create mode 100644 
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java
 create mode 100644 
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java
 create mode 100644 
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java
 create mode 100644 
opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizerTest.java
 create mode 100644 
opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/SetBasedNormalizerTest.java
 create mode 100644 
opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TextNormalizerTest.java
 create mode 100644 
opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizerTest.java
 create mode 100644 opennlp-docs/src/docbkx/normalizer.xml

Reply via email to