This is an automated email from the ASF dual-hosted git repository.
tallison pushed a change to branch charset-detection-improvements
in repository https://gitbox.apache.org/repos/asf/tika.git
omit d48324fc75 git add
omit 9cce5a2613 git add
omit 0b88eb8603 checkpoint commit
This update removed existing revisions from the reference, leaving the
reference pointing at a previous point in the repository history.
* -- * -- N refs/heads/charset-detection-improvements (768f44e774)
\
O -- O -- O (d48324fc75)
Any revisions marked "omit" are not gone; other references still
refer to them. Any revisions marked "discard" are gone forever.
No new revisions were added by this update.
Summary of changes:
.../org/apache/tika/detect/AutoDetectReader.java | 9 +-
.../org/apache/tika/detect/CharsetSupersets.java | 89 ----
.../apache/tika/metadata/TikaCoreProperties.java | 12 -
.../ml/chardetect/ByteNgramFeatureExtractor.java | 122 ++++--
.../tika/ml/chardetect/CjkEncodingRules.java | 461 +++++++++++++++++++++
.../ml/chardetect/MojibusterEncodingDetector.java | 273 ++++++------
.../tika/ml/chardetect/chardetect-v6-no-utf32.bin | Bin 0 -> 574128 bytes
.../org/apache/tika/ml/chardetect/chardetect.bin | Bin 623569 -> 0 bytes
.../tika/ml/chardetect/EbcdicRoutingTest.java | 80 +---
.../ml/chardetect/tools/BucketCollisionAudit.java | 459 ++++++++++++++++++++
.../chardetect/tools/BuildCharsetTrainingData.java | 256 ++++--------
.../ConfigurableByteNgramFeatureExtractor.java | 416 +++++++++++++++++++
.../ml/chardetect/tools/EvalCharsetDetectors.java | 17 +-
.../ml/chardetect/tools/TraceCharsetLogits.java | 15 +-
.../ml/chardetect/tools/TrainCharsetModel.java | 86 +++-
.../chardetect/ByteNgramFeatureExtractorTest.java | 19 +-
.../chardetect/ConfigurableGlobalFeatureTest.java | 233 +++++++++++
.../ml/chardetect/FeatureExtractorParityTest.java | 354 ++++++++++++++++
18 files changed, 2331 insertions(+), 570 deletions(-)
delete mode 100644
tika-core/src/main/java/org/apache/tika/detect/CharsetSupersets.java
create mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkEncodingRules.java
create mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect-v6-no-utf32.bin
delete mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect.bin
create mode 100644
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BucketCollisionAudit.java
create mode 100644
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java
create mode 100644
tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java
create mode 100644
tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/FeatureExtractorParityTest.java