This is an automated email from the ASF dual-hosted git repository.
tallison pushed a change to branch charset-detection-improvements
in repository https://gitbox.apache.org/repos/asf/tika.git
from 768f44e774 split strides into separate model space
add 0b88eb8603 checkpoint commit
add 9cce5a2613 git add
add d48324fc75 git add
No new revisions were added by this update.
Summary of changes:
.../org/apache/tika/detect/AutoDetectReader.java | 9 +-
.../org/apache/tika/detect/CharsetSupersets.java | 89 ++++
.../apache/tika/metadata/TikaCoreProperties.java | 12 +
.../ml/chardetect/ByteNgramFeatureExtractor.java | 122 ++----
.../tika/ml/chardetect/CjkEncodingRules.java | 461 ---------------------
.../ml/chardetect/MojibusterEncodingDetector.java | 273 ++++++------
.../tika/ml/chardetect/chardetect-v6-no-utf32.bin | Bin 574128 -> 0 bytes
.../org/apache/tika/ml/chardetect/chardetect.bin | Bin 0 -> 623569 bytes
.../tika/ml/chardetect/EbcdicRoutingTest.java | 80 +++-
.../ml/chardetect/tools/BucketCollisionAudit.java | 459 --------------------
.../chardetect/tools/BuildCharsetTrainingData.java | 256 ++++++++----
.../ConfigurableByteNgramFeatureExtractor.java | 416 -------------------
.../ml/chardetect/tools/EvalCharsetDetectors.java | 17 +-
.../ml/chardetect/tools/TraceCharsetLogits.java | 15 +-
.../ml/chardetect/tools/TrainCharsetModel.java | 86 +---
.../chardetect/ByteNgramFeatureExtractorTest.java | 19 +-
.../chardetect/ConfigurableGlobalFeatureTest.java | 233 -----------
.../ml/chardetect/FeatureExtractorParityTest.java | 354 ----------------
18 files changed, 570 insertions(+), 2331 deletions(-)
create mode 100644
tika-core/src/main/java/org/apache/tika/detect/CharsetSupersets.java
delete mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkEncodingRules.java
delete mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect-v6-no-utf32.bin
create mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect.bin
delete mode 100644
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BucketCollisionAudit.java
delete mode 100644
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java
delete mode 100644
tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java
delete mode 100644
tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/FeatureExtractorParityTest.java