This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch charset-detection-improvements
in repository https://gitbox.apache.org/repos/asf/tika.git


    from 768f44e774 split strides into separate model space
     add 0b88eb8603 checkpoint commit
     add 9cce5a2613 git add
     add d48324fc75 git add

No new revisions were added by this update.

Summary of changes:
 .../org/apache/tika/detect/AutoDetectReader.java   |   9 +-
 .../org/apache/tika/detect/CharsetSupersets.java   |  89 ++++
 .../apache/tika/metadata/TikaCoreProperties.java   |  12 +
 .../ml/chardetect/ByteNgramFeatureExtractor.java   | 122 ++----
 .../tika/ml/chardetect/CjkEncodingRules.java       | 461 ---------------------
 .../ml/chardetect/MojibusterEncodingDetector.java  | 273 ++++++------
 .../tika/ml/chardetect/chardetect-v6-no-utf32.bin  | Bin 574128 -> 0 bytes
 .../org/apache/tika/ml/chardetect/chardetect.bin   | Bin 0 -> 623569 bytes
 .../tika/ml/chardetect/EbcdicRoutingTest.java      |  80 +++-
 .../ml/chardetect/tools/BucketCollisionAudit.java  | 459 --------------------
 .../chardetect/tools/BuildCharsetTrainingData.java | 256 ++++++++----
 .../ConfigurableByteNgramFeatureExtractor.java     | 416 -------------------
 .../ml/chardetect/tools/EvalCharsetDetectors.java  |  17 +-
 .../ml/chardetect/tools/TraceCharsetLogits.java    |  15 +-
 .../ml/chardetect/tools/TrainCharsetModel.java     |  86 +---
 .../chardetect/ByteNgramFeatureExtractorTest.java  |  19 +-
 .../chardetect/ConfigurableGlobalFeatureTest.java  | 233 -----------
 .../ml/chardetect/FeatureExtractorParityTest.java  | 354 ----------------
 18 files changed, 570 insertions(+), 2331 deletions(-)
 create mode 100644 
tika-core/src/main/java/org/apache/tika/detect/CharsetSupersets.java
 delete mode 100644 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkEncodingRules.java
 delete mode 100644 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect-v6-no-utf32.bin
 create mode 100644 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect.bin
 delete mode 100644 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BucketCollisionAudit.java
 delete mode 100644 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java
 delete mode 100644 
tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java
 delete mode 100644 
tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/FeatureExtractorParityTest.java

Reply via email to