[
https://issues.apache.org/jira/browse/TIKA-4731?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18083906#comment-18083906
]
Hudson commented on TIKA-4731:
------------------------------
SUCCESS: Integrated in Jenkins build Tika » tika-main-jdk17 #1389 (See
[https://ci-builds.apache.org/job/Tika/job/tika-main-jdk17/1389/])
TIKA-4731 - improve charset detection and junk detection (#2839) (github:
[https://github.com/apache/tika/commit/a2bc3513ac9c2dd6d0f63ea37363c3a3ec172cb4])
* (edit)
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java
* (edit) tika-ml/tika-ml-junkdetect/pom.xml
* (edit)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
* (edit)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
* (add)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/HtmlByteStripperTest.java
* (edit)
tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
* (add)
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationData.java
* (add)
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/DiagnoseDiscrimination.java
* (add)
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/HtmlContentCleaner.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
* (edit)
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java
* (add)
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/BigramTables.java
* (add)
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
* (delete)
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/CountPerScriptBigrams.java
* (add)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/TraceMojibuster.java
* (delete)
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/AnalyzeHanByBlock.java
* (add)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java
* (delete)
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorV7Test.java
* (edit)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin
* (edit)
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
* (add) .skills/tika-eval-h2-query.md
* (edit)
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java
* (add)
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java
* (delete)
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java
* (add)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CalibrateTopK.java
* (add)
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
* (edit)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CharsetConfusables.java
* (delete)
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/EntityRefProbe.java
* (delete)
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkOnCharsetDevtest.java
* (edit)
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BuildCharsetTrainingData.java
* (add)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/InspectBigramContributions.java
* (add)
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/TextQualityFeaturesTest.java
* (add)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CheckUtf8OnFile.java
* (delete)
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java
* (edit)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java
* (delete)
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/V7Tables.java
* (add)
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/RebalanceCharsetTraining.java
* (edit)
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
* (add)
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/LatinSiblingComparisonTest.java
* (add)
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/TraceJunkFilter.java
* (edit)
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
* (edit)
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
* (edit)
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetectorTest.java
* (add)
docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc
* (add)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/AdaptiveProbeTest.java
* (edit)
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java
> Ongoing improvements to the junk detector
> -----------------------------------------
>
> Key: TIKA-4731
> URL: https://issues.apache.org/jira/browse/TIKA-4731
> Project: Tika
> Issue Type: Task
> Reporter: Tim Allison
> Priority: Minor
> Fix For: 4.0.0
>
>
> With [https://github.com/apache/tika/pull/2818,] I think we have a decent
> shape for the junk detector.
> There are several areas for improvement, but I think it is ready to go.
> This ticket tracks follow on work, including:
> * Smaller model
> * Handling pathological code block changes
> * Handling candidates with different character counts
> * Other items to be discovered in our commoncrawl/govdocs1 corpus?
> We have some coverage for the middle two item, but need to address those more
> directly.
> This work is not a blocker on the 4.0.0-beta-1 release.
--
This message was sent by Atlassian Jira
(v8.20.10#820010)