This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch TIKA-4731-common-script
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 63dc5ed7066dca81add348fe8656d7baae49f7c2
Author: tallison <[email protected]>
AuthorDate: Wed May 20 08:04:32 2026 -0400

    TIKA-4731 - further refinements
---
 tika-ml/tika-ml-junkdetect/pom.xml                 |  12 ++++++++++++
 .../tika/ml/junkdetect/tools/TrainJunkModel.java   |   6 ++++++
 .../org/apache/tika/ml/junkdetect/junkdetect.bin   | Bin 2784427 -> 2901358 
bytes
 3 files changed, 18 insertions(+)

diff --git a/tika-ml/tika-ml-junkdetect/pom.xml 
b/tika-ml/tika-ml-junkdetect/pom.xml
index a10d73ad64..fe717998cf 100644
--- a/tika-ml/tika-ml-junkdetect/pom.xml
+++ b/tika-ml/tika-ml-junkdetect/pom.xml
@@ -61,6 +61,18 @@
     </dependency>
 
     <!-- Test dependencies -->
+    <!--
+      tika-serialization is test-scope only because the one consumer
+      (BuildJunkAugmentationData) lives in src/test/java — it's a corpus-prep
+      tool, not part of the runtime detector. Keeps the production classpath of
+      tika-ml-junkdetect free of the serialization dep.
+    -->
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-serialization</artifactId>
+      <version>${revision}</version>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.junit.jupiter</groupId>
       <artifactId>junit-jupiter-api</artifactId>
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
index 88807c1cdb..b52e185eff 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
@@ -226,6 +226,12 @@ public class TrainJunkModel {
         {"ISO-8859-1", "windows-1252"},
         {"windows-1252", "ISO-8859-1"},
         {"x-MacRoman", "windows-1252"},
+        // The exact win-1252/ISO-8859-2 sibling pathology: a win-1252 page 
with
+        // ©/®/£ symbols read as ISO-8859-2 yields isolated Latin-Extended-A
+        // letters (Š/Ž/Ł). Included as classifier negatives so the LR trains
+        // against this pattern directly.
+        {"windows-1252", "ISO-8859-2"},
+        {"ISO-8859-2", "windows-1252"},
         // SBCS Cyrillic / Greek / RTL
         {"windows-1251", "windows-1252"},
         {"windows-1252", "windows-1251"},
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 
b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
index 50f7dfe2e6..af491ba162 100644
Binary files 
a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 and 
b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 differ

Reply via email to