This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 48e4ecc126 strip charset from mimes (#2775)
48e4ecc126 is described below

commit 48e4ecc126fe762eca7a59a4992439d18c0d4ef5
Author: Tim Allison <[email protected]>
AuthorDate: Fri Apr 17 09:31:19 2026 -0400

    strip charset from mimes (#2775)
---
 .../org/apache/tika/eval/app/ProfilerBase.java     |  1 +
 .../java/org/apache/tika/eval/app/db/Cols.java     |  1 +
 .../org/apache/tika/eval/app/db/MimeBuffer.java    | 13 ++++++++----
 .../src/main/resources/comparison-reports-pg.xml   | 24 +++++++++++-----------
 .../src/main/resources/comparison-reports.xml      | 24 +++++++++++-----------
 5 files changed, 35 insertions(+), 28 deletions(-)

diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
index 9e083b4057..6df2641a2d 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
@@ -102,6 +102,7 @@ public abstract class ProfilerBase {
     public static TableInfo REF_PARSE_EXCEPTION_TYPES =
             new TableInfo("ref_parse_exception_types", new 
ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER), new 
ColInfo(Cols.PARSE_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128));
     public static TableInfo MIME_TABLE = new TableInfo("mimes", new 
ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"), new 
ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256),
+            new ColInfo(Cols.BASE_MIME, Types.VARCHAR, 256),
             new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12));
     private static CommonTokenCountManager COMMON_TOKEN_COUNT_MANAGER;
     private static Pattern FILE_NAME_CLEANER = 
Pattern.compile("\\.(json|txt)(\\.(bz2|gz|zip))?$");
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java
index 55b78423a8..0724f7f16e 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java
@@ -51,6 +51,7 @@ public enum Cols {
 
 
     MIME_STRING,//string representation of mime type
+    BASE_MIME,//mime type without parameters (charset, delimiter, etc.)
 
     DIR_NAME_A,//for comparisons in REF_PAIR_NAMES
     DIR_NAME_B,
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java
index e03a63a3ed..9541020541 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java
@@ -36,7 +36,10 @@ public class MimeBuffer extends AbstractDBBuffer {
 
     public MimeBuffer(Connection connection, TableInfo mimeTable, MimeTypes 
mimeTypes) throws SQLException {
         st = connection.prepareStatement(
-                "insert into " + mimeTable.getName() + "( " + 
Cols.MIME_ID.name() + ", " + Cols.MIME_STRING.name() + ", " + 
Cols.FILE_EXTENSION.name() + ") values (?,?,?)");
+                "insert into " + mimeTable.getName() + "( " +
+                        Cols.MIME_ID.name() + ", " + Cols.MIME_STRING.name() +
+                        ", " + Cols.BASE_MIME.name() + ", " +
+                        Cols.FILE_EXTENSION.name() + ") values (?,?,?,?)");
         this.mimeTypes = mimeTypes;
         this.connection = connection;
     }
@@ -47,15 +50,17 @@ public class MimeBuffer extends AbstractDBBuffer {
             st.clearParameters();
             st.setInt(1, id);
             st.setString(2, value);
+            int semi = value.indexOf(';');
+            st.setString(3, semi > 0 ? value.substring(0, semi).trim() : 
value);
             try {
                 String ext = MimeUtil.getExtension(value, mimeTypes);
                 if (ext == null || ext.isEmpty()) {
-                    st.setNull(3, Types.VARCHAR);
+                    st.setNull(4, Types.VARCHAR);
                 } else {
-                    st.setString(3, ext);
+                    st.setString(4, ext);
                 }
             } catch (MimeTypeException e) {
-                st.setNull(3, Types.VARCHAR);
+                st.setNull(4, Types.VARCHAR);
             }
             st.execute();
 
diff --git 
a/tika-eval/tika-eval-app/src/main/resources/comparison-reports-pg.xml 
b/tika-eval/tika-eval-app/src/main/resources/comparison-reports-pg.xml
index b4c3cf0a1b..0773e51dd3 100644
--- a/tika-eval/tika-eval-app/src/main/resources/comparison-reports-pg.xml
+++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports-pg.xml
@@ -1510,12 +1510,12 @@
           includeSql="true">
 
     <sql>
-      select mime_string, count(1) as cnt
+      select base_mime, count(1) as cnt
       from profiles_a pa
       left join profiles_b pb on pa.id=pb.id
       join mimes m on pa.mime_id=m.mime_id
       where pb.id is null
-      group by mime_string
+      group by base_mime
       order by cnt desc
     </sql>
   </report>
@@ -1525,12 +1525,12 @@
           includeSql="true">
 
     <sql>
-      select mime_string, count(1) as cnt
+      select base_mime, count(1) as cnt
       from profiles_a pa
       left join profiles_b pb on pa.id=pb.id
       join mimes m on pa.mime_id=m.mime_id
       where pb.id is null and pa.is_embedded=false
-      group by mime_string
+      group by base_mime
       order by cnt desc
     </sql>
   </report>
@@ -1540,12 +1540,12 @@
           includeSql="true">
 
     <sql>
-      select mime_string, count(1) as cnt
+      select base_mime, count(1) as cnt
       from profiles_a pa
       left join profiles_b pb on pa.id=pb.id
       join mimes m on pa.mime_id=m.mime_id
       where pb.id is null and pa.is_embedded=true
-      group by mime_string
+      group by base_mime
       order by cnt desc
     </sql>
   </report>
@@ -1555,12 +1555,12 @@
           includeSql="true">
 
     <sql>
-      select mime_string, count(1) as cnt
+      select base_mime, count(1) as cnt
       from profiles_b pb
       left join profiles_a pa on pb.id=pa.id
       join mimes m on pb.mime_id=m.mime_id
       where pa.id is null
-      group by mime_string
+      group by base_mime
       order by cnt desc
     </sql>
   </report>
@@ -1570,12 +1570,12 @@
           includeSql="true">
 
     <sql>
-      select mime_string, count(1) as cnt
+      select base_mime, count(1) as cnt
       from profiles_b pb
       left join profiles_a pa on pb.id=pa.id
       join mimes m on pb.mime_id=m.mime_id
       where pa.id is null and pb.is_embedded=false
-      group by mime_string
+      group by base_mime
       order by cnt desc
     </sql>
   </report>
@@ -1585,12 +1585,12 @@
           includeSql="true">
 
     <sql>
-      select mime_string, count(1) as cnt
+      select base_mime, count(1) as cnt
       from profiles_b pb
       left join profiles_a pa on pb.id=pa.id
       join mimes m on pb.mime_id=m.mime_id
       where pa.id is null and pb.is_embedded=true
-      group by mime_string
+      group by base_mime
       order by cnt desc
     </sql>
   </report>
diff --git a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml 
b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
index 082949e9cf..cf2aca82b4 100644
--- a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
+++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
@@ -1318,12 +1318,12 @@
           includeSql="true">
 
     <sql>
-      select mime_string, count(1) as cnt
+      select base_mime, count(1) as cnt
       from profiles_a pa
       left join profiles_b pb on pa.id=pb.id
       join mimes m on pa.mime_id=m.mime_id
       where pb.id is null
-      group by mime_string
+      group by base_mime
       order by cnt desc
     </sql>
   </report>
@@ -1333,12 +1333,12 @@
           includeSql="true">
 
     <sql>
-      select mime_string, count(1) as cnt
+      select base_mime, count(1) as cnt
       from profiles_a pa
       left join profiles_b pb on pa.id=pb.id
       join mimes m on pa.mime_id=m.mime_id
       where pb.id is null and pa.is_embedded=false
-      group by mime_string
+      group by base_mime
       order by cnt desc
     </sql>
   </report>
@@ -1348,12 +1348,12 @@
           includeSql="true">
 
     <sql>
-      select mime_string, count(1) as cnt
+      select base_mime, count(1) as cnt
       from profiles_a pa
       left join profiles_b pb on pa.id=pb.id
       join mimes m on pa.mime_id=m.mime_id
       where pb.id is null and pa.is_embedded=true
-      group by mime_string
+      group by base_mime
       order by cnt desc
     </sql>
   </report>
@@ -1363,12 +1363,12 @@
           includeSql="true">
 
     <sql>
-      select mime_string, count(1) as cnt
+      select base_mime, count(1) as cnt
       from profiles_b pb
       left join profiles_a pa on pb.id=pa.id
       join mimes m on pb.mime_id=m.mime_id
       where pa.id is null
-      group by mime_string
+      group by base_mime
       order by cnt desc
     </sql>
   </report>
@@ -1378,12 +1378,12 @@
           includeSql="true">
 
     <sql>
-      select mime_string, count(1) as cnt
+      select base_mime, count(1) as cnt
       from profiles_b pb
       left join profiles_a pa on pb.id=pa.id
       join mimes m on pb.mime_id=m.mime_id
       where pa.id is null and pb.is_embedded=false
-      group by mime_string
+      group by base_mime
       order by cnt desc
     </sql>
   </report>
@@ -1393,12 +1393,12 @@
           includeSql="true">
 
     <sql>
-      select mime_string, count(1) as cnt
+      select base_mime, count(1) as cnt
       from profiles_b pb
       left join profiles_a pa on pb.id=pa.id
       join mimes m on pb.mime_id=m.mime_id
       where pa.id is null and pb.is_embedded=true
-      group by mime_string
+      group by base_mime
       order by cnt desc
     </sql>
   </report>

Reply via email to