This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4387
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d6bd747d557bf2172b2393b197dacbc33fc97632
Author: tallison <[email protected]>
AuthorDate: Tue Feb 25 16:30:24 2025 -0500

    TIKA-4387 -- FilenameUtils should require that file extensions be ascii 
alphanumerics only.
---
 tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java | 11 +++++++++--
 .../src/test/java/org/apache/tika/io/FilenameUtilsTest.java   |  8 ++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java 
b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
index 17bc9e920..efaf07ca3 100644
--- a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
@@ -18,6 +18,7 @@ package org.apache.tika.io;
 
 import java.util.HashSet;
 import java.util.Locale;
+import java.util.regex.Pattern;
 
 import org.apache.tika.utils.StringUtils;
 
@@ -42,6 +43,7 @@ public class FilenameUtils {
         }
     }
 
+    private final static Pattern ASCII_NUMERIC = 
Pattern.compile("\\A\\.(?i)[a-z0-9]{1,5}\\Z");
 
     /**
      * Scans the given file name for reserved characters on different OSs and
@@ -110,7 +112,9 @@ public class FilenameUtils {
     }
 
     /**
-     * This includes the period, e.g. ".pdf"
+     * This includes the period, e.g. ".pdf".
+     * This requires that an extension contain only ascii alphanumerics
+     * and it requires that an extension length be 5 or less.
      * @param path
      * @return the suffix or an empty string if one could not be found
      */
@@ -119,7 +123,10 @@ public class FilenameUtils {
         int i = n.lastIndexOf(".");
         //arbitrarily sets max extension length
         if (i > -1 && n.length() - i < 6) {
-            return n.substring(i);
+            String suffix = n.substring(i);
+            if (ASCII_NUMERIC.matcher(suffix).matches()) {
+                return suffix;
+            }
         }
         return StringUtils.EMPTY;
     }
diff --git a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java 
b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
index 0cc869aa1..39f0ae757 100644
--- a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
@@ -23,6 +23,8 @@ import static org.junit.jupiter.api.Assertions.fail;
 
 import org.junit.jupiter.api.Test;
 
+import org.apache.tika.utils.StringUtils;
+
 public class FilenameUtilsTest {
 
     /**
@@ -101,6 +103,12 @@ public class FilenameUtilsTest {
         testFilenameEquality("HW.txt", "_1457338542/HW.txt");
     }
 
+    @Test
+    public void testExtension() throws Exception {
+        assertEquals(".pdf", 
FilenameUtils.getSuffixFromPath("blah/blah/or/something.pdf"));
+        assertEquals(StringUtils.EMPTY, FilenameUtils.getSuffixFromPath("blah 
\" blaoh .5\""));
+    }
+
     private void testFilenameEquality(String expected, String path) {
         assertEquals(expected, FilenameUtils.getName(path));
     }

Reply via email to