This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new e34aa7d51 TIKA-4387 -- FilenameUtils should require that file
extensions be ascii alphanumerics only. (#2143)
e34aa7d51 is described below
commit e34aa7d51ca8eaa13d45d95411eedbe778895555
Author: Tim Allison <[email protected]>
AuthorDate: Tue Feb 25 17:04:28 2025 -0500
TIKA-4387 -- FilenameUtils should require that file extensions be ascii
alphanumerics only. (#2143)
---
tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java | 11 +++++++++--
.../src/test/java/org/apache/tika/io/FilenameUtilsTest.java | 8 ++++++++
2 files changed, 17 insertions(+), 2 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
index 17bc9e920..efaf07ca3 100644
--- a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
@@ -18,6 +18,7 @@ package org.apache.tika.io;
import java.util.HashSet;
import java.util.Locale;
+import java.util.regex.Pattern;
import org.apache.tika.utils.StringUtils;
@@ -42,6 +43,7 @@ public class FilenameUtils {
}
}
+ private final static Pattern ASCII_NUMERIC =
Pattern.compile("\\A\\.(?i)[a-z0-9]{1,5}\\Z");
/**
* Scans the given file name for reserved characters on different OSs and
@@ -110,7 +112,9 @@ public class FilenameUtils {
}
/**
- * This includes the period, e.g. ".pdf"
+ * This includes the period, e.g. ".pdf".
+ * This requires that an extension contain only ascii alphanumerics
+ * and it requires that an extension length be 5 or less.
* @param path
* @return the suffix or an empty string if one could not be found
*/
@@ -119,7 +123,10 @@ public class FilenameUtils {
int i = n.lastIndexOf(".");
//arbitrarily sets max extension length
if (i > -1 && n.length() - i < 6) {
- return n.substring(i);
+ String suffix = n.substring(i);
+ if (ASCII_NUMERIC.matcher(suffix).matches()) {
+ return suffix;
+ }
}
return StringUtils.EMPTY;
}
diff --git a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
index 0cc869aa1..39f0ae757 100644
--- a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
@@ -23,6 +23,8 @@ import static org.junit.jupiter.api.Assertions.fail;
import org.junit.jupiter.api.Test;
+import org.apache.tika.utils.StringUtils;
+
public class FilenameUtilsTest {
/**
@@ -101,6 +103,12 @@ public class FilenameUtilsTest {
testFilenameEquality("HW.txt", "_1457338542/HW.txt");
}
+ @Test
+ public void testExtension() throws Exception {
+ assertEquals(".pdf",
FilenameUtils.getSuffixFromPath("blah/blah/or/something.pdf"));
+ assertEquals(StringUtils.EMPTY, FilenameUtils.getSuffixFromPath("blah
\" blaoh .5\""));
+ }
+
private void testFilenameEquality(String expected, String path) {
assertEquals(expected, FilenameUtils.getName(path));
}