This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4387 in repository https://gitbox.apache.org/repos/asf/tika.git
commit d6bd747d557bf2172b2393b197dacbc33fc97632 Author: tallison <[email protected]> AuthorDate: Tue Feb 25 16:30:24 2025 -0500 TIKA-4387 -- FilenameUtils should require that file extensions be ascii alphanumerics only. --- tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java | 11 +++++++++-- .../src/test/java/org/apache/tika/io/FilenameUtilsTest.java | 8 ++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java index 17bc9e920..efaf07ca3 100644 --- a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java +++ b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java @@ -18,6 +18,7 @@ package org.apache.tika.io; import java.util.HashSet; import java.util.Locale; +import java.util.regex.Pattern; import org.apache.tika.utils.StringUtils; @@ -42,6 +43,7 @@ public class FilenameUtils { } } + private final static Pattern ASCII_NUMERIC = Pattern.compile("\\A\\.(?i)[a-z0-9]{1,5}\\Z"); /** * Scans the given file name for reserved characters on different OSs and @@ -110,7 +112,9 @@ public class FilenameUtils { } /** - * This includes the period, e.g. ".pdf" + * This includes the period, e.g. ".pdf". + * This requires that an extension contain only ascii alphanumerics + * and it requires that an extension length be 5 or less. * @param path * @return the suffix or an empty string if one could not be found */ @@ -119,7 +123,10 @@ public class FilenameUtils { int i = n.lastIndexOf("."); //arbitrarily sets max extension length if (i > -1 && n.length() - i < 6) { - return n.substring(i); + String suffix = n.substring(i); + if (ASCII_NUMERIC.matcher(suffix).matches()) { + return suffix; + } } return StringUtils.EMPTY; } diff --git a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java index 0cc869aa1..39f0ae757 100644 --- a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java +++ b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java @@ -23,6 +23,8 @@ import static org.junit.jupiter.api.Assertions.fail; import org.junit.jupiter.api.Test; +import org.apache.tika.utils.StringUtils; + public class FilenameUtilsTest { /** @@ -101,6 +103,12 @@ public class FilenameUtilsTest { testFilenameEquality("HW.txt", "_1457338542/HW.txt"); } + @Test + public void testExtension() throws Exception { + assertEquals(".pdf", FilenameUtils.getSuffixFromPath("blah/blah/or/something.pdf")); + assertEquals(StringUtils.EMPTY, FilenameUtils.getSuffixFromPath("blah \" blaoh .5\"")); + } + private void testFilenameEquality(String expected, String path) { assertEquals(expected, FilenameUtils.getName(path)); }
