This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 67ada354f0 improve isatab parsing (#2875)
67ada354f0 is described below

commit 67ada354f0eec3bada1b732ffff4fd23a0d8b235
Author: Tim Allison <[email protected]>
AuthorDate: Fri Jun 5 12:50:50 2026 -0400

    improve isatab parsing (#2875)
    
    Co-authored-by: Copilot Autofix powered by AI 
<[email protected]>
---
 .../java/org/apache/tika/io/FilenameUtils.java     | 47 ++++++++++++++++++++++
 .../java/org/apache/tika/io/FilenameUtilsTest.java | 31 ++++++++++++++
 .../apache/tika/parser/isatab/ISArchiveParser.java | 11 +++--
 .../tika/parser/isatab/ISArchiveParserTest.java    | 39 ++++++++++++++++++
 4 files changed, 125 insertions(+), 3 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java 
b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
index 777f8482fb..db299ef0d8 100644
--- a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
@@ -16,6 +16,8 @@
  */
 package org.apache.tika.io;
 
+import java.io.IOException;
+import java.nio.file.Path;
 import java.util.HashSet;
 import java.util.Locale;
 import java.util.regex.Matcher;
@@ -268,6 +270,51 @@ public class FilenameUtils {
         return retPath;
     }
 
+    /**
+     * Resolves {@code name} against {@code dir} and returns the result only 
if it stays
+     * within {@code dir}.
+     * <p>
+     * The result is {@link Path#normalize() normalized} before it is checked. 
This is the
+     * load-bearing step: without it, resolving a name such as {@code ../x} 
yields the literal
+     * path {@code dir/../x}, whose path elements still begin with {@code 
dir}, so the
+     * {@link Path#startsWith(Path)} check below would wrongly accept it. 
Normalizing first
+     * collapses the {@code ..} so the check sees the real location.
+     * <p>
+     * Containment is tested with {@link Path#startsWith(Path)} (element by 
element) rather
+     * than by comparing path strings, so a sibling whose name merely shares a 
textual prefix
+     * with {@code dir} (for example {@code /a/bc} against {@code /a/b}) is 
correctly treated
+     * as being outside {@code dir}.
+     *
+     * @param dir  the directory the resolved path must stay within
+     * @param name the child name to resolve against {@code dir}; may contain 
relative
+     *             segments such as {@code ..}
+     * @return the resolved, normalized path, guaranteed to start with the 
normalized
+     *         {@code dir}
+     * @throws IOException if {@code name} resolves to a location outside 
{@code dir}
+     */
+    public static Path resolveWithin(Path dir, String name) throws IOException 
{
+        Path normalizedDir = dir.normalize();
+        Path resolved = normalizedDir.resolve(name).normalize();
+        if (!resolved.startsWith(normalizedDir)) {
+            throw new IOException(
+                    "'" + name + "' resolves to '" + resolved + "', which is 
outside of '" +
+                            normalizedDir + "'");
+        }
+
+        // Defense in depth against symlink traversal (only possible if the 
paths exist).
+        if (java.nio.file.Files.exists(resolved) && 
java.nio.file.Files.exists(normalizedDir)) {
+            Path realDir = normalizedDir.toRealPath();
+            Path realResolved = resolved.toRealPath();
+            if (!realResolved.startsWith(realDir)) {
+                throw new IOException(
+                        "'" + name + "' resolves to '" + realResolved + "', 
which is outside of '" +
+                                realDir + "'");
+            }
+        }
+
+        return resolved;
+    }
+
     private static int getPrefixLength(String path) {
         int prefixLength = 
org.apache.commons.io.FilenameUtils.getPrefixLength(path);
         if (prefixLength > 0) {
diff --git a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java 
b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
index 0984f4fea0..4a0a00dca7 100644
--- a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
@@ -18,9 +18,14 @@ package org.apache.tika.io;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assertions.fail;
 
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.metadata.Metadata;
@@ -105,6 +110,32 @@ public class FilenameUtilsTest {
         testFilenameEquality("HW.txt", "_1457338542/HW.txt");
     }
 
+    @Test
+    public void testResolveWithin() throws Exception {
+        Path dir = Paths.get("base", "isa");
+
+        // plain and nested children resolve to a path under dir
+        assertEquals(Paths.get("base", "isa", "a_assay.txt"),
+                FilenameUtils.resolveWithin(dir, "a_assay.txt"));
+        assertEquals(Paths.get("base", "isa", "sub", "a_assay.txt"),
+                FilenameUtils.resolveWithin(dir, "sub/a_assay.txt"));
+
+        // resolving the directory itself stays within it
+        assertEquals(dir.normalize(), FilenameUtils.resolveWithin(dir, "."));
+
+        // names that resolve out of dir are rejected. These also pin the 
normalize() call:
+        // without it the resolved path would still textually begin with dir 
and be accepted.
+        assertThrows(IOException.class, () -> FilenameUtils.resolveWithin(dir, 
"../outside.txt"));
+        assertThrows(IOException.class, () -> FilenameUtils.resolveWithin(dir, 
"../../outside.txt"));
+        assertThrows(IOException.class,
+                () -> FilenameUtils.resolveWithin(dir, 
"sub/../../outside.txt"));
+
+        // element-wise (not string-prefix) containment: a/bc is not within 
a/b even though
+        // the string "a/bc" starts with the string "a/b".
+        assertThrows(IOException.class,
+                () -> FilenameUtils.resolveWithin(Paths.get("a", "b"), 
"../bc"));
+    }
+
     @Test
     public void testExtension() throws Exception {
         assertEquals(".pdf", 
FilenameUtils.getSuffixFromPath("blah/blah/or/something.pdf"));
diff --git 
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
 
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
index 85baa72d03..b7bce35af2 100644
--- 
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
+++ 
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.isatab;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.file.Path;
 import java.util.Collections;
 import java.util.Set;
 
@@ -27,6 +28,7 @@ import org.xml.sax.SAXException;
 
 import org.apache.tika.config.TikaComponent;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.FilenameUtils;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -134,12 +136,15 @@ public class ISArchiveParser implements Parser {
 
     private void parseAssay(XHTMLContentHandler xhtml, Metadata metadata, 
ParseContext context)
             throws IOException, SAXException, TikaException {
+        // location starts with "/C:" on windows, so build the directory Path 
from a File
+        // rather than Paths.get(). The assay file names come from the 
investigation file and
+        // are resolved within this directory.
+        Path locationDir = new File(this.location).toPath();
         for (String assayFileName : 
metadata.getValues(studyAssayFileNameField)) {
             xhtml.startElement("div");
             xhtml.element("h3", "ASSAY " + assayFileName);
-            // location starts with "/C:" on windows, can't use Paths.get()
-            try (InputStream stream = TikaInputStream.get(new 
File(this.location + assayFileName).toPath()))
-            {
+            Path assayFile = FilenameUtils.resolveWithin(locationDir, 
assayFileName);
+            try (InputStream stream = TikaInputStream.get(assayFile)) {
                 ISATabUtils.parseAssay(stream, xhtml, metadata, context);
             }
             xhtml.endElement("div");
diff --git 
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
 
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
index 0ca0325902..cec574f879 100644
--- 
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
+++ 
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
@@ -17,8 +17,16 @@
 package org.apache.tika.parser.isatab;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
 
 import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
 import org.xml.sax.ContentHandler;
 
 import org.apache.tika.TikaTest;
@@ -68,4 +76,35 @@ public class ISArchiveParserTest extends TikaTest {
         assertEquals("Stephen", metadata.get("Investigation Person First 
Name"),
                 "Invalid Investigation Person First Name");
     }
+
+    @Test
+    public void testAssayFileNameResolvedWithinDirectory(@TempDir Path root) 
throws Exception {
+        // A file sitting next to (but outside of) the ISA-Tab directory.
+        Path outside = root.resolve("outside.txt");
+        Files.write(outside, 
"OUTSIDE_DIRECTORY_CONTENT".getBytes(StandardCharsets.UTF_8));
+
+        // An ISA-Tab directory whose investigation file points an assay at 
the sibling file
+        // using a relative name.
+        Path isaDir = Files.createDirectory(root.resolve("isa"));
+        Files.write(isaDir.resolve("i_test.txt"),
+                ("STUDY\n"
+                        + "Study File Name\t\"s_test.txt\"\n"
+                        + "Study Assay File Name\t\"../outside.txt\"\n")
+                        .getBytes(StandardCharsets.UTF_8));
+        Path study = isaDir.resolve("s_test.txt");
+        Files.write(study, "\"Source 
Name\"\n\"culture1\"\n".getBytes(StandardCharsets.UTF_8));
+
+        Parser parser = new ISArchiveParser(isaDir.toString());
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "s_test.txt");
+        ParseContext context = new ParseContext();
+
+        try (TikaInputStream tis = TikaInputStream.get(study)) {
+            assertThrows(IOException.class,
+                    () -> parser.parse(tis, handler, metadata, context));
+        }
+        assertFalse(handler.toString().contains("OUTSIDE_DIRECTORY_CONTENT"),
+                "assay reader read a file outside the ISA-Tab directory");
+    }
 }

Reply via email to