This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 67ada354f0 improve isatab parsing (#2875)
67ada354f0 is described below
commit 67ada354f0eec3bada1b732ffff4fd23a0d8b235
Author: Tim Allison <[email protected]>
AuthorDate: Fri Jun 5 12:50:50 2026 -0400
improve isatab parsing (#2875)
Co-authored-by: Copilot Autofix powered by AI
<[email protected]>
---
.../java/org/apache/tika/io/FilenameUtils.java | 47 ++++++++++++++++++++++
.../java/org/apache/tika/io/FilenameUtilsTest.java | 31 ++++++++++++++
.../apache/tika/parser/isatab/ISArchiveParser.java | 11 +++--
.../tika/parser/isatab/ISArchiveParserTest.java | 39 ++++++++++++++++++
4 files changed, 125 insertions(+), 3 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
index 777f8482fb..db299ef0d8 100644
--- a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
@@ -16,6 +16,8 @@
*/
package org.apache.tika.io;
+import java.io.IOException;
+import java.nio.file.Path;
import java.util.HashSet;
import java.util.Locale;
import java.util.regex.Matcher;
@@ -268,6 +270,51 @@ public class FilenameUtils {
return retPath;
}
+ /**
+ * Resolves {@code name} against {@code dir} and returns the result only
if it stays
+ * within {@code dir}.
+ * <p>
+ * The result is {@link Path#normalize() normalized} before it is checked.
This is the
+ * load-bearing step: without it, resolving a name such as {@code ../x}
yields the literal
+ * path {@code dir/../x}, whose path elements still begin with {@code
dir}, so the
+ * {@link Path#startsWith(Path)} check below would wrongly accept it.
Normalizing first
+ * collapses the {@code ..} so the check sees the real location.
+ * <p>
+ * Containment is tested with {@link Path#startsWith(Path)} (element by
element) rather
+ * than by comparing path strings, so a sibling whose name merely shares a
textual prefix
+ * with {@code dir} (for example {@code /a/bc} against {@code /a/b}) is
correctly treated
+ * as being outside {@code dir}.
+ *
+ * @param dir the directory the resolved path must stay within
+ * @param name the child name to resolve against {@code dir}; may contain
relative
+ * segments such as {@code ..}
+ * @return the resolved, normalized path, guaranteed to start with the
normalized
+ * {@code dir}
+ * @throws IOException if {@code name} resolves to a location outside
{@code dir}
+ */
+ public static Path resolveWithin(Path dir, String name) throws IOException
{
+ Path normalizedDir = dir.normalize();
+ Path resolved = normalizedDir.resolve(name).normalize();
+ if (!resolved.startsWith(normalizedDir)) {
+ throw new IOException(
+ "'" + name + "' resolves to '" + resolved + "', which is
outside of '" +
+ normalizedDir + "'");
+ }
+
+ // Defense in depth against symlink traversal (only possible if the
paths exist).
+ if (java.nio.file.Files.exists(resolved) &&
java.nio.file.Files.exists(normalizedDir)) {
+ Path realDir = normalizedDir.toRealPath();
+ Path realResolved = resolved.toRealPath();
+ if (!realResolved.startsWith(realDir)) {
+ throw new IOException(
+ "'" + name + "' resolves to '" + realResolved + "',
which is outside of '" +
+ realDir + "'");
+ }
+ }
+
+ return resolved;
+ }
+
private static int getPrefixLength(String path) {
int prefixLength =
org.apache.commons.io.FilenameUtils.getPrefixLength(path);
if (prefixLength > 0) {
diff --git a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
index 0984f4fea0..4a0a00dca7 100644
--- a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
@@ -18,9 +18,14 @@ package org.apache.tika.io;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
import org.junit.jupiter.api.Test;
import org.apache.tika.metadata.Metadata;
@@ -105,6 +110,32 @@ public class FilenameUtilsTest {
testFilenameEquality("HW.txt", "_1457338542/HW.txt");
}
+ @Test
+ public void testResolveWithin() throws Exception {
+ Path dir = Paths.get("base", "isa");
+
+ // plain and nested children resolve to a path under dir
+ assertEquals(Paths.get("base", "isa", "a_assay.txt"),
+ FilenameUtils.resolveWithin(dir, "a_assay.txt"));
+ assertEquals(Paths.get("base", "isa", "sub", "a_assay.txt"),
+ FilenameUtils.resolveWithin(dir, "sub/a_assay.txt"));
+
+ // resolving the directory itself stays within it
+ assertEquals(dir.normalize(), FilenameUtils.resolveWithin(dir, "."));
+
+ // names that resolve out of dir are rejected. These also pin the
normalize() call:
+ // without it the resolved path would still textually begin with dir
and be accepted.
+ assertThrows(IOException.class, () -> FilenameUtils.resolveWithin(dir,
"../outside.txt"));
+ assertThrows(IOException.class, () -> FilenameUtils.resolveWithin(dir,
"../../outside.txt"));
+ assertThrows(IOException.class,
+ () -> FilenameUtils.resolveWithin(dir,
"sub/../../outside.txt"));
+
+ // element-wise (not string-prefix) containment: a/bc is not within
a/b even though
+ // the string "a/bc" starts with the string "a/b".
+ assertThrows(IOException.class,
+ () -> FilenameUtils.resolveWithin(Paths.get("a", "b"),
"../bc"));
+ }
+
@Test
public void testExtension() throws Exception {
assertEquals(".pdf",
FilenameUtils.getSuffixFromPath("blah/blah/or/something.pdf"));
diff --git
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
index 85baa72d03..b7bce35af2 100644
---
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
+++
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.isatab;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.file.Path;
import java.util.Collections;
import java.util.Set;
@@ -27,6 +28,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -134,12 +136,15 @@ public class ISArchiveParser implements Parser {
private void parseAssay(XHTMLContentHandler xhtml, Metadata metadata,
ParseContext context)
throws IOException, SAXException, TikaException {
+ // location starts with "/C:" on windows, so build the directory Path
from a File
+ // rather than Paths.get(). The assay file names come from the
investigation file and
+ // are resolved within this directory.
+ Path locationDir = new File(this.location).toPath();
for (String assayFileName :
metadata.getValues(studyAssayFileNameField)) {
xhtml.startElement("div");
xhtml.element("h3", "ASSAY " + assayFileName);
- // location starts with "/C:" on windows, can't use Paths.get()
- try (InputStream stream = TikaInputStream.get(new
File(this.location + assayFileName).toPath()))
- {
+ Path assayFile = FilenameUtils.resolveWithin(locationDir,
assayFileName);
+ try (InputStream stream = TikaInputStream.get(assayFile)) {
ISATabUtils.parseAssay(stream, xhtml, metadata, context);
}
xhtml.endElement("div");
diff --git
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
index 0ca0325902..cec574f879 100644
---
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
+++
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
@@ -17,8 +17,16 @@
package org.apache.tika.parser.isatab;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
import org.xml.sax.ContentHandler;
import org.apache.tika.TikaTest;
@@ -68,4 +76,35 @@ public class ISArchiveParserTest extends TikaTest {
assertEquals("Stephen", metadata.get("Investigation Person First
Name"),
"Invalid Investigation Person First Name");
}
+
+ @Test
+ public void testAssayFileNameResolvedWithinDirectory(@TempDir Path root)
throws Exception {
+ // A file sitting next to (but outside of) the ISA-Tab directory.
+ Path outside = root.resolve("outside.txt");
+ Files.write(outside,
"OUTSIDE_DIRECTORY_CONTENT".getBytes(StandardCharsets.UTF_8));
+
+ // An ISA-Tab directory whose investigation file points an assay at
the sibling file
+ // using a relative name.
+ Path isaDir = Files.createDirectory(root.resolve("isa"));
+ Files.write(isaDir.resolve("i_test.txt"),
+ ("STUDY\n"
+ + "Study File Name\t\"s_test.txt\"\n"
+ + "Study Assay File Name\t\"../outside.txt\"\n")
+ .getBytes(StandardCharsets.UTF_8));
+ Path study = isaDir.resolve("s_test.txt");
+ Files.write(study, "\"Source
Name\"\n\"culture1\"\n".getBytes(StandardCharsets.UTF_8));
+
+ Parser parser = new ISArchiveParser(isaDir.toString());
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "s_test.txt");
+ ParseContext context = new ParseContext();
+
+ try (TikaInputStream tis = TikaInputStream.get(study)) {
+ assertThrows(IOException.class,
+ () -> parser.parse(tis, handler, metadata, context));
+ }
+ assertFalse(handler.toString().contains("OUTSIDE_DIRECTORY_CONTENT"),
+ "assay reader read a file outside the ISA-Tab directory");
+ }
}