This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch improve-isatab in repository https://gitbox.apache.org/repos/asf/tika.git
commit d2d8ad228269516674e9eb5f6a4a01c87d0ad4f0 Author: tallison <[email protected]> AuthorDate: Fri Jun 5 11:43:10 2026 -0400 improve isatab parsing --- .../java/org/apache/tika/io/FilenameUtils.java | 35 +++++++++++++++++++ .../java/org/apache/tika/io/FilenameUtilsTest.java | 31 +++++++++++++++++ .../apache/tika/parser/isatab/ISArchiveParser.java | 11 ++++-- .../tika/parser/isatab/ISArchiveParserTest.java | 39 ++++++++++++++++++++++ 4 files changed, 113 insertions(+), 3 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java index 777f8482fb..9e764ad36b 100644 --- a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java +++ b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java @@ -16,6 +16,8 @@ */ package org.apache.tika.io; +import java.io.IOException; +import java.nio.file.Path; import java.util.HashSet; import java.util.Locale; import java.util.regex.Matcher; @@ -268,6 +270,39 @@ public class FilenameUtils { return retPath; } + /** + * Resolves {@code name} against {@code dir} and returns the result only if it stays + * within {@code dir}. + * <p> + * The result is {@link Path#normalize() normalized} before it is checked. This is the + * load-bearing step: without it, resolving a name such as {@code ../x} yields the literal + * path {@code dir/../x}, whose path elements still begin with {@code dir}, so the + * {@link Path#startsWith(Path)} check below would wrongly accept it. Normalizing first + * collapses the {@code ..} so the check sees the real location. + * <p> + * Containment is tested with {@link Path#startsWith(Path)} (element by element) rather + * than by comparing path strings, so a sibling whose name merely shares a textual prefix + * with {@code dir} (for example {@code /a/bc} against {@code /a/b}) is correctly treated + * as being outside {@code dir}. + * + * @param dir the directory the resolved path must stay within + * @param name the child name to resolve against {@code dir}; may contain relative + * segments such as {@code ..} + * @return the resolved, normalized path, guaranteed to start with the normalized + * {@code dir} + * @throws IOException if {@code name} resolves to a location outside {@code dir} + */ + public static Path resolveWithin(Path dir, String name) throws IOException { + Path normalizedDir = dir.normalize(); + Path resolved = normalizedDir.resolve(name).normalize(); + if (!resolved.startsWith(normalizedDir)) { + throw new IOException( + "'" + name + "' resolves to '" + resolved + "', which is outside of '" + + normalizedDir + "'"); + } + return resolved; + } + private static int getPrefixLength(String path) { int prefixLength = org.apache.commons.io.FilenameUtils.getPrefixLength(path); if (prefixLength > 0) { diff --git a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java index 0984f4fea0..4a0a00dca7 100644 --- a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java +++ b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java @@ -18,9 +18,14 @@ package org.apache.tika.io; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; + import org.junit.jupiter.api.Test; import org.apache.tika.metadata.Metadata; @@ -105,6 +110,32 @@ public class FilenameUtilsTest { testFilenameEquality("HW.txt", "_1457338542/HW.txt"); } + @Test + public void testResolveWithin() throws Exception { + Path dir = Paths.get("base", "isa"); + + // plain and nested children resolve to a path under dir + assertEquals(Paths.get("base", "isa", "a_assay.txt"), + FilenameUtils.resolveWithin(dir, "a_assay.txt")); + assertEquals(Paths.get("base", "isa", "sub", "a_assay.txt"), + FilenameUtils.resolveWithin(dir, "sub/a_assay.txt")); + + // resolving the directory itself stays within it + assertEquals(dir.normalize(), FilenameUtils.resolveWithin(dir, ".")); + + // names that resolve out of dir are rejected. These also pin the normalize() call: + // without it the resolved path would still textually begin with dir and be accepted. + assertThrows(IOException.class, () -> FilenameUtils.resolveWithin(dir, "../outside.txt")); + assertThrows(IOException.class, () -> FilenameUtils.resolveWithin(dir, "../../outside.txt")); + assertThrows(IOException.class, + () -> FilenameUtils.resolveWithin(dir, "sub/../../outside.txt")); + + // element-wise (not string-prefix) containment: a/bc is not within a/b even though + // the string "a/bc" starts with the string "a/b". + assertThrows(IOException.class, + () -> FilenameUtils.resolveWithin(Paths.get("a", "b"), "../bc")); + } + @Test public void testExtension() throws Exception { assertEquals(".pdf", FilenameUtils.getSuffixFromPath("blah/blah/or/something.pdf")); diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java index 85baa72d03..b7bce35af2 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java @@ -19,6 +19,7 @@ package org.apache.tika.parser.isatab; import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.nio.file.Path; import java.util.Collections; import java.util.Set; @@ -27,6 +28,7 @@ import org.xml.sax.SAXException; import org.apache.tika.config.TikaComponent; import org.apache.tika.exception.TikaException; +import org.apache.tika.io.FilenameUtils; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -134,12 +136,15 @@ public class ISArchiveParser implements Parser { private void parseAssay(XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + // location starts with "/C:" on windows, so build the directory Path from a File + // rather than Paths.get(). The assay file names come from the investigation file and + // are resolved within this directory. + Path locationDir = new File(this.location).toPath(); for (String assayFileName : metadata.getValues(studyAssayFileNameField)) { xhtml.startElement("div"); xhtml.element("h3", "ASSAY " + assayFileName); - // location starts with "/C:" on windows, can't use Paths.get() - try (InputStream stream = TikaInputStream.get(new File(this.location + assayFileName).toPath())) - { + Path assayFile = FilenameUtils.resolveWithin(locationDir, assayFileName); + try (InputStream stream = TikaInputStream.get(assayFile)) { ISATabUtils.parseAssay(stream, xhtml, metadata, context); } xhtml.endElement("div"); diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java index 0ca0325902..cec574f879 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java @@ -17,8 +17,16 @@ package org.apache.tika.parser.isatab; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; import org.xml.sax.ContentHandler; import org.apache.tika.TikaTest; @@ -68,4 +76,35 @@ public class ISArchiveParserTest extends TikaTest { assertEquals("Stephen", metadata.get("Investigation Person First Name"), "Invalid Investigation Person First Name"); } + + @Test + public void testAssayFileNameResolvedWithinDirectory(@TempDir Path root) throws Exception { + // A file sitting next to (but outside of) the ISA-Tab directory. + Path outside = root.resolve("outside.txt"); + Files.write(outside, "OUTSIDE_DIRECTORY_CONTENT".getBytes(StandardCharsets.UTF_8)); + + // An ISA-Tab directory whose investigation file points an assay at the sibling file + // using a relative name. + Path isaDir = Files.createDirectory(root.resolve("isa")); + Files.write(isaDir.resolve("i_test.txt"), + ("STUDY\n" + + "Study File Name\t\"s_test.txt\"\n" + + "Study Assay File Name\t\"../outside.txt\"\n") + .getBytes(StandardCharsets.UTF_8)); + Path study = isaDir.resolve("s_test.txt"); + Files.write(study, "\"Source Name\"\n\"culture1\"\n".getBytes(StandardCharsets.UTF_8)); + + Parser parser = new ISArchiveParser(isaDir.toString()); + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "s_test.txt"); + ParseContext context = new ParseContext(); + + try (TikaInputStream tis = TikaInputStream.get(study)) { + assertThrows(IOException.class, + () -> parser.parse(tis, handler, metadata, context)); + } + assertFalse(handler.toString().contains("OUTSIDE_DIRECTORY_CONTENT"), + "assay reader read a file outside the ISA-Tab directory"); + } }
