This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4221-fix-unpack200 in repository https://gitbox.apache.org/repos/asf/tika.git
commit d6e1d684c2ae264bd2f6ff4d523fa0301bd57781 Author: tallison <[email protected]> AuthorDate: Wed Jun 3 13:02:22 2026 -0400 TIKA-4221 - tmp workaround for pack200 --- .../apache/tika/parser/pkg/CompressorParser.java | 48 ++++++++++++++++++++- .../tika/parser/pkg/CompressorParserTest.java | 22 ++++++++++ .../test/resources/test-documents/testPACK200.pack | Bin 0 -> 530 bytes 3 files changed, 69 insertions(+), 1 deletion(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java index 8535d304b8..ea4d8411c0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java @@ -34,6 +34,8 @@ import static org.apache.tika.detect.zip.CompressorConstants.ZSTD; import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE; import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -233,7 +235,30 @@ public class CompressorParser implements Parser { //trust that and go with the appropriate name //to avoid calling CompressorStreamFactory.detect() twice String name = getStreamName(metadata); - if (name != null) { + boolean pack200 = CompressorStreamFactory.PACK200.equals(name); + if (name == null) { + //No content-type hint: peek to see whether this is pack200 so we can route it + //through the workaround below. Anything else falls through to autodetect unchanged. + pack200 = isPack200(tis); + } + if (pack200) { + // TIKA-4221 / COMPRESS-721 workaround: commons-compress' Pack200CompressorInputStream + // reflects into java.io internals (FilterInputStream.in / FileInputStream.path) to + // bound its input, which throws InaccessibleObjectException on Java 17+. A + // TikaInputStream is a FilterInputStream, so it triggers this. Spool to a file and + // reopen via Files.newInputStream (a ChannelInputStream) -- the one input type + // commons-compress does not reflect into. Pack200CompressorInputStream reads its + // input fully in the constructor (IN_MEMORY) and then serves bytes from an in-memory + // buffer, so the channel stream can be closed immediately afterward. Remove this once + // Tika depends on a commons-compress release that contains the COMPRESS-721 fix. + try (InputStream packStream = Files.newInputStream(tis.getPath())) { + cis = factory.createCompressorInputStream(CompressorStreamFactory.PACK200, + packStream); + } + if (name == null) { + metadata.set(CONTENT_TYPE, PACK.toString()); + } + } else if (name != null) { cis = factory.createCompressorInputStream(name, tis); } else { cis = factory.createCompressorInputStream(tis); @@ -248,6 +273,11 @@ public class CompressorParser implements Parser { throw new TikaMemoryLimitException(e.getMessage()); } throw new TikaException("Unable to uncompress document stream", e); + } catch (IOException e) { + //the pack200 workaround (getPath()/Files.newInputStream) can throw IOException; + //make sure the close shield is removed before propagating + tis.removeCloseShield(); + throw e; } @@ -328,6 +358,22 @@ public class CompressorParser implements Parser { return MIMES_TO_NAME.get(mimeString); } + /** + * Peeks at the stream signature to determine whether it is a pack200 archive, without + * consuming the stream. Used so pack200 can be routed through the COMPRESS-721 workaround in + * {@link #parse}. + * + * @param tis the input, which must support mark/reset (a TikaInputStream always does) + * @return {@code true} if the signature matches pack200 + */ + private static boolean isPack200(TikaInputStream tis) { + try { + return CompressorStreamFactory.PACK200.equals(CompressorStreamFactory.detect(tis)); + } catch (CompressorException e) { + return false; + } + } + public Config getDefaultConfig() { return defaultConfig; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java index 58dcb0a0d6..f3def60901 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java @@ -17,11 +17,15 @@ package org.apache.tika.parser.pkg; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; import static org.junit.jupiter.api.Assumptions.assumeTrue; import java.util.HashSet; +import java.util.List; import java.util.Set; import org.apache.commons.compress.compressors.CompressorStreamFactory; @@ -31,6 +35,8 @@ import org.junit.jupiter.api.Test; import org.apache.tika.TikaTest; import org.apache.tika.detect.zip.CompressorConstants; import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; @@ -71,6 +77,22 @@ public class CompressorParserTest extends TikaTest { } } + @Test + public void testPack200() throws Exception { + //TIKA-4221: commons-compress' Pack200CompressorInputStream throws an + //InaccessibleObjectException on Java 17+ when handed a FilterInputStream or a + //FileInputStream (a TikaInputStream is a FilterInputStream). CompressorParser must route + //pack200 through the spool-to-file workaround so it unpacks cleanly. + //testPACK200.pack is borrowed from Apache Commons Compress (HelloWorld.pack). + List<Metadata> metadataList = getRecursiveMetadata("testPACK200.pack"); + assertEquals("application/x-java-pack200", metadataList.get(0).get(Metadata.CONTENT_TYPE)); + assertNull(metadataList.get(0).get(TikaCoreProperties.CONTAINER_EXCEPTION), + "pack200 should unpack without an exception"); + //the pack200 archive must have been unpacked into at least one embedded document + assertTrue(metadataList.size() > 1, + "pack200 should have been unpacked into embedded content"); + } + @Test public void testQuineXHTML() throws Exception { //Anti-virus can surreptitiously remove this file diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testPACK200.pack b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testPACK200.pack new file mode 100644 index 0000000000..7445d85a49 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testPACK200.pack differ
