This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4732 in repository https://gitbox.apache.org/repos/asf/tika.git
commit c0de3596e2250c4b756f0d1aab83cbfb451dbe55 Author: Lawrence Moorehead <[email protected]> AuthorDate: Fri May 15 11:47:08 2026 -0400 Removing duplicate file from unpack/all endpoint. --- .../apache/tika/pipes/core/server/PipesWorker.java | 76 ------- .../tika/pipes/core/FrictionlessUnpackTest.java | 66 +++--- .../tika/server/standard/UnpackerResourceTest.java | 248 +++++++++++++++++++++ 3 files changed, 284 insertions(+), 106 deletions(-) diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java index a76defc641..fb7553fee0 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java @@ -489,72 +489,6 @@ class PipesWorker implements Callable<PipesResult> { mapper.writeValue(os, metadataMap); } - /** - * Stores the original document to the temp handler for inclusion in the zip. - * Uses TikaInputStream's internal file caching to avoid consuming the stream. - */ - private void storeOriginalDocument(TikaInputStream tis, TempFileUnpackHandler tempHandler) - throws IOException { - String fileName = getFileNameFromFetchKey(); - - // TikaInputStream caches to a temp file internally - get that file - Path originalPath = tis.getPath(); - if (originalPath != null && Files.exists(originalPath)) { - // Copy from the cached file - try (InputStream is = Files.newInputStream(originalPath)) { - tempHandler.storeOriginalDocument(is, fileName); - } - } else { - // Stream hasn't been cached yet - we need to read and reset - tis.mark(Integer.MAX_VALUE); - try { - tempHandler.storeOriginalDocument(tis, fileName); - } finally { - tis.reset(); - } - } - } - - /** - * Stores the original document to the frictionless handler for inclusion in output. - * Uses TikaInputStream's internal file caching to avoid consuming the stream. - */ - private void storeOriginalDocumentForFrictionless(TikaInputStream tis, - FrictionlessUnpackHandler frictionlessHandler) - throws IOException { - String fileName = getFileNameFromFetchKey(); - - // TikaInputStream caches to a temp file internally - get that file - Path originalPath = tis.getPath(); - if (originalPath != null && Files.exists(originalPath)) { - // Copy from the cached file - try (InputStream is = Files.newInputStream(originalPath)) { - frictionlessHandler.storeOriginalDocument(is, fileName); - } - } else { - // Stream hasn't been cached yet - we need to read and reset - tis.mark(Integer.MAX_VALUE); - try { - frictionlessHandler.storeOriginalDocument(tis, fileName); - } finally { - tis.reset(); - } - } - } - - /** - * Extracts the file name from the fetch key. - */ - private String getFileNameFromFetchKey() { - String fetchKey = fetchEmitTuple.getFetchKey().getFetchKey(); - String fileName = fetchKey; - int lastSlash = Math.max(fetchKey.lastIndexOf('/'), fetchKey.lastIndexOf('\\')); - if (lastSlash >= 0 && lastSlash < fetchKey.length() - 1) { - fileName = fetchKey.substring(lastSlash + 1); - } - return fileName; - } - protected ParseDataOrPipesResult parseFromTuple() throws TikaException, InterruptedException { //start a new metadata object to gather info from the fetch process //we want to isolate and not touch the metadata sent into the fetchEmitTuple @@ -575,16 +509,6 @@ class PipesWorker implements Callable<PipesResult> { } try (TikaInputStream tis = tisOrResult.tis()) { - // Store original document for zipping/frictionless if requested - UnpackHandler handler = localContext.get(UnpackHandler.class); - UnpackConfig uc = localContext.get(UnpackConfig.class); - if (uc != null && uc.isIncludeOriginal()) { - if (handler instanceof FrictionlessUnpackHandler frictionlessHandler) { - storeOriginalDocumentForFrictionless(tis, frictionlessHandler); - } else if (handler instanceof TempFileUnpackHandler tempHandler) { - storeOriginalDocument(tis, tempHandler); - } - } return parseHandler.parseWithStream(fetchEmitTuple, tis, metadata, localContext); } catch (SecurityException e) { LOG.error("security exception id={}", fetchEmitTuple.getId(), e); diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/FrictionlessUnpackTest.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/FrictionlessUnpackTest.java index deaf5e3561..14a05aad70 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/FrictionlessUnpackTest.java +++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/FrictionlessUnpackTest.java @@ -635,28 +635,30 @@ public class FrictionlessUnpackTest { @Test public void testFrictionlessWithIncludeOriginal(@TempDir Path tmp) throws Exception { - // Test that includeOriginal works with Frictionless format + // includeOriginal=true causes the container to appear in the Frictionless + // package as "unpacked/0.<ext>" (added by ParseHandler._preParse via + // unpackHandler.add(0, ...)) and to be listed once in datapackage.json. Path outputDir = tmp.resolve("output"); Files.createDirectories(outputDir); try (PipesClient pipesClient = init(tmp, TEST_DOC_WITH_EMBEDDED)) { ParseContext parseContext = new ParseContext(); parseContext.set(ParseMode.class, ParseMode.UNPACK); - + UnpackConfig unpackConfig = new UnpackConfig(); unpackConfig.setEmitter(EMITTER_NAME); unpackConfig.setOutputFormat(UnpackConfig.OUTPUT_FORMAT.FRICTIONLESS); unpackConfig.setOutputMode(UnpackConfig.OUTPUT_MODE.ZIPPED); - unpackConfig.setIncludeOriginal(true); // Include container document + unpackConfig.setIncludeOriginal(true); parseContext.set(UnpackConfig.class, unpackConfig); - + PipesResult pipesResult = pipesClient.process( new FetchEmitTuple(TEST_DOC_WITH_EMBEDDED, new FetchKey(FETCHER_NAME, TEST_DOC_WITH_EMBEDDED), new EmitKey(EMITTER_NAME, TEST_DOC_WITH_EMBEDDED), new Metadata(), parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); - + assertTrue(pipesResult.isSuccess(), "Frictionless with includeOriginal should succeed"); } @@ -664,39 +666,43 @@ public class FrictionlessUnpackTest { List<Path> zipFiles = Files.list(outputDir) .filter(p -> p.toString().endsWith("-frictionless.zip")) .toList(); + assertEquals(1, zipFiles.size(), "Should create exactly one frictionless zip"); try (ZipFile zip = new ZipFile(zipFiles.get(0).toFile())) { - // Original file should be at root level or in a specific location - boolean hasOriginal = false; + // The container itself should be present as the id-0 entry under + // unpacked/. Match "unpacked/0" exactly or "unpacked/0.<ext>" + // (whichever the active SUFFIX_STRATEGY produces). + Set<String> allEntries = new HashSet<>(); Enumeration<? extends ZipEntry> entries = zip.entries(); while (entries.hasMoreElements()) { - ZipEntry entry = entries.nextElement(); - // Original could be at root or documented location - if (entry.getName().contains(TEST_DOC_WITH_EMBEDDED) || - entry.getName().equals("original/" + TEST_DOC_WITH_EMBEDDED)) { - hasOriginal = true; - break; - } + allEntries.add(entries.nextElement().getName()); } - - // Also check datapackage.json for original in resources + boolean hasContainerAsId0 = allEntries.stream() + .anyMatch(n -> n.equals("unpacked/0") || n.startsWith("unpacked/0.")); + assertTrue(hasContainerAsId0, + "With includeOriginal=true, the container should appear as the " + + "unpacked/0 entry. Entries: " + allEntries); + + // And the manifest's resources should list the container at unpacked/0; + // no resource path should escape the unpacked/ prefix (no separate + // root-level "original" entry should exist). ZipEntry dpEntry = zip.getEntry("datapackage.json"); - if (dpEntry != null) { - JsonNode dataPackage; - try (InputStream is = zip.getInputStream(dpEntry)) { - dataPackage = OBJECT_MAPPER.readTree(is); - } - for (JsonNode resource : dataPackage.get("resources")) { - String path = resource.get("path").asText(); - if (!path.startsWith("unpacked/")) { - hasOriginal = true; - break; - } + assertNotNull(dpEntry, "datapackage.json should be present"); + JsonNode dataPackage; + try (InputStream is = zip.getInputStream(dpEntry)) { + dataPackage = OBJECT_MAPPER.readTree(is); + } + boolean manifestListsContainer = false; + for (JsonNode resource : dataPackage.get("resources")) { + String path = resource.get("path").asText(); + assertTrue(path.startsWith("unpacked/"), + "Manifest resources should only list unpacked/ paths; got " + path); + if (path.equals("unpacked/0") || path.startsWith("unpacked/0.")) { + manifestListsContainer = true; } } - - assertTrue(hasOriginal, - "With includeOriginal=true, original document should be in package"); + assertTrue(manifestListsContainer, + "Manifest should list the container at unpacked/0"); } } diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java index 21443d5795..ae200959b0 100644 --- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java +++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java @@ -33,8 +33,10 @@ import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import javax.imageio.ImageIO; import com.fasterxml.jackson.databind.JsonNode; @@ -494,6 +496,252 @@ public class UnpackerResourceTest extends CXFTestBase { assertTrue(hasZeroPaddedName, "Should have zero-padded file names (e.g., 0000.jpeg)"); } + /** + * The datapackage.json "resources" array is the manifest for the Frictionless + * package. Verifies it lists exactly the data files present in the zip, with + * no missing or extraneous entries. The package envelope (datapackage.json + * and the optional metadata.json) is not itself a "resource" and is excluded + * from the comparison. + */ + @Test + public void testFrictionlessDataPackageMatchesArchiveContents() throws Exception { + String configJson = """ + { + "parse-context": { + "unpack-config": { + "outputFormat": "FRICTIONLESS", + "outputMode": "ZIPPED", + "includeFullMetadata": true, + "includeOriginal": true + } + } + } + """; + ContentDisposition fileCd = new ContentDisposition("form-data; name=\"file\"; filename=\"Doc1_ole.doc\""); + Attachment fileAtt = new Attachment("file", + ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd); + Attachment configAtt = new Attachment("config", "application/json", + new ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8))); + + Response response = WebClient + .create(endPoint + ALL_PATH) + .type("multipart/form-data") + .accept("application/zip") + .post(new MultipartBody(Arrays.asList(fileAtt, configAtt))); + + assertEquals(200, response.getStatus()); + Map<String, byte[]> data = readZipArchiveBytes((InputStream) response.getEntity()); + + byte[] dpBytes = data.get("datapackage.json"); + assertNotNull(dpBytes, "datapackage.json should be present"); + + JsonNode dataPackage = MAPPER.readTree(dpBytes); + JsonNode resources = dataPackage.get("resources"); + assertNotNull(resources, "datapackage.json should have a 'resources' array"); + assertTrue(resources.isArray() && resources.size() > 0, + "resources array should be non-empty"); + + Set<String> manifestPaths = new HashSet<>(); + for (JsonNode resource : resources) { + manifestPaths.add(resource.get("path").asText()); + } + + Set<String> archiveDataFiles = new HashSet<>(data.keySet()); + archiveDataFiles.remove("datapackage.json"); + archiveDataFiles.remove("metadata.json"); + + assertEquals(archiveDataFiles, manifestPaths, + "datapackage.json 'resources' must list exactly the data files in the zip. " + + "Only-in-manifest: " + difference(manifestPaths, archiveDataFiles) + + ", only-in-archive: " + difference(archiveDataFiles, manifestPaths)); + } + + private static Set<String> difference(Set<String> a, Set<String> b) { + Set<String> diff = new HashSet<>(a); + diff.removeAll(b); + return diff; + } + + /** + * /unpack/all forces unpack-config.includeOriginal=true. In REGULAR mode, + * the container must appear exactly once in the archive — at "0.<ext>" + * at the zip root — with no additional copy elsewhere. + */ + @Test + public void testRegularAllContainerAppearsOnce() throws Exception { + ContentDisposition fileCd = new ContentDisposition("form-data; name=\"file\"; filename=\"Doc1_ole.doc\""); + Attachment fileAtt = new Attachment("file", + ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd); + + Response response = WebClient + .create(endPoint + ALL_PATH) + .type("multipart/form-data") + .accept("application/zip") + .post(new MultipartBody(Arrays.asList(fileAtt))); + + assertEquals(200, response.getStatus()); + Map<String, String> data = readZipArchive((InputStream) response.getEntity()); + + long containerEntries = data.keySet().stream() + .filter(k -> !k.endsWith(".metadata.json")) + .filter(k -> k.equals("0") || k.startsWith("0.")) + .count(); + assertEquals(1, containerEntries, + "Container should appear exactly once at the zip root as 0.<ext>. " + + "Entries: " + data.keySet()); + } + + /** + * Documents the shape of /unpack output in FRICTIONLESS mode (no /all, + * default config): a datapackage.json manifest plus the unpacked/<id> + * children, and no metadata.json envelope. + */ + @Test + public void testFrictionlessUnpackShape() throws Exception { + String configJson = """ + { + "parse-context": { + "unpack-config": { + "outputFormat": "FRICTIONLESS", + "outputMode": "ZIPPED" + } + } + } + """; + ContentDisposition fileCd = new ContentDisposition("form-data; name=\"file\"; filename=\"Doc1_ole.doc\""); + Attachment fileAtt = new Attachment("file", + ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd); + Attachment configAtt = new Attachment("config", "application/json", + new ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8))); + + Response response = WebClient + .create(endPoint + UNPACKER_PATH) + .type("multipart/form-data") + .accept("application/zip") + .post(new MultipartBody(Arrays.asList(fileAtt, configAtt))); + + assertEquals(200, response.getStatus()); + Map<String, String> data = readZipArchive((InputStream) response.getEntity()); + + assertTrue(data.containsKey("datapackage.json"), + "Should contain datapackage.json manifest. Entries: " + data.keySet()); + assertFalse(data.containsKey("metadata.json"), + "Should not contain metadata.json without includeFullMetadata. Entries: " + data.keySet()); + boolean hasUnpacked = data.keySet().stream().anyMatch(k -> k.startsWith("unpacked/")); + assertTrue(hasUnpacked, "Should contain unpacked/ entries. Entries: " + data.keySet()); + } + + /** + * Documents the difference between /unpack and /unpack/all in FRICTIONLESS: + * /unpack/all forces unpack-config.includeOriginal=true, which causes the + * container itself to be added to the unpack output as id 0 + * ("unpacked/0.<ext>"). /unpack with no extra config does not. + */ + @Test + public void testFrictionlessUnpackAllAddsContainerAsUnpackedZero() throws Exception { + String configJson = """ + { + "parse-context": { + "unpack-config": { + "outputFormat": "FRICTIONLESS", + "outputMode": "ZIPPED" + } + } + } + """; + ContentDisposition fileCd = new ContentDisposition("form-data; name=\"file\"; filename=\"Doc1_ole.doc\""); + Attachment unpackFile = new Attachment("file", + ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd); + Attachment unpackConfig = new Attachment("config", "application/json", + new ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8))); + + Response unpackResponse = WebClient + .create(endPoint + UNPACKER_PATH) + .type("multipart/form-data") + .accept("application/zip") + .post(new MultipartBody(Arrays.asList(unpackFile, unpackConfig))); + assertEquals(200, unpackResponse.getStatus()); + Set<String> unpackEntries = new HashSet<>(readZipArchive( + (InputStream) unpackResponse.getEntity()).keySet()); + + Attachment allFile = new Attachment("file", + ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd); + Attachment allConfig = new Attachment("config", "application/json", + new ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8))); + Response allResponse = WebClient + .create(endPoint + ALL_PATH) + .type("multipart/form-data") + .accept("application/zip") + .post(new MultipartBody(Arrays.asList(allFile, allConfig))); + assertEquals(200, allResponse.getStatus()); + Set<String> allEntries = new HashSet<>(readZipArchive( + (InputStream) allResponse.getEntity()).keySet()); + + boolean unpackHasContainer = unpackEntries.stream().anyMatch(k -> k.equals("unpacked/0.doc")); + boolean allHasContainer = allEntries.stream().anyMatch(k -> k.equals("unpacked/0.doc")); + assertFalse(unpackHasContainer, + "/unpack alone should not include the container. Entries: " + unpackEntries); + assertTrue(allHasContainer, + "/unpack/all should include the container as unpacked/0.<ext>. " + + "Entries: " + allEntries); + + Set<String> onlyInAll = difference(allEntries, unpackEntries); + assertEquals(Set.of("unpacked/0.doc"), onlyInAll, + "Only the container (unpacked/0.<ext>) should distinguish /unpack/all " + + "from /unpack. Difference: " + onlyInAll); + } + + /** + * Documents that includeFullMetadata=true in FRICTIONLESS adds a + * metadata.json envelope whose entries carry X-TIKA:content (the extracted + * text) and Content-Type alongside the per-document metadata. + */ + @Test + public void testFrictionlessIncludeFullMetadataAddsMetadataJson() throws Exception { + String configJson = """ + { + "parse-context": { + "unpack-config": { + "outputFormat": "FRICTIONLESS", + "outputMode": "ZIPPED", + "includeFullMetadata": true + } + } + } + """; + ContentDisposition fileCd = new ContentDisposition("form-data; name=\"file\"; filename=\"Doc1_ole.doc\""); + Attachment fileAtt = new Attachment("file", + ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd); + Attachment configAtt = new Attachment("config", "application/json", + new ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8))); + + Response response = WebClient + .create(endPoint + ALL_PATH) + .type("multipart/form-data") + .accept("application/zip") + .post(new MultipartBody(Arrays.asList(fileAtt, configAtt))); + assertEquals(200, response.getStatus()); + + Map<String, byte[]> data = readZipArchiveBytes((InputStream) response.getEntity()); + byte[] metadataBytes = data.get("metadata.json"); + assertNotNull(metadataBytes, "metadata.json should be present when includeFullMetadata=true. " + + "Entries: " + data.keySet()); + + JsonNode metadata = MAPPER.readTree(metadataBytes); + assertTrue(metadata.isArray() && metadata.size() > 0, + "metadata.json should be a non-empty array"); + + JsonNode container = metadata.get(0); + assertTrue(container.has("Content-Type"), + "Container metadata entry should carry Content-Type. Entry: " + container); + assertEquals("application/msword", container.get("Content-Type").asText(), + "Container metadata entry should describe the submitted .doc"); + assertTrue(container.has("X-TIKA:content"), + "Container metadata entry should carry X-TIKA:content (extracted text). Entry: " + container); + assertTrue(container.get("X-TIKA:content").asText().length() > 0, + "X-TIKA:content for the container should be non-empty"); + } + /** * Tests UnpackSelector filtering by mime type. */
