This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new a803c16e7d TIKA-4732 (#2820)
a803c16e7d is described below

commit a803c16e7dbea2969ab6d6efdd959c72f14881f5
Author: Tim Allison <[email protected]>
AuthorDate: Mon May 18 11:55:06 2026 -0400

    TIKA-4732 (#2820)
    
    ---------
    
    Co-authored-by: Lawrence Moorehead <[email protected]>
---
 .../core/extractor/FrictionlessUnpackHandler.java  | 111 --------
 .../core/extractor/TempFileUnpackHandler.java      |  38 ---
 .../apache/tika/pipes/core/server/PipesWorker.java | 123 ++-------
 .../tika/pipes/core/FrictionlessUnpackTest.java    |  66 +++--
 .../tika/server/standard/UnpackerResourceTest.java | 307 +++++++++++++++++++++
 5 files changed, 364 insertions(+), 281 deletions(-)

diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/FrictionlessUnpackHandler.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/FrictionlessUnpackHandler.java
index 705288bcb0..e1b89e9869 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/FrictionlessUnpackHandler.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/FrictionlessUnpackHandler.java
@@ -61,10 +61,6 @@ public class FrictionlessUnpackHandler extends 
AbstractUnpackHandler implements
     private final EmitKey containerEmitKey;
     private final UnpackConfig unpackConfig;
     private final List<FrictionlessFileInfo> embeddedFiles = new ArrayList<>();
-    private Path originalDocumentPath;
-    private String originalDocumentName;
-    private String originalDocumentHash;
-    private long originalDocumentBytes;
     private boolean closed = false;
 
     /**
@@ -153,39 +149,6 @@ public class FrictionlessUnpackHandler extends 
AbstractUnpackHandler implements
         return emitKey;
     }
 
-    /**
-     * Stores the original container document for optional inclusion.
-     *
-     * @param inputStream the original document input stream
-     * @param fileName    the file name for the original document
-     * @throws IOException if storing fails
-     */
-    public void storeOriginalDocument(InputStream inputStream, String 
fileName) throws IOException {
-        this.originalDocumentName = fileName;
-        this.originalDocumentPath = tempDirectory.resolve(fileName);
-
-        MessageDigest digest;
-        try {
-            digest = MessageDigest.getInstance("SHA-256");
-        } catch (NoSuchAlgorithmException e) {
-            throw new IOException("SHA-256 algorithm not available", e);
-        }
-
-        long bytes = 0;
-        try (DigestInputStream dis = new DigestInputStream(inputStream, 
digest);
-             OutputStream os = Files.newOutputStream(originalDocumentPath)) {
-            byte[] buffer = new byte[8192];
-            int read;
-            while ((read = dis.read(buffer)) != -1) {
-                os.write(buffer, 0, read);
-                bytes += read;
-            }
-        }
-
-        this.originalDocumentHash = 
FrictionlessResource.formatHash(digest.digest());
-        this.originalDocumentBytes = bytes;
-    }
-
     /**
      * Builds the DataPackage manifest from collected files.
      *
@@ -195,17 +158,6 @@ public class FrictionlessUnpackHandler extends 
AbstractUnpackHandler implements
     public DataPackage buildDataPackage(String containerName) {
         DataPackage dataPackage = new DataPackage(containerName);
 
-        // Add original document if included
-        if (unpackConfig.isIncludeOriginal() && hasOriginalDocument()) {
-            dataPackage.addResource(FrictionlessResource.create(
-                    originalDocumentName,
-                    detectMediatypeFromFilename(originalDocumentName),
-                    originalDocumentBytes,
-                    originalDocumentHash,
-                    originalDocumentName
-            ));
-        }
-
         // Add all embedded files with unpacked/ prefix
         for (FrictionlessFileInfo fileInfo : embeddedFiles) {
             String path = UNPACKED_DIR + "/" + fileInfo.fileName();
@@ -222,48 +174,6 @@ public class FrictionlessUnpackHandler extends 
AbstractUnpackHandler implements
         return dataPackage;
     }
 
-    /**
-     * Simple mediatype detection from filename extension.
-     */
-    private String detectMediatypeFromFilename(String filename) {
-        if (filename == null) {
-            return "application/octet-stream";
-        }
-        String lower = filename.toLowerCase(java.util.Locale.ROOT);
-        if (lower.endsWith(".pdf")) {
-            return "application/pdf";
-        } else if (lower.endsWith(".xml")) {
-            return "application/xml";
-        } else if (lower.endsWith(".doc")) {
-            return "application/msword";
-        } else if (lower.endsWith(".docx")) {
-            return 
"application/vnd.openxmlformats-officedocument.wordprocessingml.document";
-        } else if (lower.endsWith(".xls")) {
-            return "application/vnd.ms-excel";
-        } else if (lower.endsWith(".xlsx")) {
-            return 
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
-        } else if (lower.endsWith(".ppt")) {
-            return "application/vnd.ms-powerpoint";
-        } else if (lower.endsWith(".pptx")) {
-            return 
"application/vnd.openxmlformats-officedocument.presentationml.presentation";
-        } else if (lower.endsWith(".txt")) {
-            return "text/plain";
-        } else if (lower.endsWith(".html") || lower.endsWith(".htm")) {
-            return "text/html";
-        } else if (lower.endsWith(".json")) {
-            return "application/json";
-        } else if (lower.endsWith(".png")) {
-            return "image/png";
-        } else if (lower.endsWith(".jpg") || lower.endsWith(".jpeg")) {
-            return "image/jpeg";
-        } else if (lower.endsWith(".gif")) {
-            return "image/gif";
-        } else if (lower.endsWith(".zip")) {
-            return "application/zip";
-        }
-        return "application/octet-stream";
-    }
-
     /**
      * Returns the temporary directory where files are stored.
      */
@@ -292,27 +202,6 @@ public class FrictionlessUnpackHandler extends 
AbstractUnpackHandler implements
         return !embeddedFiles.isEmpty();
     }
 
-    /**
-     * Returns the path to the original document if stored.
-     */
-    public Path getOriginalDocumentPath() {
-        return originalDocumentPath;
-    }
-
-    /**
-     * Returns the name of the original document if stored.
-     */
-    public String getOriginalDocumentName() {
-        return originalDocumentName;
-    }
-
-    /**
-     * Returns true if the original document was stored.
-     */
-    public boolean hasOriginalDocument() {
-        return originalDocumentPath != null && 
Files.exists(originalDocumentPath);
-    }
-
     /**
      * Returns the UnpackConfig used by this handler.
      */
diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/TempFileUnpackHandler.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/TempFileUnpackHandler.java
index 6f665f27d3..ada8a2daa7 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/TempFileUnpackHandler.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/TempFileUnpackHandler.java
@@ -42,8 +42,6 @@ public class TempFileUnpackHandler extends 
AbstractUnpackHandler
     private final EmitKey containerEmitKey;
     private final UnpackConfig unpackConfig;
     private final List<EmbeddedFileInfo> embeddedFiles = new ArrayList<>();
-    private Path originalDocumentPath;
-    private String originalDocumentName;
     private boolean closed = false;
 
     /**
@@ -112,42 +110,6 @@ public class TempFileUnpackHandler extends 
AbstractUnpackHandler
         return !embeddedFiles.isEmpty();
     }
 
-    /**
-     * Stores the original container document for inclusion in the zip.
-     * Call this before parsing if includeOriginal is enabled.
-     *
-     * @param inputStream the original document input stream
-     * @param fileName the file name for the original document
-     */
-    public void storeOriginalDocument(InputStream inputStream, String 
fileName) throws IOException {
-        this.originalDocumentName = fileName;
-        this.originalDocumentPath = tempDirectory.resolve("_original_" + 
fileName);
-        try (OutputStream os = Files.newOutputStream(originalDocumentPath)) {
-            inputStream.transferTo(os);
-        }
-    }
-
-    /**
-     * Returns the path to the original document if stored.
-     */
-    public Path getOriginalDocumentPath() {
-        return originalDocumentPath;
-    }
-
-    /**
-     * Returns the name of the original document if stored.
-     */
-    public String getOriginalDocumentName() {
-        return originalDocumentName;
-    }
-
-    /**
-     * Returns true if the original document was stored.
-     */
-    public boolean hasOriginalDocument() {
-        return originalDocumentPath != null && 
Files.exists(originalDocumentPath);
-    }
-
     @Override
     public void close() throws IOException {
         if (!closed) {
diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
index a76defc641..b60ed056c3 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
@@ -37,6 +37,7 @@ import 
org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
 import org.apache.tika.extractor.UnpackHandler;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
@@ -319,15 +320,9 @@ class PipesWorker implements Callable<PipesResult> {
         DataPackage dataPackage = 
frictionlessHandler.buildDataPackage(containerName);
 
         try {
-            // Emit original document if included
-            if (unpackConfig.isIncludeOriginal() && 
frictionlessHandler.hasOriginalDocument()) {
-                String originalEmitKey = baseEmitKey + "/" + 
frictionlessHandler.getOriginalDocumentName();
-                try (InputStream is = 
Files.newInputStream(frictionlessHandler.getOriginalDocumentPath())) {
-                    streamEmitter.emit(originalEmitKey, is, new Metadata(), 
parseContext);
-                }
-            }
-
-            // Emit each embedded file under unpacked/
+            // Emit each embedded file under unpacked/.
+            // When includeOriginal=true the container itself is added as id 0 
by
+            // ParseHandler._preParse, so it appears here as one of the 
embedded entries.
             for (FrictionlessUnpackHandler.FrictionlessFileInfo fileInfo : 
frictionlessHandler.getEmbeddedFiles()) {
                 String fileEmitKey = baseEmitKey + "/unpacked/" + 
fileInfo.fileName();
                 try (InputStream is = 
Files.newInputStream(fileInfo.filePath())) {
@@ -384,15 +379,9 @@ class PipesWorker implements Callable<PipesResult> {
                 zos.closeEntry();
             }
 
-            // Add original document if included (at root level)
-            if (unpackConfig.isIncludeOriginal() && 
frictionlessHandler.hasOriginalDocument()) {
-                ZipEntry originalEntry = new 
ZipEntry(frictionlessHandler.getOriginalDocumentName());
-                zos.putNextEntry(originalEntry);
-                Files.copy(frictionlessHandler.getOriginalDocumentPath(), zos);
-                zos.closeEntry();
-            }
-
-            // Add all embedded files under unpacked/
+            // Add all embedded files under unpacked/.
+            // When includeOriginal=true the container itself is added as id 0 
by
+            // ParseHandler._preParse, so it appears here as one of the 
embedded entries.
             for (FrictionlessUnpackHandler.FrictionlessFileInfo fileInfo : 
frictionlessHandler.getEmbeddedFiles()) {
                 ZipEntry fileEntry = new ZipEntry("unpacked/" + 
fileInfo.fileName());
                 zos.putNextEntry(fileEntry);
@@ -441,14 +430,8 @@ class PipesWorker implements Callable<PipesResult> {
     private void createZipFile(Path zipFile, TempFileUnpackHandler tempHandler,
                                UnpackConfig unpackConfig) throws IOException {
         try (ZipOutputStream zos = new 
ZipOutputStream(Files.newOutputStream(zipFile))) {
-            // Include original document if requested
-            if (unpackConfig.isIncludeOriginal() && 
tempHandler.hasOriginalDocument()) {
-                ZipEntry originalEntry = new 
ZipEntry(tempHandler.getOriginalDocumentName());
-                zos.putNextEntry(originalEntry);
-                Files.copy(tempHandler.getOriginalDocumentPath(), zos);
-                zos.closeEntry();
-            }
-
+            // When includeOriginal=true the container itself is added as id 0 
by
+            // ParseHandler._preParse, so it appears here as one of the 
embedded entries.
             for (TempFileUnpackHandler.EmbeddedFileInfo fileInfo : 
tempHandler.getEmbeddedFiles()) {
                 // Add the embedded file
                 ZipEntry fileEntry = new ZipEntry(fileInfo.fileName());
@@ -489,72 +472,6 @@ class PipesWorker implements Callable<PipesResult> {
         mapper.writeValue(os, metadataMap);
     }
 
-    /**
-     * Stores the original document to the temp handler for inclusion in the 
zip.
-     * Uses TikaInputStream's internal file caching to avoid consuming the 
stream.
-     */
-    private void storeOriginalDocument(TikaInputStream tis, 
TempFileUnpackHandler tempHandler)
-            throws IOException {
-        String fileName = getFileNameFromFetchKey();
-
-        // TikaInputStream caches to a temp file internally - get that file
-        Path originalPath = tis.getPath();
-        if (originalPath != null && Files.exists(originalPath)) {
-            // Copy from the cached file
-            try (InputStream is = Files.newInputStream(originalPath)) {
-                tempHandler.storeOriginalDocument(is, fileName);
-            }
-        } else {
-            // Stream hasn't been cached yet - we need to read and reset
-            tis.mark(Integer.MAX_VALUE);
-            try {
-                tempHandler.storeOriginalDocument(tis, fileName);
-            } finally {
-                tis.reset();
-            }
-        }
-    }
-
-    /**
-     * Stores the original document to the frictionless handler for inclusion 
in output.
-     * Uses TikaInputStream's internal file caching to avoid consuming the 
stream.
-     */
-    private void storeOriginalDocumentForFrictionless(TikaInputStream tis,
-                                                      
FrictionlessUnpackHandler frictionlessHandler)
-            throws IOException {
-        String fileName = getFileNameFromFetchKey();
-
-        // TikaInputStream caches to a temp file internally - get that file
-        Path originalPath = tis.getPath();
-        if (originalPath != null && Files.exists(originalPath)) {
-            // Copy from the cached file
-            try (InputStream is = Files.newInputStream(originalPath)) {
-                frictionlessHandler.storeOriginalDocument(is, fileName);
-            }
-        } else {
-            // Stream hasn't been cached yet - we need to read and reset
-            tis.mark(Integer.MAX_VALUE);
-            try {
-                frictionlessHandler.storeOriginalDocument(tis, fileName);
-            } finally {
-                tis.reset();
-            }
-        }
-    }
-
-    /**
-     * Extracts the file name from the fetch key.
-     */
-    private String getFileNameFromFetchKey() {
-        String fetchKey = fetchEmitTuple.getFetchKey().getFetchKey();
-        String fileName = fetchKey;
-        int lastSlash = Math.max(fetchKey.lastIndexOf('/'), 
fetchKey.lastIndexOf('\\'));
-        if (lastSlash >= 0 && lastSlash < fetchKey.length() - 1) {
-            fileName = fetchKey.substring(lastSlash + 1);
-        }
-        return fileName;
-    }
-
     protected ParseDataOrPipesResult parseFromTuple() throws TikaException, 
InterruptedException {
         //start a new metadata object to gather info from the fetch process
         //we want to isolate and not touch the metadata sent into the 
fetchEmitTuple
@@ -569,22 +486,24 @@ class PipesWorker implements Callable<PipesResult> {
         }
         // Use newMetadata() to apply any configured write limits
         Metadata metadata = localContext.newMetadata();
+        // Carry the caller-supplied resource name across the fresh-metadata 
boundary so
+        // detection, suffix selection, and the Frictionless manifest's name 
field see
+        // the logical filename rather than whatever the fetcher's path 
happens to be
+        // (e.g., a server-side spool prefix). TikaInputStream.get(path, 
metadata)
+        // already honors a pre-set RESOURCE_NAME_KEY.
+        Metadata tupleMetadata = fetchEmitTuple.getMetadata();
+        String suppliedName = tupleMetadata == null
+                ? null
+                : tupleMetadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+        if (!StringUtils.isBlank(suppliedName)) {
+            metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, suppliedName);
+        }
         FetchHandler.TisOrResult tisOrResult = 
fetchHandler.fetch(fetchEmitTuple, metadata, localContext);
         if (tisOrResult.pipesResult() != null) {
             return new ParseDataOrPipesResult(null, tisOrResult.pipesResult());
         }
 
         try (TikaInputStream tis = tisOrResult.tis()) {
-            // Store original document for zipping/frictionless if requested
-            UnpackHandler handler = localContext.get(UnpackHandler.class);
-            UnpackConfig uc = localContext.get(UnpackConfig.class);
-            if (uc != null && uc.isIncludeOriginal()) {
-                if (handler instanceof FrictionlessUnpackHandler 
frictionlessHandler) {
-                    storeOriginalDocumentForFrictionless(tis, 
frictionlessHandler);
-                } else if (handler instanceof TempFileUnpackHandler 
tempHandler) {
-                    storeOriginalDocument(tis, tempHandler);
-                }
-            }
             return parseHandler.parseWithStream(fetchEmitTuple, tis, metadata, 
localContext);
         } catch (SecurityException e) {
             LOG.error("security exception id={}", fetchEmitTuple.getId(), e);
diff --git 
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/FrictionlessUnpackTest.java
 
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/FrictionlessUnpackTest.java
index deaf5e3561..14a05aad70 100644
--- 
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/FrictionlessUnpackTest.java
+++ 
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/FrictionlessUnpackTest.java
@@ -635,28 +635,30 @@ public class FrictionlessUnpackTest {
 
     @Test
     public void testFrictionlessWithIncludeOriginal(@TempDir Path tmp) throws 
Exception {
-        // Test that includeOriginal works with Frictionless format
+        // includeOriginal=true causes the container to appear in the 
Frictionless
+        // package as "unpacked/0.<ext>" (added by ParseHandler._preParse via
+        // unpackHandler.add(0, ...)) and to be listed once in 
datapackage.json.
         Path outputDir = tmp.resolve("output");
         Files.createDirectories(outputDir);
 
         try (PipesClient pipesClient = init(tmp, TEST_DOC_WITH_EMBEDDED)) {
             ParseContext parseContext = new ParseContext();
             parseContext.set(ParseMode.class, ParseMode.UNPACK);
-            
+
             UnpackConfig unpackConfig = new UnpackConfig();
             unpackConfig.setEmitter(EMITTER_NAME);
             
unpackConfig.setOutputFormat(UnpackConfig.OUTPUT_FORMAT.FRICTIONLESS);
             unpackConfig.setOutputMode(UnpackConfig.OUTPUT_MODE.ZIPPED);
-            unpackConfig.setIncludeOriginal(true);  // Include container 
document
+            unpackConfig.setIncludeOriginal(true);
             parseContext.set(UnpackConfig.class, unpackConfig);
-            
+
             PipesResult pipesResult = pipesClient.process(
                     new FetchEmitTuple(TEST_DOC_WITH_EMBEDDED,
                             new FetchKey(FETCHER_NAME, TEST_DOC_WITH_EMBEDDED),
                             new EmitKey(EMITTER_NAME, TEST_DOC_WITH_EMBEDDED),
                             new Metadata(), parseContext,
                             FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT));
-            
+
             assertTrue(pipesResult.isSuccess(),
                     "Frictionless with includeOriginal should succeed");
         }
@@ -664,39 +666,43 @@ public class FrictionlessUnpackTest {
         List<Path> zipFiles = Files.list(outputDir)
                 .filter(p -> p.toString().endsWith("-frictionless.zip"))
                 .toList();
+        assertEquals(1, zipFiles.size(), "Should create exactly one 
frictionless zip");
 
         try (ZipFile zip = new ZipFile(zipFiles.get(0).toFile())) {
-            // Original file should be at root level or in a specific location
-            boolean hasOriginal = false;
+            // The container itself should be present as the id-0 entry under
+            // unpacked/. Match "unpacked/0" exactly or "unpacked/0.<ext>"
+            // (whichever the active SUFFIX_STRATEGY produces).
+            Set<String> allEntries = new HashSet<>();
             Enumeration<? extends ZipEntry> entries = zip.entries();
             while (entries.hasMoreElements()) {
-                ZipEntry entry = entries.nextElement();
-                // Original could be at root or documented location
-                if (entry.getName().contains(TEST_DOC_WITH_EMBEDDED) ||
-                        entry.getName().equals("original/" + 
TEST_DOC_WITH_EMBEDDED)) {
-                    hasOriginal = true;
-                    break;
-                }
+                allEntries.add(entries.nextElement().getName());
             }
-
-            // Also check datapackage.json for original in resources
+            boolean hasContainerAsId0 = allEntries.stream()
+                    .anyMatch(n -> n.equals("unpacked/0") || 
n.startsWith("unpacked/0."));
+            assertTrue(hasContainerAsId0,
+                    "With includeOriginal=true, the container should appear as 
the " +
+                            "unpacked/0 entry. Entries: " + allEntries);
+
+            // And the manifest's resources should list the container at 
unpacked/0;
+            // no resource path should escape the unpacked/ prefix (no separate
+            // root-level "original" entry should exist).
             ZipEntry dpEntry = zip.getEntry("datapackage.json");
-            if (dpEntry != null) {
-                JsonNode dataPackage;
-                try (InputStream is = zip.getInputStream(dpEntry)) {
-                    dataPackage = OBJECT_MAPPER.readTree(is);
-                }
-                for (JsonNode resource : dataPackage.get("resources")) {
-                    String path = resource.get("path").asText();
-                    if (!path.startsWith("unpacked/")) {
-                        hasOriginal = true;
-                        break;
-                    }
+            assertNotNull(dpEntry, "datapackage.json should be present");
+            JsonNode dataPackage;
+            try (InputStream is = zip.getInputStream(dpEntry)) {
+                dataPackage = OBJECT_MAPPER.readTree(is);
+            }
+            boolean manifestListsContainer = false;
+            for (JsonNode resource : dataPackage.get("resources")) {
+                String path = resource.get("path").asText();
+                assertTrue(path.startsWith("unpacked/"),
+                        "Manifest resources should only list unpacked/ paths; 
got " + path);
+                if (path.equals("unpacked/0") || 
path.startsWith("unpacked/0.")) {
+                    manifestListsContainer = true;
                 }
             }
-
-            assertTrue(hasOriginal,
-                    "With includeOriginal=true, original document should be in 
package");
+            assertTrue(manifestListsContainer,
+                    "Manifest should list the container at unpacked/0");
         }
     }
 
diff --git 
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
 
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
index 21443d5795..b8c62b17ae 100644
--- 
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
+++ 
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
@@ -33,8 +33,10 @@ import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import javax.imageio.ImageIO;
 
 import com.fasterxml.jackson.databind.JsonNode;
@@ -494,6 +496,311 @@ public class UnpackerResourceTest extends CXFTestBase {
         assertTrue(hasZeroPaddedName, "Should have zero-padded file names 
(e.g., 0000.jpeg)");
     }
 
+    /**
+     * The datapackage.json "resources" array is the manifest for the 
Frictionless
+     * package. Verifies it lists exactly the data files present in the zip, 
with
+     * no missing or extraneous entries. The package envelope (datapackage.json
+     * and the optional metadata.json) is not itself a "resource" and is 
excluded
+     * from the comparison.
+     */
+    @Test
+    public void testFrictionlessDataPackageMatchesArchiveContents() throws 
Exception {
+        String configJson = """
+                {
+                  "parse-context": {
+                    "unpack-config": {
+                      "outputFormat": "FRICTIONLESS",
+                      "outputMode": "ZIPPED",
+                      "includeFullMetadata": true,
+                      "includeOriginal": true
+                    }
+                  }
+                }
+                """;
+        ContentDisposition fileCd = new ContentDisposition("form-data; 
name=\"file\"; filename=\"Doc1_ole.doc\"");
+        Attachment fileAtt = new Attachment("file",
+                ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+        Attachment configAtt = new Attachment("config", "application/json",
+                new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
+
+        Response response = WebClient
+                .create(endPoint + ALL_PATH)
+                .type("multipart/form-data")
+                .accept("application/zip")
+                .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
+
+        assertEquals(200, response.getStatus());
+        Map<String, byte[]> data = readZipArchiveBytes((InputStream) 
response.getEntity());
+
+        byte[] dpBytes = data.get("datapackage.json");
+        assertNotNull(dpBytes, "datapackage.json should be present");
+
+        JsonNode dataPackage = MAPPER.readTree(dpBytes);
+        JsonNode resources = dataPackage.get("resources");
+        assertNotNull(resources, "datapackage.json should have a 'resources' 
array");
+        assertTrue(resources.isArray() && resources.size() > 0,
+                "resources array should be non-empty");
+
+        Set<String> manifestPaths = new HashSet<>();
+        for (JsonNode resource : resources) {
+            manifestPaths.add(resource.get("path").asText());
+        }
+
+        Set<String> archiveDataFiles = new HashSet<>(data.keySet());
+        archiveDataFiles.remove("datapackage.json");
+        archiveDataFiles.remove("metadata.json");
+
+        assertEquals(archiveDataFiles, manifestPaths,
+                "datapackage.json 'resources' must list exactly the data files 
in the zip. " +
+                        "Only-in-manifest: " + difference(manifestPaths, 
archiveDataFiles) +
+                        ", only-in-archive: " + difference(archiveDataFiles, 
manifestPaths));
+    }
+
+    /**
+     * The Frictionless manifest's "name" field is supposed to carry the
+     * original filename of each resource. For the container 
(unpacked/0.<ext>),
+     * that name should be the filename the user supplied on the multipart
+     * upload -- not the server's internal spool filename.
+     */
+    @Test
+    public void testFrictionlessContainerManifestNameMatchesUploadFilename() 
throws Exception {
+        String configJson = """
+                {
+                  "parse-context": {
+                    "unpack-config": {
+                      "outputFormat": "FRICTIONLESS",
+                      "outputMode": "ZIPPED"
+                    }
+                  }
+                }
+                """;
+        String uploadFilename = "Doc1_ole.doc";
+        ContentDisposition fileCd = new ContentDisposition(
+                "form-data; name=\"file\"; filename=\"" + uploadFilename + 
"\"");
+        Attachment fileAtt = new Attachment("file",
+                ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+        Attachment configAtt = new Attachment("config", "application/json",
+                new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
+
+        Response response = WebClient
+                .create(endPoint + ALL_PATH)
+                .type("multipart/form-data")
+                .accept("application/zip")
+                .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
+
+        assertEquals(200, response.getStatus());
+        Map<String, byte[]> data = readZipArchiveBytes((InputStream) 
response.getEntity());
+
+        byte[] dpBytes = data.get("datapackage.json");
+        assertNotNull(dpBytes, "datapackage.json should be present");
+        JsonNode dataPackage = MAPPER.readTree(dpBytes);
+
+        JsonNode containerResource = null;
+        for (JsonNode resource : dataPackage.get("resources")) {
+            String path = resource.get("path").asText();
+            if (path.equals("unpacked/0") || path.startsWith("unpacked/0.")) {
+                containerResource = resource;
+                break;
+            }
+        }
+        assertNotNull(containerResource,
+                "Manifest should list the container at unpacked/0. Resources: 
" +
+                        dataPackage.get("resources"));
+
+        JsonNode nameNode = containerResource.get("name");
+        assertNotNull(nameNode,
+                "Container resource should carry a 'name' field. Resource: " + 
containerResource);
+        assertEquals(uploadFilename, nameNode.asText(),
+                "Container's manifest name should be the user-supplied upload 
filename, " +
+                        "not the server's internal spool filename. Resource: " 
+ containerResource);
+    }
+
+    private static Set<String> difference(Set<String> a, Set<String> b) {
+        Set<String> diff = new HashSet<>(a);
+        diff.removeAll(b);
+        return diff;
+    }
+
+    /**
+     * /unpack/all forces unpack-config.includeOriginal=true. In REGULAR mode,
+     * the container must appear exactly once in the archive — at 
"0.&lt;ext&gt;"
+     * at the zip root — with no additional copy elsewhere.
+     */
+    @Test
+    public void testRegularAllContainerAppearsOnce() throws Exception {
+        ContentDisposition fileCd = new ContentDisposition("form-data; 
name=\"file\"; filename=\"Doc1_ole.doc\"");
+        Attachment fileAtt = new Attachment("file",
+                ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+
+        Response response = WebClient
+                .create(endPoint + ALL_PATH)
+                .type("multipart/form-data")
+                .accept("application/zip")
+                .post(new MultipartBody(Arrays.asList(fileAtt)));
+
+        assertEquals(200, response.getStatus());
+        Map<String, String> data = readZipArchive((InputStream) 
response.getEntity());
+
+        long containerEntries = data.keySet().stream()
+                .filter(k -> !k.endsWith(".metadata.json"))
+                .filter(k -> k.equals("0") || k.startsWith("0."))
+                .count();
+        assertEquals(1, containerEntries,
+                "Container should appear exactly once at the zip root as 
0.<ext>. " +
+                        "Entries: " + data.keySet());
+    }
+
+    /**
+     * Documents the shape of /unpack output in FRICTIONLESS mode (no /all,
+     * default config): a datapackage.json manifest plus the 
unpacked/&lt;id&gt;
+     * children, and no metadata.json envelope.
+     */
+    @Test
+    public void testFrictionlessUnpackShape() throws Exception {
+        String configJson = """
+                {
+                  "parse-context": {
+                    "unpack-config": {
+                      "outputFormat": "FRICTIONLESS",
+                      "outputMode": "ZIPPED"
+                    }
+                  }
+                }
+                """;
+        ContentDisposition fileCd = new ContentDisposition("form-data; 
name=\"file\"; filename=\"Doc1_ole.doc\"");
+        Attachment fileAtt = new Attachment("file",
+                ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+        Attachment configAtt = new Attachment("config", "application/json",
+                new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
+
+        Response response = WebClient
+                .create(endPoint + UNPACKER_PATH)
+                .type("multipart/form-data")
+                .accept("application/zip")
+                .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
+
+        assertEquals(200, response.getStatus());
+        Map<String, String> data = readZipArchive((InputStream) 
response.getEntity());
+
+        assertTrue(data.containsKey("datapackage.json"),
+                "Should contain datapackage.json manifest. Entries: " + 
data.keySet());
+        assertFalse(data.containsKey("metadata.json"),
+                "Should not contain metadata.json without includeFullMetadata. 
Entries: " + data.keySet());
+        boolean hasUnpacked = data.keySet().stream().anyMatch(k -> 
k.startsWith("unpacked/"));
+        assertTrue(hasUnpacked, "Should contain unpacked/ entries. Entries: " 
+ data.keySet());
+    }
+
+    /**
+     * Documents the difference between /unpack and /unpack/all in 
FRICTIONLESS:
+     * /unpack/all forces unpack-config.includeOriginal=true, which causes the
+     * container itself to be added to the unpack output as id 0
+     * ("unpacked/0.&lt;ext&gt;"). /unpack with no extra config does not.
+     */
+    @Test
+    public void testFrictionlessUnpackAllAddsContainerAsUnpackedZero() throws 
Exception {
+        String configJson = """
+                {
+                  "parse-context": {
+                    "unpack-config": {
+                      "outputFormat": "FRICTIONLESS",
+                      "outputMode": "ZIPPED"
+                    }
+                  }
+                }
+                """;
+        ContentDisposition fileCd = new ContentDisposition("form-data; 
name=\"file\"; filename=\"Doc1_ole.doc\"");
+        Attachment unpackFile = new Attachment("file",
+                ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+        Attachment unpackConfig = new Attachment("config", "application/json",
+                new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
+
+        Response unpackResponse = WebClient
+                .create(endPoint + UNPACKER_PATH)
+                .type("multipart/form-data")
+                .accept("application/zip")
+                .post(new MultipartBody(Arrays.asList(unpackFile, 
unpackConfig)));
+        assertEquals(200, unpackResponse.getStatus());
+        Set<String> unpackEntries = new HashSet<>(readZipArchive(
+                (InputStream) unpackResponse.getEntity()).keySet());
+
+        Attachment allFile = new Attachment("file",
+                ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+        Attachment allConfig = new Attachment("config", "application/json",
+                new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
+        Response allResponse = WebClient
+                .create(endPoint + ALL_PATH)
+                .type("multipart/form-data")
+                .accept("application/zip")
+                .post(new MultipartBody(Arrays.asList(allFile, allConfig)));
+        assertEquals(200, allResponse.getStatus());
+        Set<String> allEntries = new HashSet<>(readZipArchive(
+                (InputStream) allResponse.getEntity()).keySet());
+
+        boolean unpackHasContainer = unpackEntries.stream().anyMatch(k -> 
k.equals("unpacked/0.doc"));
+        boolean allHasContainer = allEntries.stream().anyMatch(k -> 
k.equals("unpacked/0.doc"));
+        assertFalse(unpackHasContainer,
+                "/unpack alone should not include the container. Entries: " + 
unpackEntries);
+        assertTrue(allHasContainer,
+                "/unpack/all should include the container as unpacked/0.<ext>. 
" +
+                        "Entries: " + allEntries);
+
+        Set<String> onlyInAll = difference(allEntries, unpackEntries);
+        assertEquals(Set.of("unpacked/0.doc"), onlyInAll,
+                "Only the container (unpacked/0.<ext>) should distinguish 
/unpack/all " +
+                        "from /unpack. Difference: " + onlyInAll);
+    }
+
+    /**
+     * Documents that includeFullMetadata=true in FRICTIONLESS adds a
+     * metadata.json envelope whose entries carry X-TIKA:content (the extracted
+     * text) and Content-Type alongside the per-document metadata.
+     */
+    @Test
+    public void testFrictionlessIncludeFullMetadataAddsMetadataJson() throws 
Exception {
+        String configJson = """
+                {
+                  "parse-context": {
+                    "unpack-config": {
+                      "outputFormat": "FRICTIONLESS",
+                      "outputMode": "ZIPPED",
+                      "includeFullMetadata": true
+                    }
+                  }
+                }
+                """;
+        ContentDisposition fileCd = new ContentDisposition("form-data; 
name=\"file\"; filename=\"Doc1_ole.doc\"");
+        Attachment fileAtt = new Attachment("file",
+                ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+        Attachment configAtt = new Attachment("config", "application/json",
+                new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
+
+        Response response = WebClient
+                .create(endPoint + ALL_PATH)
+                .type("multipart/form-data")
+                .accept("application/zip")
+                .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
+        assertEquals(200, response.getStatus());
+
+        Map<String, byte[]> data = readZipArchiveBytes((InputStream) 
response.getEntity());
+        byte[] metadataBytes = data.get("metadata.json");
+        assertNotNull(metadataBytes, "metadata.json should be present when 
includeFullMetadata=true. " +
+                "Entries: " + data.keySet());
+
+        JsonNode metadata = MAPPER.readTree(metadataBytes);
+        assertTrue(metadata.isArray() && metadata.size() > 0,
+                "metadata.json should be a non-empty array");
+
+        JsonNode container = metadata.get(0);
+        assertTrue(container.has("Content-Type"),
+                "Container metadata entry should carry Content-Type. Entry: " 
+ container);
+        assertEquals("application/msword", 
container.get("Content-Type").asText(),
+                "Container metadata entry should describe the submitted .doc");
+        assertTrue(container.has("X-TIKA:content"),
+                "Container metadata entry should carry X-TIKA:content 
(extracted text). Entry: " + container);
+        assertTrue(container.get("X-TIKA:content").asText().length() > 0,
+                "X-TIKA:content for the container should be non-empty");
+    }
+
     /**
      * Tests UnpackSelector filtering by mime type.
      */


Reply via email to