This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new a803c16e7d TIKA-4732 (#2820)
a803c16e7d is described below
commit a803c16e7dbea2969ab6d6efdd959c72f14881f5
Author: Tim Allison <[email protected]>
AuthorDate: Mon May 18 11:55:06 2026 -0400
TIKA-4732 (#2820)
---------
Co-authored-by: Lawrence Moorehead <[email protected]>
---
.../core/extractor/FrictionlessUnpackHandler.java | 111 --------
.../core/extractor/TempFileUnpackHandler.java | 38 ---
.../apache/tika/pipes/core/server/PipesWorker.java | 123 ++-------
.../tika/pipes/core/FrictionlessUnpackTest.java | 66 +++--
.../tika/server/standard/UnpackerResourceTest.java | 307 +++++++++++++++++++++
5 files changed, 364 insertions(+), 281 deletions(-)
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/FrictionlessUnpackHandler.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/FrictionlessUnpackHandler.java
index 705288bcb0..e1b89e9869 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/FrictionlessUnpackHandler.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/FrictionlessUnpackHandler.java
@@ -61,10 +61,6 @@ public class FrictionlessUnpackHandler extends
AbstractUnpackHandler implements
private final EmitKey containerEmitKey;
private final UnpackConfig unpackConfig;
private final List<FrictionlessFileInfo> embeddedFiles = new ArrayList<>();
- private Path originalDocumentPath;
- private String originalDocumentName;
- private String originalDocumentHash;
- private long originalDocumentBytes;
private boolean closed = false;
/**
@@ -153,39 +149,6 @@ public class FrictionlessUnpackHandler extends
AbstractUnpackHandler implements
return emitKey;
}
- /**
- * Stores the original container document for optional inclusion.
- *
- * @param inputStream the original document input stream
- * @param fileName the file name for the original document
- * @throws IOException if storing fails
- */
- public void storeOriginalDocument(InputStream inputStream, String
fileName) throws IOException {
- this.originalDocumentName = fileName;
- this.originalDocumentPath = tempDirectory.resolve(fileName);
-
- MessageDigest digest;
- try {
- digest = MessageDigest.getInstance("SHA-256");
- } catch (NoSuchAlgorithmException e) {
- throw new IOException("SHA-256 algorithm not available", e);
- }
-
- long bytes = 0;
- try (DigestInputStream dis = new DigestInputStream(inputStream,
digest);
- OutputStream os = Files.newOutputStream(originalDocumentPath)) {
- byte[] buffer = new byte[8192];
- int read;
- while ((read = dis.read(buffer)) != -1) {
- os.write(buffer, 0, read);
- bytes += read;
- }
- }
-
- this.originalDocumentHash =
FrictionlessResource.formatHash(digest.digest());
- this.originalDocumentBytes = bytes;
- }
-
/**
* Builds the DataPackage manifest from collected files.
*
@@ -195,17 +158,6 @@ public class FrictionlessUnpackHandler extends
AbstractUnpackHandler implements
public DataPackage buildDataPackage(String containerName) {
DataPackage dataPackage = new DataPackage(containerName);
- // Add original document if included
- if (unpackConfig.isIncludeOriginal() && hasOriginalDocument()) {
- dataPackage.addResource(FrictionlessResource.create(
- originalDocumentName,
- detectMediatypeFromFilename(originalDocumentName),
- originalDocumentBytes,
- originalDocumentHash,
- originalDocumentName
- ));
- }
-
// Add all embedded files with unpacked/ prefix
for (FrictionlessFileInfo fileInfo : embeddedFiles) {
String path = UNPACKED_DIR + "/" + fileInfo.fileName();
@@ -222,48 +174,6 @@ public class FrictionlessUnpackHandler extends
AbstractUnpackHandler implements
return dataPackage;
}
- /**
- * Simple mediatype detection from filename extension.
- */
- private String detectMediatypeFromFilename(String filename) {
- if (filename == null) {
- return "application/octet-stream";
- }
- String lower = filename.toLowerCase(java.util.Locale.ROOT);
- if (lower.endsWith(".pdf")) {
- return "application/pdf";
- } else if (lower.endsWith(".xml")) {
- return "application/xml";
- } else if (lower.endsWith(".doc")) {
- return "application/msword";
- } else if (lower.endsWith(".docx")) {
- return
"application/vnd.openxmlformats-officedocument.wordprocessingml.document";
- } else if (lower.endsWith(".xls")) {
- return "application/vnd.ms-excel";
- } else if (lower.endsWith(".xlsx")) {
- return
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
- } else if (lower.endsWith(".ppt")) {
- return "application/vnd.ms-powerpoint";
- } else if (lower.endsWith(".pptx")) {
- return
"application/vnd.openxmlformats-officedocument.presentationml.presentation";
- } else if (lower.endsWith(".txt")) {
- return "text/plain";
- } else if (lower.endsWith(".html") || lower.endsWith(".htm")) {
- return "text/html";
- } else if (lower.endsWith(".json")) {
- return "application/json";
- } else if (lower.endsWith(".png")) {
- return "image/png";
- } else if (lower.endsWith(".jpg") || lower.endsWith(".jpeg")) {
- return "image/jpeg";
- } else if (lower.endsWith(".gif")) {
- return "image/gif";
- } else if (lower.endsWith(".zip")) {
- return "application/zip";
- }
- return "application/octet-stream";
- }
-
/**
* Returns the temporary directory where files are stored.
*/
@@ -292,27 +202,6 @@ public class FrictionlessUnpackHandler extends
AbstractUnpackHandler implements
return !embeddedFiles.isEmpty();
}
- /**
- * Returns the path to the original document if stored.
- */
- public Path getOriginalDocumentPath() {
- return originalDocumentPath;
- }
-
- /**
- * Returns the name of the original document if stored.
- */
- public String getOriginalDocumentName() {
- return originalDocumentName;
- }
-
- /**
- * Returns true if the original document was stored.
- */
- public boolean hasOriginalDocument() {
- return originalDocumentPath != null &&
Files.exists(originalDocumentPath);
- }
-
/**
* Returns the UnpackConfig used by this handler.
*/
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/TempFileUnpackHandler.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/TempFileUnpackHandler.java
index 6f665f27d3..ada8a2daa7 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/TempFileUnpackHandler.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/TempFileUnpackHandler.java
@@ -42,8 +42,6 @@ public class TempFileUnpackHandler extends
AbstractUnpackHandler
private final EmitKey containerEmitKey;
private final UnpackConfig unpackConfig;
private final List<EmbeddedFileInfo> embeddedFiles = new ArrayList<>();
- private Path originalDocumentPath;
- private String originalDocumentName;
private boolean closed = false;
/**
@@ -112,42 +110,6 @@ public class TempFileUnpackHandler extends
AbstractUnpackHandler
return !embeddedFiles.isEmpty();
}
- /**
- * Stores the original container document for inclusion in the zip.
- * Call this before parsing if includeOriginal is enabled.
- *
- * @param inputStream the original document input stream
- * @param fileName the file name for the original document
- */
- public void storeOriginalDocument(InputStream inputStream, String
fileName) throws IOException {
- this.originalDocumentName = fileName;
- this.originalDocumentPath = tempDirectory.resolve("_original_" +
fileName);
- try (OutputStream os = Files.newOutputStream(originalDocumentPath)) {
- inputStream.transferTo(os);
- }
- }
-
- /**
- * Returns the path to the original document if stored.
- */
- public Path getOriginalDocumentPath() {
- return originalDocumentPath;
- }
-
- /**
- * Returns the name of the original document if stored.
- */
- public String getOriginalDocumentName() {
- return originalDocumentName;
- }
-
- /**
- * Returns true if the original document was stored.
- */
- public boolean hasOriginalDocument() {
- return originalDocumentPath != null &&
Files.exists(originalDocumentPath);
- }
-
@Override
public void close() throws IOException {
if (!closed) {
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
index a76defc641..b60ed056c3 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
@@ -37,6 +37,7 @@ import
org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
import org.apache.tika.extractor.UnpackHandler;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
@@ -319,15 +320,9 @@ class PipesWorker implements Callable<PipesResult> {
DataPackage dataPackage =
frictionlessHandler.buildDataPackage(containerName);
try {
- // Emit original document if included
- if (unpackConfig.isIncludeOriginal() &&
frictionlessHandler.hasOriginalDocument()) {
- String originalEmitKey = baseEmitKey + "/" +
frictionlessHandler.getOriginalDocumentName();
- try (InputStream is =
Files.newInputStream(frictionlessHandler.getOriginalDocumentPath())) {
- streamEmitter.emit(originalEmitKey, is, new Metadata(),
parseContext);
- }
- }
-
- // Emit each embedded file under unpacked/
+ // Emit each embedded file under unpacked/.
+ // When includeOriginal=true the container itself is added as id 0
by
+ // ParseHandler._preParse, so it appears here as one of the
embedded entries.
for (FrictionlessUnpackHandler.FrictionlessFileInfo fileInfo :
frictionlessHandler.getEmbeddedFiles()) {
String fileEmitKey = baseEmitKey + "/unpacked/" +
fileInfo.fileName();
try (InputStream is =
Files.newInputStream(fileInfo.filePath())) {
@@ -384,15 +379,9 @@ class PipesWorker implements Callable<PipesResult> {
zos.closeEntry();
}
- // Add original document if included (at root level)
- if (unpackConfig.isIncludeOriginal() &&
frictionlessHandler.hasOriginalDocument()) {
- ZipEntry originalEntry = new
ZipEntry(frictionlessHandler.getOriginalDocumentName());
- zos.putNextEntry(originalEntry);
- Files.copy(frictionlessHandler.getOriginalDocumentPath(), zos);
- zos.closeEntry();
- }
-
- // Add all embedded files under unpacked/
+ // Add all embedded files under unpacked/.
+ // When includeOriginal=true the container itself is added as id 0
by
+ // ParseHandler._preParse, so it appears here as one of the
embedded entries.
for (FrictionlessUnpackHandler.FrictionlessFileInfo fileInfo :
frictionlessHandler.getEmbeddedFiles()) {
ZipEntry fileEntry = new ZipEntry("unpacked/" +
fileInfo.fileName());
zos.putNextEntry(fileEntry);
@@ -441,14 +430,8 @@ class PipesWorker implements Callable<PipesResult> {
private void createZipFile(Path zipFile, TempFileUnpackHandler tempHandler,
UnpackConfig unpackConfig) throws IOException {
try (ZipOutputStream zos = new
ZipOutputStream(Files.newOutputStream(zipFile))) {
- // Include original document if requested
- if (unpackConfig.isIncludeOriginal() &&
tempHandler.hasOriginalDocument()) {
- ZipEntry originalEntry = new
ZipEntry(tempHandler.getOriginalDocumentName());
- zos.putNextEntry(originalEntry);
- Files.copy(tempHandler.getOriginalDocumentPath(), zos);
- zos.closeEntry();
- }
-
+ // When includeOriginal=true the container itself is added as id 0
by
+ // ParseHandler._preParse, so it appears here as one of the
embedded entries.
for (TempFileUnpackHandler.EmbeddedFileInfo fileInfo :
tempHandler.getEmbeddedFiles()) {
// Add the embedded file
ZipEntry fileEntry = new ZipEntry(fileInfo.fileName());
@@ -489,72 +472,6 @@ class PipesWorker implements Callable<PipesResult> {
mapper.writeValue(os, metadataMap);
}
- /**
- * Stores the original document to the temp handler for inclusion in the
zip.
- * Uses TikaInputStream's internal file caching to avoid consuming the
stream.
- */
- private void storeOriginalDocument(TikaInputStream tis,
TempFileUnpackHandler tempHandler)
- throws IOException {
- String fileName = getFileNameFromFetchKey();
-
- // TikaInputStream caches to a temp file internally - get that file
- Path originalPath = tis.getPath();
- if (originalPath != null && Files.exists(originalPath)) {
- // Copy from the cached file
- try (InputStream is = Files.newInputStream(originalPath)) {
- tempHandler.storeOriginalDocument(is, fileName);
- }
- } else {
- // Stream hasn't been cached yet - we need to read and reset
- tis.mark(Integer.MAX_VALUE);
- try {
- tempHandler.storeOriginalDocument(tis, fileName);
- } finally {
- tis.reset();
- }
- }
- }
-
- /**
- * Stores the original document to the frictionless handler for inclusion
in output.
- * Uses TikaInputStream's internal file caching to avoid consuming the
stream.
- */
- private void storeOriginalDocumentForFrictionless(TikaInputStream tis,
-
FrictionlessUnpackHandler frictionlessHandler)
- throws IOException {
- String fileName = getFileNameFromFetchKey();
-
- // TikaInputStream caches to a temp file internally - get that file
- Path originalPath = tis.getPath();
- if (originalPath != null && Files.exists(originalPath)) {
- // Copy from the cached file
- try (InputStream is = Files.newInputStream(originalPath)) {
- frictionlessHandler.storeOriginalDocument(is, fileName);
- }
- } else {
- // Stream hasn't been cached yet - we need to read and reset
- tis.mark(Integer.MAX_VALUE);
- try {
- frictionlessHandler.storeOriginalDocument(tis, fileName);
- } finally {
- tis.reset();
- }
- }
- }
-
- /**
- * Extracts the file name from the fetch key.
- */
- private String getFileNameFromFetchKey() {
- String fetchKey = fetchEmitTuple.getFetchKey().getFetchKey();
- String fileName = fetchKey;
- int lastSlash = Math.max(fetchKey.lastIndexOf('/'),
fetchKey.lastIndexOf('\\'));
- if (lastSlash >= 0 && lastSlash < fetchKey.length() - 1) {
- fileName = fetchKey.substring(lastSlash + 1);
- }
- return fileName;
- }
-
protected ParseDataOrPipesResult parseFromTuple() throws TikaException,
InterruptedException {
//start a new metadata object to gather info from the fetch process
//we want to isolate and not touch the metadata sent into the
fetchEmitTuple
@@ -569,22 +486,24 @@ class PipesWorker implements Callable<PipesResult> {
}
// Use newMetadata() to apply any configured write limits
Metadata metadata = localContext.newMetadata();
+ // Carry the caller-supplied resource name across the fresh-metadata
boundary so
+ // detection, suffix selection, and the Frictionless manifest's name
field see
+ // the logical filename rather than whatever the fetcher's path
happens to be
+ // (e.g., a server-side spool prefix). TikaInputStream.get(path,
metadata)
+ // already honors a pre-set RESOURCE_NAME_KEY.
+ Metadata tupleMetadata = fetchEmitTuple.getMetadata();
+ String suppliedName = tupleMetadata == null
+ ? null
+ : tupleMetadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+ if (!StringUtils.isBlank(suppliedName)) {
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, suppliedName);
+ }
FetchHandler.TisOrResult tisOrResult =
fetchHandler.fetch(fetchEmitTuple, metadata, localContext);
if (tisOrResult.pipesResult() != null) {
return new ParseDataOrPipesResult(null, tisOrResult.pipesResult());
}
try (TikaInputStream tis = tisOrResult.tis()) {
- // Store original document for zipping/frictionless if requested
- UnpackHandler handler = localContext.get(UnpackHandler.class);
- UnpackConfig uc = localContext.get(UnpackConfig.class);
- if (uc != null && uc.isIncludeOriginal()) {
- if (handler instanceof FrictionlessUnpackHandler
frictionlessHandler) {
- storeOriginalDocumentForFrictionless(tis,
frictionlessHandler);
- } else if (handler instanceof TempFileUnpackHandler
tempHandler) {
- storeOriginalDocument(tis, tempHandler);
- }
- }
return parseHandler.parseWithStream(fetchEmitTuple, tis, metadata,
localContext);
} catch (SecurityException e) {
LOG.error("security exception id={}", fetchEmitTuple.getId(), e);
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/FrictionlessUnpackTest.java
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/FrictionlessUnpackTest.java
index deaf5e3561..14a05aad70 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/FrictionlessUnpackTest.java
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/FrictionlessUnpackTest.java
@@ -635,28 +635,30 @@ public class FrictionlessUnpackTest {
@Test
public void testFrictionlessWithIncludeOriginal(@TempDir Path tmp) throws
Exception {
- // Test that includeOriginal works with Frictionless format
+ // includeOriginal=true causes the container to appear in the
Frictionless
+ // package as "unpacked/0.<ext>" (added by ParseHandler._preParse via
+ // unpackHandler.add(0, ...)) and to be listed once in
datapackage.json.
Path outputDir = tmp.resolve("output");
Files.createDirectories(outputDir);
try (PipesClient pipesClient = init(tmp, TEST_DOC_WITH_EMBEDDED)) {
ParseContext parseContext = new ParseContext();
parseContext.set(ParseMode.class, ParseMode.UNPACK);
-
+
UnpackConfig unpackConfig = new UnpackConfig();
unpackConfig.setEmitter(EMITTER_NAME);
unpackConfig.setOutputFormat(UnpackConfig.OUTPUT_FORMAT.FRICTIONLESS);
unpackConfig.setOutputMode(UnpackConfig.OUTPUT_MODE.ZIPPED);
- unpackConfig.setIncludeOriginal(true); // Include container
document
+ unpackConfig.setIncludeOriginal(true);
parseContext.set(UnpackConfig.class, unpackConfig);
-
+
PipesResult pipesResult = pipesClient.process(
new FetchEmitTuple(TEST_DOC_WITH_EMBEDDED,
new FetchKey(FETCHER_NAME, TEST_DOC_WITH_EMBEDDED),
new EmitKey(EMITTER_NAME, TEST_DOC_WITH_EMBEDDED),
new Metadata(), parseContext,
FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT));
-
+
assertTrue(pipesResult.isSuccess(),
"Frictionless with includeOriginal should succeed");
}
@@ -664,39 +666,43 @@ public class FrictionlessUnpackTest {
List<Path> zipFiles = Files.list(outputDir)
.filter(p -> p.toString().endsWith("-frictionless.zip"))
.toList();
+ assertEquals(1, zipFiles.size(), "Should create exactly one
frictionless zip");
try (ZipFile zip = new ZipFile(zipFiles.get(0).toFile())) {
- // Original file should be at root level or in a specific location
- boolean hasOriginal = false;
+ // The container itself should be present as the id-0 entry under
+ // unpacked/. Match "unpacked/0" exactly or "unpacked/0.<ext>"
+ // (whichever the active SUFFIX_STRATEGY produces).
+ Set<String> allEntries = new HashSet<>();
Enumeration<? extends ZipEntry> entries = zip.entries();
while (entries.hasMoreElements()) {
- ZipEntry entry = entries.nextElement();
- // Original could be at root or documented location
- if (entry.getName().contains(TEST_DOC_WITH_EMBEDDED) ||
- entry.getName().equals("original/" +
TEST_DOC_WITH_EMBEDDED)) {
- hasOriginal = true;
- break;
- }
+ allEntries.add(entries.nextElement().getName());
}
-
- // Also check datapackage.json for original in resources
+ boolean hasContainerAsId0 = allEntries.stream()
+ .anyMatch(n -> n.equals("unpacked/0") ||
n.startsWith("unpacked/0."));
+ assertTrue(hasContainerAsId0,
+ "With includeOriginal=true, the container should appear as
the " +
+ "unpacked/0 entry. Entries: " + allEntries);
+
+ // And the manifest's resources should list the container at
unpacked/0;
+ // no resource path should escape the unpacked/ prefix (no separate
+ // root-level "original" entry should exist).
ZipEntry dpEntry = zip.getEntry("datapackage.json");
- if (dpEntry != null) {
- JsonNode dataPackage;
- try (InputStream is = zip.getInputStream(dpEntry)) {
- dataPackage = OBJECT_MAPPER.readTree(is);
- }
- for (JsonNode resource : dataPackage.get("resources")) {
- String path = resource.get("path").asText();
- if (!path.startsWith("unpacked/")) {
- hasOriginal = true;
- break;
- }
+ assertNotNull(dpEntry, "datapackage.json should be present");
+ JsonNode dataPackage;
+ try (InputStream is = zip.getInputStream(dpEntry)) {
+ dataPackage = OBJECT_MAPPER.readTree(is);
+ }
+ boolean manifestListsContainer = false;
+ for (JsonNode resource : dataPackage.get("resources")) {
+ String path = resource.get("path").asText();
+ assertTrue(path.startsWith("unpacked/"),
+ "Manifest resources should only list unpacked/ paths;
got " + path);
+ if (path.equals("unpacked/0") ||
path.startsWith("unpacked/0.")) {
+ manifestListsContainer = true;
}
}
-
- assertTrue(hasOriginal,
- "With includeOriginal=true, original document should be in
package");
+ assertTrue(manifestListsContainer,
+ "Manifest should list the container at unpacked/0");
}
}
diff --git
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
index 21443d5795..b8c62b17ae 100644
---
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
+++
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
@@ -33,8 +33,10 @@ import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import javax.imageio.ImageIO;
import com.fasterxml.jackson.databind.JsonNode;
@@ -494,6 +496,311 @@ public class UnpackerResourceTest extends CXFTestBase {
assertTrue(hasZeroPaddedName, "Should have zero-padded file names
(e.g., 0000.jpeg)");
}
+ /**
+ * The datapackage.json "resources" array is the manifest for the
Frictionless
+ * package. Verifies it lists exactly the data files present in the zip,
with
+ * no missing or extraneous entries. The package envelope (datapackage.json
+ * and the optional metadata.json) is not itself a "resource" and is
excluded
+ * from the comparison.
+ */
+ @Test
+ public void testFrictionlessDataPackageMatchesArchiveContents() throws
Exception {
+ String configJson = """
+ {
+ "parse-context": {
+ "unpack-config": {
+ "outputFormat": "FRICTIONLESS",
+ "outputMode": "ZIPPED",
+ "includeFullMetadata": true,
+ "includeOriginal": true
+ }
+ }
+ }
+ """;
+ ContentDisposition fileCd = new ContentDisposition("form-data;
name=\"file\"; filename=\"Doc1_ole.doc\"");
+ Attachment fileAtt = new Attachment("file",
+ ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+ Attachment configAtt = new Attachment("config", "application/json",
+ new
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
+
+ Response response = WebClient
+ .create(endPoint + ALL_PATH)
+ .type("multipart/form-data")
+ .accept("application/zip")
+ .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
+
+ assertEquals(200, response.getStatus());
+ Map<String, byte[]> data = readZipArchiveBytes((InputStream)
response.getEntity());
+
+ byte[] dpBytes = data.get("datapackage.json");
+ assertNotNull(dpBytes, "datapackage.json should be present");
+
+ JsonNode dataPackage = MAPPER.readTree(dpBytes);
+ JsonNode resources = dataPackage.get("resources");
+ assertNotNull(resources, "datapackage.json should have a 'resources'
array");
+ assertTrue(resources.isArray() && resources.size() > 0,
+ "resources array should be non-empty");
+
+ Set<String> manifestPaths = new HashSet<>();
+ for (JsonNode resource : resources) {
+ manifestPaths.add(resource.get("path").asText());
+ }
+
+ Set<String> archiveDataFiles = new HashSet<>(data.keySet());
+ archiveDataFiles.remove("datapackage.json");
+ archiveDataFiles.remove("metadata.json");
+
+ assertEquals(archiveDataFiles, manifestPaths,
+ "datapackage.json 'resources' must list exactly the data files
in the zip. " +
+ "Only-in-manifest: " + difference(manifestPaths,
archiveDataFiles) +
+ ", only-in-archive: " + difference(archiveDataFiles,
manifestPaths));
+ }
+
+ /**
+ * The Frictionless manifest's "name" field is supposed to carry the
+ * original filename of each resource. For the container
(unpacked/0.<ext>),
+ * that name should be the filename the user supplied on the multipart
+ * upload -- not the server's internal spool filename.
+ */
+ @Test
+ public void testFrictionlessContainerManifestNameMatchesUploadFilename()
throws Exception {
+ String configJson = """
+ {
+ "parse-context": {
+ "unpack-config": {
+ "outputFormat": "FRICTIONLESS",
+ "outputMode": "ZIPPED"
+ }
+ }
+ }
+ """;
+ String uploadFilename = "Doc1_ole.doc";
+ ContentDisposition fileCd = new ContentDisposition(
+ "form-data; name=\"file\"; filename=\"" + uploadFilename +
"\"");
+ Attachment fileAtt = new Attachment("file",
+ ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+ Attachment configAtt = new Attachment("config", "application/json",
+ new
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
+
+ Response response = WebClient
+ .create(endPoint + ALL_PATH)
+ .type("multipart/form-data")
+ .accept("application/zip")
+ .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
+
+ assertEquals(200, response.getStatus());
+ Map<String, byte[]> data = readZipArchiveBytes((InputStream)
response.getEntity());
+
+ byte[] dpBytes = data.get("datapackage.json");
+ assertNotNull(dpBytes, "datapackage.json should be present");
+ JsonNode dataPackage = MAPPER.readTree(dpBytes);
+
+ JsonNode containerResource = null;
+ for (JsonNode resource : dataPackage.get("resources")) {
+ String path = resource.get("path").asText();
+ if (path.equals("unpacked/0") || path.startsWith("unpacked/0.")) {
+ containerResource = resource;
+ break;
+ }
+ }
+ assertNotNull(containerResource,
+ "Manifest should list the container at unpacked/0. Resources:
" +
+ dataPackage.get("resources"));
+
+ JsonNode nameNode = containerResource.get("name");
+ assertNotNull(nameNode,
+ "Container resource should carry a 'name' field. Resource: " +
containerResource);
+ assertEquals(uploadFilename, nameNode.asText(),
+ "Container's manifest name should be the user-supplied upload
filename, " +
+ "not the server's internal spool filename. Resource: "
+ containerResource);
+ }
+
+ private static Set<String> difference(Set<String> a, Set<String> b) {
+ Set<String> diff = new HashSet<>(a);
+ diff.removeAll(b);
+ return diff;
+ }
+
+ /**
+ * /unpack/all forces unpack-config.includeOriginal=true. In REGULAR mode,
+ * the container must appear exactly once in the archive — at
"0.<ext>"
+ * at the zip root — with no additional copy elsewhere.
+ */
+ @Test
+ public void testRegularAllContainerAppearsOnce() throws Exception {
+ ContentDisposition fileCd = new ContentDisposition("form-data;
name=\"file\"; filename=\"Doc1_ole.doc\"");
+ Attachment fileAtt = new Attachment("file",
+ ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+
+ Response response = WebClient
+ .create(endPoint + ALL_PATH)
+ .type("multipart/form-data")
+ .accept("application/zip")
+ .post(new MultipartBody(Arrays.asList(fileAtt)));
+
+ assertEquals(200, response.getStatus());
+ Map<String, String> data = readZipArchive((InputStream)
response.getEntity());
+
+ long containerEntries = data.keySet().stream()
+ .filter(k -> !k.endsWith(".metadata.json"))
+ .filter(k -> k.equals("0") || k.startsWith("0."))
+ .count();
+ assertEquals(1, containerEntries,
+ "Container should appear exactly once at the zip root as
0.<ext>. " +
+ "Entries: " + data.keySet());
+ }
+
+ /**
+ * Documents the shape of /unpack output in FRICTIONLESS mode (no /all,
+ * default config): a datapackage.json manifest plus the
unpacked/<id>
+ * children, and no metadata.json envelope.
+ */
+ @Test
+ public void testFrictionlessUnpackShape() throws Exception {
+ String configJson = """
+ {
+ "parse-context": {
+ "unpack-config": {
+ "outputFormat": "FRICTIONLESS",
+ "outputMode": "ZIPPED"
+ }
+ }
+ }
+ """;
+ ContentDisposition fileCd = new ContentDisposition("form-data;
name=\"file\"; filename=\"Doc1_ole.doc\"");
+ Attachment fileAtt = new Attachment("file",
+ ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+ Attachment configAtt = new Attachment("config", "application/json",
+ new
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
+
+ Response response = WebClient
+ .create(endPoint + UNPACKER_PATH)
+ .type("multipart/form-data")
+ .accept("application/zip")
+ .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
+
+ assertEquals(200, response.getStatus());
+ Map<String, String> data = readZipArchive((InputStream)
response.getEntity());
+
+ assertTrue(data.containsKey("datapackage.json"),
+ "Should contain datapackage.json manifest. Entries: " +
data.keySet());
+ assertFalse(data.containsKey("metadata.json"),
+ "Should not contain metadata.json without includeFullMetadata.
Entries: " + data.keySet());
+ boolean hasUnpacked = data.keySet().stream().anyMatch(k ->
k.startsWith("unpacked/"));
+ assertTrue(hasUnpacked, "Should contain unpacked/ entries. Entries: "
+ data.keySet());
+ }
+
+ /**
+ * Documents the difference between /unpack and /unpack/all in
FRICTIONLESS:
+ * /unpack/all forces unpack-config.includeOriginal=true, which causes the
+ * container itself to be added to the unpack output as id 0
+ * ("unpacked/0.<ext>"). /unpack with no extra config does not.
+ */
+ @Test
+ public void testFrictionlessUnpackAllAddsContainerAsUnpackedZero() throws
Exception {
+ String configJson = """
+ {
+ "parse-context": {
+ "unpack-config": {
+ "outputFormat": "FRICTIONLESS",
+ "outputMode": "ZIPPED"
+ }
+ }
+ }
+ """;
+ ContentDisposition fileCd = new ContentDisposition("form-data;
name=\"file\"; filename=\"Doc1_ole.doc\"");
+ Attachment unpackFile = new Attachment("file",
+ ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+ Attachment unpackConfig = new Attachment("config", "application/json",
+ new
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
+
+ Response unpackResponse = WebClient
+ .create(endPoint + UNPACKER_PATH)
+ .type("multipart/form-data")
+ .accept("application/zip")
+ .post(new MultipartBody(Arrays.asList(unpackFile,
unpackConfig)));
+ assertEquals(200, unpackResponse.getStatus());
+ Set<String> unpackEntries = new HashSet<>(readZipArchive(
+ (InputStream) unpackResponse.getEntity()).keySet());
+
+ Attachment allFile = new Attachment("file",
+ ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+ Attachment allConfig = new Attachment("config", "application/json",
+ new
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
+ Response allResponse = WebClient
+ .create(endPoint + ALL_PATH)
+ .type("multipart/form-data")
+ .accept("application/zip")
+ .post(new MultipartBody(Arrays.asList(allFile, allConfig)));
+ assertEquals(200, allResponse.getStatus());
+ Set<String> allEntries = new HashSet<>(readZipArchive(
+ (InputStream) allResponse.getEntity()).keySet());
+
+ boolean unpackHasContainer = unpackEntries.stream().anyMatch(k ->
k.equals("unpacked/0.doc"));
+ boolean allHasContainer = allEntries.stream().anyMatch(k ->
k.equals("unpacked/0.doc"));
+ assertFalse(unpackHasContainer,
+ "/unpack alone should not include the container. Entries: " +
unpackEntries);
+ assertTrue(allHasContainer,
+ "/unpack/all should include the container as unpacked/0.<ext>.
" +
+ "Entries: " + allEntries);
+
+ Set<String> onlyInAll = difference(allEntries, unpackEntries);
+ assertEquals(Set.of("unpacked/0.doc"), onlyInAll,
+ "Only the container (unpacked/0.<ext>) should distinguish
/unpack/all " +
+ "from /unpack. Difference: " + onlyInAll);
+ }
+
+ /**
+ * Documents that includeFullMetadata=true in FRICTIONLESS adds a
+ * metadata.json envelope whose entries carry X-TIKA:content (the extracted
+ * text) and Content-Type alongside the per-document metadata.
+ */
+ @Test
+ public void testFrictionlessIncludeFullMetadataAddsMetadataJson() throws
Exception {
+ String configJson = """
+ {
+ "parse-context": {
+ "unpack-config": {
+ "outputFormat": "FRICTIONLESS",
+ "outputMode": "ZIPPED",
+ "includeFullMetadata": true
+ }
+ }
+ }
+ """;
+ ContentDisposition fileCd = new ContentDisposition("form-data;
name=\"file\"; filename=\"Doc1_ole.doc\"");
+ Attachment fileAtt = new Attachment("file",
+ ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+ Attachment configAtt = new Attachment("config", "application/json",
+ new
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
+
+ Response response = WebClient
+ .create(endPoint + ALL_PATH)
+ .type("multipart/form-data")
+ .accept("application/zip")
+ .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
+ assertEquals(200, response.getStatus());
+
+ Map<String, byte[]> data = readZipArchiveBytes((InputStream)
response.getEntity());
+ byte[] metadataBytes = data.get("metadata.json");
+ assertNotNull(metadataBytes, "metadata.json should be present when
includeFullMetadata=true. " +
+ "Entries: " + data.keySet());
+
+ JsonNode metadata = MAPPER.readTree(metadataBytes);
+ assertTrue(metadata.isArray() && metadata.size() > 0,
+ "metadata.json should be a non-empty array");
+
+ JsonNode container = metadata.get(0);
+ assertTrue(container.has("Content-Type"),
+ "Container metadata entry should carry Content-Type. Entry: "
+ container);
+ assertEquals("application/msword",
container.get("Content-Type").asText(),
+ "Container metadata entry should describe the submitted .doc");
+ assertTrue(container.has("X-TIKA:content"),
+ "Container metadata entry should carry X-TIKA:content
(extracted text). Entry: " + container);
+ assertTrue(container.get("X-TIKA:content").asText().length() > 0,
+ "X-TIKA:content for the container should be non-empty");
+ }
+
/**
* Tests UnpackSelector filtering by mime type.
*/