This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4736 in repository https://gitbox.apache.org/repos/asf/tika.git
commit a0e08651c0f366891752ce97110c17d698dc659f Author: tallison <[email protected]> AuthorDate: Thu May 21 16:17:07 2026 -0400 TIKA-4736 -- image extraction fails --- docs/modules/ROOT/pages/using-tika/cli/index.adoc | 3 +++ .../main/java/org/apache/tika/cli/AsyncHelper.java | 7 +++++ .../src/main/java/org/apache/tika/cli/TikaCLI.java | 2 ++ .../java/org/apache/tika/cli/AsyncHelperTest.java | 18 +++++++++++++ .../org/apache/tika/async/cli/PluginsWriter.java | 10 +++++++ .../apache/tika/async/cli/SimpleAsyncConfig.java | 13 +++++++++ .../org/apache/tika/async/cli/TikaAsyncCLI.java | 16 ++++++++++- .../apache/tika/async/cli/AsyncCliParserTest.java | 25 +++++++++++++++++ .../tika/pipes/emitter/fs/FileSystemEmitter.java | 18 ++++++++++++- .../fs/FileSystemEmitterRuntimeConfigTest.java | 31 ++++++++++++++++++++++ 10 files changed, 141 insertions(+), 2 deletions(-) diff --git a/docs/modules/ROOT/pages/using-tika/cli/index.adoc b/docs/modules/ROOT/pages/using-tika/cli/index.adoc index f5d15608cf..c9f9da8f03 100644 --- a/docs/modules/ROOT/pages/using-tika/cli/index.adoc +++ b/docs/modules/ROOT/pages/using-tika/cli/index.adoc @@ -200,6 +200,9 @@ to the output directory. |`--content-only` |Output only extracted content (no metadata, no JSON wrapper); implies `--concatenate` +|`--on-exists` +|Behavior when an output file already exists: `exception` (default), `replace` or `skip` + |`-T` or `--timeoutMs` |Timeout for each parse in milliseconds diff --git a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java index 2320fa7df5..df99107871 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java +++ b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java @@ -44,6 +44,13 @@ public class AsyncHelper { } argList.add("-o"); argList.add(dir); + } else if ("--extract".equals(arg)) { + // tika-app documents --extract as the long form of -z. TikaAsyncCLI + // only knows -z/--unzipShallow (and -Z/--unzipRecursive), so without + // this translation --extract falls through as an unrecognized arg and + // trips the "unknown args" / "set inputDir once" errors (TIKA-4736). + // -z passes through untranslated and is already recognized. + argList.add("-z"); } else if ("-a".equals(arg)) { //do nothing } else if (arg.startsWith(UNPACK_FORMAT_KEY)) { diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index aeea5f9eda..d4a5628489 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -801,6 +801,8 @@ public class TikaCLI { out.println(" -pX or --password=X Use document password X"); out.println(" -z or --extract Extract all attachements into current directory"); out.println(" --extract-dir=<dir> Specify target directory for -z"); + out.println(" --on-exists=<mode> When an output file already exists: exception"); + out.println(" (default), replace or skip"); out.println(" --maxEmbeddedDepth=X Maximum depth for embedded document extraction"); out.println(" --maxEmbeddedCount=X Maximum number of embedded documents to extract"); out.println(" -r or --pretty-print For JSON, XML and XHTML outputs, adds newlines and"); diff --git a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java index f1a3b79864..bb668b7660 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java @@ -57,6 +57,24 @@ public class AsyncHelperTest { assertArrayEquals(expected, AsyncHelper.translateArgs(args)); } + @Test + public void testExtractLongFormTranslatedToZ() throws Exception { + // TIKA-4736: tika-app's --extract is the long form of -z. It must be + // translated to -z (which TikaAsyncCLI recognizes); otherwise it falls + // through as an unknown arg and the batch parse fails. + String[] args = new String[]{"--extract", "--extract-dir=ImageFiles", "input.pdf"}; + String[] expected = new String[]{"-z", "-o", "ImageFiles", "input.pdf"}; + assertArrayEquals(expected, AsyncHelper.translateArgs(args)); + } + + @Test + public void testShortFormZUnchanged() throws Exception { + // -z is already recognized by TikaAsyncCLI and must pass through untranslated. + String[] args = new String[]{"-z", "--extract-dir=ImageFiles", "input.pdf"}; + String[] expected = new String[]{"-z", "-o", "ImageFiles", "input.pdf"}; + assertArrayEquals(expected, AsyncHelper.translateArgs(args)); + } + @Test public void testJsonRecursiveSkipped() throws Exception { // -J is the default in async mode, so it's just skipped diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java index ef04527d95..dbc3de3935 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java @@ -151,6 +151,16 @@ public class PluginsWriter { } } + // Override the emitter's onExists policy if set on the CLI (--on-exists) + if (!StringUtils.isBlank(simpleAsyncConfig.getOnExists()) + && emitters != null && emitters.has("fse")) { + ObjectNode fse = (ObjectNode) emitters.get("fse"); + if (fse != null && fse.has("file-system-emitter")) { + ObjectNode fsEmitter = (ObjectNode) fse.get("file-system-emitter"); + fsEmitter.put("onExists", simpleAsyncConfig.getOnExists()); + } + } + // Write timeout limits to parse-context if configured on CLI if (simpleAsyncConfig.getTimeoutMs() != null) { ObjectNode parseContext = root.has("parse-context") diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java index 5ea5e764ba..f10788d89a 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java @@ -46,6 +46,10 @@ class SimpleAsyncConfig { private final String unpackMode; // "ZIPPED" or "DIRECTORY" private final boolean unpackIncludeMetadata; + // Emitter behavior when an output file already exists: "EXCEPTION", "REPLACE" or "SKIP". + // null leaves the emitter/config default (EXCEPTION) in place. + private String onExists; + //TODO -- switch to a builder public SimpleAsyncConfig(String inputDir, String outputDir, Integer numClients, Long timeoutMs, String xmx, String fileList, String tikaConfig, BasicContentHandlerFactory.HANDLER_TYPE handlerType, @@ -136,6 +140,14 @@ class SimpleAsyncConfig { return unpackIncludeMetadata; } + public String getOnExists() { + return onExists; + } + + public void setOnExists(String onExists) { + this.onExists = onExists; + } + @Override public String toString() { return "SimpleAsyncConfig{" + @@ -154,6 +166,7 @@ class SimpleAsyncConfig { ", unpackFormat='" + unpackFormat + '\'' + ", unpackMode='" + unpackMode + '\'' + ", unpackIncludeMetadata=" + unpackIncludeMetadata + + ", onExists='" + onExists + '\'' + '}'; } } diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java index 3f02173a5e..845b5b1940 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java @@ -82,6 +82,8 @@ public class TikaAsyncCLI { "output mode for unpacking: ZIPPED (default) or DIRECTORY"); options.addOption(null, "unpack-include-metadata", false, "include metadata.json in Frictionless output"); + options.addOption(null, "on-exists", true, + "behavior when an output file already exists: exception (default), replace or skip"); return options; } @@ -235,6 +237,16 @@ public class TikaAsyncCLI { unpackIncludeMetadata = true; } + String onExists = null; + if (line.hasOption("on-exists")) { + String v = line.getOptionValue("on-exists").toUpperCase(java.util.Locale.ROOT); + if (!v.equals("EXCEPTION") && !v.equals("REPLACE") && !v.equals("SKIP")) { + throw new TikaConfigException("Can't understand --on-exists=" + + line.getOptionValue("on-exists") + "; must be one of: exception, replace, skip"); + } + onExists = v; + } + if (line.getArgList().size() > 2) { throw new TikaConfigException("Can't have more than 2 unknown args: " + line.getArgList()); } @@ -282,10 +294,12 @@ public class TikaAsyncCLI { outputDir = Paths.get("output").toAbsolutePath().toString(); } - return new SimpleAsyncConfig(inputDir, outputDir, + SimpleAsyncConfig config = new SimpleAsyncConfig(inputDir, outputDir, numClients, timeoutMs, xmx, fileList, tikaConfig, handlerType, extractBytesMode, pluginsDir, concatenate, contentOnly, unpackFormat, unpackMode, unpackIncludeMetadata); + config.setOnExists(onExists); + return config; } private static BasicContentHandlerFactory.HANDLER_TYPE getHandlerType(String t) throws TikaConfigException { diff --git a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java index 8795549aab..b8960b7c08 100644 --- a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java +++ b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java @@ -20,6 +20,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import java.nio.file.Files; @@ -30,6 +31,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import org.apache.tika.exception.TikaConfigException; import org.apache.tika.sax.BasicContentHandlerFactory; public class AsyncCliParserTest { @@ -91,6 +93,29 @@ public class AsyncCliParserTest { assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.XML, simpleAsyncConfig.getHandlerType()); } + @Test + public void testOnExists() throws Exception { + // TIKA-4736: --on-exists is normalized to upper case and carried on the config. + SimpleAsyncConfig replace = TikaAsyncCLI.parseCommandLine( + new String[]{"-i", "input", "-o", "output", "--on-exists", "replace"}); + assertEquals("REPLACE", replace.getOnExists()); + + SimpleAsyncConfig skip = TikaAsyncCLI.parseCommandLine( + new String[]{"-i", "input", "-o", "output", "--on-exists", "skip"}); + assertEquals("SKIP", skip.getOnExists()); + + // Default (unset) leaves the emitter/config default (EXCEPTION) in place. + SimpleAsyncConfig dflt = TikaAsyncCLI.parseCommandLine( + new String[]{"-i", "input", "-o", "output"}); + assertNull(dflt.getOnExists()); + } + + @Test + public void testOnExistsInvalid() { + assertThrows(TikaConfigException.class, () -> TikaAsyncCLI.parseCommandLine( + new String[]{"-i", "input", "-o", "output", "--on-exists", "bogus"})); + } + @Test public void testFileListWithInputDir(@TempDir Path tmp) throws Exception { Path fileList = tmp.resolve("files.txt"); diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java index e990652b30..564beff9af 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java @@ -122,6 +122,8 @@ public class FileSystemEmitter extends AbstractStreamEmitter { try (Writer writer = Files.newBufferedWriter(output, StandardCharsets.UTF_8, StandardOpenOption.CREATE_NEW)) { //CREATE_NEW forces an IOException if the file already exists JsonMetadataList.toJson(metadataList, writer, config.prettyPrint()); + } catch (FileAlreadyExistsException e) { + throw alreadyExistsException(output); } } else { try (Writer writer = Files.newBufferedWriter(output, StandardCharsets.UTF_8)) { @@ -157,7 +159,11 @@ public class FileSystemEmitter extends AbstractStreamEmitter { if (config.onExists() == FileSystemEmitterConfig.ON_EXISTS.REPLACE) { Files.copy(inputStream, output, StandardCopyOption.REPLACE_EXISTING); } else if (config.onExists() == FileSystemEmitterConfig.ON_EXISTS.EXCEPTION) { - Files.copy(inputStream, output); + try { + Files.copy(inputStream, output); + } catch (FileAlreadyExistsException e) { + throw alreadyExistsException(output); + } } else if (config.onExists() == FileSystemEmitterConfig.ON_EXISTS.SKIP) { if (!Files.isRegularFile(output)) { try { @@ -169,6 +175,16 @@ public class FileSystemEmitter extends AbstractStreamEmitter { } } + /** + * Actionable error for the {@code onExists=EXCEPTION} case; the bare + * {@link FileAlreadyExistsException} reports only the path (TIKA-4736). + */ + private static IOException alreadyExistsException(Path output) { + return new IOException("Output already exists (onExists=EXCEPTION, not overwritten): " + + output.toAbsolutePath() + + ". Use an empty output dir, delete the file, or set onExists to REPLACE or SKIP."); + } + private FileSystemEmitterConfig getConfig(ParseContext parseContext) throws TikaConfigException, IOException { FileSystemEmitterConfig config = fileSystemEmitterConfig; String configKey = getExtensionConfig().id(); diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitterRuntimeConfigTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitterRuntimeConfigTest.java index c13bef3004..f3abd3dd67 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitterRuntimeConfigTest.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitterRuntimeConfigTest.java @@ -229,4 +229,35 @@ public class FileSystemEmitterRuntimeConfigTest { assertFalse(Files.exists(otherDir.resolve("test.json")), "File should not be created in other directory"); } + + @Test + public void testExceptionMessageWhenOutputExists(@TempDir Path tempDir) throws Exception { + // TIKA-4736: the onExists=EXCEPTION failure should carry an actionable message, + // not just the bare path from FileAlreadyExistsException. + String config = String.format(Locale.ROOT, + "{\"basePath\":\"%s\", \"onExists\":\"EXCEPTION\"}", + tempDir.toString().replace("\\", "\\\\")); + FileSystemEmitter emitter = FileSystemEmitter.build( + new ExtensionConfig("test-emitter", "test", config)); + ParseContext context = new ParseContext(); + + // Bytes path (the --extract / image-extraction scenario) + emitter.emit("img", new ByteArrayInputStream("a".getBytes(StandardCharsets.UTF_8)), + new Metadata(), context); + IOException bytesEx = assertThrows(IOException.class, () -> + emitter.emit("img", new ByteArrayInputStream("b".getBytes(StandardCharsets.UTF_8)), + new Metadata(), context)); + assertTrue(bytesEx.getMessage().contains("onExists=EXCEPTION"), bytesEx.getMessage()); + assertTrue(bytesEx.getMessage().contains("REPLACE or SKIP"), bytesEx.getMessage()); + + // Metadata-JSON path + List<Metadata> metadataList = new ArrayList<>(); + Metadata m = new Metadata(); + m.set(TikaCoreProperties.TIKA_CONTENT, "x"); + metadataList.add(m); + emitter.emit("meta.json", metadataList, context); + IOException metaEx = assertThrows(IOException.class, () -> + emitter.emit("meta.json", metadataList, context)); + assertTrue(metaEx.getMessage().contains("onExists=EXCEPTION"), metaEx.getMessage()); + } }
