This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 4b66205620 TIKA-4736 -- image extraction fails (#2828)
4b66205620 is described below

commit 4b66205620250bb59602dca11b0f44374152f0f2
Author: Tim Allison <[email protected]>
AuthorDate: Thu May 21 21:35:07 2026 -0400

    TIKA-4736 -- image extraction fails (#2828)
---
 docs/modules/ROOT/pages/using-tika/cli/index.adoc  |  3 +++
 .../main/java/org/apache/tika/cli/AsyncHelper.java |  7 +++++
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |  2 ++
 .../java/org/apache/tika/cli/AsyncHelperTest.java  | 18 +++++++++++++
 .../org/apache/tika/async/cli/PluginsWriter.java   | 10 +++++++
 .../apache/tika/async/cli/SimpleAsyncConfig.java   | 13 +++++++++
 .../org/apache/tika/async/cli/TikaAsyncCLI.java    | 16 ++++++++++-
 .../apache/tika/async/cli/AsyncCliParserTest.java  | 25 +++++++++++++++++
 .../tika/pipes/emitter/fs/FileSystemEmitter.java   | 18 ++++++++++++-
 .../fs/FileSystemEmitterRuntimeConfigTest.java     | 31 ++++++++++++++++++++++
 10 files changed, 141 insertions(+), 2 deletions(-)

diff --git a/docs/modules/ROOT/pages/using-tika/cli/index.adoc 
b/docs/modules/ROOT/pages/using-tika/cli/index.adoc
index f5d15608cf..c9f9da8f03 100644
--- a/docs/modules/ROOT/pages/using-tika/cli/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/cli/index.adoc
@@ -200,6 +200,9 @@ to the output directory.
 |`--content-only`
 |Output only extracted content (no metadata, no JSON wrapper); implies 
`--concatenate`
 
+|`--on-exists`
+|Behavior when an output file already exists: `exception` (default), `replace` 
or `skip`
+
 |`-T` or `--timeoutMs`
 |Timeout for each parse in milliseconds
 
diff --git a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java 
b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
index 2320fa7df5..df99107871 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
@@ -44,6 +44,13 @@ public class AsyncHelper {
                 }
                 argList.add("-o");
                 argList.add(dir);
+            } else if ("--extract".equals(arg)) {
+                // tika-app documents --extract as the long form of -z. 
TikaAsyncCLI
+                // only knows -z/--unzipShallow (and -Z/--unzipRecursive), so 
without
+                // this translation --extract falls through as an unrecognized 
arg and
+                // trips the "unknown args" / "set inputDir once" errors 
(TIKA-4736).
+                // -z passes through untranslated and is already recognized.
+                argList.add("-z");
             } else if ("-a".equals(arg)) {
                 //do nothing
             } else if (arg.startsWith(UNPACK_FORMAT_KEY)) {
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index aeea5f9eda..d4a5628489 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -801,6 +801,8 @@ public class TikaCLI {
         out.println("    -pX or --password=X    Use document password X");
         out.println("    -z  or --extract       Extract all attachements into 
current directory");
         out.println("    --extract-dir=<dir>    Specify target directory for 
-z");
+        out.println("    --on-exists=<mode>     When an output file already 
exists: exception");
+        out.println("                           (default), replace or skip");
         out.println("    --maxEmbeddedDepth=X   Maximum depth for embedded 
document extraction");
         out.println("    --maxEmbeddedCount=X   Maximum number of embedded 
documents to extract");
         out.println("    -r  or --pretty-print  For JSON, XML and XHTML 
outputs, adds newlines and");
diff --git a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java 
b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
index f1a3b79864..bb668b7660 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
@@ -57,6 +57,24 @@ public class AsyncHelperTest {
         assertArrayEquals(expected, AsyncHelper.translateArgs(args));
     }
 
+    @Test
+    public void testExtractLongFormTranslatedToZ() throws Exception {
+        // TIKA-4736: tika-app's --extract is the long form of -z. It must be
+        // translated to -z (which TikaAsyncCLI recognizes); otherwise it falls
+        // through as an unknown arg and the batch parse fails.
+        String[] args = new String[]{"--extract", "--extract-dir=ImageFiles", 
"input.pdf"};
+        String[] expected = new String[]{"-z", "-o", "ImageFiles", 
"input.pdf"};
+        assertArrayEquals(expected, AsyncHelper.translateArgs(args));
+    }
+
+    @Test
+    public void testShortFormZUnchanged() throws Exception {
+        // -z is already recognized by TikaAsyncCLI and must pass through 
untranslated.
+        String[] args = new String[]{"-z", "--extract-dir=ImageFiles", 
"input.pdf"};
+        String[] expected = new String[]{"-z", "-o", "ImageFiles", 
"input.pdf"};
+        assertArrayEquals(expected, AsyncHelper.translateArgs(args));
+    }
+
     @Test
     public void testJsonRecursiveSkipped() throws Exception {
         // -J is the default in async mode, so it's just skipped
diff --git 
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
 
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
index ef04527d95..dbc3de3935 100644
--- 
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
+++ 
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
@@ -151,6 +151,16 @@ public class PluginsWriter {
                 }
             }
 
+            // Override the emitter's onExists policy if set on the CLI 
(--on-exists)
+            if (!StringUtils.isBlank(simpleAsyncConfig.getOnExists())
+                    && emitters != null && emitters.has("fse")) {
+                ObjectNode fse = (ObjectNode) emitters.get("fse");
+                if (fse != null && fse.has("file-system-emitter")) {
+                    ObjectNode fsEmitter = (ObjectNode) 
fse.get("file-system-emitter");
+                    fsEmitter.put("onExists", simpleAsyncConfig.getOnExists());
+                }
+            }
+
             // Write timeout limits to parse-context if configured on CLI
             if (simpleAsyncConfig.getTimeoutMs() != null) {
                 ObjectNode parseContext = root.has("parse-context")
diff --git 
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
 
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
index 5ea5e764ba..f10788d89a 100644
--- 
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
+++ 
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
@@ -46,6 +46,10 @@ class SimpleAsyncConfig {
     private final String unpackMode;    // "ZIPPED" or "DIRECTORY"
     private final boolean unpackIncludeMetadata;
 
+    // Emitter behavior when an output file already exists: "EXCEPTION", 
"REPLACE" or "SKIP".
+    // null leaves the emitter/config default (EXCEPTION) in place.
+    private String onExists;
+
     //TODO -- switch to a builder
     public SimpleAsyncConfig(String inputDir, String outputDir, Integer 
numClients, Long timeoutMs, String xmx, String fileList,
                              String tikaConfig, 
BasicContentHandlerFactory.HANDLER_TYPE handlerType,
@@ -136,6 +140,14 @@ class SimpleAsyncConfig {
         return unpackIncludeMetadata;
     }
 
+    public String getOnExists() {
+        return onExists;
+    }
+
+    public void setOnExists(String onExists) {
+        this.onExists = onExists;
+    }
+
     @Override
     public String toString() {
         return "SimpleAsyncConfig{" +
@@ -154,6 +166,7 @@ class SimpleAsyncConfig {
                 ", unpackFormat='" + unpackFormat + '\'' +
                 ", unpackMode='" + unpackMode + '\'' +
                 ", unpackIncludeMetadata=" + unpackIncludeMetadata +
+                ", onExists='" + onExists + '\'' +
                 '}';
     }
 }
diff --git 
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
 
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
index 3f02173a5e..845b5b1940 100644
--- 
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
+++ 
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
@@ -82,6 +82,8 @@ public class TikaAsyncCLI {
                 "output mode for unpacking: ZIPPED (default) or DIRECTORY");
         options.addOption(null, "unpack-include-metadata", false,
                 "include metadata.json in Frictionless output");
+        options.addOption(null, "on-exists", true,
+                "behavior when an output file already exists: exception 
(default), replace or skip");
 
         return options;
     }
@@ -235,6 +237,16 @@ public class TikaAsyncCLI {
             unpackIncludeMetadata = true;
         }
 
+        String onExists = null;
+        if (line.hasOption("on-exists")) {
+            String v = 
line.getOptionValue("on-exists").toUpperCase(java.util.Locale.ROOT);
+            if (!v.equals("EXCEPTION") && !v.equals("REPLACE") && 
!v.equals("SKIP")) {
+                throw new TikaConfigException("Can't understand --on-exists=" +
+                        line.getOptionValue("on-exists") + "; must be one of: 
exception, replace, skip");
+            }
+            onExists = v;
+        }
+
         if (line.getArgList().size() > 2) {
             throw new TikaConfigException("Can't have more than 2 unknown 
args: " + line.getArgList());
         }
@@ -282,10 +294,12 @@ public class TikaAsyncCLI {
             outputDir = Paths.get("output").toAbsolutePath().toString();
         }
 
-        return new SimpleAsyncConfig(inputDir, outputDir,
+        SimpleAsyncConfig config = new SimpleAsyncConfig(inputDir, outputDir,
                 numClients, timeoutMs, xmx, fileList, tikaConfig, handlerType,
                 extractBytesMode, pluginsDir, concatenate, contentOnly,
                 unpackFormat, unpackMode, unpackIncludeMetadata);
+        config.setOnExists(onExists);
+        return config;
     }
 
     private static BasicContentHandlerFactory.HANDLER_TYPE 
getHandlerType(String t) throws TikaConfigException {
diff --git 
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
 
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
index 8795549aab..b8960b7c08 100644
--- 
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
+++ 
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
@@ -20,6 +20,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.nio.file.Files;
@@ -30,6 +31,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.io.TempDir;
 
+import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.sax.BasicContentHandlerFactory;
 
 public class AsyncCliParserTest {
@@ -91,6 +93,29 @@ public class AsyncCliParserTest {
         assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.XML, 
simpleAsyncConfig.getHandlerType());
     }
 
+    @Test
+    public void testOnExists() throws Exception {
+        // TIKA-4736: --on-exists is normalized to upper case and carried on 
the config.
+        SimpleAsyncConfig replace = TikaAsyncCLI.parseCommandLine(
+                new String[]{"-i", "input", "-o", "output", "--on-exists", 
"replace"});
+        assertEquals("REPLACE", replace.getOnExists());
+
+        SimpleAsyncConfig skip = TikaAsyncCLI.parseCommandLine(
+                new String[]{"-i", "input", "-o", "output", "--on-exists", 
"skip"});
+        assertEquals("SKIP", skip.getOnExists());
+
+        // Default (unset) leaves the emitter/config default (EXCEPTION) in 
place.
+        SimpleAsyncConfig dflt = TikaAsyncCLI.parseCommandLine(
+                new String[]{"-i", "input", "-o", "output"});
+        assertNull(dflt.getOnExists());
+    }
+
+    @Test
+    public void testOnExistsInvalid() {
+        assertThrows(TikaConfigException.class, () -> 
TikaAsyncCLI.parseCommandLine(
+                new String[]{"-i", "input", "-o", "output", "--on-exists", 
"bogus"}));
+    }
+
     @Test
     public void testFileListWithInputDir(@TempDir Path tmp) throws Exception {
         Path fileList = tmp.resolve("files.txt");
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
index e990652b30..564beff9af 100644
--- 
a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
@@ -122,6 +122,8 @@ public class FileSystemEmitter extends 
AbstractStreamEmitter {
             try (Writer writer = Files.newBufferedWriter(output, 
StandardCharsets.UTF_8,
                     StandardOpenOption.CREATE_NEW)) { //CREATE_NEW forces an 
IOException if the file already exists
                 JsonMetadataList.toJson(metadataList, writer, 
config.prettyPrint());
+            } catch (FileAlreadyExistsException e) {
+                throw alreadyExistsException(output);
             }
         } else {
             try (Writer writer = Files.newBufferedWriter(output, 
StandardCharsets.UTF_8)) {
@@ -157,7 +159,11 @@ public class FileSystemEmitter extends 
AbstractStreamEmitter {
         if (config.onExists() == FileSystemEmitterConfig.ON_EXISTS.REPLACE) {
             Files.copy(inputStream, output, 
StandardCopyOption.REPLACE_EXISTING);
         } else if (config.onExists() == 
FileSystemEmitterConfig.ON_EXISTS.EXCEPTION) {
-            Files.copy(inputStream, output);
+            try {
+                Files.copy(inputStream, output);
+            } catch (FileAlreadyExistsException e) {
+                throw alreadyExistsException(output);
+            }
         } else if (config.onExists() == 
FileSystemEmitterConfig.ON_EXISTS.SKIP) {
             if (!Files.isRegularFile(output)) {
                 try {
@@ -169,6 +175,16 @@ public class FileSystemEmitter extends 
AbstractStreamEmitter {
         }
     }
 
+    /**
+     * Actionable error for the {@code onExists=EXCEPTION} case; the bare
+     * {@link FileAlreadyExistsException} reports only the path (TIKA-4736).
+     */
+    private static IOException alreadyExistsException(Path output) {
+        return new IOException("Output already exists (onExists=EXCEPTION, not 
overwritten): "
+                + output.toAbsolutePath()
+                + ". Use an empty output dir, delete the file, or set onExists 
to REPLACE or SKIP.");
+    }
+
     private FileSystemEmitterConfig getConfig(ParseContext parseContext) 
throws TikaConfigException, IOException {
         FileSystemEmitterConfig config = fileSystemEmitterConfig;
         String configKey = getExtensionConfig().id();
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitterRuntimeConfigTest.java
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitterRuntimeConfigTest.java
index c13bef3004..f3abd3dd67 100644
--- 
a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitterRuntimeConfigTest.java
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitterRuntimeConfigTest.java
@@ -229,4 +229,35 @@ public class FileSystemEmitterRuntimeConfigTest {
         assertFalse(Files.exists(otherDir.resolve("test.json")),
                 "File should not be created in other directory");
     }
+
+    @Test
+    public void testExceptionMessageWhenOutputExists(@TempDir Path tempDir) 
throws Exception {
+        // TIKA-4736: the onExists=EXCEPTION failure should carry an 
actionable message,
+        // not just the bare path from FileAlreadyExistsException.
+        String config = String.format(Locale.ROOT,
+                "{\"basePath\":\"%s\", \"onExists\":\"EXCEPTION\"}",
+                tempDir.toString().replace("\\", "\\\\"));
+        FileSystemEmitter emitter = FileSystemEmitter.build(
+                new ExtensionConfig("test-emitter", "test", config));
+        ParseContext context = new ParseContext();
+
+        // Bytes path (the --extract / image-extraction scenario)
+        emitter.emit("img", new 
ByteArrayInputStream("a".getBytes(StandardCharsets.UTF_8)),
+                new Metadata(), context);
+        IOException bytesEx = assertThrows(IOException.class, () ->
+                emitter.emit("img", new 
ByteArrayInputStream("b".getBytes(StandardCharsets.UTF_8)),
+                        new Metadata(), context));
+        assertTrue(bytesEx.getMessage().contains("onExists=EXCEPTION"), 
bytesEx.getMessage());
+        assertTrue(bytesEx.getMessage().contains("REPLACE or SKIP"), 
bytesEx.getMessage());
+
+        // Metadata-JSON path
+        List<Metadata> metadataList = new ArrayList<>();
+        Metadata m = new Metadata();
+        m.set(TikaCoreProperties.TIKA_CONTENT, "x");
+        metadataList.add(m);
+        emitter.emit("meta.json", metadataList, context);
+        IOException metaEx = assertThrows(IOException.class, () ->
+                emitter.emit("meta.json", metadataList, context));
+        assertTrue(metaEx.getMessage().contains("onExists=EXCEPTION"), 
metaEx.getMessage());
+    }
 }

Reply via email to