This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 2bab6d43f2 TIKA-4734 (#2843)
2bab6d43f2 is described below

commit 2bab6d43f28b1525a7f7a406151136d9f102de95
Author: Tim Allison <[email protected]>
AuthorDate: Thu May 28 17:12:06 2026 -0400

    TIKA-4734 (#2843)
---
 .../pages/migration-to-4x/migrating-to-4x.adoc     |  5 +-
 .../src/main/java/org/apache/tika/cli/TikaCLI.java | 57 ++++++++++++++--------
 .../test/java/org/apache/tika/cli/TikaCLITest.java | 33 +++++++++++++
 3 files changed, 74 insertions(+), 21 deletions(-)

diff --git a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc 
b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
index 830e63104a..d36fd51c31 100644
--- a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
+++ b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
@@ -36,9 +36,12 @@ Tika provides a conversion tool in `tika-app` to help 
migrate your XML configura
 
 [source,bash]
 ----
-java -jar tika-app.jar 
--convert-config-xml-to-json=tika-config.xml,tika-config.json
+java -jar tika-app.jar --convert-config-xml-to-json=tika-config.xml > 
tika-config.json
 ----
 
+The converted JSON is written to standard output, so redirect it to the file 
of your choice
+(as shown above). No separate `--config` argument is needed.
+
 The converter currently supports:
 
 * **Parsers section** - parser declarations with parameters and exclusions
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 6ed26567b6..980f2833c5 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -56,6 +56,7 @@ import javax.xml.transform.sax.TransformerHandler;
 import javax.xml.transform.stream.StreamResult;
 
 import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.commons.io.output.CloseShieldOutputStream;
 import org.apache.logging.log4j.Level;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -392,8 +393,18 @@ public class TikaCLI {
 
     private boolean testForAsync(String[] args) {
 
-        // Single .json file is a config file for async mode
-        if (args.length == 1 && args[0].endsWith(".json")) {
+        // Standalone utility flags are handled directly by process(), never 
via async mode.
+        // (Without this guard, --convert-config-xml-to-json=in.xml would be 
misread as a
+        // ".json"/batch arg and routed to async, failing with a 
TikaConfigException - TIKA-4734.)
+        for (String arg : args) {
+            if (arg.startsWith("--convert-config-xml-to-json=")) {
+                return false;
+            }
+        }
+
+        // Single .json file is a config file for async mode.
+        // Reject option-style args like `--config=foo.json` so they fall 
through to process().
+        if (args.length == 1 && !args[0].startsWith("-") && 
args[0].endsWith(".json")) {
             return true;
         }
 
@@ -604,27 +615,31 @@ public class TikaCLI {
         System.out.println(localConfig.getConfig().toString());
     }*/
 
-    private void convertConfigXmlToJson(String paths) throws Exception {
-        String[] parts = paths.split(",");
-        if (parts.length != 2) {
-            System.err.println("Error: --convert-config-xml-to-json requires 
input and output paths separated by comma");
-            System.err.println("Usage: 
--convert-config-xml-to-json=<input.xml>,<output.json>");
+    private void convertConfigXmlToJson(String inputPath) throws Exception {
+        if (inputPath == null || inputPath.trim().isEmpty()) {
+            System.err.println("Error: --convert-config-xml-to-json requires 
an input XML path");
+            System.err.println("Usage: 
--convert-config-xml-to-json=<input.xml> > <output.json>");
             return;
         }
 
-        Path xmlPath = Paths.get(parts[0].trim());
-        Path jsonPath = Paths.get(parts[1].trim());
+        Path xmlPath = Paths.get(inputPath.trim());
 
-        if (!Files.exists(xmlPath)) {
-            System.err.println("Error: Input XML file not found: " + xmlPath);
+        if (!Files.isRegularFile(xmlPath)) {
+            System.err.println("Error: Input XML path is not a regular file: " 
+ xmlPath);
+            return;
+        }
+        if (!Files.isReadable(xmlPath)) {
+            System.err.println("Error: Input XML file is not readable: " + 
xmlPath);
             return;
         }
 
-        try {
-            XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
-            System.out.println("Successfully converted XML config to JSON:");
-            System.out.println("  Input:  " + xmlPath.toAbsolutePath());
-            System.out.println("  Output: " + jsonPath.toAbsolutePath());
+        // Write JSON to stdout so the user can redirect it (e.g. > 
tika-config.json).
+        // Informational/diagnostic output from the converter goes to the 
logger (stderr),
+        // keeping stdout clean for the JSON payload. The converter closes the 
stream it
+        // is handed, so shield System.out from being closed out from under us.
+        try (InputStream in = Files.newInputStream(xmlPath)) {
+            XmlToJsonConfigConverter.convert(in, 
CloseShieldOutputStream.wrap(System.out));
+            System.out.flush();
         } catch (Exception e) {
             System.err.println("Error converting config: " + e.getMessage());
             throw e;
@@ -771,16 +786,18 @@ public class TikaCLI {
         out.println();
         out.println("    -g  or --gui           Start the Apache Tika GUI");
         out.println();
-        out.println("    --config=<tika-config.xml>");
-        out.println("        TikaConfig file. Must be specified before -g, -s, 
-f or the dump-x-config !");
+        out.println("    --config=<tika-config.json>");
+        out.println("        TikaConfig file (JSON as of Tika 4.x). Must be 
specified before -g or -f !");
         // TODO: TIKA-XXXX - Re-enable config dump options once JSON 
serialization is complete
         // These options are not yet implemented in 4.x due to the migration 
from XML to JSON config
         // out.println("    --dump-minimal-config  Print minimal TikaConfig");
         // out.println("    --dump-current-config  Print current TikaConfig");
         // out.println("    --dump-static-config   Print static config");
         // out.println("    --dump-static-full-config  Print static explicit 
config");
-        out.println("    
--convert-config-xml-to-json=<input.xml>,<output.json>");
-        out.println("        Convert legacy XML config to JSON format (parsers 
section only)");
+        out.println("    --convert-config-xml-to-json=<input.xml>");
+        out.println("        Convert a legacy 3.x XML config to 4.x JSON 
format (parsers section only),");
+        out.println("        writing the JSON to stdout. Redirect to save, 
e.g.:");
+        out.println("        --convert-config-xml-to-json=tika-config.xml > 
tika-config.json");
         out.println("");
         out.println("    -x  or --xml           Output XHTML content 
(default)");
         out.println("    -h  or --html          Output HTML content");
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java 
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 5498f3f056..51889ac259 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -38,6 +38,8 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Disabled;
@@ -760,6 +762,37 @@ public class TikaCLITest {
         
assertTrue(content.contains("application/vnd.oasis.opendocument.text-web"));
     }
 
+    /**
+     * Tests --convert-config-xml-to-json with no separate config file.
+     * Regression test for TIKA-4734: the flag used to be misrouted to async
+     * mode (the input arg ended in ".json"), failing with a 
TikaConfigException
+     * unless a --config was also passed. It must now run standalone and write
+     * the converted JSON to stdout.
+     */
+    @Test
+    public void testConvertConfigXmlToJson() throws Exception {
+        String xmlPath = 
Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI()).toString();
+        String content = getParamOutContent("--convert-config-xml-to-json=" + 
xmlPath);
+
+        // stdout should be pure JSON; parse and assert on structure, not 
formatting
+        JsonNode root = new ObjectMapper().readTree(content.trim());
+        JsonNode parsers = root.get("parsers");
+        assertNotNull(parsers, "Expected parsers section, got: " + content);
+        assertTrue(parsers.isArray() && parsers.size() > 0, "Expected 
non-empty parsers array, got: " + content);
+
+        JsonNode pdfEntry = null;
+        for (JsonNode entry : parsers) {
+            if (entry.has("pdf-parser")) {
+                pdfEntry = entry.get("pdf-parser");
+                break;
+            }
+        }
+        assertNotNull(pdfEntry, "Expected pdf-parser entry, got: " + content);
+        JsonNode sortByPosition = pdfEntry.findValue("sortByPosition");
+        assertNotNull(sortByPosition, "Expected sortByPosition under 
pdf-parser, got: " + content);
+        assertTrue(sortByPosition.asBoolean(), "Expected sortByPosition=true, 
got: " + sortByPosition);
+    }
+
     /**
      * reset outContent and errContent if they are not empty
      * run given params in TikaCLI and return outContent String with UTF-8

Reply via email to