This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4734 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 0b0dc4f75c5adbc3bf014c8796e609512572b917 Author: tallison <[email protected]> AuthorDate: Thu May 21 06:34:08 2026 -0400 TIKA-4734 -- fix xml config converter --- .../pages/migration-to-4x/migrating-to-4x.adoc | 5 ++- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 44 ++++++++++++++-------- .../test/java/org/apache/tika/cli/TikaCLITest.java | 19 ++++++++++ 3 files changed, 51 insertions(+), 17 deletions(-) diff --git a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc index 34ef91d778..7cca66ec39 100644 --- a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc +++ b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc @@ -36,9 +36,12 @@ Tika provides a conversion tool in `tika-app` to help migrate your XML configura [source,bash] ---- -java -jar tika-app.jar --convert-config-xml-to-json=tika-config.xml,tika-config.json +java -jar tika-app.jar --convert-config-xml-to-json=tika-config.xml > tika-config.json ---- +The converted JSON is written to standard output, so redirect it to the file of your choice +(as shown above). No separate `--config` argument is needed. + The converter currently supports: * **Parsers section** - parser declarations with parameters and exclusions diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 82be748314..80999eee21 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -56,6 +56,7 @@ import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.commons.io.output.CloseShieldOutputStream; import org.apache.logging.log4j.Level; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -364,6 +365,15 @@ public class TikaCLI { private boolean testForAsync(String[] args) { + // Standalone utility flags are handled directly by process(), never via async mode. + // (Without this guard, --convert-config-xml-to-json=in.xml would be misread as a + // ".json"/batch arg and routed to async, failing with a TikaConfigException - TIKA-4734.) + for (String arg : args) { + if (arg.startsWith("--convert-config-xml-to-json=")) { + return false; + } + } + // Single .json file is a config file for async mode if (args.length == 1 && args[0].endsWith(".json")) { return true; @@ -576,27 +586,27 @@ public class TikaCLI { System.out.println(localConfig.getConfig().toString()); }*/ - private void convertConfigXmlToJson(String paths) throws Exception { - String[] parts = paths.split(","); - if (parts.length != 2) { - System.err.println("Error: --convert-config-xml-to-json requires input and output paths separated by comma"); - System.err.println("Usage: --convert-config-xml-to-json=<input.xml>,<output.json>"); + private void convertConfigXmlToJson(String inputPath) throws Exception { + if (inputPath == null || inputPath.trim().isEmpty()) { + System.err.println("Error: --convert-config-xml-to-json requires an input XML path"); + System.err.println("Usage: --convert-config-xml-to-json=<input.xml> > <output.json>"); return; } - Path xmlPath = Paths.get(parts[0].trim()); - Path jsonPath = Paths.get(parts[1].trim()); + Path xmlPath = Paths.get(inputPath.trim()); if (!Files.exists(xmlPath)) { System.err.println("Error: Input XML file not found: " + xmlPath); return; } - try { - XmlToJsonConfigConverter.convert(xmlPath, jsonPath); - System.out.println("Successfully converted XML config to JSON:"); - System.out.println(" Input: " + xmlPath.toAbsolutePath()); - System.out.println(" Output: " + jsonPath.toAbsolutePath()); + // Write JSON to stdout so the user can redirect it (e.g. > tika-config.json). + // Informational/diagnostic output from the converter goes to the logger (stderr), + // keeping stdout clean for the JSON payload. The converter closes the stream it + // is handed, so shield System.out from being closed out from under us. + try (InputStream in = Files.newInputStream(xmlPath)) { + XmlToJsonConfigConverter.convert(in, CloseShieldOutputStream.wrap(System.out)); + System.out.flush(); } catch (Exception e) { System.err.println("Error converting config: " + e.getMessage()); throw e; @@ -743,16 +753,18 @@ public class TikaCLI { out.println(); out.println(" -g or --gui Start the Apache Tika GUI"); out.println(); - out.println(" --config=<tika-config.xml>"); - out.println(" TikaConfig file. Must be specified before -g, -s, -f or the dump-x-config !"); + out.println(" --config=<tika-config.json>"); + out.println(" TikaConfig file (JSON as of Tika 4.x). Must be specified before -g, -s or -f !"); // TODO: TIKA-XXXX - Re-enable config dump options once JSON serialization is complete // These options are not yet implemented in 4.x due to the migration from XML to JSON config // out.println(" --dump-minimal-config Print minimal TikaConfig"); // out.println(" --dump-current-config Print current TikaConfig"); // out.println(" --dump-static-config Print static config"); // out.println(" --dump-static-full-config Print static explicit config"); - out.println(" --convert-config-xml-to-json=<input.xml>,<output.json>"); - out.println(" Convert legacy XML config to JSON format (parsers section only)"); + out.println(" --convert-config-xml-to-json=<input.xml>"); + out.println(" Convert a legacy 3.x XML config to 4.x JSON format (parsers section only),"); + out.println(" writing the JSON to stdout. Redirect to save, e.g.:"); + out.println(" --convert-config-xml-to-json=tika-config.xml > tika-config.json"); out.println(""); out.println(" -x or --xml Output XHTML content (default)"); out.println(" -h or --html Output HTML content"); diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java index 5498f3f056..396e174057 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java @@ -764,6 +764,25 @@ public class TikaCLITest { * reset outContent and errContent if they are not empty * run given params in TikaCLI and return outContent String with UTF-8 */ + /** + * Tests --convert-config-xml-to-json with no separate config file. + * Regression test for TIKA-4734: the flag used to be misrouted to async + * mode (the input arg ended in ".json"), failing with a TikaConfigException + * unless a --config was also passed. It must now run standalone and write + * the converted JSON to stdout. + */ + @Test + public void testConvertConfigXmlToJson() throws Exception { + String xmlPath = Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI()).toString(); + String content = getParamOutContent("--convert-config-xml-to-json=" + xmlPath); + + // stdout should contain the converted JSON (and only the JSON) + assertTrue(content.contains("\"parsers\""), "Expected JSON parsers section, got: " + content); + assertTrue(content.contains("pdf-parser"), "Expected pdf-parser in output, got: " + content); + assertTrue(content.contains("\"sortByPosition\" : true"), "Expected converted param, got: " + content); + assertTrue(content.trim().startsWith("{"), "Output should be pure JSON, got: " + content); + } + String getParamOutContent(String... params) throws Exception { resetContent(); TikaCLI.main(params);
