This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 2bab6d43f2 TIKA-4734 (#2843)
2bab6d43f2 is described below
commit 2bab6d43f28b1525a7f7a406151136d9f102de95
Author: Tim Allison <[email protected]>
AuthorDate: Thu May 28 17:12:06 2026 -0400
TIKA-4734 (#2843)
---
.../pages/migration-to-4x/migrating-to-4x.adoc | 5 +-
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 57 ++++++++++++++--------
.../test/java/org/apache/tika/cli/TikaCLITest.java | 33 +++++++++++++
3 files changed, 74 insertions(+), 21 deletions(-)
diff --git a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
index 830e63104a..d36fd51c31 100644
--- a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
+++ b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
@@ -36,9 +36,12 @@ Tika provides a conversion tool in `tika-app` to help
migrate your XML configura
[source,bash]
----
-java -jar tika-app.jar
--convert-config-xml-to-json=tika-config.xml,tika-config.json
+java -jar tika-app.jar --convert-config-xml-to-json=tika-config.xml >
tika-config.json
----
+The converted JSON is written to standard output, so redirect it to the file
of your choice
+(as shown above). No separate `--config` argument is needed.
+
The converter currently supports:
* **Parsers section** - parser declarations with parameters and exclusions
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 6ed26567b6..980f2833c5 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -56,6 +56,7 @@ import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.commons.io.output.CloseShieldOutputStream;
import org.apache.logging.log4j.Level;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -392,8 +393,18 @@ public class TikaCLI {
private boolean testForAsync(String[] args) {
- // Single .json file is a config file for async mode
- if (args.length == 1 && args[0].endsWith(".json")) {
+ // Standalone utility flags are handled directly by process(), never
via async mode.
+ // (Without this guard, --convert-config-xml-to-json=in.xml would be
misread as a
+ // ".json"/batch arg and routed to async, failing with a
TikaConfigException - TIKA-4734.)
+ for (String arg : args) {
+ if (arg.startsWith("--convert-config-xml-to-json=")) {
+ return false;
+ }
+ }
+
+ // Single .json file is a config file for async mode.
+ // Reject option-style args like `--config=foo.json` so they fall
through to process().
+ if (args.length == 1 && !args[0].startsWith("-") &&
args[0].endsWith(".json")) {
return true;
}
@@ -604,27 +615,31 @@ public class TikaCLI {
System.out.println(localConfig.getConfig().toString());
}*/
- private void convertConfigXmlToJson(String paths) throws Exception {
- String[] parts = paths.split(",");
- if (parts.length != 2) {
- System.err.println("Error: --convert-config-xml-to-json requires
input and output paths separated by comma");
- System.err.println("Usage:
--convert-config-xml-to-json=<input.xml>,<output.json>");
+ private void convertConfigXmlToJson(String inputPath) throws Exception {
+ if (inputPath == null || inputPath.trim().isEmpty()) {
+ System.err.println("Error: --convert-config-xml-to-json requires
an input XML path");
+ System.err.println("Usage:
--convert-config-xml-to-json=<input.xml> > <output.json>");
return;
}
- Path xmlPath = Paths.get(parts[0].trim());
- Path jsonPath = Paths.get(parts[1].trim());
+ Path xmlPath = Paths.get(inputPath.trim());
- if (!Files.exists(xmlPath)) {
- System.err.println("Error: Input XML file not found: " + xmlPath);
+ if (!Files.isRegularFile(xmlPath)) {
+ System.err.println("Error: Input XML path is not a regular file: "
+ xmlPath);
+ return;
+ }
+ if (!Files.isReadable(xmlPath)) {
+ System.err.println("Error: Input XML file is not readable: " +
xmlPath);
return;
}
- try {
- XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
- System.out.println("Successfully converted XML config to JSON:");
- System.out.println(" Input: " + xmlPath.toAbsolutePath());
- System.out.println(" Output: " + jsonPath.toAbsolutePath());
+ // Write JSON to stdout so the user can redirect it (e.g. >
tika-config.json).
+ // Informational/diagnostic output from the converter goes to the
logger (stderr),
+ // keeping stdout clean for the JSON payload. The converter closes the
stream it
+ // is handed, so shield System.out from being closed out from under us.
+ try (InputStream in = Files.newInputStream(xmlPath)) {
+ XmlToJsonConfigConverter.convert(in,
CloseShieldOutputStream.wrap(System.out));
+ System.out.flush();
} catch (Exception e) {
System.err.println("Error converting config: " + e.getMessage());
throw e;
@@ -771,16 +786,18 @@ public class TikaCLI {
out.println();
out.println(" -g or --gui Start the Apache Tika GUI");
out.println();
- out.println(" --config=<tika-config.xml>");
- out.println(" TikaConfig file. Must be specified before -g, -s,
-f or the dump-x-config !");
+ out.println(" --config=<tika-config.json>");
+ out.println(" TikaConfig file (JSON as of Tika 4.x). Must be
specified before -g or -f !");
// TODO: TIKA-XXXX - Re-enable config dump options once JSON
serialization is complete
// These options are not yet implemented in 4.x due to the migration
from XML to JSON config
// out.println(" --dump-minimal-config Print minimal TikaConfig");
// out.println(" --dump-current-config Print current TikaConfig");
// out.println(" --dump-static-config Print static config");
// out.println(" --dump-static-full-config Print static explicit
config");
- out.println("
--convert-config-xml-to-json=<input.xml>,<output.json>");
- out.println(" Convert legacy XML config to JSON format (parsers
section only)");
+ out.println(" --convert-config-xml-to-json=<input.xml>");
+ out.println(" Convert a legacy 3.x XML config to 4.x JSON
format (parsers section only),");
+ out.println(" writing the JSON to stdout. Redirect to save,
e.g.:");
+ out.println(" --convert-config-xml-to-json=tika-config.xml >
tika-config.json");
out.println("");
out.println(" -x or --xml Output XHTML content
(default)");
out.println(" -h or --html Output HTML content");
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 5498f3f056..51889ac259 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -38,6 +38,8 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
@@ -760,6 +762,37 @@ public class TikaCLITest {
assertTrue(content.contains("application/vnd.oasis.opendocument.text-web"));
}
+ /**
+ * Tests --convert-config-xml-to-json with no separate config file.
+ * Regression test for TIKA-4734: the flag used to be misrouted to async
+ * mode (the input arg ended in ".json"), failing with a
TikaConfigException
+ * unless a --config was also passed. It must now run standalone and write
+ * the converted JSON to stdout.
+ */
+ @Test
+ public void testConvertConfigXmlToJson() throws Exception {
+ String xmlPath =
Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI()).toString();
+ String content = getParamOutContent("--convert-config-xml-to-json=" +
xmlPath);
+
+ // stdout should be pure JSON; parse and assert on structure, not
formatting
+ JsonNode root = new ObjectMapper().readTree(content.trim());
+ JsonNode parsers = root.get("parsers");
+ assertNotNull(parsers, "Expected parsers section, got: " + content);
+ assertTrue(parsers.isArray() && parsers.size() > 0, "Expected
non-empty parsers array, got: " + content);
+
+ JsonNode pdfEntry = null;
+ for (JsonNode entry : parsers) {
+ if (entry.has("pdf-parser")) {
+ pdfEntry = entry.get("pdf-parser");
+ break;
+ }
+ }
+ assertNotNull(pdfEntry, "Expected pdf-parser entry, got: " + content);
+ JsonNode sortByPosition = pdfEntry.findValue("sortByPosition");
+ assertNotNull(sortByPosition, "Expected sortByPosition under
pdf-parser, got: " + content);
+ assertTrue(sortByPosition.asBoolean(), "Expected sortByPosition=true,
got: " + sortByPosition);
+ }
+
/**
* reset outContent and errContent if they are not empty
* run given params in TikaCLI and return outContent String with UTF-8