This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4663-follow-ons in repository https://gitbox.apache.org/repos/asf/tika.git
commit bb249146467b1df9367b0ddd726e5387364c0428 Author: tallison <[email protected]> AuthorDate: Fri Jun 5 17:17:50 2026 -0400 TIKA-4663 - make markdown the default content handler in tika-app, tika-server, and the async CLI --- CHANGES.txt | 12 ++++++++++++ .../ROOT/pages/migration-to-4x/migrating-to-4x.adoc | 10 ++++++++++ docs/modules/ROOT/pages/pipes/getting-started.adoc | 2 +- docs/modules/ROOT/pages/pipes/parse-modes.adoc | 2 +- docs/modules/ROOT/pages/using-tika/cli/index.adoc | 8 ++++---- docs/modules/ROOT/pages/using-tika/server/index.adoc | 4 ++-- .../main/java/org/apache/tika/cli/AsyncHelper.java | 4 ++++ .../src/main/java/org/apache/tika/cli/TikaCLI.java | 12 +++++++----- .../java/org/apache/tika/cli/AsyncHelperTest.java | 7 +++++++ .../test/java/org/apache/tika/cli/TikaCLITest.java | 12 ++++++++---- .../apache/tika/sax/BasicContentHandlerFactory.java | 2 +- tika-grpc/src/main/proto/tika.proto | 1 + .../java/org/apache/tika/async/cli/TikaAsyncCLI.java | 6 +++--- .../apache/tika/async/cli/AsyncCliParserTest.java | 3 +++ .../core/resource/RecursiveMetadataResource.java | 2 +- .../server/core/RecursiveMetadataResourceTest.java | 20 ++++++++++++++++++++ 16 files changed, 85 insertions(+), 22 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 2e7a5898a5..31c3876a0e 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,15 @@ +Release 4.0.0-beta-1 - unreleased + + BREAKING CHANGES + + * The default content handler is now Markdown. tika-app, tika-server + (the /tika and /rmeta endpoints), and the async/pipes CLI now emit + Markdown content by default instead of XHTML/XML (plain text for the + async CLI). Request the previous format explicitly, e.g. tika-app + -x/--xml, the server /tika/xml and /rmeta/xml paths (or the + X-Tika-Handler header), and the async CLI --handler x (TIKA-4663). + + Release 4.0.0-alpha-1 - 5/4/2026 BREAKING CHANGES diff --git a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc index 1aed5c00fa..8ce3c69afb 100644 --- a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc +++ b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc @@ -40,6 +40,16 @@ java -jar tika-app-<version>.jar [option...] [file...] If you have build scripts or container images that drop in just the jar, update them to unpack the zip and run from inside it. +== Default content handler: XHTML/XML -> Markdown + +In 3.x the default content handler produced XHTML/XML. In 4.x the default is **Markdown** everywhere: + +* `tika-app` outputs Markdown by default (was XHTML). Pass `-x`/`--xml`, `-h`/`--html`, or `-t`/`--text` to choose another format. +* `tika-server` — the `/tika` and `/rmeta` endpoints return Markdown content by default (was XHTML/XML). Use an explicit handler path (`/tika/xml`, `/rmeta/xml`, ...) or the `X-Tika-Handler` header to choose another format. +* The async/pipes CLI emits Markdown by default (was plain text). Use `--handler x` (etc.) to choose another format. + +If you parse the extracted content programmatically and expect XHTML/XML, request it explicitly as shown above (TIKA-4663). + == Configuration: XML to JSON Tika 4.x uses JSON configuration files instead of XML. The legacy `tika-config.xml` format diff --git a/docs/modules/ROOT/pages/pipes/getting-started.adoc b/docs/modules/ROOT/pages/pipes/getting-started.adoc index db6955aeb7..4baa06c88b 100644 --- a/docs/modules/ROOT/pages/pipes/getting-started.adoc +++ b/docs/modules/ROOT/pages/pipes/getting-started.adoc @@ -53,7 +53,7 @@ java -jar tika-app.jar -i /data/input -o /data/output --handler t java -jar tika-app.jar -i /data/input -o /data/output -Z ---- -Handler types: `t` (text), `h` (html), `x` (xml), `m` (markdown), `b` (body), `i` (ignore/metadata only). +Handler types: `t` (text), `h` (html), `x` (xml), `m` (markdown), `b` (body), `i` (ignore/metadata only). The default is `m` (markdown). == JSON Configuration diff --git a/docs/modules/ROOT/pages/pipes/parse-modes.adoc b/docs/modules/ROOT/pages/pipes/parse-modes.adoc index ab81227ac9..1bcc401a24 100644 --- a/docs/modules/ROOT/pages/pipes/parse-modes.adoc +++ b/docs/modules/ROOT/pages/pipes/parse-modes.adoc @@ -69,7 +69,7 @@ and applies to all modes that produce content (`RMETA`, `CONCATENATE`, `CONTENT_ Accepted `type` values: `TEXT`, `HTML`, `XML`, `MARKDOWN`, `BODY`, `IGNORE`. The CLI `--handler` flag uses single-letter shortcuts (`t`, `h`, `x`, `m`, `b`, `i`) that map onto -these values. +these values. If unset, the default is `MARKDOWN`. [cols="1,1,2"] |=== diff --git a/docs/modules/ROOT/pages/using-tika/cli/index.adoc b/docs/modules/ROOT/pages/using-tika/cli/index.adoc index 55e7b5aa75..d7c1fe7d17 100644 --- a/docs/modules/ROOT/pages/using-tika/cli/index.adoc +++ b/docs/modules/ROOT/pages/using-tika/cli/index.adoc @@ -118,7 +118,7 @@ options see <<_tika_pipes_processing,Tika Pipes Processing>> below. |Option |Description |`-x` or `--xml` -|Output XHTML content (default) +|Output XHTML content |`-h` or `--html` |Output HTML content @@ -127,7 +127,7 @@ options see <<_tika_pipes_processing,Tika Pipes Processing>> below. |Output plain text content (body) |`--md` -|Output Markdown content (body) +|Output Markdown content (body) (default) |`-T` or `--text-main` |Output plain text — main content only, via the boilerpipe handler @@ -145,7 +145,7 @@ options see <<_tika_pipes_processing,Tika Pipes Processing>> below. |Output metadata in XMP |`-J` or `--jsonRecursive` -|Output metadata and content from all embedded files. Combine with `-x`/`-h`/`-t`/`-m` to choose the content type (default: `-x`). +|Output metadata and content from all embedded files. Combine with `-x`/`-h`/`-t`/`-m` to choose the content type (default: `--md`). |`-r` or `--pretty-print` |For JSON, XML, and XHTML output, add newlines and whitespace for readability. @@ -378,7 +378,7 @@ This processes all files in the input directory and writes JSON metadata |Option |Description |`--handler=<X>` -|Content handler type: `t`=text, `h`=html, `x`=xml, `m`=markdown, `b`=body, `i`=ignore. Default: `t`. +|Content handler type: `t`=text, `h`=html, `x`=xml, `m`=markdown, `b`=body, `i`=ignore. Default: `m`. |`--concatenate` |Concatenate content from all embedded documents into a single content field. diff --git a/docs/modules/ROOT/pages/using-tika/server/index.adoc b/docs/modules/ROOT/pages/using-tika/server/index.adoc index e630f97887..0c92d3e60f 100644 --- a/docs/modules/ROOT/pages/using-tika/server/index.adoc +++ b/docs/modules/ROOT/pages/using-tika/server/index.adoc @@ -117,7 +117,7 @@ For the root `/tika` PUT endpoint you can also pick the handler with a header: curl -T document.pdf -H "X-Tika-Handler: markdown" http://localhost:9998/tika ---- -Accepted values: `text`, `html`, `xml`, `markdown` (or `md`), `body`, `ignore`. +Accepted values: `text`, `html`, `xml`, `markdown` (or `md`), `body`, `ignore`. The default is `markdown`. === Recursive Metadata (`/rmeta`) @@ -126,7 +126,7 @@ array of metadata objects. The handler controls the content field of each entry: [source,bash] ---- -curl -T document.pdf http://localhost:9998/rmeta # default: text +curl -T document.pdf http://localhost:9998/rmeta # default: markdown curl -T document.pdf http://localhost:9998/rmeta/text curl -T document.pdf http://localhost:9998/rmeta/html curl -T document.pdf http://localhost:9998/rmeta/xml diff --git a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java index df99107871..3314298958 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java +++ b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java @@ -78,6 +78,10 @@ public class AsyncHelper { // Translate TikaCLI xml output to the TikaAsyncCLI handler type. argList.add("--handler"); argList.add("x"); + } else if (arg.equals("--md")) { + // Translate TikaCLI markdown output to the TikaAsyncCLI handler type. + argList.add("--handler"); + argList.add("m"); } else if (arg.equals("-J") || arg.equals("--jsonRecursive")) { // TikaAsyncCLI always outputs JSON with recursive metadata (RMETA mode) // This is already the default, so we just skip this arg diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index b420db0ac2..5bea9f0da6 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -243,7 +243,7 @@ public class TikaCLI { return getTransformerHandler(output, "xml", encoding, prettyPrint); } }; - private OutputType type = XML; + private OutputType type = MARKDOWN; private final OutputType HTML = new OutputType() { @Override protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception { @@ -793,6 +793,8 @@ public class TikaCLI { handlerType = BasicContentHandlerFactory.HANDLER_TYPE.TEXT; } else if (type.equals(TEXT_MAIN)) { handlerType = BasicContentHandlerFactory.HANDLER_TYPE.BODY; + } else if (type.equals(MARKDOWN)) { + handlerType = BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN; } else if (type.equals(METADATA)) { handlerType = BasicContentHandlerFactory.HANDLER_TYPE.IGNORE; } @@ -823,10 +825,10 @@ public class TikaCLI { out.println(" writing the JSON to stdout. Redirect to save, e.g.:"); out.println(" --convert-config-xml-to-json=tika-config.xml > tika-config.json"); out.println(""); - out.println(" -x or --xml Output XHTML content (default)"); + out.println(" -x or --xml Output XHTML content"); out.println(" -h or --html Output HTML content"); out.println(" -t or --text Output plain text content (body)"); - out.println(" --md Output Markdown content (body)"); + out.println(" --md Output Markdown content (body) (default)"); out.println(" -T or --text-main Output plain text content (main content only via boilerpipe handler)"); out.println(" -A or --text-all Output all text content"); out.println(" -m or --metadata Output only metadata"); @@ -834,7 +836,7 @@ public class TikaCLI { out.println(" -y or --xmp Output metadata in XMP"); out.println(" -J or --jsonRecursive Output metadata and content from all"); out.println(" embedded files (choose content type"); - out.println(" with -x, -h, -t or -m; default is -x)"); + out.println(" with -x, -h, -t or -m; default is --md)"); out.println(" -a or --async Run Tika in async mode; must specify details in a" + " tikaConfig file"); out.println(" -l or --language Output only language"); out.println(" -d or --detect Detect document type"); @@ -915,7 +917,7 @@ public class TikaCLI { out.println(" -c, --config=<file> Tika config file (--config=<file> also accepted)"); out.println(" -p, --pluginsDir Plugins directory"); out.println(" --fileList File list (one path per line, relative to -i or absolute)"); - out.println(" --handler Handler type: t=text, h=html, x=xml, m=markdown, b=body, i=ignore"); + out.println(" --handler Handler type: t=text, h=html, x=xml, m=markdown, b=body, i=ignore (default: m)"); out.println(" --concatenate Concatenate content from all embedded documents"); out.println(" --content-only Output only extracted content (no JSON wrapper); implies --concatenate"); out.println(" --on-exists Behavior when an output file exists: exception (default), replace, skip"); diff --git a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java index bb668b7660..e59434b519 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java @@ -57,6 +57,13 @@ public class AsyncHelperTest { assertArrayEquals(expected, AsyncHelper.translateArgs(args)); } + @Test + public void testMarkdownHandler() throws Exception { + String[] args = new String[]{"--md", "input", "output"}; + String[] expected = new String[]{"--handler", "m", "input", "output"}; + assertArrayEquals(expected, AsyncHelper.translateArgs(args)); + } + @Test public void testExtractLongFormTranslatedToZ() throws Exception { // TIKA-4736: tika-app's --extract is the long form of -z. It must be diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java index 51889ac259..0cdb516f11 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java @@ -259,7 +259,7 @@ public class TikaCLITest { @Test public void testExtractJavascript() throws Exception { - String json = getParamOutContent("-J", resourcePrefix + "testPDFPackage.pdf"); + String json = getParamOutContent("-J", "-x", resourcePrefix + "testPDFPackage.pdf"); assertTrue(json.contains("type=\\\"PDActionJavaScript\\\"")); assertTrue(json.contains("MACRO")); assertTrue(json.contains("NAMES_TREE")); @@ -341,7 +341,7 @@ public class TikaCLITest { */ @Test public void testListMetModels() throws Exception { - String content = getParamOutContent("--list-met-models", resourcePrefix + "alice.cli.test"); + String content = getParamOutContent("--list-met-models", "-x", resourcePrefix + "alice.cli.test"); assertTrue(content.contains("text/plain")); } @@ -663,7 +663,7 @@ public class TikaCLITest { @Test public void testConfig() throws Exception { - String content = getParamOutContent("--config=" + CONFIGS_DIR.toString() + "/tika-config1.json", resourcePrefix + "bad_xml.xml"); + String content = getParamOutContent("--config=" + CONFIGS_DIR.toString() + "/tika-config1.json", "-x", resourcePrefix + "bad_xml.xml"); assertTrue(content.contains("apple")); assertTrue(content.contains("org.apache.tika.parser.html.JSoupParser")); } @@ -679,8 +679,12 @@ public class TikaCLITest { @Test public void testJsonRecursiveMetadataParserDefault() throws Exception { + // TIKA-4663: default handler is markdown, so recursive content is markdown, not XHTML. String content = getParamOutContent("-J", "-r", resourcePrefix + "test_recursive_embedded.docx"); - assertTrue(content.contains("\"X-TIKA:content\" : \"<html xmlns=\\\"http://www.w3.org/1999/xhtml")); + assertFalse(content.contains("<html xmlns=\\\"http://www.w3.org/1999/xhtml"), + "default recursive content should be markdown, not XHTML"); + assertTrue(content.contains("# embed1.zip"), + "default recursive content should be markdown (heading syntax)"); } @Test diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java index 53d382b2e0..a7907fe3b7 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java +++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java @@ -47,7 +47,7 @@ public class BasicContentHandlerFactory implements StreamingContentHandlerFactor /** * No-arg constructor for bean-style configuration (e.g., Jackson deserialization). - * Creates a factory with TEXT handler type, unlimited write, and throwOnWriteLimitReached=true. + * Creates a factory with MARKDOWN handler type, unlimited write, and throwOnWriteLimitReached=true. */ public BasicContentHandlerFactory() { } diff --git a/tika-grpc/src/main/proto/tika.proto b/tika-grpc/src/main/proto/tika.proto index 0a64f37115..1979b2b162 100644 --- a/tika-grpc/src/main/proto/tika.proto +++ b/tika-grpc/src/main/proto/tika.proto @@ -101,6 +101,7 @@ message FetchAndParseRequest { // The ID of the emitter to use (optional). If not provided, no emitter will be used. string emitter_id = 4; // Optional JSON object to configure the ParseContext for this request, overriding server defaults. + // When unset, the content handler defaults to markdown. // Keys are parse-context component names; values are their JSON configs. // Example: {"basic-content-handler-factory": {"type": "HTML"}, "timeout-limits": {"progressTimeoutMillis": 30000}} // See the parse-context.idx component registry for available component names. diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java index fb19447111..d9b96d359b 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java @@ -71,7 +71,7 @@ public class TikaAsyncCLI { options.addOption(null, "Xmx", true, "heap for the forked clients, e.g. --Xmx 1g"); options.addOption("h", "help", false, "this help message"); options.addOption("T", "timeoutMs", true, "timeout for each parse in milliseconds"); - options.addOption(null, "handler", true, "handler type: t=text, h=html, x=xml, m=markdown, b=body, i=ignore"); + options.addOption(null, "handler", true, "handler type: t=text, h=html, x=xml, m=markdown, b=body, i=ignore (default: m)"); options.addOption("p", "pluginsDir", true, "plugins directory"); options.addOption("l", "fileList", true, "file containing one path per line (relative to inputDir or absolute)"); @@ -164,7 +164,7 @@ public class TikaAsyncCLI { if (args.length == 2 && ! args[0].startsWith("-")) { return new SimpleAsyncConfig(args[0], args[1], 1, 30000L, "-Xmx1g", null, null, - BasicContentHandlerFactory.HANDLER_TYPE.TEXT, + BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN, SimpleAsyncConfig.ExtractBytesMode.NONE, null); } @@ -185,7 +185,7 @@ public class TikaAsyncCLI { String tikaConfig = null; String asyncConfig = null; String pluginsDir = null; - BasicContentHandlerFactory.HANDLER_TYPE handlerType = BasicContentHandlerFactory.HANDLER_TYPE.TEXT; + BasicContentHandlerFactory.HANDLER_TYPE handlerType = BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN; SimpleAsyncConfig.ExtractBytesMode extractBytesMode = SimpleAsyncConfig.ExtractBytesMode.NONE; if (line.hasOption("i")) { inputDir = line.getOptionValue("i"); diff --git a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java index b8960b7c08..975506989e 100644 --- a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java +++ b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java @@ -46,6 +46,8 @@ public class AsyncCliParserTest { assertEquals(1, simpleAsyncConfig.getNumClients()); assertEquals(30000L, simpleAsyncConfig.getTimeoutMs()); assertEquals("-Xmx1g", simpleAsyncConfig.getXmx()); + // TIKA-4663: default content handler is markdown + assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN, simpleAsyncConfig.getHandlerType()); simpleAsyncConfig = TikaAsyncCLI.parseCommandLine(new String[]{"-o", "output", "-i", "input"}); assertEquals("input", simpleAsyncConfig.getInputDir()); @@ -54,6 +56,7 @@ public class AsyncCliParserTest { assertNull(simpleAsyncConfig.getNumClients()); assertNull(simpleAsyncConfig.getTimeoutMs()); assertNull(simpleAsyncConfig.getXmx()); + assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN, simpleAsyncConfig.getHandlerType()); simpleAsyncConfig = TikaAsyncCLI.parseCommandLine(new String[]{"-output", "output", "-input", "input"}); assertEquals("input", simpleAsyncConfig.getInputDir()); diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java index a129e47f30..2f33377762 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java @@ -51,7 +51,7 @@ import org.apache.tika.server.core.MetadataList; public class RecursiveMetadataResource { protected static final String HANDLER_TYPE_PARAM = "handler"; - protected static final BasicContentHandlerFactory.HANDLER_TYPE DEFAULT_HANDLER_TYPE = BasicContentHandlerFactory.HANDLER_TYPE.XML; + protected static final BasicContentHandlerFactory.HANDLER_TYPE DEFAULT_HANDLER_TYPE = BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN; private static final Logger LOG = LoggerFactory.getLogger(RecursiveMetadataResource.class); /** diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java index 106db976d9..930225db26 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java @@ -18,6 +18,7 @@ package org.apache.tika.server.core; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; import java.io.InputStream; import java.io.InputStreamReader; @@ -71,6 +72,25 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { assertContains("null pointer message", metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION)); } + + @Test + public void testDefaultHandlerIsMarkdown() throws Exception { + // TIKA-4663: /rmeta with no handler now defaults to markdown (was xml). + String defaultContent = rmetaContent(""); + assertEquals(rmetaContent("/markdown"), defaultContent, + "default /rmeta handler should be markdown"); + assertNotEquals(rmetaContent("/xml"), defaultContent, + "default /rmeta handler should no longer be xml"); + } + + private String rmetaContent(String handlerSuffix) throws Exception { + Response response = WebClient + .create(endPoint + META_PATH + handlerSuffix) + .accept("application/json") + .put(ClassLoader.getSystemResourceAsStream(TEST_NULL_POINTER)); + Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); + return JsonMetadataList.fromJson(reader).get(0).get(TikaCoreProperties.TIKA_CONTENT); + } /* @Test public void testWriteLimitInAll() throws Exception {
