This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch haystack-jina-embedding in repository https://gitbox.apache.org/repos/asf/tika.git
commit 44e81fa0ce023d7cc7e030ae4d811689216e3483 Author: tballison <[email protected]> AuthorDate: Wed May 13 12:43:08 2026 -0400 improve jina-integration --- .../tika/inference/AbstractEmbeddingFilter.java | 2 +- .../org/apache/tika/inference/ChunkSerializer.java | 11 +- .../apache/tika/inference/JinaEmbeddingFilter.java | 91 ++++++++++++ .../tika/inference/OpenAIImageEmbeddingParser.java | 10 +- .../tika/inference/JinaEmbeddingFilterTest.java | 159 +++++++++++++++++++++ 5 files changed, 269 insertions(+), 4 deletions(-) diff --git a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/AbstractEmbeddingFilter.java b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/AbstractEmbeddingFilter.java index 5d5cf78282..a89573c29b 100644 --- a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/AbstractEmbeddingFilter.java +++ b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/AbstractEmbeddingFilter.java @@ -143,7 +143,7 @@ public abstract class AbstractEmbeddingFilter extends MetadataFilter { i, Math.min(i + batchSize, chunks.size())); embed(batch, defaultConfig); } - ChunkSerializer.mergeInto(metadata, chunks); + ChunkSerializer.mergeInto(metadata, chunks, defaultConfig.getOutputField()); } catch (IOException e) { throw new TikaException( "Embedding inference failed: " + e.getMessage(), e); diff --git a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/ChunkSerializer.java b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/ChunkSerializer.java index 95e3234884..84ecafb806 100644 --- a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/ChunkSerializer.java +++ b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/ChunkSerializer.java @@ -76,15 +76,22 @@ public final class ChunkSerializer { public static void mergeInto( org.apache.tika.metadata.Metadata metadata, List<Chunk> newChunks) throws IOException { + mergeInto(metadata, newChunks, TikaCoreProperties.TIKA_CHUNKS); + } + + public static void mergeInto( + org.apache.tika.metadata.Metadata metadata, + List<Chunk> newChunks, + String fieldName) throws IOException { List<Chunk> existing; - String current = metadata.get(TikaCoreProperties.TIKA_CHUNKS); + String current = metadata.get(fieldName); if (current != null && !current.isEmpty()) { existing = fromJson(current); } else { existing = new ArrayList<>(); } existing.addAll(newChunks); - metadata.set(TikaCoreProperties.TIKA_CHUNKS, toJson(existing)); + metadata.set(fieldName, toJson(existing)); } /** diff --git a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/JinaEmbeddingFilter.java b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/JinaEmbeddingFilter.java new file mode 100644 index 0000000000..aacd32447b --- /dev/null +++ b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/JinaEmbeddingFilter.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.inference; + +import java.util.List; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; + +import org.apache.tika.config.TikaComponent; +import org.apache.tika.utils.StringUtils; + +/** + * Extends {@link OpenAIEmbeddingFilter} for + * <a href="https://jina.ai/embeddings/">Jina AI v5 text embeddings</a>. + * <p> + * The only difference from the standard OpenAI format is an optional + * {@code "task"} field in the request body that instructs the Jina model + * how to optimise the embedding. Supported values include + * {@code retrieval.passage} (default, for indexing documents), + * {@code retrieval.query} (for query-time embeddings), + * {@code text-matching}, {@code classification}, and {@code separation}. + * <p> + * Configuration key: {@code "jina-embedding-filter"} + * + * @since Apache Tika 4.0 + */ +@TikaComponent(name = "jina-embedding-filter", spi = false) +public class JinaEmbeddingFilter extends OpenAIEmbeddingFilter { + + private static final long serialVersionUID = 1L; + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + /** + * Jina task type. Default: {@code retrieval.passage} (for indexing). + * Set to {@code retrieval.query} when embedding search queries. + */ + private String task = "retrieval.passage"; + + public JinaEmbeddingFilter() { + super(); + } + + public JinaEmbeddingFilter(InferenceConfig config) { + super(config); + } + + @Override + String buildRequest(List<Chunk> chunks, InferenceConfig config) { + ObjectNode root = MAPPER.createObjectNode(); + if (!StringUtils.isBlank(config.getModel())) { + root.put("model", config.getModel()); + } + if (!StringUtils.isBlank(task)) { + root.put("task", task); + } + ArrayNode input = root.putArray("input"); + for (Chunk chunk : chunks) { + input.add(chunk.getText()); + } + return root.toString(); + } + + public String getTask() { + return task; + } + + /** + * Set the Jina task type. Default is {@code retrieval.passage}. + * Use {@code retrieval.query} when embedding search queries. + */ + public void setTask(String task) { + this.task = task; + } +} diff --git a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java index ea771df89e..b186d5465e 100644 --- a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java +++ b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java @@ -177,7 +177,7 @@ public class OpenAIImageEmbeddingParser implements Parser, Initializable, Closea Chunk chunk = new Chunk(null, locators); chunk.setVector(vector); - ChunkSerializer.mergeInto(metadata, List.of(chunk)); + ChunkSerializer.mergeInto(metadata, List.of(chunk), config.getOutputField()); XHTMLContentHandler xhtml = new XHTMLContentHandler( handler, metadata, parseContext); @@ -371,6 +371,14 @@ public class OpenAIImageEmbeddingParser implements Parser, Initializable, Closea defaultConfig.setMaxFileSizeToEmbed(maxFileSizeToEmbed); } + public String getOutputField() { + return defaultConfig.getOutputField(); + } + + public void setOutputField(String outputField) { + defaultConfig.setOutputField(outputField); + } + // ---- Azure / endpoint config getters/setters ---------------------------- public String getEmbeddingsPath() { diff --git a/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/JinaEmbeddingFilterTest.java b/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/JinaEmbeddingFilterTest.java new file mode 100644 index 0000000000..803eb48523 --- /dev/null +++ b/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/JinaEmbeddingFilterTest.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.inference; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.ArrayList; +import java.util.List; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import org.apache.tika.http.TikaTestHttpServer; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; + +public class JinaEmbeddingFilterTest { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + private TikaTestHttpServer server; + private JinaEmbeddingFilter filter; + private InferenceConfig config; + + @BeforeEach + void setUp() throws Exception { + server = new TikaTestHttpServer(); + + config = new InferenceConfig(); + config.setBaseUrl(server.url()); + config.setModel("jina-embeddings-v3"); + config.setMaxChunkChars(500); + config.setOverlapChars(0); + config.setTimeoutSeconds(10); + + filter = new JinaEmbeddingFilter(config); + } + + @AfterEach + void tearDown() { + server.shutdown(); + } + + @Test + void testDefaultTaskInRequest() throws Exception { + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildEmbeddingResponse(1, 3))); + + Metadata metadata = new Metadata(); + metadata.set(TikaCoreProperties.TIKA_CONTENT.getName(), "Some document text."); + List<Metadata> list = new ArrayList<>(); + list.add(metadata); + filter.filter(list); + + TikaTestHttpServer.RecordedRequest request = server.takeRequest(); + JsonNode body = MAPPER.readTree(request.body()); + assertEquals("retrieval.passage", body.get("task").asText(), + "Default task should be retrieval.passage"); + assertEquals("jina-embeddings-v3", body.get("model").asText()); + } + + @Test + void testCustomTask() throws Exception { + filter.setTask("retrieval.query"); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildEmbeddingResponse(1, 3))); + + Metadata metadata = new Metadata(); + metadata.set(TikaCoreProperties.TIKA_CONTENT.getName(), "What is Tika?"); + List<Metadata> list = new ArrayList<>(); + list.add(metadata); + filter.filter(list); + + TikaTestHttpServer.RecordedRequest request = server.takeRequest(); + JsonNode body = MAPPER.readTree(request.body()); + assertEquals("retrieval.query", body.get("task").asText()); + } + + @Test + void testBuildRequestJsonShape() { + List<Chunk> chunks = List.of( + new Chunk("Hello", 0, 5), + new Chunk("World", 6, 11)); + String json = filter.buildRequest(chunks, config); + + assertTrue(json.contains("\"task\":\"retrieval.passage\""), + "Should include task field: " + json); + assertTrue(json.contains("\"model\":\"jina-embeddings-v3\""), + "Should include model field: " + json); + assertTrue(json.contains("\"Hello\""), "Should include first chunk"); + assertTrue(json.contains("\"World\""), "Should include second chunk"); + } + + @Test + void testEndToEnd() throws Exception { + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildEmbeddingResponse(2, 4))); + + String content = "# Introduction\n\nFirst section text.\n\n" + + "# Background\n\nSecond section text."; + + Metadata metadata = new Metadata(); + metadata.set(TikaCoreProperties.TIKA_CONTENT.getName(), content); + List<Metadata> list = new ArrayList<>(); + list.add(metadata); + filter.filter(list); + + String chunksJson = metadata.get("tika:chunks"); + assertNotNull(chunksJson, "Should have tika:chunks"); + + List<Chunk> chunks = ChunkSerializer.fromJson(chunksJson); + assertEquals(2, chunks.size()); + assertNotNull(chunks.get(0).getVector()); + assertNotNull(chunks.get(1).getVector()); + assertEquals(4, chunks.get(0).getVector().length); + } + + private String buildEmbeddingResponse(int numVectors, int dims) { + StringBuilder sb = new StringBuilder(); + sb.append("{\"object\":\"list\",\"data\":["); + for (int i = 0; i < numVectors; i++) { + if (i > 0) { + sb.append(","); + } + sb.append("{\"object\":\"embedding\",\"index\":").append(i); + sb.append(",\"embedding\":["); + for (int d = 0; d < dims; d++) { + if (d > 0) { + sb.append(","); + } + sb.append(String.format(java.util.Locale.ROOT, + "%.6f", (i + 1) * 0.1 + d * 0.01)); + } + sb.append("]}"); + } + sb.append("],\"model\":\"jina-embeddings-v3\","); + sb.append("\"usage\":{\"prompt_tokens\":10,\"total_tokens\":10}}"); + return sb.toString(); + } +}
