This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch docs/pipes-updates in repository https://gitbox.apache.org/repos/asf/tika.git
commit 06270d3ae2d0b2f31e173f3fcbdb5376497d9432 Author: tallison <[email protected]> AuthorDate: Mon May 11 16:17:42 2026 -0400 azblob --- .../ROOT/examples/pipes-azblob-emitter.json | 1 + .../ROOT/examples/pipes-azblob-fetcher.json | 1 + .../ROOT/examples/pipes-azblob-iterator.json | 1 + .../ROOT/examples/pipes-azblob-pipeline.json | 1 + docs/modules/ROOT/nav.adoc | 1 + docs/modules/ROOT/pages/pipes/plugins/azblob.adoc | 185 +++++++++++++++++++++ .../tika/pipes/azblob/ConfigExamplesTest.java | 134 +++++++++++++++ .../resources/config-examples/az-blob-emitter.json | 14 ++ .../resources/config-examples/az-blob-fetcher.json | 13 ++ .../config-examples/az-blob-pipeline.json | 45 +++++ .../config-examples/az-blob-pipes-iterator.json | 13 ++ 11 files changed, 409 insertions(+) diff --git a/docs/modules/ROOT/examples/pipes-azblob-emitter.json b/docs/modules/ROOT/examples/pipes-azblob-emitter.json new file mode 120000 index 0000000000..8213f434fa --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-azblob-emitter.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-emitter.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-azblob-fetcher.json b/docs/modules/ROOT/examples/pipes-azblob-fetcher.json new file mode 120000 index 0000000000..c7d8ce2d52 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-azblob-fetcher.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-fetcher.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-azblob-iterator.json b/docs/modules/ROOT/examples/pipes-azblob-iterator.json new file mode 120000 index 0000000000..bc68d45fb0 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-azblob-iterator.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipes-iterator.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-azblob-pipeline.json b/docs/modules/ROOT/examples/pipes-azblob-pipeline.json new file mode 120000 index 0000000000..1e3c9dc860 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-azblob-pipeline.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipeline.json \ No newline at end of file diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index e5e2a09624..e72c1d637b 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -35,6 +35,7 @@ *** xref:pipes/plugins/filesystem.adoc[File System] *** xref:pipes/plugins/s3.adoc[Amazon S3] *** xref:pipes/plugins/gcs.adoc[Google Cloud Storage] +*** xref:pipes/plugins/azblob.adoc[Azure Blob Storage] * xref:configuration/index.adoc[Configuration] ** xref:configuration/parsers/pdf-parser.adoc[PDF Parser] ** xref:configuration/parsers/tesseract-ocr-parser.adoc[Tesseract OCR] diff --git a/docs/modules/ROOT/pages/pipes/plugins/azblob.adoc b/docs/modules/ROOT/pages/pipes/plugins/azblob.adoc new file mode 100644 index 0000000000..1e462b0f70 --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/plugins/azblob.adoc @@ -0,0 +1,185 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Azure Blob Storage Plugin +:toc: +:toclevels: 3 + +The Azure Blob Storage plugin (`tika-pipes-az-blob`) provides fetcher, emitter, and iterator interfaces for blobs in Azure Storage containers. + +[cols="2,1,3"] +|=== +|Interface |Component name |Class + +|Fetcher +|`az-blob-fetcher` +|`AZBlobFetcher` + +|Emitter +|`az-blob-emitter` +|`AZBlobEmitter` + +|Iterator +|`az-blob-pipes-iterator` +|`AZBlobPipesIterator` +|=== + +[#credentials] +== Credentials + +All three components authenticate with a SAS (shared-access-signature) token. There are no other auth modes — managed identity, account keys, and AD-based auth are not currently exposed. + +* `endpoint` — base URL of the storage account, e.g., `https://myaccount.blob.core.windows.net`. +* `sasToken` — the URL query-string portion of a generated SAS, without a leading `?`. Permissions in the token must match the operations the component will perform (read for fetchers/iterators, read+write for emitters). + +The emitter's `validate()` enforces that `sasToken`, `endpoint`, and `container` are all non-blank, but does not parse the SAS itself — invalid or expired tokens fail later when the Azure SDK makes a request. + +[#az-blob-fetcher] +== Azure Blob Fetcher (`az-blob-fetcher`) + +Reads blobs from an Azure Storage container. The fetch key is the blob name. + +[source,json] +---- +include::example$pipes-azblob-fetcher.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`endpoint` +|_required_ +|Storage account URL. + +|`container` +|_required_ +|Container name. + +|`sasToken` +|_required_ +|SAS token granting read access to the container. + +|`spoolToTemp` +|`true` +|If `true`, the fetched blob is spooled to a temp file before parsing. + +|`extractUserMetadata` +|`true` +|If `true`, blob user-metadata is copied into the parsed `Metadata`. +|=== + +[#az-blob-emitter] +== Azure Blob Emitter (`az-blob-emitter`) + +Writes parsed results to an Azure Storage container. The emit key (relative to `prefix`) is derived from the `FetchEmitTuple`. + +[source,json] +---- +include::example$pipes-azblob-emitter.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`endpoint` +|_required_ +|Storage account URL (validated non-blank). + +|`container` +|_required_ +|Destination container name (validated non-blank). + +|`sasToken` +|_required_ +|SAS token granting read+write access (validated non-blank). + +|`prefix` +|_no default_ +|Optional blob-name prefix. A trailing `/` is stripped automatically. + +|`fileExtension` +|`json` +|Extension appended to each emitted blob name. + +|`overwriteExisting` +|`false` +|If `true`, an existing blob with the same name is overwritten; otherwise the emit fails. +|=== + +[#az-blob-iterator] +== Azure Blob Iterator (`az-blob-pipes-iterator`) + +Lists blobs under a container/prefix and emits one `FetchEmitTuple` per blob. + +[source,json] +---- +include::example$pipes-azblob-iterator.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`endpoint` +|_required_ +|Storage account URL. + +|`container` +|_required_ +|Container to enumerate. + +|`sasToken` +|_required_ +|SAS token granting list+read access. + +|`prefix` +|`""` +|Blob-name prefix to scope the listing. + +|`timeoutMillis` +|`360000` +|Per-request timeout, in milliseconds (6 minutes by default). + +|`fetcherId` / `emitterId` +|_required_ +|IDs of the fetcher and emitter to bind to each emitted tuple. See xref:pipes/iterators.adoc[Pipes Iterators] for the shared iterator contract. +|=== + +[#az-blob-pipeline] +== Complete Pipeline Example + +The example below wires the Azure Blob fetcher, emitter, and iterator together into a container-to-container pipeline. + +[source,json] +---- +include::example$pipes-azblob-pipeline.json[] +---- + +[#notes] +== Notes + +* SAS tokens have an expiration baked in. For long-running pipelines, rotate the SAS or use a token that outlives the pipeline window. +* Avoid checking real SAS tokens into source control — the strings in the examples above are placeholders. +* Each component creates its own `BlobServiceClient`. The Azure SDK pools HTTP connections per client. diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/azblob/ConfigExamplesTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/azblob/ConfigExamplesTest.java new file mode 100644 index 0000000000..0a083f608a --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/azblob/ConfigExamplesTest.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.azblob; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.pipes.emitter.azblob.AZBlobEmitterConfig; +import org.apache.tika.pipes.fetcher.azblob.config.AZBlobFetcherConfig; +import org.apache.tika.pipes.iterator.azblob.AZBlobPipesIteratorConfig; + +/** + * Validates Azure Blob fetcher/emitter/iterator configuration examples used in documentation. + * <p> + * The JSON configuration examples are stored in {@code src/test/resources/config-examples/} + * and are included directly in the AsciiDoc documentation via the {@code include::} directive. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @TempDir + Path tempDir; + + private String readExample(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } + } + + private void loadViaTikaLoader(String resourceName) throws Exception { + String json = readExample(resourceName); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + TikaLoader loader = TikaLoader.load(configFile); + assertNotNull(loader, "TikaLoader should not be null for: " + resourceName); + } + + private JsonNode innerComponent(String json, String section, String id, String typeName) + throws Exception { + JsonNode root = OBJECT_MAPPER.readTree(json); + JsonNode sectionNode = root.get(section); + assertNotNull(sectionNode, "Missing section: " + section); + JsonNode idNode = id == null ? sectionNode : sectionNode.get(id); + assertNotNull(idNode, "Missing id: " + id); + JsonNode typed = idNode.get(typeName); + assertNotNull(typed, "Missing type: " + typeName); + return typed; + } + + @Test + public void testAZBlobFetcherConfig() throws Exception { + loadViaTikaLoader("az-blob-fetcher.json"); + + JsonNode inner = innerComponent(readExample("az-blob-fetcher.json"), + "fetchers", "azf", "az-blob-fetcher"); + AZBlobFetcherConfig config = AZBlobFetcherConfig.load(inner.toString()); + assertEquals("tika-input", config.getContainer()); + assertEquals("https://myaccount.blob.core.windows.net", config.getEndpoint()); + assertNotNull(config.getSasToken()); + } + + @Test + public void testAZBlobEmitterConfig() throws Exception { + loadViaTikaLoader("az-blob-emitter.json"); + + JsonNode inner = innerComponent(readExample("az-blob-emitter.json"), + "emitters", "aze", "az-blob-emitter"); + AZBlobEmitterConfig config = AZBlobEmitterConfig.load(inner.toString()); + assertEquals("tika-output", config.container()); + assertEquals("json", config.fileExtension()); + config.validate(); + assertEquals("results", config.getNormalizedPrefix()); + } + + @Test + public void testAZBlobIteratorConfig() throws Exception { + loadViaTikaLoader("az-blob-pipes-iterator.json"); + + JsonNode inner = innerComponent(readExample("az-blob-pipes-iterator.json"), + "pipes-iterator", null, "az-blob-pipes-iterator"); + AZBlobPipesIteratorConfig config = AZBlobPipesIteratorConfig.load(inner.toString()); + assertEquals("tika-input", config.getContainer()); + assertEquals("incoming/", config.getPrefix()); + assertEquals(360000L, config.getTimeoutMillis()); + assertEquals("azf", config.getFetcherId()); + assertEquals("aze", config.getEmitterId()); + } + + @Test + public void testAZBlobPipelineConfig() throws Exception { + loadViaTikaLoader("az-blob-pipeline.json"); + + String json = readExample("az-blob-pipeline.json"); + AZBlobFetcherConfig fetcher = AZBlobFetcherConfig.load( + innerComponent(json, "fetchers", "azf", "az-blob-fetcher").toString()); + AZBlobEmitterConfig emitter = AZBlobEmitterConfig.load( + innerComponent(json, "emitters", "aze", "az-blob-emitter").toString()); + AZBlobPipesIteratorConfig iterator = AZBlobPipesIteratorConfig.load( + innerComponent(json, "pipes-iterator", null, "az-blob-pipes-iterator").toString()); + + emitter.validate(); + assertEquals(fetcher.getContainer(), iterator.getContainer()); + assertEquals("azf", iterator.getFetcherId()); + assertEquals("aze", iterator.getEmitterId()); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-emitter.json b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-emitter.json new file mode 100644 index 0000000000..9d102868c9 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-emitter.json @@ -0,0 +1,14 @@ +{ + "emitters": { + "aze": { + "az-blob-emitter": { + "endpoint": "https://myaccount.blob.core.windows.net", + "container": "tika-output", + "sasToken": "sv=2024-11-04&ss=b&srt=sco&sp=rwl&se=2030-01-01T00:00:00Z&st=2024-01-01T00:00:00Z&spr=https&sig=REDACTED", + "prefix": "results/", + "fileExtension": "json", + "overwriteExisting": false + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-fetcher.json b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-fetcher.json new file mode 100644 index 0000000000..aebdcedf93 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-fetcher.json @@ -0,0 +1,13 @@ +{ + "fetchers": { + "azf": { + "az-blob-fetcher": { + "endpoint": "https://myaccount.blob.core.windows.net", + "container": "tika-input", + "sasToken": "sv=2024-11-04&ss=b&srt=sco&sp=rl&se=2030-01-01T00:00:00Z&st=2024-01-01T00:00:00Z&spr=https&sig=REDACTED", + "extractUserMetadata": true, + "spoolToTemp": true + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipeline.json b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipeline.json new file mode 100644 index 0000000000..65181a1a57 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipeline.json @@ -0,0 +1,45 @@ +{ + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "throwOnWriteLimitReached": true + } + }, + "fetchers": { + "azf": { + "az-blob-fetcher": { + "endpoint": "https://myaccount.blob.core.windows.net", + "container": "tika-input", + "sasToken": "sv=2024-11-04&ss=b&srt=sco&sp=rl&se=2030-01-01T00:00:00Z&st=2024-01-01T00:00:00Z&spr=https&sig=REDACTED", + "extractUserMetadata": true + } + } + }, + "emitters": { + "aze": { + "az-blob-emitter": { + "endpoint": "https://myaccount.blob.core.windows.net", + "container": "tika-output", + "sasToken": "sv=2024-11-04&ss=b&srt=sco&sp=rwl&se=2030-01-01T00:00:00Z&st=2024-01-01T00:00:00Z&spr=https&sig=REDACTED", + "prefix": "results/", + "fileExtension": "json" + } + } + }, + "pipes-iterator": { + "az-blob-pipes-iterator": { + "endpoint": "https://myaccount.blob.core.windows.net", + "container": "tika-input", + "sasToken": "sv=2024-11-04&ss=b&srt=sco&sp=rl&se=2030-01-01T00:00:00Z&st=2024-01-01T00:00:00Z&spr=https&sig=REDACTED", + "prefix": "incoming/", + "fetcherId": "azf", + "emitterId": "aze" + } + }, + "pipes": { + "parseMode": "RMETA", + "onParseException": "EMIT", + "numClients": 4 + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipes-iterator.json b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipes-iterator.json new file mode 100644 index 0000000000..e2875fa92f --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipes-iterator.json @@ -0,0 +1,13 @@ +{ + "pipes-iterator": { + "az-blob-pipes-iterator": { + "endpoint": "https://myaccount.blob.core.windows.net", + "container": "tika-input", + "sasToken": "sv=2024-11-04&ss=b&srt=sco&sp=rl&se=2030-01-01T00:00:00Z&st=2024-01-01T00:00:00Z&spr=https&sig=REDACTED", + "prefix": "incoming/", + "timeoutMillis": 360000, + "fetcherId": "azf", + "emitterId": "aze" + } + } +}
