[
https://issues.apache.org/jira/browse/TIKA-4606?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18062291#comment-18062291
]
ASF GitHub Bot commented on TIKA-4606:
--------------------------------------
Copilot commented on code in PR #2655:
URL: https://github.com/apache/tika/pull/2655#discussion_r2875110807
##########
tika-e2e-tests/tika-grpc/src/test/java/org/apache/tika/pipes/ExternalTestBase.java:
##########
@@ -0,0 +1,366 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+import java.time.Duration;
+import java.time.temporal.ChronoUnit;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+import java.util.regex.Pattern;
+import java.util.stream.Stream;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import io.grpc.ManagedChannel;
+import io.grpc.ManagedChannelBuilder;
+import lombok.extern.slf4j.Slf4j;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Tag;
+import org.junit.jupiter.api.TestInstance;
+import org.testcontainers.containers.DockerComposeContainer;
+import org.testcontainers.containers.output.Slf4jLogConsumer;
+import org.testcontainers.containers.wait.strategy.Wait;
+import org.testcontainers.junit.jupiter.Testcontainers;
+
+import org.apache.tika.FetchAndParseReply;
+import org.apache.tika.ListFetchersRequest;
+import org.apache.tika.TikaGrpc;
+
+@TestInstance(TestInstance.Lifecycle.PER_CLASS)
+@Testcontainers
+@Slf4j
+@Tag("E2ETest")
+public abstract class ExternalTestBase {
+ public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+ public static final int MAX_STARTUP_TIMEOUT = 120;
+ public static final String GOV_DOCS_FOLDER = "/tika/govdocs1";
+ public static final File TEST_FOLDER = new File("target", "govdocs1");
+ public static final int GOV_DOCS_FROM_IDX =
Integer.parseInt(System.getProperty("govdocs1.fromIndex", "1"));
+ public static final int GOV_DOCS_TO_IDX =
Integer.parseInt(System.getProperty("govdocs1.toIndex", "1"));
+ public static final String DIGITAL_CORPORA_ZIP_FILES_URL =
"https://corp.digitalcorpora.org/corpora/files/govdocs1/zipfiles";
+ private static final boolean USE_LOCAL_SERVER =
Boolean.parseBoolean(System.getProperty("tika.e2e.useLocalServer", "true"));
+ private static final int GRPC_PORT =
Integer.parseInt(System.getProperty("tika.e2e.grpcPort", "50052"));
+
+ public static DockerComposeContainer<?> composeContainer;
+ private static Process localGrpcProcess;
+
+ @BeforeAll
+ static void setup() throws Exception {
+ loadGovdocs1();
+
+ if (USE_LOCAL_SERVER) {
+ startLocalGrpcServer();
+ } else {
+ startDockerGrpcServer();
+ }
+ }
+
+ private static void startLocalGrpcServer() throws Exception {
+ log.info("Starting local tika-grpc server using Maven exec");
+
+ Path tikaGrpcDir = findTikaGrpcDirectory();
+ Path configFile =
Path.of("src/test/resources/tika-config.json").toAbsolutePath();
+
+ if (!Files.exists(configFile)) {
+ throw new IllegalStateException("Config file not found: " +
configFile);
+ }
+
+ log.info("Using tika-grpc from: {}", tikaGrpcDir);
+ log.info("Using config file: {}", configFile);
+
+ String javaHome = System.getProperty("java.home");
+ boolean isWindows =
System.getProperty("os.name").toLowerCase(Locale.ROOT).contains("win");
+ String javaCmd = javaHome + (isWindows ? "\\bin\\java.exe" :
"/bin/java");
+ String mvnCmd = tikaGrpcDir.getParent().resolve(isWindows ? "mvnw.cmd"
: "mvnw").toString();
+
+ ProcessBuilder pb = new ProcessBuilder(
+ mvnCmd,
+ "exec:exec",
+ "-Dexec.executable=" + javaCmd,
+ "-Dexec.args=" +
+ "--add-opens=java.base/java.lang=ALL-UNNAMED " +
+ "--add-opens=java.base/java.nio=ALL-UNNAMED " +
+ "--add-opens=java.base/java.util=ALL-UNNAMED " +
+ "--add-opens=java.base/java.util.concurrent=ALL-UNNAMED " +
+ "-classpath %classpath " +
+ "org.apache.tika.pipes.grpc.TikaGrpcServer " +
+ "-c \"" + configFile + "\" " +
+ "-p " + GRPC_PORT
+ );
+
+ pb.directory(tikaGrpcDir.toFile());
+ pb.redirectErrorStream(true);
+ pb.redirectOutput(ProcessBuilder.Redirect.PIPE);
+
+ localGrpcProcess = pb.start();
+
+ Thread logThread = new Thread(() -> {
+ try (BufferedReader reader = new BufferedReader(
+ new InputStreamReader(localGrpcProcess.getInputStream(),
StandardCharsets.UTF_8))) {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ log.info("tika-grpc: {}", line);
+ }
+ } catch (IOException e) {
+ log.error("Error reading server output", e);
+ }
+ });
+ logThread.setDaemon(true);
+ logThread.start();
+
+ waitForServerReady();
+
+ log.info("Local tika-grpc server started successfully on port {}",
GRPC_PORT);
+ }
+
+ private static Path findTikaGrpcDirectory() {
+ Path currentDir = Path.of("").toAbsolutePath();
+ Path tikaRootDir = currentDir;
+
+ while (tikaRootDir != null &&
+ !(Files.exists(tikaRootDir.resolve("tika-grpc")) &&
+ Files.exists(tikaRootDir.resolve("tika-e2e-tests")))) {
+ tikaRootDir = tikaRootDir.getParent();
+ }
+
+ if (tikaRootDir == null) {
+ throw new IllegalStateException("Cannot find tika root directory.
" +
+ "Current dir: " + currentDir);
+ }
+
+ return tikaRootDir.resolve("tika-grpc");
+ }
+
+ private static void waitForServerReady() throws Exception {
+ int maxAttempts = 60;
+ for (int i = 0; i < maxAttempts; i++) {
+ ManagedChannel testChannel = ManagedChannelBuilder
+ .forAddress("localhost", GRPC_PORT)
+ .usePlaintext()
+ .build();
+ try {
+ TikaGrpc.TikaBlockingStub stub =
TikaGrpc.newBlockingStub(testChannel);
+ stub.listFetchers(ListFetchersRequest.newBuilder().build());
+ log.info("gRPC server is ready");
+ return;
+ } catch (Exception e) {
+ log.trace("gRPC server not ready yet (attempt {}/{}): {}", i +
1, maxAttempts, e.getMessage());
+ } finally {
+ testChannel.shutdown();
+ testChannel.awaitTermination(1, TimeUnit.SECONDS);
+ }
+ TimeUnit.SECONDS.sleep(1);
+ }
+
+ if (localGrpcProcess != null && localGrpcProcess.isAlive()) {
+ localGrpcProcess.destroyForcibly();
+ }
+ throw new RuntimeException("Local gRPC server failed to start within
timeout");
+ }
+
+ private static void startDockerGrpcServer() {
+ log.info("Starting Docker Compose tika-grpc server");
+
+ String composeFilePath =
System.getProperty("tika.docker.compose.file");
+ if (composeFilePath == null || composeFilePath.isBlank()) {
+ throw new IllegalStateException(
+ "Docker Compose mode requires system property
'tika.docker.compose.file' " +
+ "pointing to a valid docker-compose.yml file.");
+ }
+ File composeFile = new File(composeFilePath);
+ if (!composeFile.isFile()) {
+ throw new IllegalStateException("Docker Compose file not found: "
+ composeFile.getAbsolutePath());
+ }
+ composeContainer = new DockerComposeContainer<>(composeFile)
+ .withEnv("HOST_GOVDOCS1_DIR", TEST_FOLDER.getAbsolutePath())
+ .withStartupTimeout(Duration.of(MAX_STARTUP_TIMEOUT,
ChronoUnit.SECONDS))
+ .withExposedService("tika-grpc", 50052,
+ Wait.forLogMessage(".*Server started.*\\n", 1))
+ .withLogConsumer("tika-grpc", new Slf4jLogConsumer(log));
+
+ composeContainer.start();
+
+ log.info("Docker Compose containers started successfully");
+ }
+
+ private static void loadGovdocs1() throws IOException,
InterruptedException {
+ if (Boolean.parseBoolean(System.getProperty("tika.e2e.useGovdocs",
"false"))) {
+ // Opt-in: download the actual GovDocs1 corpus when explicitly
requested via -Dtika.e2e.useGovdocs=true.
+ // Default CI runs use committed test fixtures to avoid any
network dependency.
+ int retries = 3;
+ int attempt = 0;
+ while (true) {
+ try {
+ downloadAndUnzipGovdocs1(GOV_DOCS_FROM_IDX,
GOV_DOCS_TO_IDX);
+ break;
+ } catch (IOException e) {
+ attempt++;
+ if (attempt >= retries) {
+ throw e;
+ }
+ log.warn("Download attempt {} failed, retrying in 10
seconds...", attempt, e);
+ TimeUnit.SECONDS.sleep(10);
+ }
+ }
+ } else {
+ copyTestFixtures();
+ }
+ }
+
+ public static void copyTestFixtures() throws IOException {
+ Path targetDir = TEST_FOLDER.toPath();
+ Files.createDirectories(targetDir);
+ String[] fixtures = {"sample.txt", "sample.html", "sample.csv",
"sample.xml"};
+ for (String fixture : fixtures) {
+ URL resource = ExternalTestBase.class.getClassLoader()
+ .getResource("test-fixtures/" + fixture);
+ if (resource == null) {
+ throw new IllegalStateException("Test fixture not found:
test-fixtures/" + fixture);
+ }
+ try (InputStream in = resource.openStream()) {
+ Files.copy(in, targetDir.resolve(fixture),
StandardCopyOption.REPLACE_EXISTING);
+ }
+ }
+ log.info("Copied {} test fixtures to {}", fixtures.length, targetDir);
+ }
+
+ @AfterAll
+ void close() {
+ if (USE_LOCAL_SERVER && localGrpcProcess != null) {
+ log.info("Stopping local gRPC server");
+ localGrpcProcess.destroy();
+ try {
+ if (!localGrpcProcess.waitFor(10, TimeUnit.SECONDS)) {
+ localGrpcProcess.destroyForcibly();
+ }
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ localGrpcProcess.destroyForcibly();
+ }
+ } else if (composeContainer != null) {
+ composeContainer.close();
+ }
+ }
+
+ public static void downloadAndUnzipGovdocs1(int fromIndex, int toIndex)
throws IOException {
+ Path targetDir = TEST_FOLDER.toPath();
+ Files.createDirectories(targetDir);
+
+ for (int i = fromIndex; i <= toIndex; i++) {
+ String zipName = String.format(java.util.Locale.ROOT, "%03d.zip",
i);
+ String url = DIGITAL_CORPORA_ZIP_FILES_URL + "/" + zipName;
+ Path zipPath = targetDir.resolve(zipName);
+
+ if (Files.exists(zipPath)) {
+ log.info("{} already exists, skipping download", zipName);
+ continue;
+ }
+
+ log.info("Downloading {} from {}...", zipName, url);
+ try (InputStream in = new URL(url).openStream()) {
+ Files.copy(in, zipPath, StandardCopyOption.REPLACE_EXISTING);
+ }
+
Review Comment:
In downloadAndUnzipGovdocs1(), if the zip already exists you `continue`,
which skips the unzip step entirely. If a previous run downloaded the zip but
extracted files were cleaned (or the prior unzip was interrupted), subsequent
runs will leave the corpus empty and the tests will fail. Consider always
running the unzip step (idempotently) even when download is skipped, or at
least unzipping when extracted content is missing.
```suggestion
if (Files.exists(zipPath)) {
log.info("{} already exists, skipping download", zipName);
} else {
log.info("Downloading {} from {}...", zipName, url);
try (InputStream in = new URL(url).openStream()) {
Files.copy(in, zipPath,
StandardCopyOption.REPLACE_EXISTING);
}
}
```
##########
tika-e2e-tests/tika-grpc/README.md:
##########
@@ -0,0 +1,84 @@
+# Tika gRPC End-to-End Tests
+
+End-to-end integration tests for Apache Tika gRPC Server.
+
+## Overview
+
+This test module validates the functionality of Apache Tika gRPC Server by:
+- Starting a local tika-grpc server using the Maven exec plugin (default)
+- Parsing small committed test fixture documents
+- Testing various fetchers (filesystem, Ignite config store, etc.)
+- Verifying parsing results and metadata extraction
+
+## Prerequisites
+
+- Java 17 or later
+- Maven 3.6 or later
+- Docker and Docker Compose (only required when using
`tika.e2e.useLocalServer=false`)
+
+## Building
+
+```bash
+../../mvnw clean install
+```
+
+## Running Tests
+
+### Run all tests (default: local server mode, committed fixtures)
+
+```bash
+../../mvnw test
+```
+
+### Run specific test
+
+```bash
+../../mvnw test -Dtest=FileSystemFetcherTest
+../../mvnw test -Dtest=IgniteConfigStoreTest
+```
+
+### Test with the full GovDocs1 corpus (opt-in)
+
+By default tests use small committed fixture files. To run against the real
GovDocs1 corpus, set `govdocs1.fromIndex` to trigger a download:
+
+```bash
+../../mvnw test -Dgovdocs1.fromIndex=1 -Dgovdocs1.toIndex=1
+```
+
+To test with more documents, increase the range or set `corpa.numdocs`:
+
+```bash
+../../mvnw test -Dgovdocs1.fromIndex=1 -Dgovdocs1.toIndex=5 -Dcorpa.numdocs=100
Review Comment:
The property name is documented as `corpa.numdocs` here, but the module/test
code uses `corpus.numDocs`. This typo will cause the example command to have no
effect; please update the text and the example command to use `corpus.numDocs`.
```suggestion
To test with more documents, increase the range or set `corpus.numDocs`:
```bash
../../mvnw test -Dgovdocs1.fromIndex=1 -Dgovdocs1.toIndex=5
-Dcorpus.numDocs=100
```
##########
tika-e2e-tests/README.md:
##########
@@ -0,0 +1,59 @@
+# Apache Tika End-to-End Tests
+
+End-to-end integration tests for Apache Tika components.
+
+## Overview
+
+This module contains standalone end-to-end (E2E) tests for various Apache Tika
distribution formats and deployment modes. Unlike unit and integration tests in
the main Tika build, these E2E tests validate complete deployment scenarios
using Docker containers and real-world test data.
+
+**Note:** This module is included in the main Tika build under the `e2e` Maven
profile (`-Pe2e`). Run `mvn test -Pe2e` from the repo root to execute these
tests.
+
+## Test Modules
+
+- **tika-grpc** - E2E tests for tika-grpc server
+
+## Prerequisites
+
+- Java 17 or later
+- Maven 3.6 or later
+- Internet connection (for downloading test documents)
Review Comment:
This README lists an internet connection as a prerequisite, but the tests
default to using committed fixtures and only download GovDocs1 when explicitly
opted in (e.g., `-Dtika.e2e.useGovdocs=true`). Consider updating this
prerequisite to make the network requirement conditional, to avoid implying
CI/dev runs always need internet access.
```suggestion
- Internet connection (only when running tests that download external
corpora, e.g. with `-Dtika.e2e.useGovdocs=true`)
```
##########
tika-e2e-tests/tika-grpc/README.md:
##########
@@ -0,0 +1,84 @@
+# Tika gRPC End-to-End Tests
+
+End-to-end integration tests for Apache Tika gRPC Server.
+
+## Overview
+
+This test module validates the functionality of Apache Tika gRPC Server by:
+- Starting a local tika-grpc server using the Maven exec plugin (default)
+- Parsing small committed test fixture documents
+- Testing various fetchers (filesystem, Ignite config store, etc.)
+- Verifying parsing results and metadata extraction
+
+## Prerequisites
+
+- Java 17 or later
+- Maven 3.6 or later
+- Docker and Docker Compose (only required when using
`tika.e2e.useLocalServer=false`)
+
+## Building
+
+```bash
+../../mvnw clean install
+```
+
+## Running Tests
+
+### Run all tests (default: local server mode, committed fixtures)
+
+```bash
+../../mvnw test
+```
+
+### Run specific test
+
+```bash
+../../mvnw test -Dtest=FileSystemFetcherTest
+../../mvnw test -Dtest=IgniteConfigStoreTest
+```
+
+### Test with the full GovDocs1 corpus (opt-in)
+
+By default tests use small committed fixture files. To run against the real
GovDocs1 corpus, set `govdocs1.fromIndex` to trigger a download:
+
+```bash
+../../mvnw test -Dgovdocs1.fromIndex=1 -Dgovdocs1.toIndex=1
+```
Review Comment:
The README says GovDocs1 download is triggered by setting
`govdocs1.fromIndex`/`govdocs1.toIndex`, but the code actually gates downloads
behind `-Dtika.e2e.useGovdocs=true` and otherwise uses committed fixtures.
Please update these instructions so the documented command matches the actual
opt-in flag and behavior.
##########
.github/workflows/main-jdk17-build.yml:
##########
@@ -45,3 +45,22 @@ jobs:
cache: 'maven'
- name: Build with Maven
run: mvn clean apache-rat:check test install javadoc:aggregate -Pci -B
"-Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn"
+
+ e2e-tests:
+ runs-on: ubuntu-latest
+ timeout-minutes: 30
+ needs: build
+ strategy:
+ matrix:
+ java: [ '17' ]
+
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up JDK ${{ matrix.java }}
+ uses: actions/setup-java@v4
+ with:
+ distribution: 'temurin'
+ java-version: ${{ matrix.java }}
+ cache: 'maven'
+ - name: Run E2E Tests
+ run: mvn clean install -Pe2e -B
"-Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn"
Review Comment:
The E2E job runs `mvn clean install -Pe2e` from the repo root, which
rebuilds the entire reactor a second time after the main `build` job. To reduce
CI time and duplication, consider scoping this step to just the e2e module
(e.g., `-pl tika-e2e-tests -am` and running `test`/`verify` instead of a full
`install`), or reusing artifacts from the `build` job.
```suggestion
run: mvn -pl tika-e2e-tests -am clean verify -Pe2e -B
"-Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn"
```
> Upgrade Ignite config store to Ignite 3.x with Calcite SQL engine
> -----------------------------------------------------------------
>
> Key: TIKA-4606
> URL: https://issues.apache.org/jira/browse/TIKA-4606
> Project: Tika
> Issue Type: Improvement
> Reporter: Nicholas DiPiazza
> Assignee: Nicholas DiPiazza
> Priority: Major
>
> h2. Overview
> Upgrade the tika-pipes-config-store-ignite module from Apache Ignite 2.17.0
> (which uses H2 1.4.x) to Apache Ignite 3.x (which uses Apache Calcite SQL
> engine).
> h2. Current State
> * Module: *tika-pipes-config-store-ignite*
> * Ignite Version: 2.17.0
> * SQL Engine: H2 1.4.197 (embedded)
> * Location: {{tika-pipes/tika-pipes-config-store-ignite/}}
> h2. Goals
> # Upgrade to Apache Ignite 3.x (latest stable release)
> # Replace H2 SQL engine with Calcite-based SQL engine
> # Maintain all existing functionality for config store
> # Update API calls to match Ignite 3.x breaking changes
> # Ensure backward compatibility for stored configurations (if possible)
> h2. Benefits
> * Modern SQL engine with Apache Calcite
> * Better performance and query optimization
> * Active maintenance and future support
> * Improved SQL feature set
> * No dependency on old H2 1.4.x (2018)
> h2. Breaking Changes to Address
> * Ignite 3.x has major API changes from 2.x
> * Configuration format changes
> * Cache API differences
> * SQL query API updates
> * Client connection changes
> h2. Implementation Steps
> # Research Ignite 3.x API changes and migration guide
> # Update Maven dependencies to Ignite 3.x
> # Refactor {{IgniteConfigStore}} to use new Ignite 3.x API
> # Update {{IgniteStoreServer}} for new connection model
> # Modify SQL queries if needed for Calcite compatibility
> # Update configuration handling
> # Update tests to work with Ignite 3.x
> # Test backward compatibility with existing configs
> # Update documentation
> h2. Acceptance Criteria
> * Ignite upgraded to version 3.x (latest stable)
> * Uses Calcite SQL engine instead of H2
> * All existing tests pass
> * Config store functionality preserved
> * No H2 dependencies remain
> * Documentation updated
> h2. References
> * Apache Ignite 3.x: https://ignite.apache.org/docs/3.0.0/
> * Ignite 3.x Migration Guide
> * Apache Calcite: https://calcite.apache.org/
> * Current module: {{tika-pipes/tika-pipes-config-store-ignite/}}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)