This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch haystack-tika-eval-integration in repository https://gitbox.apache.org/repos/asf/tika.git
commit 1777a9bbc6e9ca530f4192879371029d53e62a1f Author: tballison <[email protected]> AuthorDate: Wed May 13 12:42:03 2026 -0400 add tika-eval into tika-app --- tika-app/pom.xml | 6 ++++++ tika-eval/tika-eval-core/pom.xml | 6 ++++++ .../org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java | 2 ++ .../java/org/apache/tika/config/loader/ComponentInstantiator.java | 4 +++- 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/tika-app/pom.xml b/tika-app/pom.xml index 7e33098259..93f1394c92 100644 --- a/tika-app/pom.xml +++ b/tika-app/pom.xml @@ -68,6 +68,12 @@ <artifactId>tika-ml-junkdetect</artifactId> <version>${project.version}</version> </dependency> + <!-- tika-eval: TikaEvalMetadataFilter (tokens, OOV, lang, languageness) --> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-eval-core</artifactId> + <version>${project.version}</version> + </dependency> <dependency> <groupId>${project.groupId}</groupId> <artifactId>tika-xmp</artifactId> diff --git a/tika-eval/tika-eval-core/pom.xml b/tika-eval/tika-eval-core/pom.xml index 28bf95a442..5bf5da8a53 100644 --- a/tika-eval/tika-eval-core/pom.xml +++ b/tika-eval/tika-eval-core/pom.xml @@ -29,6 +29,12 @@ <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-annotation-processor</artifactId> + <version>${revision}</version> + <scope>provided</scope> + </dependency> <dependency> <groupId>${project.groupId}</groupId> <artifactId>tika-core</artifactId> diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java index 945ba84ee8..7c015b79a4 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java @@ -22,6 +22,7 @@ import java.util.Map; import org.apache.commons.lang3.StringUtils; +import org.apache.tika.config.TikaComponent; import org.apache.tika.eval.core.langid.LanguageIDWrapper; import org.apache.tika.eval.core.textstats.BasicTokenCountStatsCalculator; import org.apache.tika.eval.core.textstats.CommonTokens; @@ -37,6 +38,7 @@ import org.apache.tika.metadata.filter.MetadataFilterBase; import org.apache.tika.ml.junkdetect.JunkDetector; import org.apache.tika.quality.TextQualityScore; +@TikaComponent public class TikaEvalMetadataFilter extends MetadataFilterBase { public static String TIKA_EVAL_NS = "tika-eval" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java index f82daa581c..4b87ba78ca 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java @@ -127,7 +127,9 @@ public class ComponentInstantiator { Constructor<?> constructor = componentClass.getConstructor(JsonConfig.class); String jsonString = configNode != null ? configNode.toString() : "{}"; JsonConfig jsonConfig = () -> jsonString; - return (T) constructor.newInstance(jsonConfig); + T component = (T) constructor.newInstance(jsonConfig); + initializeIfNeeded(component); + return component; } catch (NoSuchMethodException e) { // No JsonConfig constructor, fall back to other methods }
