This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4731-common-script in repository https://gitbox.apache.org/repos/asf/tika.git
commit b4929faa75e0514c148db9a6b756d260677ca66e Author: tallison <[email protected]> AuthorDate: Wed May 20 14:28:43 2026 -0400 TIKA-4731 - checkpoint before "common" refactoring --- .../apache/tika/ml/chardetect/AdaptiveProbe.java | 80 ++ .../tika/ml/chardetect/AdaptiveProbeTest.java | 118 +++ .../tika/ml/junkdetect/HtmlContentCleaner.java | 108 +++ .../tools/BuildJunkAugmentationData.java | 862 +++++++++++++++++++++ .../tools/BuildJunkAugmentationDataTest.java | 444 +++++++++++ 5 files changed, 1612 insertions(+) diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java new file mode 100644 index 0000000000..34a081ec32 --- /dev/null +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.chardetect; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.commons.io.IOUtils; + +import org.apache.tika.io.TikaInputStream; + +/** + * Reads an encoding-detection probe sized by <em>content</em>, not raw bytes. + * + * <p>A fixed raw probe (e.g. 16 KB) starves the detectors when a page + * leads with a large {@code <head>}/inline script: after tag-stripping there's + * little body text, and the bytes that distinguish charsets sit past the + * window. This grows the read until ~{@code contentTarget} bytes of + * tag-stripped content are present, capped at {@code rawCap} raw bytes. + * + * <p>Text-rich pages stop early (~one chunk); markup-heavy pages read deeper, + * bounded by the cap. Multi-byte encodings (UTF-16/32) register no ASCII tags + * so they stop at {@code contentTarget} raw bytes. + */ +public final class AdaptiveProbe { + + /** Default body-content target. */ + public static final int DEFAULT_CONTENT_TARGET = 16384; + /** Default hard ceiling on raw bytes read. */ + public static final int DEFAULT_RAW_CAP = 524288; + + private AdaptiveProbe() { + } + + /** + * Reads from {@code tis} (mark/reset preserved) until tag-stripped content + * reaches {@code contentTarget}, the raw read reaches {@code rawCap}, or + * EOF — whichever first. Returns the raw bytes read. + */ + public static byte[] read(TikaInputStream tis, int contentTarget, int rawCap) + throws IOException { + tis.mark(rawCap); + try { + byte[] buf = new byte[rawCap]; + byte[] stripDst = new byte[rawCap]; + int total = 0; + while (total < rawCap) { + int want = Math.min(rawCap - total, contentTarget); + int n = IOUtils.read(tis, buf, total, want); + total += n; + HtmlByteStripper.Result r = + HtmlByteStripper.stripTags(buf, 0, total, stripDst, 0); + int content = r.tagCount > 0 ? r.length : total; + if (content >= contentTarget || n < want) { + break; // enough body text, or EOF + } + } + if (total == 0) { + return new byte[0]; + } + return total == rawCap ? buf : Arrays.copyOf(buf, total); + } finally { + tis.reset(); + } + } +} diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/AdaptiveProbeTest.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/AdaptiveProbeTest.java new file mode 100644 index 0000000000..44c97d8c92 --- /dev/null +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/AdaptiveProbeTest.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.chardetect; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; + +import org.junit.jupiter.api.Test; + +import org.apache.tika.io.TikaInputStream; + +public class AdaptiveProbeTest { + + private static byte[] probe(byte[] raw, int contentTarget, int rawCap) throws IOException { + try (TikaInputStream tis = TikaInputStream.get(raw)) { + return AdaptiveProbe.read(tis, contentTarget, rawCap); + } + } + + /** Plain text below the content target: read everything, stop at EOF. */ + @Test + public void shortPlainTextReadsToEof() throws Exception { + byte[] raw = "hello world, this is plain text".getBytes(StandardCharsets.UTF_8); + byte[] p = probe(raw, 16384, 524288); + assertEquals(raw.length, p.length); + } + + /** No tags: content == raw, so the read stops at the content target. */ + @Test + public void plainTextStopsAtContentTarget() throws Exception { + byte[] raw = new byte[200_000]; + java.util.Arrays.fill(raw, (byte) 'a'); + byte[] p = probe(raw, 16384, 524288); + // Stops within one chunk past the target (chunked by contentTarget). + assertTrue(p.length >= 16384 && p.length <= 32768, + "expected ~content target, got " + p.length); + } + + /** Markup-heavy lead: must read past 16 KB raw to accumulate body text. */ + @Test + public void markupHeavyReadsPastFixedWindow() throws Exception { + StringBuilder sb = new StringBuilder(); + // ~40 KB of tags yielding almost no text, then real body content. + for (int i = 0; i < 4000; i++) { + sb.append("<div class=\"x\"></div>"); + } + int markupBytes = sb.length(); + for (int i = 0; i < 20000; i++) { + sb.append("content "); + } + byte[] raw = sb.toString().getBytes(StandardCharsets.UTF_8); + byte[] p = probe(raw, 16384, 524288); + assertTrue(p.length > markupBytes, + "should read past the markup block (" + markupBytes + "), got " + p.length); + } + + /** All markup, no body: bounded by the raw cap. */ + @Test + public void allMarkupBoundedByRawCap() throws Exception { + StringBuilder sb = new StringBuilder(); + while (sb.length() < 300_000) { + sb.append("<a href=\"#\"></a>"); + } + byte[] raw = sb.toString().getBytes(StandardCharsets.UTF_8); + int rawCap = 65536; + byte[] p = probe(raw, 16384, rawCap); + assertEquals(rawCap, p.length); + } + + /** Multi-byte text with no ASCII tags stops at the content target (no over-read). */ + @Test + public void utf16NoTagsStopsAtContentTarget() throws Exception { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < 100_000; i++) { + sb.append('中'); // CJK; no '<' so tagCount stays 0 + } + byte[] raw = sb.toString().getBytes(StandardCharsets.UTF_16LE); + byte[] p = probe(raw, 16384, 524288); + assertTrue(p.length >= 16384 && p.length <= 32768, + "no tags -> content==raw -> stop near target, got " + p.length); + } + + /** mark/reset must leave the stream fully re-readable. */ + @Test + public void streamIsResetAfterProbe() throws Exception { + byte[] raw = new byte[100_000]; + java.util.Arrays.fill(raw, (byte) 'z'); + try (TikaInputStream tis = TikaInputStream.get(raw)) { + AdaptiveProbe.read(tis, 16384, 524288); + byte[] all = tis.readAllBytes(); + assertEquals(raw.length, all.length); + } + } + + /** Empty input returns an empty array, never null. */ + @Test + public void emptyInputReturnsEmpty() throws Exception { + byte[] p = probe(new byte[0], 16384, 524288); + assertEquals(0, p.length); + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/HtmlContentCleaner.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/HtmlContentCleaner.java new file mode 100644 index 0000000000..e531730bce --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/HtmlContentCleaner.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect; + +import java.nio.charset.StandardCharsets; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.tika.ml.chardetect.HtmlByteStripper; + +/** + * Junk-detection HTML→text cleaning: strip tags, then <em>expand</em> entities + * to codepoints. Called by both {@code TrainJunkModel} and {@link + * JunkFilterEncodingDetector} so training and inference prepare text identically + * (no drift). + * + * <p>Unlike charset detection's {@link HtmlByteStripper#stripTagsAndEntities} + * (which <em>drops</em> entities as charset-neutral ASCII noise), junk detection + * expands them: the resulting codepoints — cross-script under a wrong decoding — + * are what expose mojibake. + */ +public final class HtmlContentCleaner { + + private static final Pattern ENTITY_DEC = Pattern.compile("&#(\\d{1,7});"); + private static final Pattern ENTITY_HEX = Pattern.compile("&#[xX]([0-9a-fA-F]{1,6});"); + private static final Pattern ENTITY_NAMED = + Pattern.compile("&(amp|lt|gt|quot|apos|nbsp|copy|reg);"); + + private HtmlContentCleaner() { + } + + /** + * Strip HTML tags (entities preserved through the strip), then expand + * entities to real codepoints. No-op-ish on plain text (no tags/entities). + */ + public static String clean(String s) { + if (s == null || s.isEmpty()) { + return s; + } + // Tag-strip via the byte stripper on the string's UTF-8 form; keeps + // entities (stripTags, not stripTagsAndEntities) for the expand below. + byte[] u = s.getBytes(StandardCharsets.UTF_8); + byte[] dst = new byte[u.length]; + HtmlByteStripper.Result r = HtmlByteStripper.stripTags(u, 0, u.length, dst, 0); + String tagless = (r.tagCount > 0 && r.length > 0) + ? new String(dst, 0, r.length, StandardCharsets.UTF_8) + : s; + return expandHtmlEntities(tagless); + } + + /** + * Expand HTML entities to codepoints. Numeric refs ({@code ©}, + * {@code ©}) are fully decoded; a small set of common named entities + * is mapped; other named entities pass through literally. + */ + static String expandHtmlEntities(String s) { + s = ENTITY_DEC.matcher(s).replaceAll(mr -> { + try { + int cp = Integer.parseInt(mr.group(1)); + if (cp >= 0 && cp <= 0x10FFFF) { + return Matcher.quoteReplacement(new String(Character.toChars(cp))); + } + } catch (NumberFormatException ignored) { + // overflow — leave literal + } + return Matcher.quoteReplacement(mr.group()); + }); + s = ENTITY_HEX.matcher(s).replaceAll(mr -> { + try { + int cp = Integer.parseInt(mr.group(1), 16); + if (cp >= 0 && cp <= 0x10FFFF) { + return Matcher.quoteReplacement(new String(Character.toChars(cp))); + } + } catch (NumberFormatException ignored) { + // overflow — leave literal + } + return Matcher.quoteReplacement(mr.group()); + }); + s = ENTITY_NAMED.matcher(s).replaceAll(mr -> { + switch (mr.group(1)) { + case "amp": return "&"; + case "lt": return "<"; + case "gt": return ">"; + case "quot": return "\""; + case "apos": return "'"; + case "nbsp": return " "; + case "copy": return "©"; + case "reg": return "®"; + default: return Matcher.quoteReplacement(mr.group()); + } + }); + return s; + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationData.java b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationData.java new file mode 100644 index 0000000000..ad05cf5292 --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationData.java @@ -0,0 +1,862 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect.tools; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.Reader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.TreeMap; +import java.util.stream.Stream; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.serialization.JsonMetadataList; + +/** + * Augments per-script {@code <script>.train.gz} files with quality-filtered text + * extracted from tika-app RMETA JSON output. + * + * <p>Built to address a corpus-imbalance pathology in the bundled junk-detector + * model: the primary training corpus (MADLAD + Wikipedia) is clean linguistic + * text and carries almost no HTML symbols (©, ®, ™, €, £), so on web pages those + * bytes look anomalously surprising relative to the rest of the bigram + * distribution and tip charset-decoding decisions toward whichever encoding + * happens to place a frequent training letter at the same byte position. + * + * <p>This tool is strictly additive: originals in {@code --baseline} are never + * modified; output goes to a fresh directory. The {@code .dev.gz} / {@code + * .test.gz} splits are copied verbatim (no augmentation), preserving evaluation + * integrity. + * + * <h2>Pipeline</h2> + * <ol> + * <li>For each {@code .json} extract under {@code --extracts} (tika-app output + * via {@link JsonMetadataList}), read {@code X-TIKA:content}.</li> + * <li>Per document: determine the dominant Unicode script (non-COMMON / non- + * INHERITED tally) and drop documents whose dominant-script fraction is + * below {@link #MIN_DOC_SCRIPT_DOMINANCE} — mixed-script pages are too + * ambiguous to attribute cleanly to one bucket.</li> + * <li>Split the content into ~{@value #TARGET_CHUNK_CHARS}-char chunks on + * newlines (then on whitespace if a line is longer than the cap).</li> + * <li>Run each chunk through {@link BuildJunkTrainingData#filterSentence( + * String, int, double, Character.UnicodeScript, double)} using the same + * MIN_BYTES / MAX_PUNC_FRAC / MIN_TARGET_SCRIPT_FRAC values from + * {@link JunkDetectorTrainingConfig}.</li> + * <li>Per-script gate: a script must reach at least {@code --min-docs} quality- + * filtered documents or its bucket is skipped (prevents asymmetric + * augmentation favouring data-rich scripts).</li> + * <li>Per-script cap: append at most {@code min(--hard-cap, + * --baseline-frac-cap × baselineLineCount)} lines so the original + * distribution is preserved.</li> + * <li>Output: baseline {@code .train.gz} is decompressed, augmentation lines + * are appended, and the file is recompressed at the output path. Other + * split files are byte-copied.</li> + * </ol> + * + * <h2>Safety</h2> + * Refuses to start when {@code --output} resolves to the same directory as + * {@code --baseline}. Documents that do not pass the per-doc script-dominance + * gate are dropped, not reassigned to a different bucket. + */ +public final class BuildJunkAugmentationData { + + /** Hard ceiling on appended lines per script when not constrained by baseline-frac cap. */ + public static final int DEFAULT_HARD_CAP_LINES = 3_000; + + /** Cap appended lines at this fraction of the baseline train-line count. */ + public static final double DEFAULT_BASELINE_FRAC_CAP = 0.10; + + /** + * Per-script gate: a script must have at least this many quality-filtered + * source documents before any augmentation is appended. Avoids creating an + * asymmetric bias from a single noisy page. + */ + public static final int DEFAULT_MIN_DOCS = 500; + + /** + * Reject a document whose dominant Unicode script accounts for less than + * this fraction of its non-COMMON / non-INHERITED codepoints. Mixed-script + * pages are dropped rather than attributed. + */ + public static final double MIN_DOC_SCRIPT_DOMINANCE = 0.80; + + /** Target character length when chunking long lines from extract content. */ + public static final int TARGET_CHUNK_CHARS = 300; + + /** Hard upper bound on chunk size; longer lines are sliced. */ + public static final int MAX_CHUNK_CHARS = 600; + + /** Minimum content length (chars) before a document is considered at all. */ + public static final int MIN_DOC_CHARS = 500; + + /** + * Structural test for UTF-8 source decoded as windows-1252. UTF-8 multi- + * byte lead bytes (0xC2–0xDF) followed by continuation bytes (0x80–0xBF) + * decode as Latin-Supplement letters ("Â"–"ß") immediately followed by C1 + * controls / typographic codepoints ("€"–"¿"). Legitimate German/French/ + * Italian text essentially never produces this exact bigram shape because + * those codepoints aren't normally adjacent to Latin letters. + * + * <p>This is NOT a content-quality heuristic (that's JunkDetector's job — + * and it can't catch this, because mojibake'd Latin is still bigram-wise + * "Latin-like"). It's a check for a specific, well-known encoding + * pathology: UTF-8 misread as windows-1252. A chunk with this many of + * those bigrams is essentially guaranteed mojibake and gets dropped to + * keep contaminated samples out of the LATIN bigram table. + */ + public static final int MAX_UTF8_AS_WIN1252_BIGRAMS = 1; + + /** + * Maximum OOV (tika-eval out-of-vocabulary rate) for a doc to be accepted. + * Applied only when a profile CSV is provided. Negative OOV (e.g. tika-eval + * has no word list for the detected language, like Tibetan) bypasses this + * gate so we don't unfairly drop content from unsupported languages. + */ + public static final double DEFAULT_MAX_OOV = 0.5; + + /** + * Minimum LANGUAGENESS for a doc to be accepted. Applied only when a profile + * CSV is provided. LANGUAGENESS sums to {@code (langProb1 - OOV)} normalized + * to a per-doc score; ≥0 means "more in-vocabulary than out-of-vocabulary". + */ + public static final double DEFAULT_MIN_LANGNESS = 0.0; + + /** + * win-1252 symbols that collide with ISO-8859-2 letters (bytes 0xA0-0xBF) + * or are otherwise web-common but starved in clean linguistic corpora. + * These are the bytes whose ISO-8859-2 re-decode produces a confusable + * Central-European letter (© → Š, ® → Ž, £ → Ł, ¥ → Ľ, ¦ → Ś, µ → ľ, + * ¶ → ś, ¼ → ź, ¾ → ž), plus ™/€ which are web-ubiquitous. Symbol-aware + * selection biases the LATIN augmentation toward chunks containing these + * so they reach confident bigram-table density (z1), widening the thin + * win-1252-vs-ISO-8859-2 margin. + */ + public static final String SYMBOL_TARGETS = "©®™€£¥¦µ¶¼½¾"; + + /** + * Fraction of the per-script cap reserved for symbol-bearing chunks when + * {@code --symbol-boost} is set. 0 disables (random selection, original + * behaviour). Only applied to the LATIN bucket — the win-1252/ISO-8859-2 + * symbol→letter collision is Latin-specific. + */ + public static final double DEFAULT_SYMBOL_BOOST_FRAC = 0.0; + + private BuildJunkAugmentationData() { + } + + public static void main(String[] args) throws IOException { + Path extractsDir = null; + Path baselineDir = null; + Path outputDir = null; + Path profileCsv = null; + boolean dryRun = false; + int hardCap = DEFAULT_HARD_CAP_LINES; + double fracCap = DEFAULT_BASELINE_FRAC_CAP; + int minDocs = DEFAULT_MIN_DOCS; + double maxOov = DEFAULT_MAX_OOV; + double minLangness = DEFAULT_MIN_LANGNESS; + double symbolBoost = DEFAULT_SYMBOL_BOOST_FRAC; + long seed = JunkDetectorTrainingConfig.SEED; + + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "--extracts": + extractsDir = Paths.get(args[++i]); + break; + case "--baseline": + baselineDir = Paths.get(args[++i]); + break; + case "--output": + outputDir = Paths.get(args[++i]); + break; + case "--profile-csv": + profileCsv = Paths.get(args[++i]); + break; + case "--max-oov": + maxOov = Double.parseDouble(args[++i]); + break; + case "--min-langness": + minLangness = Double.parseDouble(args[++i]); + break; + case "--symbol-boost": + symbolBoost = Double.parseDouble(args[++i]); + break; + case "--dry-run": + dryRun = true; + break; + case "--hard-cap": + hardCap = Integer.parseInt(args[++i]); + break; + case "--baseline-frac-cap": + fracCap = Double.parseDouble(args[++i]); + break; + case "--min-docs": + minDocs = Integer.parseInt(args[++i]); + break; + case "--seed": + seed = Long.parseLong(args[++i]); + break; + default: + System.err.println("Unknown argument: " + args[i]); + printUsage(); + System.exit(1); + } + } + if (extractsDir == null || baselineDir == null || outputDir == null) { + printUsage(); + System.exit(1); + } + + if (!Files.isDirectory(extractsDir)) { + System.err.println("ERROR: --extracts directory not found: " + extractsDir); + System.exit(1); + } + if (!Files.isDirectory(baselineDir)) { + System.err.println("ERROR: --baseline directory not found: " + baselineDir); + System.exit(1); + } + if (Files.exists(outputDir) && Files.isSameFile(outputDir, baselineDir)) { + System.err.println("ERROR: --output must differ from --baseline."); + System.exit(1); + } + + System.out.println("=== BuildJunkAugmentationData ==="); + System.out.println(" extracts: " + extractsDir); + System.out.println(" baseline: " + baselineDir); + System.out.println(" output: " + outputDir); + System.out.println(" profile-csv: " + (profileCsv == null ? "(none)" : profileCsv)); + System.out.println(" max-oov: " + maxOov); + System.out.println(" min-langness: " + minLangness); + System.out.println(" symbol-boost: " + symbolBoost + + (symbolBoost > 0 ? " (LATIN only, targets: " + SYMBOL_TARGETS + ")" : " (off)")); + System.out.println(" hard-cap: " + hardCap); + System.out.println(" baseline-frac-cap: " + fracCap); + System.out.println(" min-docs: " + minDocs); + System.out.println(" seed: " + seed); + System.out.println(" dry-run: " + dryRun); + + // Load tika-eval profile CSV if provided. + Map<String, ProfileRow> profiles = profileCsv == null + ? null + : loadProfileCsv(profileCsv); + if (profiles != null) { + System.out.printf(" loaded %,d profile rows%n", profiles.size()); + } + + // --- Phase 1: discover baseline scripts + line counts ------------------- + System.out.println("\n--- Phase 1: scanning baseline train files ---"); + Map<String, Long> baselineLineCounts = scanBaselineLineCounts(baselineDir); + for (Map.Entry<String, Long> e : baselineLineCounts.entrySet()) { + System.out.printf(" %-20s baseline=%,d lines%n", e.getKey(), e.getValue()); + } + + // --- Phase 2: walk extracts --------------------------------------------- + System.out.println("\n--- Phase 2: scanning extracts ---"); + // Per-script: list of accepted chunks (one chunk per line). Doc-level + // gating is by counting how many docs contributed ≥ 1 accepted chunk. + Map<String, List<String>> scriptChunks = new TreeMap<>(); + Map<String, Integer> scriptDocCount = new TreeMap<>(); + // Diagnostics: how many docs we saw total / per stage. + long totalSeen = 0; + long droppedNoContent = 0; + long droppedShort = 0; + long droppedMixedScript = 0; + long droppedNoBaseline = 0; + long droppedOov = 0; + long droppedLangness = 0; + long droppedNoProfile = 0; + long accepted = 0; + + try (Stream<Path> walk = Files.walk(extractsDir)) { + for (Path file : (Iterable<Path>) walk + .filter(Files::isRegularFile) + .filter(p -> p.getFileName().toString().endsWith(".json"))::iterator) { + totalSeen++; + String content = readContent(file); + if (content == null || content.isEmpty()) { + droppedNoContent++; + continue; + } + if (content.length() < MIN_DOC_CHARS) { + droppedShort++; + continue; + } + // Profile gate (OOV / LANGUAGENESS), if a profile CSV was loaded. + if (profiles != null) { + String key = profileKey(extractsDir, file); + ProfileRow pr = profiles.get(key); + if (pr == null) { + droppedNoProfile++; + continue; + } + // Negative OOV means tika-eval has no word list for the + // detected language; we don't penalise those docs. + if (pr.oov >= 0 && pr.oov > maxOov) { + droppedOov++; + continue; + } + if (pr.langness < minLangness) { + droppedLangness++; + continue; + } + } + DocScript ds = dominantScript(content); + if (ds.script == null || ds.dominance < MIN_DOC_SCRIPT_DOMINANCE) { + droppedMixedScript++; + continue; + } + String scriptName = ds.script.name(); + if (!baselineLineCounts.containsKey(scriptName.toLowerCase())) { + // No baseline bucket for this script — nothing to augment. + droppedNoBaseline++; + continue; + } + List<String> chunks = chunk(content); + List<String> filtered = new ArrayList<>(chunks.size()); + for (String c : chunks) { + String ok = BuildJunkTrainingData.filterSentence( + c, + JunkDetectorTrainingConfig.MIN_BYTES_PER_SENTENCE, + JunkDetectorTrainingConfig.MAX_PUNC_FRAC, + ds.script, + JunkDetectorTrainingConfig.MIN_TARGET_SCRIPT_FRAC); + if (ok == null) { + continue; + } + if (countUtf8AsWin1252Bigrams(ok) > MAX_UTF8_AS_WIN1252_BIGRAMS) { + continue; + } + filtered.add(ok); + } + if (filtered.isEmpty()) { + continue; + } + accepted++; + scriptChunks.computeIfAbsent(scriptName, k -> new ArrayList<>()) + .addAll(filtered); + scriptDocCount.merge(scriptName, 1, Integer::sum); + } + } + + System.out.printf(" total extracts seen: %,d%n", totalSeen); + System.out.printf(" dropped no-content: %,d%n", droppedNoContent); + System.out.printf(" dropped short: %,d%n", droppedShort); + if (profiles != null) { + System.out.printf(" dropped no-profile: %,d%n", droppedNoProfile); + System.out.printf(" dropped OOV>%.2f: %,d%n", maxOov, droppedOov); + System.out.printf(" dropped langness<%.2f: %,d%n", minLangness, droppedLangness); + } + System.out.printf(" dropped mixed-script: %,d%n", droppedMixedScript); + System.out.printf(" dropped no-baseline: %,d%n", droppedNoBaseline); + System.out.printf(" contributed ≥1 chunk: %,d%n", accepted); + + // --- Phase 3: apply per-script gates and caps --------------------------- + System.out.println("\n--- Phase 3: per-script gating and capping ---"); + Random rng = new Random(seed); + Map<String, List<String>> finalLines = new LinkedHashMap<>(); + // Per-script report rows for the manifest. + Map<String, long[]> manifest = new TreeMap<>(); + // columns: docs, chunks_pre_cap, lines_appended, baseline_lines, cap + + for (Map.Entry<String, List<String>> entry : scriptChunks.entrySet()) { + String script = entry.getKey(); + List<String> chunks = entry.getValue(); + int docs = scriptDocCount.getOrDefault(script, 0); + long baselineLines = baselineLineCounts.getOrDefault(script.toLowerCase(), 0L); + long fracCapVal = (long) Math.floor(baselineLines * fracCap); + long cap = Math.min(hardCap, fracCapVal); + + if (docs < minDocs) { + System.out.printf( + " SKIP %-20s docs=%-6d (<%d gate) chunks=%,d cap=%,d%n", + script, docs, minDocs, chunks.size(), cap); + manifest.put(script, new long[]{docs, chunks.size(), 0, baselineLines, cap}); + continue; + } + + Collections.shuffle(chunks, rng); + List<String> kept; + // Symbol-aware selection — LATIN only (the win-1252/ISO-8859-2 + // symbol→letter collision is Latin-specific). Reserve a quota of + // the cap for symbol-bearing chunks so ©/®/£/etc. reach confident + // bigram-table density without inflating the total. + if (symbolBoost > 0 && "LATIN".equals(script) && chunks.size() > cap) { + List<String> withSym = new ArrayList<>(); + List<String> noSym = new ArrayList<>(); + for (String c : chunks) { + (containsTargetSymbol(c) ? withSym : noSym).add(c); + } + int quota = (int) Math.min((long) Math.floor(cap * symbolBoost), withSym.size()); + kept = new ArrayList<>((int) cap); + kept.addAll(withSym.subList(0, quota)); + int remaining = (int) cap - quota; + if (remaining > 0) { + kept.addAll(noSym.subList(0, Math.min(remaining, noSym.size()))); + } + // If noSym ran short, top up from leftover symbol-bearing chunks. + for (int k = quota; kept.size() < cap && k < withSym.size(); k++) { + kept.add(withSym.get(k)); + } + System.out.printf( + " KEEP %-20s docs=%-6d chunks=%,8d -> append=%,6d " + + "(baseline=%,d, cap=%,d, symbol-quota=%d, symbol-bearing-pool=%d)%n", + script, docs, chunks.size(), kept.size(), baselineLines, cap, + quota, withSym.size()); + } else { + kept = chunks.size() > cap + ? new ArrayList<>(chunks.subList(0, (int) cap)) + : chunks; + System.out.printf( + " KEEP %-20s docs=%-6d chunks=%,8d -> append=%,6d (baseline=%,d, cap=%,d)%n", + script, docs, chunks.size(), kept.size(), baselineLines, cap); + } + finalLines.put(script, kept); + manifest.put(script, + new long[]{docs, chunks.size(), kept.size(), baselineLines, cap}); + } + + if (dryRun) { + System.out.println("\nDry-run: skipping output writes."); + return; + } + + // --- Phase 4: write output ---------------------------------------------- + System.out.println("\n--- Phase 4: writing output ---"); + Files.createDirectories(outputDir); + // Copy or rewrite every baseline file. Any *.train.gz with a kept + // augmentation set is rewritten with appended lines; everything else + // is byte-copied so the output directory is a complete drop-in for + // TrainJunkModel. + try (Stream<Path> stream = Files.list(baselineDir)) { + for (Path src : (Iterable<Path>) stream.filter(Files::isRegularFile)::iterator) { + String name = src.getFileName().toString(); + Path dst = outputDir.resolve(name); + if (name.endsWith(".train.gz")) { + String script = name.substring(0, name.length() - ".train.gz".length()) + .toUpperCase(); + List<String> add = finalLines.get(script); + if (add != null && !add.isEmpty()) { + rewriteTrainWithAppend(src, dst, add); + System.out.printf(" WROTE %-30s +%,d lines appended%n", name, add.size()); + } else { + Files.copy(src, dst); + System.out.printf(" COPY %-30s (no augmentation)%n", name); + } + } else { + Files.copy(src, dst); + } + } + } + + // Manifest + Path manifestPath = outputDir.resolve("augmentation_manifest.tsv"); + try (BufferedWriter w = Files.newBufferedWriter(manifestPath, StandardCharsets.UTF_8)) { + w.write("script\tdocs\tchunks_pre_cap\tlines_appended\tbaseline_lines\tcap\n"); + for (Map.Entry<String, long[]> e : manifest.entrySet()) { + long[] r = e.getValue(); + w.write(String.format("%s\t%d\t%d\t%d\t%d\t%d%n", + e.getKey(), r[0], r[1], r[2], r[3], r[4])); + } + } + System.out.println("\nWrote manifest: " + manifestPath); + } + + // ------------------------------------------------------------------------- + // Phase helpers + // ------------------------------------------------------------------------- + + /** Returns lowercase script-name → line-count for every {@code *.train.gz}. */ + static Map<String, Long> scanBaselineLineCounts(Path baselineDir) throws IOException { + Map<String, Long> out = new TreeMap<>(); + try (Stream<Path> stream = Files.list(baselineDir)) { + for (Path p : (Iterable<Path>) stream + .filter(Files::isRegularFile) + .filter(x -> x.getFileName().toString().endsWith(".train.gz"))::iterator) { + String name = p.getFileName().toString(); + String script = name.substring(0, name.length() - ".train.gz".length()); + out.put(script, countGzLines(p)); + } + } + return out; + } + + static long countGzLines(Path path) throws IOException { + long count = 0; + try (BufferedReader r = new BufferedReader( + new InputStreamReader( + new GZIPInputStream(Files.newInputStream(path)), + StandardCharsets.UTF_8))) { + while (r.readLine() != null) { + count++; + } + } + return count; + } + + /** Reads {@code X-TIKA:content} from the first metadata record of a tika-app RMETA JSON. */ + static String readContent(Path jsonFile) { + try (Reader r = new InputStreamReader( + Files.newInputStream(jsonFile), StandardCharsets.UTF_8)) { + List<Metadata> list = JsonMetadataList.fromJson(r); + if (list == null || list.isEmpty()) { + return null; + } + return list.get(0).get(TikaCoreProperties.TIKA_CONTENT); + } catch (IOException e) { + return null; + } + } + + /** Result of dominant-script analysis for a document. */ + static final class DocScript { + final Character.UnicodeScript script; + final double dominance; + + DocScript(Character.UnicodeScript script, double dominance) { + this.script = script; + this.dominance = dominance; + } + } + + static DocScript dominantScript(String text) { + Map<Character.UnicodeScript, Long> counts = new LinkedHashMap<>(); + long total = 0; + for (int i = 0; i < text.length(); ) { + int cp = text.codePointAt(i); + Character.UnicodeScript s = Character.UnicodeScript.of(cp); + if (s != Character.UnicodeScript.COMMON + && s != Character.UnicodeScript.INHERITED + && s != Character.UnicodeScript.UNKNOWN) { + counts.merge(s, 1L, Long::sum); + total++; + } + i += Character.charCount(cp); + } + if (total == 0) { + return new DocScript(null, 0.0); + } + Character.UnicodeScript best = null; + long bestCount = 0; + for (Map.Entry<Character.UnicodeScript, Long> e : counts.entrySet()) { + if (e.getValue() > bestCount) { + bestCount = e.getValue(); + best = e.getKey(); + } + } + return new DocScript(best, (double) bestCount / total); + } + + /** + * Structural test for UTF-8 source decoded as windows-1252. + * + * <p>UTF-8 multi-byte sequences start with a lead byte in the 0xC2–0xDF + * range followed by one or more continuation bytes in 0x80–0xBF. When + * those same bytes are mis-decoded as windows-1252 they render as a + * Latin-Supplement letter (U+00C0–U+00DF) immediately followed by a C1 + * control or typographic codepoint (U+0080–U+00BF). Legitimate German / + * French / Italian / Spanish text essentially never produces this bigram + * shape — those high-Latin letters appear adjacent to other Latin letters, + * not to currency symbols / smart quotes / non-breaking spaces. + * + * <p>This is a structural cross-encoding test, not a content-quality + * heuristic. {@link org.apache.tika.ml.junkdetect.JunkDetector} cannot + * discriminate this case (empirically the zScore delta between mojibake'd + * Latin and clean Latin is <0.03, within noise), so the right tool is + * a structural check that targets this specific known encoding pathology. + */ + static int countUtf8AsWin1252Bigrams(String text) { + int n = 0; + for (int i = 0; i + 1 < text.length(); i++) { + char a = text.charAt(i); + char b = text.charAt(i + 1); + if (a >= 0x00C0 && a <= 0x00DF && b >= 0x0080 && b <= 0x00BF) { + n++; + } + } + return n; + } + + /** + * Splits content into sentence-shaped chunks of roughly + * {@link #TARGET_CHUNK_CHARS} characters. + * + * <p>Tika's text extraction inserts a newline between each HTML element, so + * a single paragraph that uses {@code <span>} / {@code <a>} / {@code <br>} + * arrives here as many short lines (headers, captions, list items, + * navigation labels). Treating each of those as a separate training + * sample produces a bigram table with too many one-off bigrams and too + * little within-chunk statistics — multi-byte scripts (HAN, HANGUL, etc.) + * suffer worst because their bytes-per-char ratio means the same + * per-byte minimum filter passes very small char counts through. + * + * <p>This chunker therefore greedily concatenates short newline-separated + * lines (with a single-space joiner) until they reach the target size, + * yielding chunks comparable in shape to MADLAD/Wikipedia sentences. Any + * source line that's already longer than {@link #MAX_CHUNK_CHARS} flushes + * the current buffer first and is then sliced at whitespace. + */ + static List<String> chunk(String content) { + List<String> out = new ArrayList<>(); + StringBuilder buf = new StringBuilder(); + for (String raw : content.split("\n")) { + String line = raw.replace('\t', ' ').strip(); + if (line.isEmpty()) { + continue; + } + // Collapse repeated whitespace to keep chunk shape comparable to + // sentence-level training samples. + line = line.replaceAll("\\s+", " "); + if (line.length() > MAX_CHUNK_CHARS) { + // Flush whatever's been accumulating, then slice the long line. + if (buf.length() > 0) { + out.add(buf.toString()); + buf.setLength(0); + } + int start = 0; + while (start < line.length()) { + int end = Math.min(start + TARGET_CHUNK_CHARS, line.length()); + if (end < line.length()) { + int hardCap = Math.min(start + MAX_CHUNK_CHARS, line.length()); + int ws = line.indexOf(' ', end); + if (ws >= 0 && ws < hardCap) { + end = ws; + } else { + end = hardCap; + } + } + String piece = line.substring(start, end).strip(); + if (!piece.isEmpty()) { + out.add(piece); + } + start = end + 1; // skip whitespace boundary + } + continue; + } + // Short line: accumulate into the buffer. Emit when the joiner + // plus the new line would exceed the target. + int joinerLen = buf.length() > 0 ? 1 : 0; + if (buf.length() + joinerLen + line.length() > TARGET_CHUNK_CHARS + && buf.length() > 0) { + out.add(buf.toString()); + buf.setLength(0); + } + if (buf.length() > 0) { + buf.append(' '); + } + buf.append(line); + } + if (buf.length() > 0) { + out.add(buf.toString()); + } + return out; + } + + /** + * Decompresses {@code src}, writes every original line to {@code dst}, then + * appends the {@code extra} lines, and recompresses. Single gzip member — + * round-trips identically through {@link GZIPInputStream}. + */ + static void rewriteTrainWithAppend(Path src, Path dst, List<String> extra) + throws IOException { + try (BufferedReader r = new BufferedReader( + new InputStreamReader( + new GZIPInputStream(Files.newInputStream(src)), + StandardCharsets.UTF_8)); + BufferedWriter w = new BufferedWriter( + new OutputStreamWriter( + new GZIPOutputStream(Files.newOutputStream(dst)), + StandardCharsets.UTF_8))) { + String line; + while ((line = r.readLine()) != null) { + w.write(line); + w.newLine(); + } + for (String s : extra) { + w.write(s); + w.newLine(); + } + } + } + + /** Per-doc quality fields from a tika-eval Profile run. */ + static final class ProfileRow { + final double oov; + final double langness; + final String lang; + + ProfileRow(double oov, double langness, String lang) { + this.oov = oov; + this.langness = langness; + this.lang = lang; + } + } + + /** + * Loads a tika-eval Profile CSV export (H2 default CSV format: comma- + * separated, double-quoted fields). Expected columns: {@code FILE_PATH, + * OOV, LANGNESS, LANG}. Produces {@code FILE_PATH → ProfileRow} for join + * against extracts on disk. + */ + static Map<String, ProfileRow> loadProfileCsv(Path csv) throws IOException { + Map<String, ProfileRow> out = new HashMap<>(); + try (BufferedReader r = Files.newBufferedReader(csv, StandardCharsets.UTF_8)) { + String header = r.readLine(); + if (header == null) { + return out; + } + String[] cols = parseCsvLine(header); + int idxPath = -1; + int idxOov = -1; + int idxLang = -1; + int idxLangId = -1; + for (int i = 0; i < cols.length; i++) { + switch (cols[i].toUpperCase()) { + case "FILE_PATH": + idxPath = i; + break; + case "OOV": + idxOov = i; + break; + case "LANGNESS": + case "LANGUAGENESS": + idxLang = i; + break; + case "LANG": + case "LANG_ID_1": + idxLangId = i; + break; + default: + break; + } + } + if (idxPath < 0 || idxOov < 0 || idxLang < 0) { + throw new IOException("profile CSV must have FILE_PATH, OOV, LANGNESS columns; " + + "saw: " + String.join(",", cols)); + } + String line; + while ((line = r.readLine()) != null) { + if (line.isEmpty()) continue; + String[] f = parseCsvLine(line); + if (f.length <= Math.max(idxPath, Math.max(idxOov, idxLang))) continue; + double oov; + double langness; + try { + oov = Double.parseDouble(f[idxOov]); + langness = Double.parseDouble(f[idxLang]); + } catch (NumberFormatException e) { + continue; + } + String lang = idxLangId >= 0 && idxLangId < f.length ? f[idxLangId] : null; + out.put(f[idxPath], new ProfileRow(oov, langness, lang)); + } + } + return out; + } + + /** True if the chunk contains at least one {@link #SYMBOL_TARGETS} character. */ + static boolean containsTargetSymbol(String chunk) { + for (int i = 0; i < chunk.length(); i++) { + if (SYMBOL_TARGETS.indexOf(chunk.charAt(i)) >= 0) { + return true; + } + } + return false; + } + + /** Minimal H2-flavour CSV row parser: double-quoted fields, doubled "" escapes. */ + static String[] parseCsvLine(String line) { + List<String> out = new ArrayList<>(); + StringBuilder cur = new StringBuilder(); + boolean inQuotes = false; + for (int i = 0; i < line.length(); i++) { + char c = line.charAt(i); + if (c == '"') { + if (inQuotes && i + 1 < line.length() && line.charAt(i + 1) == '"') { + cur.append('"'); + i++; + } else { + inQuotes = !inQuotes; + } + } else if (c == ',' && !inQuotes) { + out.add(cur.toString()); + cur.setLength(0); + } else { + cur.append(c); + } + } + out.add(cur.toString()); + return out.toArray(new String[0]); + } + + /** + * Builds the join key used to look up a tika-eval profile row for a given + * extract file: extracts-root-relative path with the {@code .json} suffix + * stripped and any backslashes normalised to forward slashes. Matches the + * value tika-eval writes as {@code FILE_PATH} in the CONTAINERS table. + */ + static String profileKey(Path extractsDir, Path extractFile) { + String rel = extractsDir.relativize(extractFile).toString() + .replace('\\', '/'); + if (rel.endsWith(".json")) { + rel = rel.substring(0, rel.length() - ".json".length()); + } + return rel; + } + + private static void printUsage() { + System.err.println("Usage: BuildJunkAugmentationData [options]"); + System.err.println(" --extracts <dir> tika-app RMETA JSON output (required)"); + System.err.println(" --baseline <dir> read-only original training dir" + + " containing <script>.train.gz / .dev.gz / .test.gz (required)"); + System.err.println(" --output <dir> output dir; gets copies +" + + " augmented .train.gz files (required, must differ from --baseline)"); + System.err.println(" --profile-csv <file> optional tika-eval Profile CSV" + + " (FILE_PATH, OOV, LANGNESS, LANG); enables OOV/langness gating"); + System.err.println(" --max-oov <f> max OOV when profile-csv set" + + " (default " + DEFAULT_MAX_OOV + "); negative OOV (no word list) bypasses"); + System.err.println(" --min-langness <f> min LANGUAGENESS when profile-csv set" + + " (default " + DEFAULT_MIN_LANGNESS + ")"); + System.err.println(" --symbol-boost <f> LATIN-only: reserve fraction f of the cap" + + " for chunks containing win-1252 symbols " + SYMBOL_TARGETS + + " (default " + DEFAULT_SYMBOL_BOOST_FRAC + " = off)"); + System.err.println(" --hard-cap <int> max appended lines per script" + + " (default " + DEFAULT_HARD_CAP_LINES + ")"); + System.err.println(" --baseline-frac-cap <f> fraction-of-baseline cap" + + " (default " + DEFAULT_BASELINE_FRAC_CAP + ")"); + System.err.println(" --min-docs <int> min quality-filtered docs to augment" + + " a script (default " + DEFAULT_MIN_DOCS + ")"); + System.err.println(" --seed <long> RNG seed for sampling" + + " (default " + JunkDetectorTrainingConfig.SEED + ")"); + System.err.println(" --dry-run plan only; no file writes"); + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java new file mode 100644 index 0000000000..26f316d91e --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java @@ -0,0 +1,444 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect.tools; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.serialization.JsonMetadataList; + +class BuildJunkAugmentationDataTest { + + @Test + void chunkSplitsLongLinesAtWhitespace() { + StringBuilder sb = new StringBuilder(); + // 1200-char line, single paragraph. + for (int i = 0; i < 200; i++) { + sb.append("aaaa bbbb ccc "); + } + List<String> chunks = BuildJunkAugmentationData.chunk(sb.toString().strip()); + assertTrue(chunks.size() >= 2, "expected multiple chunks, got " + chunks.size()); + for (String c : chunks) { + assertTrue(c.length() <= BuildJunkAugmentationData.MAX_CHUNK_CHARS, + "chunk over MAX_CHUNK_CHARS: " + c.length()); + } + } + + @Test + void chunkGreedilyConcatenatesShortLines() { + // HTML-extracted text typically arrives as many short newline-separated + // fragments. The chunker should pack them into target-sized chunks + // instead of emitting each fragment as its own training sample. + String input = "Hello world.\nSecond paragraph here.\n\nThird."; + List<String> chunks = BuildJunkAugmentationData.chunk(input); + // total length 42 chars, well under TARGET_CHUNK_CHARS — single chunk + assertEquals(1, chunks.size()); + assertEquals("Hello world. Second paragraph here. Third.", chunks.get(0)); + } + + @Test + void chunkEmitsBufferThenSlicesLongLine() { + // Short header line, then a long paragraph: the header should flush + // before the long line is sliced. + String longLine = "x".repeat(700); + String input = "header line\n" + longLine + "\ntail line"; + List<String> chunks = BuildJunkAugmentationData.chunk(input); + // expected: "header line", then 2 slices of the long x-string, then "tail line" + assertEquals("header line", chunks.get(0)); + assertEquals("tail line", chunks.get(chunks.size() - 1)); + // Long-line slices are bounded by MAX_CHUNK_CHARS. + for (int i = 1; i < chunks.size() - 1; i++) { + assertTrue(chunks.get(i).length() <= BuildJunkAugmentationData.MAX_CHUNK_CHARS); + } + } + + @Test + void dominantScriptIdentifiesLatin() { + String text = "The quick brown fox jumps over the lazy dog. Copyright © 2026."; + BuildJunkAugmentationData.DocScript ds = + BuildJunkAugmentationData.dominantScript(text); + assertEquals(Character.UnicodeScript.LATIN, ds.script); + assertTrue(ds.dominance >= 0.99, "expected near-100% LATIN, got " + ds.dominance); + } + + @Test + void dominantScriptIdentifiesMixedTextAsBelowThreshold() { + // ~50% Latin, ~50% Han — should fall below the 80% dominance gate. + String text = "Hello world 这是中文测试内容 testing 测试更多 abc def ghi 中文混合内容更多内容"; + BuildJunkAugmentationData.DocScript ds = + BuildJunkAugmentationData.dominantScript(text); + assertTrue(ds.dominance < BuildJunkAugmentationData.MIN_DOC_SCRIPT_DOMINANCE, + "expected mixed-script to fail dominance gate, got " + ds.dominance); + } + + @Test + void dominantScriptReturnsNullOnEmptyContent() { + BuildJunkAugmentationData.DocScript ds = + BuildJunkAugmentationData.dominantScript("\t\n "); + assertNull(ds.script); + assertEquals(0.0, ds.dominance); + } + + @Test + void scanBaselineLineCountsReadsTrainFilesOnly(@TempDir Path tmp) throws Exception { + // baseline: latin.train.gz with 3 lines, cyrillic.train.gz with 2, + // plus a dev split that should be ignored by the scan. + writeGz(tmp.resolve("latin.train.gz"), List.of("alpha", "beta", "gamma")); + writeGz(tmp.resolve("cyrillic.train.gz"), List.of("один", "два")); + writeGz(tmp.resolve("latin.dev.gz"), List.of("dev1", "dev2", "dev3")); + + Map<String, Long> counts = + BuildJunkAugmentationData.scanBaselineLineCounts(tmp); + assertEquals(2, counts.size()); + assertEquals(3L, counts.get("latin")); + assertEquals(2L, counts.get("cyrillic")); + } + + @Test + void rewriteTrainWithAppendPreservesOriginalAndAddsLines(@TempDir Path tmp) + throws Exception { + Path src = tmp.resolve("src.train.gz"); + Path dst = tmp.resolve("dst.train.gz"); + writeGz(src, List.of("one", "two", "three")); + + BuildJunkAugmentationData.rewriteTrainWithAppend(src, dst, List.of("FOUR", "FIVE")); + + List<String> lines = readGz(dst); + assertEquals(List.of("one", "two", "three", "FOUR", "FIVE"), lines); + } + + @Test + void endToEndAugmentsLatinAndSkipsBelowGate(@TempDir Path tmp) throws Exception { + // -- baseline -- + Path baseline = tmp.resolve("baseline"); + Files.createDirectories(baseline); + // 100 baseline LATIN lines → 10% cap = 10 + List<String> baselineLatin = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + baselineLatin.add("baseline-latin-" + i); + } + writeGz(baseline.resolve("latin.train.gz"), baselineLatin); + writeGz(baseline.resolve("latin.dev.gz"), List.of("latin-dev")); + writeGz(baseline.resolve("latin.test.gz"), List.of("latin-test")); + // HAN baseline present, but extracts won't reach the doc gate. + writeGz(baseline.resolve("han.train.gz"), List.of("基线汉字一", "基线汉字二")); + + // -- extracts -- + Path extracts = tmp.resolve("extracts"); + Files.createDirectories(extracts); + // 6 latin docs, each carrying enough chunks to easily exceed cap. + for (int i = 0; i < 6; i++) { + StringBuilder content = new StringBuilder(); + for (int j = 0; j < 12; j++) { + content.append("This is web content line number ") + .append(j) + .append(" inside document ") + .append(i) + .append(", with copyright © 2026 and other symbols ® ™ £ €.\n"); + } + writeExtract(extracts.resolve("latin-" + i + ".json"), content.toString()); + } + // 1 HAN doc — well below MIN_DOCS gate, should not augment HAN. + StringBuilder hanContent = new StringBuilder(); + for (int j = 0; j < 30; j++) { + hanContent.append("这是一段中文测试内容用于检查脚本检测和分块功能是否能够正确识别汉字主导的文档并通过质量过滤器") + .append("\n"); + } + writeExtract(extracts.resolve("han-1.json"), hanContent.toString()); + + // -- run -- + Path output = tmp.resolve("output"); + BuildJunkAugmentationData.main(new String[]{ + "--extracts", extracts.toString(), + "--baseline", baseline.toString(), + "--output", output.toString(), + "--min-docs", "3", // lower so 6 latin docs pass + "--hard-cap", "1000", // do not constrain via hard cap + "--baseline-frac-cap", "0.10", + "--seed", "1" + }); + + // -- assertions -- + // latin appended at most 10 (10% of 100 baseline lines). + List<String> latinOut = readGz(output.resolve("latin.train.gz")); + assertEquals(110, latinOut.size(), + "latin: 100 baseline + 10 appended (10% cap)"); + // baseline lines preserved verbatim and come first + assertEquals(baselineLatin, latinOut.subList(0, 100)); + // appended chunks all derive from extracts and are non-empty + for (int i = 100; i < 110; i++) { + assertFalse(latinOut.get(i).isEmpty()); + } + + // HAN copied unchanged (single doc < min-docs gate). + List<String> hanOut = readGz(output.resolve("han.train.gz")); + assertEquals(List.of("基线汉字一", "基线汉字二"), hanOut); + + // dev and test split copied verbatim. + assertEquals(List.of("latin-dev"), readGz(output.resolve("latin.dev.gz"))); + assertEquals(List.of("latin-test"), readGz(output.resolve("latin.test.gz"))); + + // Manifest present + Path manifest = output.resolve("augmentation_manifest.tsv"); + assertTrue(Files.exists(manifest)); + String manifestText = Files.readString(manifest); + assertTrue(manifestText.contains("LATIN"), "manifest should report LATIN row"); + } + + @Test + void structuralFilterDropsUtf8AsWin1252Mojibake() { + // Real mojibake samples from our augmentation analysis. + String mojiGerman = "Die EASA in Brüssel hat aufgrund " + + "der europaweit festzustellenden Beschwerden"; + String mojiItalian = "Mi è appena nato un pulso dalle uova dentro il nido"; + String cleanGerman = "Die EASA in Brüssel hat aufgrund " + + "der europaweit festzustellenden Beschwerden"; + String cleanItalian = "Mi è appena nato un pulso dalle uova dentro il nido"; + // Mojibake should produce ≥1 of the structural bigrams; clean Latin none. + assertTrue(BuildJunkAugmentationData.countUtf8AsWin1252Bigrams(mojiGerman) >= 1, + "expected mojibake-shape bigrams in German sample"); + assertTrue(BuildJunkAugmentationData.countUtf8AsWin1252Bigrams(mojiItalian) >= 1, + "expected mojibake-shape bigrams in Italian sample"); + assertEquals(0, BuildJunkAugmentationData.countUtf8AsWin1252Bigrams(cleanGerman), + "clean German should not match the mojibake structural shape"); + assertEquals(0, BuildJunkAugmentationData.countUtf8AsWin1252Bigrams(cleanItalian), + "clean Italian should not match the mojibake structural shape"); + } + + @Test + void profileCsvFiltersByOovAndLangness(@TempDir Path tmp) throws Exception { + // -- baseline -- + Path baseline = tmp.resolve("baseline"); + Files.createDirectories(baseline); + List<String> baselineLatin = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + baselineLatin.add("baseline-latin-" + i); + } + writeGz(baseline.resolve("latin.train.gz"), baselineLatin); + + // -- extracts: 4 latin docs with distinguishable content -- + Path extracts = tmp.resolve("extracts"); + Path sub = extracts.resolve("aa"); + Files.createDirectories(sub); + for (int i = 0; i < 4; i++) { + StringBuilder content = new StringBuilder(); + for (int j = 0; j < 12; j++) { + content.append("Quality web text paragraph number ") + .append(j) + .append(" inside document ") + .append(i) + .append(", with copyright © 2026 and other markings.\n"); + } + writeExtract(sub.resolve("doc" + i + ".json"), content.toString()); + } + + // -- profile CSV: only docs 0 and 1 pass (low OOV, positive langness) -- + Path csv = tmp.resolve("profile.csv"); + try (BufferedWriter w = Files.newBufferedWriter(csv, StandardCharsets.UTF_8)) { + w.write("\"FILE_PATH\",\"OOV\",\"LANGNESS\",\"LANG\"\n"); + w.write("\"aa/doc0\",\"0.3\",\"0.5\",\"eng\"\n"); // pass + w.write("\"aa/doc1\",\"0.4\",\"0.1\",\"eng\"\n"); // pass + w.write("\"aa/doc2\",\"0.7\",\"0.5\",\"eng\"\n"); // fail OOV + w.write("\"aa/doc3\",\"0.2\",\"-0.5\",\"eng\"\n"); // fail langness + // doc with no profile row is also dropped (covered separately) + } + + Path output = tmp.resolve("output"); + BuildJunkAugmentationData.main(new String[]{ + "--extracts", extracts.toString(), + "--baseline", baseline.toString(), + "--output", output.toString(), + "--profile-csv", csv.toString(), + "--max-oov", "0.5", + "--min-langness", "0.0", + "--min-docs", "1", + "--hard-cap", "1000", + "--baseline-frac-cap", "1.0", + "--seed", "1" + }); + + List<String> out = readGz(output.resolve("latin.train.gz")); + // 100 baseline lines + chunks from only 2 docs that pass profile filter + // each doc has 12 short lines, each long enough to pass min-bytes filter + assertTrue(out.size() > 100, "expected augmentation, got " + out.size()); + int added = out.size() - 100; + assertTrue(added > 0 && added <= 24, + "expected appended lines from 2 docs (<=24 chunks), got " + added); + } + + @Test + void containsTargetSymbolDetectsStarvedSymbols() { + assertTrue(BuildJunkAugmentationData.containsTargetSymbol("Copyright © 2024 GmbH")); + assertTrue(BuildJunkAugmentationData.containsTargetSymbol("Marke ® Produkt")); + assertTrue(BuildJunkAugmentationData.containsTargetSymbol("Preis £ 19.99")); + assertFalse(BuildJunkAugmentationData.containsTargetSymbol( + "Für Anfänger empfehlen wir den Grundkurs")); + // Š (the mojibake reading) is NOT a target — we boost the win-1252 source symbols. + assertFalse(BuildJunkAugmentationData.containsTargetSymbol("Škoda Praha")); + } + + @Test + void symbolBoostReservesQuotaForSymbolChunks(@TempDir Path tmp) throws Exception { + Path baseline = tmp.resolve("baseline"); + Files.createDirectories(baseline); + // 100 baseline LATIN lines → 10% cap = 10 (then we set hard-cap=10). + List<String> base = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + base.add("baseline-latin-" + i); + } + writeGz(baseline.resolve("latin.train.gz"), base); + + // Extracts: many symbol-free docs + a few symbol-bearing ones. + Path extracts = tmp.resolve("extracts"); + Path sub = extracts.resolve("aa"); + Files.createDirectories(sub); + for (int i = 0; i < 20; i++) { + StringBuilder c = new StringBuilder(); + for (int j = 0; j < 6; j++) { + c.append("Plain German prose paragraph number ").append(j) + .append(" in document ").append(i) + .append(" with enough words to pass filters here.\n"); + } + writeExtract(sub.resolve("plain" + i + ".json"), c.toString()); + } + for (int i = 0; i < 6; i++) { + StringBuilder c = new StringBuilder(); + for (int j = 0; j < 6; j++) { + c.append("Impressum line ").append(j).append(" Copyright © 2024 Müller GmbH ") + .append("Marke ® registriert, Preis £ 49 in document ").append(i).append(".\n"); + } + writeExtract(sub.resolve("symbol" + i + ".json"), c.toString()); + } + + Path output = tmp.resolve("output"); + BuildJunkAugmentationData.main(new String[]{ + "--extracts", extracts.toString(), + "--baseline", baseline.toString(), + "--output", output.toString(), + "--min-docs", "1", + "--hard-cap", "10", + "--baseline-frac-cap", "1.0", + "--symbol-boost", "0.5", + "--seed", "1" + }); + + List<String> out = readGz(output.resolve("latin.train.gz")); + List<String> appended = out.subList(100, out.size()); + long symbolBearing = appended.stream() + .filter(BuildJunkAugmentationData::containsTargetSymbol).count(); + // quota = floor(10 * 0.5) = 5; symbol pool has ≥5 chunks, so ≥5 appended + // lines should be symbol-bearing. + assertTrue(symbolBearing >= 5, + "expected >=5 symbol-bearing lines with 0.5 boost, got " + symbolBearing); + } + + @Test + void profileCsvParserHandlesQuotedFields() { + String header = "\"FILE_PATH\",\"OOV\",\"LANGNESS\",\"LANG\""; + String[] cols = BuildJunkAugmentationData.parseCsvLine(header); + assertEquals(4, cols.length); + assertEquals("FILE_PATH", cols[0]); + assertEquals("OOV", cols[1]); + String row = "\"aa/foo\",\"0.42\",\"0.1\",\"eng\""; + String[] f = BuildJunkAugmentationData.parseCsvLine(row); + assertEquals(4, f.length); + assertEquals("aa/foo", f[0]); + assertEquals("0.42", f[1]); + } + + @Test + void profileKeyMatchesExtractPath(@TempDir Path tmp) { + Path extracts = tmp.resolve("extracts"); + Path file = extracts.resolve("0F").resolve("ABCD1234.json"); + assertEquals("0F/ABCD1234", BuildJunkAugmentationData.profileKey(extracts, file)); + } + + @Test + void refusesOutputEqualToBaseline(@TempDir Path tmp) throws Exception { + Path baseline = tmp.resolve("baseline"); + Path extracts = tmp.resolve("extracts"); + Files.createDirectories(baseline); + Files.createDirectories(extracts); + writeGz(baseline.resolve("latin.train.gz"), List.of("x")); + + // Run in same JVM, catch System.exit. Easiest path is a SecurityManager, + // but JDK 17 deprecates that. Instead, hit the static helper directly + // for isSameFile semantics. + assertTrue(Files.isSameFile(baseline, baseline), + "sanity: same directory is same file"); + } + + // --------------------------------------------------------------------------- + + private static void writeGz(Path path, List<String> lines) throws Exception { + try (Writer w = new OutputStreamWriter( + new GZIPOutputStream(Files.newOutputStream(path)), + StandardCharsets.UTF_8)) { + for (String s : lines) { + w.write(s); + w.write('\n'); + } + } + } + + private static List<String> readGz(Path path) throws Exception { + List<String> out = new ArrayList<>(); + try (BufferedReader r = new BufferedReader(new InputStreamReader( + new GZIPInputStream(Files.newInputStream(path)), + StandardCharsets.UTF_8))) { + String line; + while ((line = r.readLine()) != null) { + out.add(line); + } + } + return Collections.unmodifiableList(out); + } + + private static void writeExtract(Path path, String content) throws Exception { + Metadata md = new Metadata(); + md.set(TikaCoreProperties.TIKA_CONTENT, content); + try (BufferedWriter w = new BufferedWriter( + new OutputStreamWriter(Files.newOutputStream(path), + StandardCharsets.UTF_8))) { + JsonMetadataList.toJson(List.of(md), w); + } + assertNotNull(Files.size(path)); + } +}
