This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch universal-junk-detector in repository https://gitbox.apache.org/repos/asf/tika.git
commit 11714d714db07c59c5d65ec3eddebd99a527e832 Author: tballison <[email protected]> AuthorDate: Thu Apr 23 17:20:09 2026 -0400 updates --- .../tika/ml/junkdetect/tools/TrainJunkModel.java | 12 ++++++++++-- .../org/apache/tika/ml/junkdetect/junkdetect.bin | Bin 468582 -> 468510 bytes 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java index d5e7ce2430..bf37734005 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java @@ -823,8 +823,15 @@ public class TrainJunkModel { * <p>Label convention: 1 = clean, 0 = corrupted. At inference, positive * logit → clean text; negative logit → corrupted text. * - * <p>Uses full-batch gradient descent with L2 regularization. Converges - * reliably for {@code numFeatures} ≤ 10 with the default hyperparameters. + * <p>Uses full-batch gradient descent with L2 regularization and a + * non-negativity constraint on feature weights (projected gradient descent: + * {@code w[j] = max(0, w[j])} after each step). The constraint enforces the + * semantic invariant that every feature is calibrated so higher = cleaner; + * a negative weight would mean "more unusual transitions → cleaner text", which + * is semantically wrong and causes pathological behaviour when collinear features + * (e.g. z2 block-transitions and z4 script-transitions) are both present. + * The bias term is unconstrained. Converges reliably for {@code numFeatures} ≤ 10 + * with the default hyperparameters. * * @param features list of feature vectors, each of length {@code numFeatures} * @param labels parallel list of labels (0 or 1) @@ -875,6 +882,7 @@ public class TrainJunkModel { for (int j = 0; j < numFeatures; j++) { w[j] -= lr * (float) (gradW[j] / n + lambda * w[j]); + w[j] = Math.max(0f, w[j]); // projected gradient: feature weights are non-negative by design } bias -= lr * (float) (gradB / n); } diff --git a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin index 050a5c4054..72593ff91b 100644 Binary files a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin and b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin differ
