This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch charset-ship-today in repository https://gitbox.apache.org/repos/asf/tika.git
commit c1eb6201de4c801e47a7888b373a16ee188b1480 Author: tallison <[email protected]> AuthorDate: Wed Apr 15 17:15:29 2026 -0400 git add --- .../tika/langdetect/charsoup/HtmlStripperTest.java | 213 +++++++++++++++++++++ 1 file changed, 213 insertions(+) diff --git a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/HtmlStripperTest.java b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/HtmlStripperTest.java new file mode 100644 index 0000000000..b21b2c8a40 --- /dev/null +++ b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/HtmlStripperTest.java @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.langdetect.charsoup; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.Test; + +public class HtmlStripperTest { + + @Test + public void stripsBasicTags() { + assertEquals("Hello world", + HtmlStripper.strip("<html><body>Hello world</body></html>")); + assertEquals("no tags here", + HtmlStripper.strip("no tags here")); + assertEquals("", + HtmlStripper.strip("<empty/>")); + } + + @Test + public void removesScriptAndStyleContents() { + // Script bodies (JavaScript) and style bodies (CSS) used to leak into + // the output and skew language detection toward English. Verify they + // are removed entirely. + String input = "<html><head>" + + "<script type=\"text/javascript\">function add(a,b) { return a+b; }</script>" + + "<style>body { font-family: sans-serif; }</style>" + + "</head><body>real content here</body></html>"; + String stripped = HtmlStripper.strip(input); + assertFalse(stripped.contains("function"), + "JavaScript identifier 'function' should not survive: " + stripped); + assertFalse(stripped.contains("font-family"), + "CSS property name should not survive: " + stripped); + assertTrue(stripped.contains("real content here"), + "Body prose should survive: " + stripped); + } + + @Test + public void removesComments() { + String input = "<p>before<!-- some comment with content -->after</p>"; + String stripped = HtmlStripper.strip(input); + assertFalse(stripped.contains("comment"), + "Comment body should not survive: " + stripped); + assertTrue(stripped.contains("before")); + assertTrue(stripped.contains("after")); + } + + @Test + public void handlesEntities() { + // Named entities (e.g. &, ) → stripped to space (low signal, + // and a full named-entity table is heavyweight). + // Numeric entities (e.g. Ӓ, ”) → DECODED to their actual + // code point so the content reaches the language detector. This + // matters for files where the page's primary content is delivered + // via numeric entities (e.g. industrial-product pages emitting CJK + // ideographs as &#NNNN; for cross-charset compatibility). + String stripped = HtmlStripper.strip( + "<p>&hello world–test”end</p>"); + assertFalse(stripped.contains("&"), + "No entity references should survive: " + stripped); + // 0x2013 = en-dash, 0x201D = right double quote — should appear as + // actual chars, not as entity references nor as spaces. + assertTrue(stripped.contains("\u2013"), + "Numeric entity – should decode to en-dash: " + stripped); + assertTrue(stripped.contains("\u201D"), + "Numeric entity ” should decode to right double quote: " + stripped); + assertTrue(stripped.contains("hello")); + assertTrue(stripped.contains("world")); + } + + @Test + public void decodesCjkNumericEntities() { + // Real-world case: industrial-product pages that emit CJK ideographs + // via numeric entities (so they render correctly regardless of the + // page's declared charset). The decoded content must reach the + // language detector — without this, language detection sees only + // ASCII markup and concludes "English" no matter what the page is + // actually about. + String input = "<p>过滤离 cyclone</p>"; + String stripped = HtmlStripper.strip(input); + assertTrue(stripped.contains("\u8FC7"), + "0x8FC7 (过) should decode: " + stripped); + assertTrue(stripped.contains("\u6EE4"), + "0x6EE4 (滤) should decode: " + stripped); + assertTrue(stripped.contains("\u79BB"), + "0x79BB (离) should decode: " + stripped); + } + + @Test + public void rejectsInvalidNumericEntities() { + // Surrogate-half codepoints, control chars, and out-of-range numbers + // should be replaced with a space rather than emitted (they would + // either crash the language detector or skew scores). + String stripped = HtmlStripper.strip("good�bad�bad�good"); + assertFalse(stripped.contains("\uD800"), + "Surrogate code point should not be emitted: " + stripped); + assertTrue(stripped.contains("good")); + } + + @Test + public void nullAndEmptyAreReturnedAsIs() { + assertEquals(null, HtmlStripper.strip(null)); + assertEquals("", HtmlStripper.strip("")); + } + + @Test + public void unclosedTagSwallowsToEnd() { + // Defensive: a `<` with no matching `>` should not dump uninterpreted + // markup back into the output (would dominate language scoring). + // The unclosed tag is consumed silently — no trailing space. + assertEquals("before", HtmlStripper.strip("before<unclosed never ends")); + } + + @Test + public void literalAmpersandPreserved() { + // A `&` not followed by a recognisable entity body must survive. + assertEquals("a & b", HtmlStripper.strip("a & b")); + assertEquals("rock&roll", HtmlStripper.strip("rock&roll")); + } + + // --- defensive-input cases (regression coverage) --- + + @Test + public void truncatedAtScriptDoesNotThrow() { + // Previously triggered AIOOBE: input ending exactly with `<script` + // (or `<style`) — element-name match succeeded but the + // tag-boundary lookahead read past end-of-string. + for (String s : new String[]{ + "<script", "<SCRIPT", "<style", "<STYLE", + "before<script", "x<style", "<scriptX"}) { + // Just verify no exception and a string is returned. + String out = HtmlStripper.strip(s); + assertTrue(out != null, "stripping " + s + " returned null"); + } + } + + @Test + public void manyOpenAnglesDoesNotHang() { + // Worst-case stress: 32K of `<` characters with no matching `>`. + // The main loop should run in O(N) and produce empty output. + StringBuilder sb = new StringBuilder(32 * 1024); + for (int k = 0; k < 32 * 1024; k++) { + sb.append('<'); + } + long start = System.nanoTime(); + String out = HtmlStripper.strip(sb.toString()); + long ms = (System.nanoTime() - start) / 1_000_000; + assertEquals("", out); + assertTrue(ms < 1000, "took " + ms + " ms — possible quadratic blowup"); + } + + @Test + public void manyAmpersandsDoesNotHang() { + // 32K of `&` characters with no entity bodies. Each is treated as + // literal `&` with O(1) lookahead bounded by 12 chars. + StringBuilder sb = new StringBuilder(32 * 1024); + for (int k = 0; k < 32 * 1024; k++) { + sb.append('&'); + } + long start = System.nanoTime(); + String out = HtmlStripper.strip(sb.toString()); + long ms = (System.nanoTime() - start) / 1_000_000; + // All 32K ampersands survive as literals. + assertEquals(32 * 1024, out.length()); + assertTrue(ms < 1000, "took " + ms + " ms — possible quadratic blowup"); + } + + @Test + public void unclosedScriptBlockDoesNotHang() { + // 32K of false `</script` partial closers (no tag boundary). + // skipPastClosing should advance past each one in O(1) and + // terminate cleanly. + StringBuilder sb = new StringBuilder(32 * 1024); + sb.append("<script>"); + while (sb.length() < 32 * 1024) { + sb.append("</scriptX"); // X is not a tag boundary + } + long start = System.nanoTime(); + String out = HtmlStripper.strip(sb.toString()); + long ms = (System.nanoTime() - start) / 1_000_000; + assertTrue(out != null); + assertTrue(ms < 1000, "took " + ms + " ms — possible quadratic blowup"); + } + + @Test + public void truncatedNumericEntityDoesNotThrow() { + // Various truncations. None should throw. + for (String s : new String[]{ + "&", "&#", "&#x", "", "", + "&#;", "&#x;", "&#xZZ;", "�", // overflow + "&;", "&abc", "&abc def;"}) { + String out = HtmlStripper.strip(s); + assertTrue(out != null, "stripping " + s + " returned null"); + } + } +}
