This is an automated email from the ASF dual-hosted git repository. rzo1 pushed a commit to branch OPENNLP-1810 in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit 60a9d47b69e2e92c2325de8e4406dec9aa505ee4 Author: Richard Zowalla <[email protected]> AuthorDate: Sun Mar 22 13:06:21 2026 +0100 OPENNLP-1810: Fix SentenceDetector abbreviation matching for case-insensitive dictionaries with duplicate abbreviations --- .../tools/sentdetect/SentenceDetectorME.java | 33 +++++++++++++--------- .../sentdetect/SentenceDetectorMEGermanTest.java | 13 +++++++++ 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java index 55faf79c..ec71618d 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java @@ -340,29 +340,34 @@ public class SentenceDetectorME implements SentenceDetector, Probabilistic { return true; final String text = s.toString(); + final boolean caseSensitive = abbDict.isCaseSensitive(); + final String searchText = caseSensitive ? text : StringUtil.toLowerCase(text); for (StringList abb : abbDict) { - final String abbToken = abb.getToken(0); - final int tokenStartPos = text.indexOf(abbToken, fromIndex); - if (tokenStartPos == -1) { - continue; // skip fast when abb is not present in text - } - if (tokenStartPos == 0 && text.substring(tokenStartPos, candidateIndex + 1).equals(abbToken)) { - return false; // full abbreviation match at sentence start -> no acceptable break - } else { - final int tokenLength = abbToken.length(); + final String abbToken = caseSensitive ? abb.getToken(0) + : StringUtil.toLowerCase(abb.getToken(0)); + final int tokenLength = abbToken.length(); + int tokenStartPos = searchText.indexOf(abbToken, fromIndex); + while (tokenStartPos != -1) { + if (tokenStartPos > candidateIndex) { + break; // past candidate position, no point searching further + } + if (tokenStartPos == 0 + && searchText.substring(tokenStartPos, candidateIndex + 1).equals(abbToken)) { + return false; // full abbreviation match at sentence start -> no acceptable break + } final char prevChar = s.charAt(tokenStartPos == 0 ? tokenStartPos : tokenStartPos - 1); - if (tokenStartPos + tokenLength < candidateIndex || tokenStartPos > candidateIndex || + if (tokenStartPos + tokenLength >= candidateIndex /* * Note: * Skip abbreviation candidate if regular characters exist directly before it, * That is, any letter or digit except: a whitespace, an apostrophe, or an opening round bracket. * This prevents mismatches from overlaps close to an actual sentence end. */ - !(Character.isWhitespace(prevChar) || isApostrophe(prevChar) || prevChar == '(')) { - - continue; + && (Character.isWhitespace(prevChar) || isApostrophe(prevChar) || prevChar == '(')) { + return false; // in case of a valid abbreviation: the (sentence) break is not accepted } - return false; // in case of a valid abbreviation: the (sentence) break is not accepted + // Try next occurrence of this abbreviation in the text + tokenStartPos = searchText.indexOf(abbToken, tokenStartPos + 1); } } return true; // no abbreviation(s) at given positions: valid sentence boundary diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java index 9d271ce0..f145dd9e 100644 --- a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java @@ -190,6 +190,19 @@ public class SentenceDetectorMEGermanTest extends AbstractSentenceDetectorTest { () -> assertEquals(2, probs.length)); } + // Edge case: The same abbreviation appears twice in a single sentence segment. + @Test + void testSentDetectWithDuplicateAbbreviationInSameSegment() { + prepareResources(true); + final String sent1 = "Lt. Vertrag und lt. Bescheid gelten andere Bedingungen."; + String[] sents = sentenceDetector.sentDetect(sent1); + double[] probs = sentenceDetector.probs(); + assertAll( + () -> assertEquals(1, sents.length), + () -> assertEquals(sent1, sents[0]), + () -> assertEquals(1, probs.length)); + } + /* * A reproducer and test for OPENNLP-1767. * It checks that sentence detection with common abbreviations works correctly,
