(opennlp) 01/01: OPENNLP-1810: Fix SentenceDetector abbreviation matching for case-insensitive dictionaries with duplicate abbreviations

rzo1 Sun, 22 Mar 2026 05:07:07 -0700

This is an automated email from the ASF dual-hosted git repository.

rzo1 pushed a commit to branch OPENNLP-1810
in repository https://gitbox.apache.org/repos/asf/opennlp.git


commit 60a9d47b69e2e92c2325de8e4406dec9aa505ee4
Author: Richard Zowalla <[email protected]>
AuthorDate: Sun Mar 22 13:06:21 2026 +0100

     OPENNLP-1810: Fix SentenceDetector abbreviation matching for 
case-insensitive dictionaries with duplicate abbreviations
---
 .../tools/sentdetect/SentenceDetectorME.java       | 33 +++++++++++++---------
 .../sentdetect/SentenceDetectorMEGermanTest.java   | 13 +++++++++
 2 files changed, 32 insertions(+), 14 deletions(-)

diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
index 55faf79c..ec71618d 100644
--- 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -340,29 +340,34 @@ public class SentenceDetectorME implements 
SentenceDetector, Probabilistic {
       return true;
 
     final String text = s.toString();
+    final boolean caseSensitive = abbDict.isCaseSensitive();
+    final String searchText = caseSensitive ? text : 
StringUtil.toLowerCase(text);
     for (StringList abb : abbDict) {
-      final String abbToken = abb.getToken(0);
-      final int tokenStartPos = text.indexOf(abbToken, fromIndex);
-      if (tokenStartPos == -1) {
-        continue; // skip fast when abb is not present in text
-      }
-      if (tokenStartPos == 0 && text.substring(tokenStartPos, candidateIndex + 
1).equals(abbToken)) {
-        return false; // full abbreviation match at sentence start -> no 
acceptable break
-      } else {
-        final int tokenLength = abbToken.length();
+      final String abbToken = caseSensitive ? abb.getToken(0)
+          : StringUtil.toLowerCase(abb.getToken(0));
+      final int tokenLength = abbToken.length();
+      int tokenStartPos = searchText.indexOf(abbToken, fromIndex);
+      while (tokenStartPos != -1) {
+        if (tokenStartPos > candidateIndex) {
+          break; // past candidate position, no point searching further
+        }
+        if (tokenStartPos == 0
+            && searchText.substring(tokenStartPos, candidateIndex + 
1).equals(abbToken)) {
+          return false; // full abbreviation match at sentence start -> no 
acceptable break
+        }
         final char prevChar = s.charAt(tokenStartPos == 0 ? tokenStartPos : 
tokenStartPos - 1);
-        if (tokenStartPos + tokenLength < candidateIndex || tokenStartPos > 
candidateIndex ||
+        if (tokenStartPos + tokenLength >= candidateIndex
           /*
            * Note:
            * Skip abbreviation candidate if regular characters exist directly 
before it,
            * That is, any letter or digit except: a whitespace, an apostrophe, 
or an opening round bracket.
            * This prevents mismatches from overlaps close to an actual 
sentence end.
            */
-            !(Character.isWhitespace(prevChar) || isApostrophe(prevChar) || 
prevChar == '(')) {
-
-          continue;
+            && (Character.isWhitespace(prevChar) || isApostrophe(prevChar) || 
prevChar == '(')) {
+          return false; // in case of a valid abbreviation: the (sentence) 
break is not accepted
         }
-        return false; // in case of a valid abbreviation: the (sentence) break 
is not accepted
+        // Try next occurrence of this abbreviation in the text
+        tokenStartPos = searchText.indexOf(abbToken, tokenStartPos + 1);
       }
     }
     return true; // no abbreviation(s) at given positions: valid sentence 
boundary
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
index 9d271ce0..f145dd9e 100644
--- 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
@@ -190,6 +190,19 @@ public class SentenceDetectorMEGermanTest extends 
AbstractSentenceDetectorTest {
         () -> assertEquals(2, probs.length));
   }
 
+  // Edge case: The same abbreviation appears twice in a single sentence 
segment.
+  @Test
+  void testSentDetectWithDuplicateAbbreviationInSameSegment() {
+    prepareResources(true);
+    final String sent1 = "Lt. Vertrag und lt. Bescheid gelten andere 
Bedingungen.";
+    String[] sents = sentenceDetector.sentDetect(sent1);
+    double[] probs = sentenceDetector.probs();
+    assertAll(
+        () -> assertEquals(1, sents.length),
+        () -> assertEquals(sent1, sents[0]),
+        () -> assertEquals(1, probs.length));
+  }
+
   /*
    * A reproducer and test for OPENNLP-1767.
    * It checks that sentence detection with common abbreviations works 
correctly,

(opennlp) 01/01: OPENNLP-1810: Fix SentenceDetector abbreviation matching for case-insensitive dictionaries with duplicate abbreviations

Reply via email to