(opennlp) 01/01: OPENNLP-1811: Fix SentenceDetector missing abbreviations at non-first sentence start with useTokenEnd=false

rzo1 Sun, 22 Mar 2026 05:15:11 -0700

This is an automated email from the ASF dual-hosted git repository.

rzo1 pushed a commit to branch OPENNLP-1811
in repository https://gitbox.apache.org/repos/asf/opennlp.git


commit 56c5bea8b3a0a95b560fb5b5e2d2f1c83e8abbc2
Author: Richard Zowalla <[email protected]>
AuthorDate: Sun Mar 22 13:14:50 2026 +0100

     OPENNLP-1811: Fix SentenceDetector missing abbreviations at non-first 
sentence start with useTokenEnd=false
---
 .../opennlp/tools/sentdetect/SentenceDetectorME.java | 14 ++++++++++----
 .../sentdetect/SentenceDetectorMEGermanTest.java     | 20 ++++++++++++++++++++
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
index ec71618d..6930638e 100644
--- 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -204,7 +204,13 @@ public class SentenceDetectorME implements 
SentenceDetector, Probabilistic {
       // skip over the leading parts of non-token final delimiters
       int fws = getFirstWS(s,cint + 1);
       if (i + 1 < end && enders.get(i + 1) < fws) {
-        continue;
+        // Do not skip if the character right after the delimiter is uppercase,
+        // as this likely indicates the start of a new sentence (e.g., 
"Gedanken.Bek.")
+        // rather than a multi-period abbreviation (e.g., "z.B.").
+        int nextCharIdx = cint + 1;
+        if (nextCharIdx >= s.length() || 
!Character.isUpperCase(s.charAt(nextCharIdx))) {
+          continue;
+        }
       }
       if (positions.size() > 0 && cint < positions.get(positions.size() - 1)) 
continue;
 
@@ -351,11 +357,11 @@ public class SentenceDetectorME implements 
SentenceDetector, Probabilistic {
         if (tokenStartPos > candidateIndex) {
           break; // past candidate position, no point searching further
         }
-        if (tokenStartPos == 0
+        if (tokenStartPos == fromIndex
             && searchText.substring(tokenStartPos, candidateIndex + 
1).equals(abbToken)) {
-          return false; // full abbreviation match at sentence start -> no 
acceptable break
+          return false; // full abbreviation match at segment start -> no 
acceptable break
         }
-        final char prevChar = s.charAt(tokenStartPos == 0 ? tokenStartPos : 
tokenStartPos - 1);
+        final char prevChar = s.charAt(tokenStartPos == fromIndex ? 
tokenStartPos : tokenStartPos - 1);
         if (tokenStartPos + tokenLength >= candidateIndex
           /*
            * Note:
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
index f145dd9e..53203c12 100644
--- 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
@@ -203,6 +203,26 @@ public class SentenceDetectorMEGermanTest extends 
AbstractSentenceDetectorTest {
         () -> assertEquals(1, probs.length));
   }
 
+  /**
+   * Edge case: Multi-letter abbreviation at the start of a non-first sentence
+   * with {@code useTokenEnd = false} (no space between sentences).
+   */
+  @Test
+  void testSentDetectWithMultiLetterAbbreviationAtNonFirstSentenceStart() {
+    prepareResources(false);
+    final String sent1 = "Träume sind eine Verbindung von Gedanken.";
+    final String sent2 = "Bek. Problem: Schlafmangel.";
+    // No space between sentences (useTokenEnd=false supports this)
+    String sampleSentences = sent1 + sent2;
+    String[] sents = sentenceDetector.sentDetect(sampleSentences);
+    double[] probs = sentenceDetector.probs();
+    assertAll(
+        () -> assertEquals(2, sents.length),
+        () -> assertEquals(sent1, sents[0]),
+        () -> assertEquals(sent2, sents[1]),
+        () -> assertEquals(2, probs.length));
+  }
+
   /*
    * A reproducer and test for OPENNLP-1767.
    * It checks that sentence detection with common abbreviations works 
correctly,

(opennlp) 01/01: OPENNLP-1811: Fix SentenceDetector missing abbreviations at non-first sentence start with useTokenEnd=false

Reply via email to