(opennlp) branch opennlp-2.x updated: OPENNLP-1811: Fix SentenceDetector missing abbreviations at non-first sentence start with useTokenEnd=false (#985)

mawiesne Sun, 22 Mar 2026 09:10:25 -0700

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch opennlp-2.x
in repository https://gitbox.apache.org/repos/asf/opennlp.git



The following commit(s) were added to refs/heads/opennlp-2.x by this push:
     new 8be5fd0b  OPENNLP-1811: Fix SentenceDetector missing abbreviations at 
non-first sentence start with useTokenEnd=false (#985)
8be5fd0b is described below

commit 8be5fd0b9f5d794061a1d243f4d7e9dcd438cb2b
Author: Richard Zowalla <[email protected]>
AuthorDate: Sun Mar 22 17:09:16 2026 +0100

     OPENNLP-1811: Fix SentenceDetector missing abbreviations at non-first 
sentence start with useTokenEnd=false (#985)
    
    (cherry picked from commit 81f85440ace19a24897c04b9d36418e90c46513b)
---
 .../opennlp/tools/sentdetect/SentenceDetectorME.java | 14 ++++++++++----
 .../sentdetect/SentenceDetectorMEGermanTest.java     | 20 ++++++++++++++++++++
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java 
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
index 4217b730..bf2cd7c2 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -204,7 +204,13 @@ public class SentenceDetectorME implements 
SentenceDetector, Probabilistic {
       // skip over the leading parts of non-token final delimiters
       int fws = getFirstWS(s,cint + 1);
       if (i + 1 < end && enders.get(i + 1) < fws) {
-        continue;
+        // Do not skip if the character right after the delimiter is uppercase,
+        // as this likely indicates the start of a new sentence (e.g., 
"Gedanken.Bek.")
+        // rather than a multi-period abbreviation (e.g., "z.B.").
+        int nextCharIdx = cint + 1;
+        if (nextCharIdx >= s.length() || 
!Character.isUpperCase(s.charAt(nextCharIdx))) {
+          continue;
+        }
       }
       if (positions.size() > 0 && cint < positions.get(positions.size() - 1)) 
continue;
 
@@ -351,11 +357,11 @@ public class SentenceDetectorME implements 
SentenceDetector, Probabilistic {
         if (tokenStartPos > candidateIndex) {
           break; // past candidate position, no point searching further
         }
-        if (tokenStartPos == 0
+        if (tokenStartPos == fromIndex
             && searchText.substring(tokenStartPos, candidateIndex + 
1).equals(abbToken)) {
-          return false; // full abbreviation match at sentence start -> no 
acceptable break
+          return false; // full abbreviation match at segment start -> no 
acceptable break
         }
-        final char prevChar = s.charAt(tokenStartPos == 0 ? tokenStartPos : 
tokenStartPos - 1);
+        final char prevChar = s.charAt(tokenStartPos == fromIndex ? 
tokenStartPos : tokenStartPos - 1);
         if (tokenStartPos + tokenLength >= candidateIndex
           /*
            * Note:
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
index 1d51a089..db6b1a29 100644
--- 
a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
@@ -203,6 +203,26 @@ public class SentenceDetectorMEGermanTest extends 
AbstractSentenceDetectorTest {
         () -> assertEquals(1, probs.length));
   }
 
+  /**
+   * Edge case: Multi-letter abbreviation at the start of a non-first sentence
+   * with {@code useTokenEnd = false} (no space between sentences).
+   */
+  @Test
+  void testSentDetectWithMultiLetterAbbreviationAtNonFirstSentenceStart() {
+    prepareResources(false);
+    final String sent1 = "Träume sind eine Verbindung von Gedanken.";
+    final String sent2 = "Bek. Problem: Schlafmangel.";
+    // No space between sentences (useTokenEnd=false supports this)
+    String sampleSentences = sent1 + sent2;
+    String[] sents = sentenceDetector.sentDetect(sampleSentences);
+    double[] probs = sentenceDetector.probs();
+    assertAll(
+        () -> assertEquals(2, sents.length),
+        () -> assertEquals(sent1, sents[0]),
+        () -> assertEquals(sent2, sents[1]),
+        () -> assertEquals(2, probs.length));
+  }
+
   /*
    * A reproducer and test for OPENNLP-1767.
    * It checks that sentence detection with common abbreviations works 
correctly,

(opennlp) branch opennlp-2.x updated: OPENNLP-1811: Fix SentenceDetector missing abbreviations at non-first sentence start with useTokenEnd=false (#985)

Reply via email to