This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch opennlp-2.x
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/opennlp-2.x by this push:
new 8be5fd0b OPENNLP-1811: Fix SentenceDetector missing abbreviations at
non-first sentence start with useTokenEnd=false (#985)
8be5fd0b is described below
commit 8be5fd0b9f5d794061a1d243f4d7e9dcd438cb2b
Author: Richard Zowalla <[email protected]>
AuthorDate: Sun Mar 22 17:09:16 2026 +0100
OPENNLP-1811: Fix SentenceDetector missing abbreviations at non-first
sentence start with useTokenEnd=false (#985)
(cherry picked from commit 81f85440ace19a24897c04b9d36418e90c46513b)
---
.../opennlp/tools/sentdetect/SentenceDetectorME.java | 14 ++++++++++----
.../sentdetect/SentenceDetectorMEGermanTest.java | 20 ++++++++++++++++++++
2 files changed, 30 insertions(+), 4 deletions(-)
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
index 4217b730..bf2cd7c2 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -204,7 +204,13 @@ public class SentenceDetectorME implements
SentenceDetector, Probabilistic {
// skip over the leading parts of non-token final delimiters
int fws = getFirstWS(s,cint + 1);
if (i + 1 < end && enders.get(i + 1) < fws) {
- continue;
+ // Do not skip if the character right after the delimiter is uppercase,
+ // as this likely indicates the start of a new sentence (e.g.,
"Gedanken.Bek.")
+ // rather than a multi-period abbreviation (e.g., "z.B.").
+ int nextCharIdx = cint + 1;
+ if (nextCharIdx >= s.length() ||
!Character.isUpperCase(s.charAt(nextCharIdx))) {
+ continue;
+ }
}
if (positions.size() > 0 && cint < positions.get(positions.size() - 1))
continue;
@@ -351,11 +357,11 @@ public class SentenceDetectorME implements
SentenceDetector, Probabilistic {
if (tokenStartPos > candidateIndex) {
break; // past candidate position, no point searching further
}
- if (tokenStartPos == 0
+ if (tokenStartPos == fromIndex
&& searchText.substring(tokenStartPos, candidateIndex +
1).equals(abbToken)) {
- return false; // full abbreviation match at sentence start -> no
acceptable break
+ return false; // full abbreviation match at segment start -> no
acceptable break
}
- final char prevChar = s.charAt(tokenStartPos == 0 ? tokenStartPos :
tokenStartPos - 1);
+ final char prevChar = s.charAt(tokenStartPos == fromIndex ?
tokenStartPos : tokenStartPos - 1);
if (tokenStartPos + tokenLength >= candidateIndex
/*
* Note:
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
index 1d51a089..db6b1a29 100644
---
a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
+++
b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
@@ -203,6 +203,26 @@ public class SentenceDetectorMEGermanTest extends
AbstractSentenceDetectorTest {
() -> assertEquals(1, probs.length));
}
+ /**
+ * Edge case: Multi-letter abbreviation at the start of a non-first sentence
+ * with {@code useTokenEnd = false} (no space between sentences).
+ */
+ @Test
+ void testSentDetectWithMultiLetterAbbreviationAtNonFirstSentenceStart() {
+ prepareResources(false);
+ final String sent1 = "Träume sind eine Verbindung von Gedanken.";
+ final String sent2 = "Bek. Problem: Schlafmangel.";
+ // No space between sentences (useTokenEnd=false supports this)
+ String sampleSentences = sent1 + sent2;
+ String[] sents = sentenceDetector.sentDetect(sampleSentences);
+ double[] probs = sentenceDetector.probs();
+ assertAll(
+ () -> assertEquals(2, sents.length),
+ () -> assertEquals(sent1, sents[0]),
+ () -> assertEquals(sent2, sents[1]),
+ () -> assertEquals(2, probs.length));
+ }
+
/*
* A reproducer and test for OPENNLP-1767.
* It checks that sentence detection with common abbreviations works
correctly,