This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch opennlp-2.x
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/opennlp-2.x by this push:
new db4f2de1 OPENNLP-1810: Fix SentenceDetector fails to detect multiple
identical abbreviations in the same sentence (#984)
db4f2de1 is described below
commit db4f2de1457f4af001ff9dd98a49e86d37adfd0a
Author: Richard Zowalla <[email protected]>
AuthorDate: Sun Mar 22 15:40:02 2026 +0100
OPENNLP-1810: Fix SentenceDetector fails to detect multiple identical
abbreviations in the same sentence (#984)
(cherry picked from commit dfe0a5a9215a92958de71a0ad02e2fb756121285)
---
.../tools/sentdetect/SentenceDetectorME.java | 33 +++++++++++++---------
.../sentdetect/SentenceDetectorMEGermanTest.java | 13 +++++++++
2 files changed, 32 insertions(+), 14 deletions(-)
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
index 0968ea3e..4217b730 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -340,29 +340,34 @@ public class SentenceDetectorME implements
SentenceDetector, Probabilistic {
return true;
final String text = s.toString();
+ final boolean caseSensitive = abbDict.isCaseSensitive();
+ final String searchText = caseSensitive ? text :
StringUtil.toLowerCase(text);
for (StringList abb : abbDict) {
- final String abbToken = abb.getToken(0);
- final int tokenStartPos = text.indexOf(abbToken, fromIndex);
- if (tokenStartPos == -1) {
- continue; // skip fast when abb is not present in text
- }
- if (tokenStartPos == 0 && text.substring(tokenStartPos, candidateIndex +
1).equals(abbToken)) {
- return false; // full abbreviation match at sentence start -> no
acceptable break
- } else {
- final int tokenLength = abbToken.length();
+ final String abbToken = caseSensitive ? abb.getToken(0)
+ : StringUtil.toLowerCase(abb.getToken(0));
+ final int tokenLength = abbToken.length();
+ int tokenStartPos = searchText.indexOf(abbToken, fromIndex);
+ while (tokenStartPos != -1) {
+ if (tokenStartPos > candidateIndex) {
+ break; // past candidate position, no point searching further
+ }
+ if (tokenStartPos == 0
+ && searchText.substring(tokenStartPos, candidateIndex +
1).equals(abbToken)) {
+ return false; // full abbreviation match at sentence start -> no
acceptable break
+ }
final char prevChar = s.charAt(tokenStartPos == 0 ? tokenStartPos :
tokenStartPos - 1);
- if (tokenStartPos + tokenLength < candidateIndex || tokenStartPos >
candidateIndex ||
+ if (tokenStartPos + tokenLength >= candidateIndex
/*
* Note:
* Skip abbreviation candidate if regular characters exist directly
before it,
* That is, any letter or digit except: a whitespace, an apostrophe,
or an opening round bracket.
* This prevents mismatches from overlaps close to an actual
sentence end.
*/
- !(Character.isWhitespace(prevChar) || isApostrophe(prevChar) ||
prevChar == '(')) {
-
- continue;
+ && (Character.isWhitespace(prevChar) || isApostrophe(prevChar) ||
prevChar == '(')) {
+ return false; // in case of a valid abbreviation: the (sentence)
break is not accepted
}
- return false; // in case of a valid abbreviation: the (sentence) break
is not accepted
+ // Try next occurrence of this abbreviation in the text
+ tokenStartPos = searchText.indexOf(abbToken, tokenStartPos + 1);
}
}
return true; // no abbreviation(s) at given positions: valid sentence
boundary
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
index ccc5d589..1d51a089 100644
---
a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
+++
b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
@@ -190,6 +190,19 @@ public class SentenceDetectorMEGermanTest extends
AbstractSentenceDetectorTest {
() -> assertEquals(2, probs.length));
}
+ // Edge case: The same abbreviation appears twice in a single sentence
segment.
+ @Test
+ void testSentDetectWithDuplicateAbbreviationInSameSegment() {
+ prepareResources(true);
+ final String sent1 = "Lt. Vertrag und lt. Bescheid gelten andere
Bedingungen.";
+ String[] sents = sentenceDetector.sentDetect(sent1);
+ double[] probs = sentenceDetector.probs();
+ assertAll(
+ () -> assertEquals(1, sents.length),
+ () -> assertEquals(sent1, sents[0]),
+ () -> assertEquals(1, probs.length));
+ }
+
/*
* A reproducer and test for OPENNLP-1767.
* It checks that sentence detection with common abbreviations works
correctly,