This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new dfe0a5a9 OPENNLP-1810: Fix Fix SentenceDetector fails to detect
multiple identical abbreviations in the same sentence (#984)
dfe0a5a9 is described below
commit dfe0a5a9215a92958de71a0ad02e2fb756121285
Author: Richard Zowalla <[email protected]>
AuthorDate: Sun Mar 22 15:40:02 2026 +0100
OPENNLP-1810: Fix Fix SentenceDetector fails to detect multiple identical
abbreviations in the same sentence (#984)
---
.../tools/sentdetect/SentenceDetectorME.java | 33 +++++++++++++---------
.../sentdetect/SentenceDetectorMEGermanTest.java | 13 +++++++++
2 files changed, 32 insertions(+), 14 deletions(-)
diff --git
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
index 55faf79c..ec71618d 100644
---
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
+++
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -340,29 +340,34 @@ public class SentenceDetectorME implements
SentenceDetector, Probabilistic {
return true;
final String text = s.toString();
+ final boolean caseSensitive = abbDict.isCaseSensitive();
+ final String searchText = caseSensitive ? text :
StringUtil.toLowerCase(text);
for (StringList abb : abbDict) {
- final String abbToken = abb.getToken(0);
- final int tokenStartPos = text.indexOf(abbToken, fromIndex);
- if (tokenStartPos == -1) {
- continue; // skip fast when abb is not present in text
- }
- if (tokenStartPos == 0 && text.substring(tokenStartPos, candidateIndex +
1).equals(abbToken)) {
- return false; // full abbreviation match at sentence start -> no
acceptable break
- } else {
- final int tokenLength = abbToken.length();
+ final String abbToken = caseSensitive ? abb.getToken(0)
+ : StringUtil.toLowerCase(abb.getToken(0));
+ final int tokenLength = abbToken.length();
+ int tokenStartPos = searchText.indexOf(abbToken, fromIndex);
+ while (tokenStartPos != -1) {
+ if (tokenStartPos > candidateIndex) {
+ break; // past candidate position, no point searching further
+ }
+ if (tokenStartPos == 0
+ && searchText.substring(tokenStartPos, candidateIndex +
1).equals(abbToken)) {
+ return false; // full abbreviation match at sentence start -> no
acceptable break
+ }
final char prevChar = s.charAt(tokenStartPos == 0 ? tokenStartPos :
tokenStartPos - 1);
- if (tokenStartPos + tokenLength < candidateIndex || tokenStartPos >
candidateIndex ||
+ if (tokenStartPos + tokenLength >= candidateIndex
/*
* Note:
* Skip abbreviation candidate if regular characters exist directly
before it,
* That is, any letter or digit except: a whitespace, an apostrophe,
or an opening round bracket.
* This prevents mismatches from overlaps close to an actual
sentence end.
*/
- !(Character.isWhitespace(prevChar) || isApostrophe(prevChar) ||
prevChar == '(')) {
-
- continue;
+ && (Character.isWhitespace(prevChar) || isApostrophe(prevChar) ||
prevChar == '(')) {
+ return false; // in case of a valid abbreviation: the (sentence)
break is not accepted
}
- return false; // in case of a valid abbreviation: the (sentence) break
is not accepted
+ // Try next occurrence of this abbreviation in the text
+ tokenStartPos = searchText.indexOf(abbToken, tokenStartPos + 1);
}
}
return true; // no abbreviation(s) at given positions: valid sentence
boundary
diff --git
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
index 9d271ce0..f145dd9e 100644
---
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
+++
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
@@ -190,6 +190,19 @@ public class SentenceDetectorMEGermanTest extends
AbstractSentenceDetectorTest {
() -> assertEquals(2, probs.length));
}
+ // Edge case: The same abbreviation appears twice in a single sentence
segment.
+ @Test
+ void testSentDetectWithDuplicateAbbreviationInSameSegment() {
+ prepareResources(true);
+ final String sent1 = "Lt. Vertrag und lt. Bescheid gelten andere
Bedingungen.";
+ String[] sents = sentenceDetector.sentDetect(sent1);
+ double[] probs = sentenceDetector.probs();
+ assertAll(
+ () -> assertEquals(1, sents.length),
+ () -> assertEquals(sent1, sents[0]),
+ () -> assertEquals(1, probs.length));
+ }
+
/*
* A reproducer and test for OPENNLP-1767.
* It checks that sentence detection with common abbreviations works
correctly,