This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch opennlp-2.x
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/opennlp-2.x by this push:
new 5b690d73 OPENNLP-1809: SentenceDetector misses multi-letter
abbreviations at sentence start (#983)
5b690d73 is described below
commit 5b690d73f49db3db9112f21158b4a454e589f306
Author: Martin Wiesner <[email protected]>
AuthorDate: Sun Mar 22 12:43:32 2026 +0100
OPENNLP-1809: SentenceDetector misses multi-letter abbreviations at
sentence start (#983)
- adds reproducer & test
- fixes the issue in SentenceDetectorME#isAcceptableBreak(..)
- refactors some code in other spots
(cherry picked from commit d780617c9332f0d4391a6480190fa56ca672531f)
---
.../serializer/DictionaryEntryPersistor.java | 6 ++--
.../tools/sentdetect/SentenceDetectorME.java | 40 ++++++++++++----------
.../sentdetect/SentenceDetectorMEGermanTest.java | 25 ++++++++++++--
.../tools/tokenize/TokenizerFactoryTest.java | 4 +--
.../test/resources/opennlp/tools/lang/abb_DE.xml | 3 ++
5 files changed, 53 insertions(+), 25 deletions(-)
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/dictionary/serializer/DictionaryEntryPersistor.java
b/opennlp-tools/src/main/java/opennlp/tools/dictionary/serializer/DictionaryEntryPersistor.java
index a20891c9..c8e3ad42 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/dictionary/serializer/DictionaryEntryPersistor.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/dictionary/serializer/DictionaryEntryPersistor.java
@@ -226,7 +226,7 @@ public class DictionaryEntryPersistor {
public static boolean create(InputStream in, EntryInserter inserter)
throws IOException {
- DictionaryContenthandler profileContentHandler = new
DictionaryContenthandler(inserter);
+ DictionaryContenthandler handler = new DictionaryContenthandler(inserter);
XMLReader xmlReader;
try {
@@ -235,14 +235,14 @@ public class DictionaryEntryPersistor {
// There is a compatibility problem here: JAXP default is false while
SAX 2 default is true!
// OpenNLP requires it activated!
xmlReader.setFeature(SAX_FEATURE_NAMESPACES, true);
- xmlReader.setContentHandler(profileContentHandler);
+ xmlReader.setContentHandler(handler);
xmlReader.parse(new InputSource(new UncloseableInputStream(in)));
}
catch (ParserConfigurationException | SAXException e) {
throw new InvalidFormatException("The profile data stream has " +
"an invalid format!", e);
}
- return profileContentHandler.mIsCaseSensitiveDictionary;
+ return handler.mIsCaseSensitiveDictionary;
}
/**
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
index f64768df..0968ea3e 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -339,27 +339,31 @@ public class SentenceDetectorME implements
SentenceDetector, Probabilistic {
if (abbDict == null)
return true;
+ final String text = s.toString();
for (StringList abb : abbDict) {
- final String token = abb.getToken(0);
- final int tokenPosition = s.toString().indexOf(token, fromIndex);
- if (tokenPosition == -1) {
- continue; // skip fast
+ final String abbToken = abb.getToken(0);
+ final int tokenStartPos = text.indexOf(abbToken, fromIndex);
+ if (tokenStartPos == -1) {
+ continue; // skip fast when abb is not present in text
}
-
- final char prevChar = s.charAt(tokenPosition == 0 ? tokenPosition :
tokenPosition - 1);
- int tokenLength = token.length();
- if (tokenPosition + tokenLength < candidateIndex || tokenPosition >
candidateIndex ||
- /*
- * Note:
- * Skip abbreviation candidate if regular characters exist directly
before it,
- * That is, any letter or digit except: a whitespace, an apostrophe,
or an opening round bracket.
- * This prevents mismatches from overlaps close to an actual sentence
end.
- */
- !(Character.isWhitespace(prevChar) || isApostrophe(prevChar) ||
prevChar == '(')) {
-
- continue;
+ if (tokenStartPos == 0 && text.substring(tokenStartPos, candidateIndex +
1).equals(abbToken)) {
+ return false; // full abbreviation match at sentence start -> no
acceptable break
+ } else {
+ final int tokenLength = abbToken.length();
+ final char prevChar = s.charAt(tokenStartPos == 0 ? tokenStartPos :
tokenStartPos - 1);
+ if (tokenStartPos + tokenLength < candidateIndex || tokenStartPos >
candidateIndex ||
+ /*
+ * Note:
+ * Skip abbreviation candidate if regular characters exist directly
before it,
+ * That is, any letter or digit except: a whitespace, an apostrophe,
or an opening round bracket.
+ * This prevents mismatches from overlaps close to an actual
sentence end.
+ */
+ !(Character.isWhitespace(prevChar) || isApostrophe(prevChar) ||
prevChar == '(')) {
+
+ continue;
+ }
+ return false; // in case of a valid abbreviation: the (sentence) break
is not accepted
}
- return false; // in case of a valid abbreviation: the (sentence) break
is not accepted
}
return true; // no abbreviation(s) at given positions: valid sentence
boundary
}
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
index 33560133..ccc5d589 100644
---
a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
+++
b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
@@ -152,10 +152,10 @@ public class SentenceDetectorMEGermanTest extends
AbstractSentenceDetectorTest {
}
/*
- * A reproducer and test for OPENNLP-1781.
+ * A reproducer and test for OPENNLP-1781.
*/
@Test
- void testSentDetectWithAbbreviationsAtSentenceStart() {
+ void testSentDetectWithSingleLetterAbbreviationsAtSentenceStart() {
prepareResources(true);
final String sent1 = "S. Träume sind eine Verbindung von Gedanken.";
@@ -169,6 +169,27 @@ public class SentenceDetectorMEGermanTest extends
AbstractSentenceDetectorTest {
() -> assertEquals(1, probs.length));
}
+ /*
+ * A reproducer and test for OPENNLP-1809.
+ */
+ @Test
+ void testSentDetectWithMultiLetterAbbreviationsAtSentenceStart() {
+ prepareResources(true);
+
+ final String sent1 = "Bek. Problem: Schlafmangel.";
+ final String sent2 = "Über die letzten Tage hinweg war sie zunehmend
müde.";
+
+ String sampleSentences = sent1 + " " + sent2;
+ String[] sents = sentenceDetector.sentDetect(sampleSentences);
+ double[] probs = sentenceDetector.probs();
+
+ assertAll(
+ () -> assertEquals(2, sents.length),
+ () -> assertEquals(sent1, sents[0]),
+ () -> assertEquals(sent2, sents[1]),
+ () -> assertEquals(2, probs.length));
+ }
+
/*
* A reproducer and test for OPENNLP-1767.
* It checks that sentence detection with common abbreviations works
correctly,
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
index 23d2ba7a..feff3835 100644
---
a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
+++
b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
@@ -210,8 +210,8 @@ public class TokenizerFactoryTest {
Assertions.assertEquals(expectedNumTokens, tokens.length);
String[] sentSplit = sentence
- .replaceAll("'", " '")
- .replaceAll(",", " ,")
+ .replace("'", " '")
+ .replace(",", " ,")
.split(" ");
for (int i = 0; i < sentSplit.length; i++) {
String sElement = sentSplit[i];
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/lang/abb_DE.xml
b/opennlp-tools/src/test/resources/opennlp/tools/lang/abb_DE.xml
index 23e09abc..33c6c875 100644
--- a/opennlp-tools/src/test/resources/opennlp/tools/lang/abb_DE.xml
+++ b/opennlp-tools/src/test/resources/opennlp/tools/lang/abb_DE.xml
@@ -47,4 +47,7 @@
<entry>
<token>z.B.</token>
</entry>
+ <entry>
+ <token>Bek.</token>
+ </entry>
</dictionary>