(opennlp) branch main updated: OPENNLP-1809: SentenceDetector misses multi-letter abbreviations at sentence start (#983)

mawiesne Sun, 22 Mar 2026 04:43:47 -0700

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git



The following commit(s) were added to refs/heads/main by this push:
     new d780617c OPENNLP-1809: SentenceDetector misses multi-letter 
abbreviations at sentence start (#983)
d780617c is described below

commit d780617c9332f0d4391a6480190fa56ca672531f
Author: Martin Wiesner <[email protected]>
AuthorDate: Sun Mar 22 12:43:32 2026 +0100

    OPENNLP-1809: SentenceDetector misses multi-letter abbreviations at 
sentence start (#983)
    
    - adds reproducer & test
    - fixes the issue in SentenceDetectorME#isAcceptableBreak(..)
    - refactors some code in other spots
---
 .../serializer/DictionaryEntryPersistor.java       |  6 ++--
 .../tools/sentdetect/SentenceDetectorME.java       | 40 ++++++++++++----------
 .../sentdetect/SentenceDetectorMEGermanTest.java   | 25 ++++++++++++--
 .../tools/tokenize/TokenizerFactoryTest.java       |  4 +--
 .../test/resources/opennlp/tools/lang/abb_DE.xml   |  3 ++
 5 files changed, 53 insertions(+), 25 deletions(-)

diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/dictionary/serializer/DictionaryEntryPersistor.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/dictionary/serializer/DictionaryEntryPersistor.java
index a20891c9..c8e3ad42 100644
--- 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/dictionary/serializer/DictionaryEntryPersistor.java
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/dictionary/serializer/DictionaryEntryPersistor.java
@@ -226,7 +226,7 @@ public class DictionaryEntryPersistor {
   public static boolean create(InputStream in, EntryInserter inserter)
       throws IOException {
 
-    DictionaryContenthandler profileContentHandler = new 
DictionaryContenthandler(inserter);
+    DictionaryContenthandler handler = new DictionaryContenthandler(inserter);
 
     XMLReader xmlReader;
     try {
@@ -235,14 +235,14 @@ public class DictionaryEntryPersistor {
       // There is a compatibility problem here: JAXP default is false while 
SAX 2 default is true!
       // OpenNLP requires it activated!
       xmlReader.setFeature(SAX_FEATURE_NAMESPACES, true);
-      xmlReader.setContentHandler(profileContentHandler);
+      xmlReader.setContentHandler(handler);
       xmlReader.parse(new InputSource(new UncloseableInputStream(in)));
     }
     catch (ParserConfigurationException | SAXException e) {
       throw new InvalidFormatException("The profile data stream has " +
           "an invalid format!", e);
     }
-    return profileContentHandler.mIsCaseSensitiveDictionary;
+    return handler.mIsCaseSensitiveDictionary;
   }
 
   /**
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
index 9b113e12..55faf79c 100644
--- 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -339,27 +339,31 @@ public class SentenceDetectorME implements 
SentenceDetector, Probabilistic {
     if (abbDict == null)
       return true;
 
+    final String text = s.toString();
     for (StringList abb : abbDict) {
-      final String token = abb.getToken(0);
-      final int tokenPosition = s.toString().indexOf(token, fromIndex);
-      if (tokenPosition == -1) {
-        continue; // skip fast
+      final String abbToken = abb.getToken(0);
+      final int tokenStartPos = text.indexOf(abbToken, fromIndex);
+      if (tokenStartPos == -1) {
+        continue; // skip fast when abb is not present in text
       }
-
-      final char prevChar = s.charAt(tokenPosition == 0 ? tokenPosition : 
tokenPosition - 1);
-      int tokenLength = token.length();
-      if (tokenPosition + tokenLength < candidateIndex || tokenPosition > 
candidateIndex ||
-        /*
-         * Note:
-         * Skip abbreviation candidate if regular characters exist directly 
before it,
-         * That is, any letter or digit except: a whitespace, an apostrophe, 
or an opening round bracket.
-         * This prevents mismatches from overlaps close to an actual sentence 
end.
-         */
-          !(Character.isWhitespace(prevChar) || isApostrophe(prevChar) || 
prevChar == '(')) {
-
-        continue;
+      if (tokenStartPos == 0 && text.substring(tokenStartPos, candidateIndex + 
1).equals(abbToken)) {
+        return false; // full abbreviation match at sentence start -> no 
acceptable break
+      } else {
+        final int tokenLength = abbToken.length();
+        final char prevChar = s.charAt(tokenStartPos == 0 ? tokenStartPos : 
tokenStartPos - 1);
+        if (tokenStartPos + tokenLength < candidateIndex || tokenStartPos > 
candidateIndex ||
+          /*
+           * Note:
+           * Skip abbreviation candidate if regular characters exist directly 
before it,
+           * That is, any letter or digit except: a whitespace, an apostrophe, 
or an opening round bracket.
+           * This prevents mismatches from overlaps close to an actual 
sentence end.
+           */
+            !(Character.isWhitespace(prevChar) || isApostrophe(prevChar) || 
prevChar == '(')) {
+
+          continue;
+        }
+        return false; // in case of a valid abbreviation: the (sentence) break 
is not accepted
       }
-      return false; // in case of a valid abbreviation: the (sentence) break 
is not accepted
     }
     return true; // no abbreviation(s) at given positions: valid sentence 
boundary
   }
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
index 5417fe17..9d271ce0 100644
--- 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
@@ -152,10 +152,10 @@ public class SentenceDetectorMEGermanTest extends 
AbstractSentenceDetectorTest {
   }
 
   /*
-    * A reproducer and test for OPENNLP-1781.
+   * A reproducer and test for OPENNLP-1781.
    */
   @Test
-  void testSentDetectWithAbbreviationsAtSentenceStart() {
+  void testSentDetectWithSingleLetterAbbreviationsAtSentenceStart() {
     prepareResources(true);
 
     final String sent1 = "S. Träume sind eine Verbindung von Gedanken.";
@@ -169,6 +169,27 @@ public class SentenceDetectorMEGermanTest extends 
AbstractSentenceDetectorTest {
         () -> assertEquals(1, probs.length));
   }
 
+  /*
+   * A reproducer and test for OPENNLP-1809.
+   */
+  @Test
+  void testSentDetectWithMultiLetterAbbreviationsAtSentenceStart() {
+    prepareResources(true);
+
+    final String sent1 = "Bek. Problem: Schlafmangel.";
+    final String sent2 = "Über die letzten Tage hinweg war sie zunehmend 
müde.";
+
+    String sampleSentences = sent1 + " " + sent2;
+    String[] sents = sentenceDetector.sentDetect(sampleSentences);
+    double[] probs = sentenceDetector.probs();
+
+    assertAll(
+        () -> assertEquals(2, sents.length),
+        () -> assertEquals(sent1, sents[0]),
+        () -> assertEquals(sent2, sents[1]),
+        () -> assertEquals(2, probs.length));
+  }
+
   /*
    * A reproducer and test for OPENNLP-1767.
    * It checks that sentence detection with common abbreviations works 
correctly,
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
index 41ff5d44..de478083 100644
--- 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
@@ -210,8 +210,8 @@ public class TokenizerFactoryTest {
 
     Assertions.assertEquals(expectedNumTokens, tokens.length);
     String[] sentSplit = sentence
-            .replaceAll("'", " '")
-            .replaceAll(",", " ,")
+            .replace("'", " '")
+            .replace(",", " ,")
             .split(" ");
     for (int i = 0; i < sentSplit.length; i++) {
       String sElement = sentSplit[i];
diff --git 
a/opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/lang/abb_DE.xml 
b/opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/lang/abb_DE.xml
index 23e09abc..33c6c875 100644
--- 
a/opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/lang/abb_DE.xml
+++ 
b/opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/lang/abb_DE.xml
@@ -47,4 +47,7 @@
   <entry>
     <token>z.B.</token>
   </entry>
+  <entry>
+    <token>Bek.</token>
+  </entry>
 </dictionary>

(opennlp) branch main updated: OPENNLP-1809: SentenceDetector misses multi-letter abbreviations at sentence start (#983)

Reply via email to