(opennlp) branch main updated: OPENNLP-1840: Fix native memory leak and vocabulary NPE in NameFinderDL (#1076)

mawiesne Fri, 12 Jun 2026 05:20:00 -0700

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git



The following commit(s) were added to refs/heads/main by this push:
     new aaf1b1fc4 OPENNLP-1840: Fix native memory leak and vocabulary NPE in 
NameFinderDL (#1076)
aaf1b1fc4 is described below

commit aaf1b1fc43d1ededa9ab469befcaa457bc31978c
Author: Kristian Rickert <[email protected]>
AuthorDate: Fri Jun 12 08:19:48 2026 -0400

    OPENNLP-1840: Fix native memory leak and vocabulary NPE in NameFinderDL 
(#1076)
    
    Every find() call leaked the OnnxTensor inputs and the OrtSession.Result
    for each sentence chunk. Tensors are now closed in a finally block and
    the result with try-with-resources.
    
    Tokens absent from the vocabulary caused an opaque NullPointerException
    through auto-unboxing. The token-to-id mapping now throws
    IllegalArgumentException naming the missing token, indicating a
    vocabulary/model mismatch.
---
 .../java/opennlp/dl/namefinder/NameFinderDL.java   | 70 +++++++++++++++-------
 .../opennlp/dl/namefinder/NameFinderDLTest.java    | 60 +++++++++++++++++++
 2 files changed, 110 insertions(+), 20 deletions(-)

diff --git 
a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java
 
b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java
index 74e5a1aac..f7373700e 100644
--- 
a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java
+++ 
b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java
@@ -130,21 +130,30 @@ public class NameFinderDL extends AbstractDL implements 
TokenNameFinder {
 
           // The inputs to the ONNX model.
           final Map<String, OnnxTensor> inputs = new HashMap<>();
-          inputs.put(INPUT_IDS, OnnxTensor.createTensor(env, 
LongBuffer.wrap(tokens.ids()),
-              new long[] {1, tokens.ids().length}));
 
-          if (inferenceOptions.isIncludeAttentionMask()) {
-            inputs.put(ATTENTION_MASK, OnnxTensor.createTensor(env,
-                LongBuffer.wrap(tokens.mask()), new long[] {1, 
tokens.mask().length}));
-          }
+          final float[][][] v;
+          try {
+            inputs.put(INPUT_IDS, OnnxTensor.createTensor(env, 
LongBuffer.wrap(tokens.ids()),
+                new long[] {1, tokens.ids().length}));
 
-          if (inferenceOptions.isIncludeTokenTypeIds()) {
-            inputs.put(TOKEN_TYPE_IDS, OnnxTensor.createTensor(env,
-                LongBuffer.wrap(tokens.types()), new long[] {1, 
tokens.types().length}));
-          }
+            if (inferenceOptions.isIncludeAttentionMask()) {
+              inputs.put(ATTENTION_MASK, OnnxTensor.createTensor(env,
+                  LongBuffer.wrap(tokens.mask()), new long[] {1, 
tokens.mask().length}));
+            }
 
-          // The outputs from the model.
-          final float[][][] v = (float[][][]) 
session.run(inputs).get(0).getValue();
+            if (inferenceOptions.isIncludeTokenTypeIds()) {
+              inputs.put(TOKEN_TYPE_IDS, OnnxTensor.createTensor(env,
+                  LongBuffer.wrap(tokens.types()), new long[] {1, 
tokens.types().length}));
+            }
+
+            // The outputs from the model.
+            try (OrtSession.Result result = session.run(inputs)) {
+              // getValue() copies the tensor into Java arrays, so the result 
can be closed safely.
+              v = (float[][][]) result.get(0).getValue();
+            }
+          } finally {
+            inputs.values().forEach(OnnxTensor::close);
+          }
 
           // Find consecutive B-PER and I-PER labels and combine the spans 
where necessary.
           // There are also B-LOC and I-LOC tags for locations that might be 
useful at some point.
@@ -376,13 +385,7 @@ public class NameFinderDL extends AbstractDL implements 
TokenNameFinder {
       // Now we can tokenize the group and continue.
       final String[] tokens = tokenizer.tokenize(group);
 
-      final int[] ids = new int[tokens.length];
-
-      for (int x = 0; x < tokens.length; x++) {
-        ids[x] = vocab.get(tokens[x]);
-      }
-
-      final long[] lids = Arrays.stream(ids).mapToLong(i -> i).toArray();
+      final long[] ids = tokenIds(tokens, vocab);
 
       final long[] mask = new long[ids.length];
       Arrays.fill(mask, 1);
@@ -390,7 +393,7 @@ public class NameFinderDL extends AbstractDL implements 
TokenNameFinder {
       final long[] types = new long[ids.length];
       Arrays.fill(types, 0);
 
-      t.add(new Tokens(tokens, lids, mask, types));
+      t.add(new Tokens(tokens, ids, mask, types));
 
     }
 
@@ -398,4 +401,31 @@ public class NameFinderDL extends AbstractDL implements 
TokenNameFinder {
 
   }
 
+  /**
+   * Maps tokens to their vocabulary ids.
+   *
+   * @param tokens The tokens to map.
+   * @param vocab The vocabulary map.
+   * @return The token ids.
+   *
+   * @throws IllegalArgumentException Thrown if a token is not present in the
+   *     vocabulary.
+   */
+  static long[] tokenIds(final String[] tokens, final Map<String, Integer> 
vocab) {
+
+    final long[] ids = new long[tokens.length];
+
+    for (int x = 0; x < tokens.length; x++) {
+      final Integer id = vocab.get(tokens[x]);
+      if (id == null) {
+        throw new IllegalArgumentException("Token '" + tokens[x]
+            + "' is not present in the vocabulary; the vocabulary file does 
not match the model.");
+      }
+      ids[x] = id;
+    }
+
+    return ids;
+
+  }
+
 }
diff --git 
a/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/namefinder/NameFinderDLTest.java
 
b/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/namefinder/NameFinderDLTest.java
new file mode 100644
index 000000000..87fe18c9b
--- /dev/null
+++ 
b/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/namefinder/NameFinderDLTest.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.dl.namefinder;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.tokenize.WordpieceTokenizer;
+
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class NameFinderDLTest {
+
+  private static Map<String, Integer> vocab() {
+    final Map<String, Integer> vocab = new HashMap<>();
+    vocab.put(WordpieceTokenizer.BERT_CLS_TOKEN, 0);
+    vocab.put(WordpieceTokenizer.BERT_SEP_TOKEN, 1);
+    vocab.put(WordpieceTokenizer.BERT_UNK_TOKEN, 2);
+    vocab.put("hello", 3);
+    vocab.put("world", 4);
+    return vocab;
+  }
+
+  @Test
+  void testTokenIdsMapsTokensToVocabularyIds() {
+    final long[] ids = NameFinderDL.tokenIds(
+        new String[] {WordpieceTokenizer.BERT_CLS_TOKEN, "hello", "world",
+            WordpieceTokenizer.BERT_SEP_TOKEN}, vocab());
+
+    assertArrayEquals(new long[] {0, 3, 4, 1}, ids);
+  }
+
+  @Test
+  void testTokenIdsRejectsTokensMissingFromVocabulary() {
+    final IllegalArgumentException e = 
assertThrows(IllegalArgumentException.class, () ->
+        NameFinderDL.tokenIds(new String[] {"hello", "missing"}, vocab()));
+
+    assertTrue(e.getMessage().contains("missing"),
+        "the error message should name the missing token: " + e.getMessage());
+  }
+}

(opennlp) branch main updated: OPENNLP-1840: Fix native memory leak and vocabulary NPE in NameFinderDL (#1076)

Reply via email to