This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch opennlp-2.x in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit 0939220669af20b4a9dd7d8f3c00b89583bf38f2 Author: Kristian Rickert <[email protected]> AuthorDate: Fri Jun 12 14:04:24 2026 +0200 OPENNLP-1839 : Fix native memory leak and vocabulary NPE in DocumentCategorizerDL (#1074) Every categorize() call leaked the OnnxTensor inputs and the OrtSession.Result for each document chunk. Tensors are now closed in a finally block and the result with try-with-resources. Tokens absent from the vocabulary caused an opaque NullPointerException through auto-unboxing, which the broad catch in categorize() swallowed. The token-to-id mapping now throws IllegalArgumentException naming the missing token, indicating a vocabulary/model mismatch. (cherry picked from commit b6af87589770ca8c39ef94d2726a21cad6d9f32f) --- .../opennlp/dl/doccat/DocumentCategorizerDL.java | 75 +++++++++++++++------- .../dl/doccat/DocumentCategorizerDLTest.java | 60 +++++++++++++++++ 2 files changed, 112 insertions(+), 23 deletions(-) diff --git a/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java b/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java index a0c9ede77..f02dd875b 100644 --- a/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java +++ b/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java @@ -145,23 +145,31 @@ public class DocumentCategorizerDL extends AbstractDL implements DocumentCategor final Map<String, OnnxTensor> inputs = new HashMap<>(); - inputs.put(INPUT_IDS, OnnxTensor.createTensor(env, - LongBuffer.wrap(t.ids()), new long[] {1, t.ids().length})); - - if (inferenceOptions.isIncludeAttentionMask()) { - inputs.put(ATTENTION_MASK, OnnxTensor.createTensor(env, - LongBuffer.wrap(t.mask()), new long[] {1, t.mask().length})); - } - - if (inferenceOptions.isIncludeTokenTypeIds()) { - inputs.put(TOKEN_TYPE_IDS, OnnxTensor.createTensor(env, - LongBuffer.wrap(t.types()), new long[] {1, t.types().length})); + final Object output; + try { + inputs.put(INPUT_IDS, OnnxTensor.createTensor(env, + LongBuffer.wrap(t.ids()), new long[] {1, t.ids().length})); + + if (inferenceOptions.isIncludeAttentionMask()) { + inputs.put(ATTENTION_MASK, OnnxTensor.createTensor(env, + LongBuffer.wrap(t.mask()), new long[] {1, t.mask().length})); + } + + if (inferenceOptions.isIncludeTokenTypeIds()) { + inputs.put(TOKEN_TYPE_IDS, OnnxTensor.createTensor(env, + LongBuffer.wrap(t.types()), new long[] {1, t.types().length})); + } + + // The outputs from the model. Some models return a 2D array (e.g. BERT), + // while others return a 1D array (e.g. RoBERTa). + try (OrtSession.Result result = session.run(inputs)) { + // getValue() copies the tensor into Java arrays, so the result can be closed safely. + output = result.get(0).getValue(); + } + } finally { + inputs.values().forEach(OnnxTensor::close); } - // The outputs from the model. Some models return a 2D array (e.g. BERT), - // while others return a 1D array (e.g. RoBERTa). - final Object output = session.run(inputs).get(0).getValue(); - final float[] rawScores; if (output instanceof float[][] v) { rawScores = v[0]; @@ -300,13 +308,7 @@ public class DocumentCategorizerDL extends AbstractDL implements DocumentCategor // Now we can tokenize the group and continue. final String[] tokens = tokenizer.tokenize(group); - final int[] ids = new int[tokens.length]; - - for (int x = 0; x < tokens.length; x++) { - ids[x] = vocab.get(tokens[x]); - } - - final long[] lids = Arrays.stream(ids).mapToLong(i -> i).toArray(); + final long[] ids = tokenIds(tokens, vocab); final long[] mask = new long[ids.length]; Arrays.fill(mask, 1); @@ -314,7 +316,7 @@ public class DocumentCategorizerDL extends AbstractDL implements DocumentCategor final long[] types = new long[ids.length]; Arrays.fill(types, 0); - t.add(new Tokens(tokens, lids, mask, types)); + t.add(new Tokens(tokens, ids, mask, types)); } @@ -322,6 +324,33 @@ public class DocumentCategorizerDL extends AbstractDL implements DocumentCategor } + /** + * Maps tokens to their vocabulary ids. + * + * @param tokens The tokens to map. + * @param vocab The vocabulary map. + * @return The token ids. + * + * @throws IllegalArgumentException Thrown if a token is not present in the + * vocabulary. + */ + static long[] tokenIds(final String[] tokens, final Map<String, Integer> vocab) { + + final long[] ids = new long[tokens.length]; + + for (int x = 0; x < tokens.length; x++) { + final Integer id = vocab.get(tokens[x]); + if (id == null) { + throw new IllegalArgumentException("Token '" + tokens[x] + + "' is not present in the vocabulary; the vocabulary file does not match the model."); + } + ids[x] = id; + } + + return ids; + + } + /** * Applies softmax to an array of values. * diff --git a/opennlp-dl/src/test/java/opennlp/dl/doccat/DocumentCategorizerDLTest.java b/opennlp-dl/src/test/java/opennlp/dl/doccat/DocumentCategorizerDLTest.java new file mode 100644 index 000000000..a6bab39f6 --- /dev/null +++ b/opennlp-dl/src/test/java/opennlp/dl/doccat/DocumentCategorizerDLTest.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.dl.doccat; + +import java.util.HashMap; +import java.util.Map; + +import org.junit.jupiter.api.Test; + +import opennlp.tools.tokenize.WordpieceTokenizer; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class DocumentCategorizerDLTest { + + private static Map<String, Integer> vocab() { + final Map<String, Integer> vocab = new HashMap<>(); + vocab.put(WordpieceTokenizer.BERT_CLS_TOKEN, 0); + vocab.put(WordpieceTokenizer.BERT_SEP_TOKEN, 1); + vocab.put(WordpieceTokenizer.BERT_UNK_TOKEN, 2); + vocab.put("hello", 3); + vocab.put("world", 4); + return vocab; + } + + @Test + void testTokenIdsMapsTokensToVocabularyIds() { + final long[] ids = DocumentCategorizerDL.tokenIds( + new String[] {WordpieceTokenizer.BERT_CLS_TOKEN, "hello", "world", + WordpieceTokenizer.BERT_SEP_TOKEN}, vocab()); + + assertArrayEquals(new long[] {0, 3, 4, 1}, ids); + } + + @Test + void testTokenIdsRejectsTokensMissingFromVocabulary() { + final IllegalArgumentException e = assertThrows(IllegalArgumentException.class, () -> + DocumentCategorizerDL.tokenIds(new String[] {"hello", "missing"}, vocab())); + + assertTrue(e.getMessage().contains("missing"), + "the error message should name the missing token: " + e.getMessage()); + } +}
