This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch opennlp-2.x
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/opennlp-2.x by this push:
new 95f401d44 [2.x] OPENNLP-1829: Transfer DirectoryModelFinder to OpenNLP
core (#1054)
95f401d44 is described below
commit 95f401d44dce4199ebb53f1a7fb9f46ab489031c
Author: Martin Wiesner <[email protected]>
AuthorDate: Mon May 18 10:27:02 2026 +0200
[2.x] OPENNLP-1829: Transfer DirectoryModelFinder to OpenNLP core (#1054)
---
.../tools/models/AbstractClassPathModelFinder.java | 70 ++++++++++
.../tools/models/dir/DirectoryModelFinder.java | 145 +++++++++++++++++++++
.../models/simple/SimpleClassPathModelFinder.java | 61 +--------
3 files changed, 220 insertions(+), 56 deletions(-)
diff --git
a/opennlp-tools-models/src/main/java/opennlp/tools/models/AbstractClassPathModelFinder.java
b/opennlp-tools-models/src/main/java/opennlp/tools/models/AbstractClassPathModelFinder.java
index b31128dd8..2050892c8 100644
---
a/opennlp-tools-models/src/main/java/opennlp/tools/models/AbstractClassPathModelFinder.java
+++
b/opennlp-tools-models/src/main/java/opennlp/tools/models/AbstractClassPathModelFinder.java
@@ -16,12 +16,22 @@
*/
package opennlp.tools.models;
+import java.io.IOException;
+import java.net.JarURLConnection;
import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Enumeration;
import java.util.HashSet;
import java.util.List;
+import java.util.Locale;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
+import java.util.jar.JarEntry;
+import java.util.jar.JarFile;
+import java.util.regex.Pattern;
/**
* A base implementation of a {@link ClassPathModelFinder} for the detection of
@@ -126,4 +136,64 @@ public abstract class AbstractClassPathModelFinder
implements ClassPathModelFind
return jarModelPrefix;
}
+ /**
+ * Escapes a {@code wildcard} expressions for usage as a Java regular
expression.
+ *
+ * @param wildcard A valid expression. It must not be {@code null}.
+ * @return The escaped regex.
+ */
+ protected String asRegex(String wildcard) {
+ return wildcard
+ .replace(".", "\\.")
+ .replace("*", ".*")
+ .replace("?", ".");
+ }
+
+ protected boolean matchesPattern(URL url, Pattern pattern) {
+ return pattern.matcher(url.getFile()).matches();
+ }
+
+ /**
+ * Converts a {@code location} in String form to a {@link URL}.
+ *
+ * @param location The resource path and/or reference.
+ * @return The converted {@link URL} form.
+ * @throws IOException Thrown if IO errors occurred during conversion
+ */
+ protected static URL toURL(String location) throws IOException {
+ try {
+ return new URI(location).toURL();
+ } catch (URISyntaxException e) {
+ throw new IOException(e);
+ }
+ }
+
+ protected List<URI> getURIsFromJar(URL fileUrl, boolean isWindows) throws
IOException {
+ final List<URI> uris = new ArrayList<>();
+ final String location = JAR + ":" +
+ (isWindows ? fileUrl.toString().replace("\\", "/")
+ : fileUrl.toString()) + "!/";
+ final URL jarUrl = toURL(location);
+ final JarURLConnection jarConnection = (JarURLConnection)
jarUrl.openConnection();
+ try (JarFile jarFile = jarConnection.getJarFile()) {
+ final Enumeration<JarEntry> entries = jarFile.entries();
+ while (entries.hasMoreElements()) {
+ final JarEntry entry = entries.nextElement();
+ if (!entry.isDirectory()) {
+ try {
+ uris.add(new URI(jarUrl + entry.getName()));
+ } catch (URISyntaxException ignored) {
+ //if we cannot convert to URI here, we ignore that entry.
+ }
+ }
+ }
+ }
+
+ return uris;
+ }
+
+ protected boolean isWindows() {
+ return System.getProperty("os.name",
"unknown").toLowerCase(Locale.ROOT).contains("win");
+ }
+
}
diff --git
a/opennlp-tools-models/src/main/java/opennlp/tools/models/dir/DirectoryModelFinder.java
b/opennlp-tools-models/src/main/java/opennlp/tools/models/dir/DirectoryModelFinder.java
new file mode 100644
index 000000000..9a7b3d1d6
--- /dev/null
+++
b/opennlp-tools-models/src/main/java/opennlp/tools/models/dir/DirectoryModelFinder.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.models.dir;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.regex.Pattern;
+import java.util.stream.Stream;
+
+import org.slf4j.LoggerFactory;
+
+import opennlp.tools.models.AbstractClassPathModelFinder;
+import opennlp.tools.models.ClassPathModelFinder;
+
+/**
+ * The {@code DirectoryModelFinder} class is responsible for finding model
files in a given directory
+ * on the classpath.
+ *
+ * <p>This class allows searching for models based on wildcard patterns,
either in plain directory structures
+ * or within JAR files. The search can be performed recursively depending on
the specified configuration.
+ *
+ * <p><b>Usage:</b>
+ * <ul>
+ * <li>Provide the prefix for models to be found in JAR files using the
{@code jarModelPrefix}
+ * parameter.</li>
+ * <li>Specify the directory to search and whether to enable recursive
scanning.</li>
+ * <li>The class supports resolving both direct file matches and entries
within JAR archives.</li>
+ * </ul>
+ *
+ * @see AbstractClassPathModelFinder
+ * @see ClassPathModelFinder
+ */
+public class DirectoryModelFinder extends AbstractClassPathModelFinder
implements ClassPathModelFinder {
+
+ private static final org.slf4j.Logger logger =
LoggerFactory.getLogger(DirectoryModelFinder.class);
+
+ private final Path directory;
+ private final boolean recursive;
+ private final Pattern jarPattern;
+ private Pattern filePattern;
+ private String prevFilePattern;
+
+ /**
+ * Instantiates a new {@link DirectoryModelFinder} with the specified
parameters.
+ *
+ * @param jarModelPrefix The prefix for identifying model files in JAR
archives; may be {@code null}.
+ * If it is {@code null}, {@link
ClassPathModelFinder#OPENNLP_MODEL_JAR_PREFIX}
+ * is used.
+ * @param directory The root directory to scan from for model files;
must not be {@code null}.
+ * @param recursive {@code true} if the search should include
subdirectories, {@code false} otherwise.
+ * @throws IllegalArgumentException Thrown if {@code directory} is {@code
null}.
+ */
+ public DirectoryModelFinder(String jarModelPrefix, Path directory, boolean
recursive) {
+ super(jarModelPrefix == null ? OPENNLP_MODEL_JAR_PREFIX : jarModelPrefix);
+ if (directory == null) {
+ throw new IllegalArgumentException("Given directory must not be NULL");
+ }
+ this.directory = directory;
+ this.recursive = recursive;
+ this.jarPattern = Pattern.compile(asRegex("*" + getJarModelPrefix()));
+ }
+
+ /**
+ * @return Always {@code null} as it is not needed for the directory case.
+ */
+ @Override
+ protected Object getContext() {
+ return null; //not needed for the simple case. Just return NULL.
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ protected List<URI> getMatchingURIs(String wildcardPattern, Object context) {
+ if (wildcardPattern == null) {
+ return Collections.emptyList();
+ }
+
+ final boolean isWindows = isWindows();
+ final List<URL> cp = getDirectoryContent();
+ final List<URI> cpu = new ArrayList<>();
+ final String filePatternString = asRegex("*" + wildcardPattern);
+ if (!filePatternString.equals(prevFilePattern)) {
+ this.filePattern = Pattern.compile(filePatternString);
+ this.prevFilePattern = filePatternString;
+ }
+
+ for (URL url : cp) {
+ if (matchesPattern(url, jarPattern)) {
+ try {
+ for (URI u : getURIsFromJar(url, isWindows)) {
+ if (matchesPattern(u.toURL(), filePattern)) {
+ cpu.add(u);
+ }
+ }
+ } catch (IOException e) {
+ logger.warn("Cannot read content of {}.", url, e);
+ }
+ }
+ }
+
+ return cpu;
+ }
+
+ private List<URL> getDirectoryContent() {
+ final List<URL> fileList = new ArrayList<>();
+ try (Stream<Path> files = Files.walk(directory, recursive ?
Integer.MAX_VALUE : 1)) {
+ files.filter(Files::isRegularFile).forEach(path -> {
+ try {
+ fileList.add(path.toUri().toURL());
+ } catch (MalformedURLException ignored) {
+
+ }
+ });
+ } catch (IOException e) {
+ logger.warn(e.getLocalizedMessage(), e);
+ }
+ return fileList;
+ }
+
+
+
+}
diff --git
a/opennlp-tools-models/src/main/java/opennlp/tools/models/simple/SimpleClassPathModelFinder.java
b/opennlp-tools-models/src/main/java/opennlp/tools/models/simple/SimpleClassPathModelFinder.java
index ebc7da9f0..e189cef8d 100644
---
a/opennlp-tools-models/src/main/java/opennlp/tools/models/simple/SimpleClassPathModelFinder.java
+++
b/opennlp-tools-models/src/main/java/opennlp/tools/models/simple/SimpleClassPathModelFinder.java
@@ -19,20 +19,15 @@ package opennlp.tools.models.simple;
import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
-import java.net.JarURLConnection;
import java.net.MalformedURLException;
import java.net.URI;
-import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLClassLoader;
+import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
-import java.util.Enumeration;
import java.util.List;
-import java.util.Locale;
-import java.util.jar.JarEntry;
-import java.util.jar.JarFile;
import java.util.regex.Pattern;
import org.slf4j.Logger;
@@ -45,7 +40,7 @@ import opennlp.tools.models.ClassPathModelFinder;
* Enables the detection of OpenNLP models in the classpath via JDK classes
* By default, this class will search for JAR files starting with
"opennlp-models-*".
* This wildcard pattern can be adjusted by using the alternative constructor
of this class.
- *
+ *
* @implNote
* It is a rather simple implementation of scanning the classpath by trying to
obtain {@link URL urls}
* from the actual classpath via a chain of possible options. It might not
work for every use-case
@@ -68,7 +63,6 @@ import opennlp.tools.models.ClassPathModelFinder;
public class SimpleClassPathModelFinder extends AbstractClassPathModelFinder
implements ClassPathModelFinder {
private static final Logger logger =
LoggerFactory.getLogger(SimpleClassPathModelFinder.class);
- private static final String FILE_PREFIX = "file";
private static final Pattern CLASSPATH_SEPARATOR_PATTERN_WINDOWS =
Pattern.compile(";");
private static final Pattern CLASSPATH_SEPARATOR_PATTERN_UNIX =
Pattern.compile(":");
// ; for Windows, : for Linux/OSX
@@ -131,51 +125,6 @@ public class SimpleClassPathModelFinder extends
AbstractClassPathModelFinder imp
return cpu;
}
- /**
- * Escapes a {@code wildcard} expressions for usage as a Java regular
expression.
- *
- * @param wildcard A valid expression. It must not be {@code null}.
- * @return The escaped regex.
- */
- private String asRegex(String wildcard) {
- return wildcard
- .replace(".", "\\.")
- .replace("*", ".*")
- .replace("?", ".");
- }
-
- private boolean matchesPattern(URL url, Pattern pattern) {
- return pattern.matcher(url.getFile()).matches();
- }
-
- private List<URI> getURIsFromJar(URL fileUrl, boolean isWindows) throws
IOException {
- final List<URI> uris = new ArrayList<>();
- final URL jarUrl = new URL(JAR + ":" +
- (isWindows ? fileUrl.toString().replace("\\", "/")
- : fileUrl.toString()) + "!/");
- final JarURLConnection jarConnection = (JarURLConnection)
jarUrl.openConnection();
- try (JarFile jarFile = jarConnection.getJarFile()) {
- final Enumeration<JarEntry> entries = jarFile.entries();
- while (entries.hasMoreElements()) {
- final JarEntry entry = entries.nextElement();
- if (!entry.isDirectory()) {
- final URL entryUrl = new URL(jarUrl + entry.getName());
- try {
- uris.add(entryUrl.toURI());
- } catch (URISyntaxException ignored) {
- //if we cannot convert to URI here, we ignore that entry.
- }
- }
- }
- }
-
- return uris;
- }
-
- private boolean isWindows() {
- return System.getProperty("os.name",
"unknown").toLowerCase(Locale.ROOT).contains("win");
- }
-
/**
* Attempts to obtain {@link URL URLs} from the classpath in the following
order:
* <p>
@@ -206,12 +155,12 @@ public class SimpleClassPathModelFinder extends
AbstractClassPathModelFinder imp
private List<URL> getClassPathUrlsFromSystemProperty() {
final String cp = System.getProperty("java.class.path", "");
final String[] matches = isWindows()
- ? CLASSPATH_SEPARATOR_PATTERN_WINDOWS.split(cp)
- : CLASSPATH_SEPARATOR_PATTERN_UNIX.split(cp);
+ ? CLASSPATH_SEPARATOR_PATTERN_WINDOWS.split(cp)
+ : CLASSPATH_SEPARATOR_PATTERN_UNIX.split(cp);
final List<URL> jarUrls = new ArrayList<>();
for (String classPath: matches) {
try {
- jarUrls.add(new URL(FILE_PREFIX, "", classPath));
+ jarUrls.add(Path.of(classPath).toUri().toURL());
} catch (MalformedURLException ignored) {
//if we cannot parse a URL from the system property, just ignore it...
//we couldn't load it anyway