This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch 2.x-OPENNLP-1829-Transfer-DirectoryModelFinder-to-OpenNLP-core in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit 7ee2ba1be4efe458ad00ca392e34687bdcf493b4 Author: Martin Wiesner <[email protected]> AuthorDate: Mon May 18 09:29:12 2026 +0200 OPENNLP-1829: Transfer DirectoryModelFinder to OpenNLP core --- .../tools/models/AbstractClassPathModelFinder.java | 70 ++++++++++ .../tools/models/dir/DirectoryModelFinder.java | 145 +++++++++++++++++++++ .../models/simple/SimpleClassPathModelFinder.java | 61 +-------- 3 files changed, 220 insertions(+), 56 deletions(-) diff --git a/opennlp-tools-models/src/main/java/opennlp/tools/models/AbstractClassPathModelFinder.java b/opennlp-tools-models/src/main/java/opennlp/tools/models/AbstractClassPathModelFinder.java index b31128dd8..2050892c8 100644 --- a/opennlp-tools-models/src/main/java/opennlp/tools/models/AbstractClassPathModelFinder.java +++ b/opennlp-tools-models/src/main/java/opennlp/tools/models/AbstractClassPathModelFinder.java @@ -16,12 +16,22 @@ */ package opennlp.tools.models; +import java.io.IOException; +import java.net.JarURLConnection; import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Enumeration; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Objects; import java.util.Optional; import java.util.Set; +import java.util.jar.JarEntry; +import java.util.jar.JarFile; +import java.util.regex.Pattern; /** * A base implementation of a {@link ClassPathModelFinder} for the detection of @@ -126,4 +136,64 @@ public abstract class AbstractClassPathModelFinder implements ClassPathModelFind return jarModelPrefix; } + /** + * Escapes a {@code wildcard} expressions for usage as a Java regular expression. + * + * @param wildcard A valid expression. It must not be {@code null}. + * @return The escaped regex. + */ + protected String asRegex(String wildcard) { + return wildcard + .replace(".", "\\.") + .replace("*", ".*") + .replace("?", "."); + } + + protected boolean matchesPattern(URL url, Pattern pattern) { + return pattern.matcher(url.getFile()).matches(); + } + + /** + * Converts a {@code location} in String form to a {@link URL}. + * + * @param location The resource path and/or reference. + * @return The converted {@link URL} form. + * @throws IOException Thrown if IO errors occurred during conversion + */ + protected static URL toURL(String location) throws IOException { + try { + return new URI(location).toURL(); + } catch (URISyntaxException e) { + throw new IOException(e); + } + } + + protected List<URI> getURIsFromJar(URL fileUrl, boolean isWindows) throws IOException { + final List<URI> uris = new ArrayList<>(); + final String location = JAR + ":" + + (isWindows ? fileUrl.toString().replace("\\", "/") + : fileUrl.toString()) + "!/"; + final URL jarUrl = toURL(location); + final JarURLConnection jarConnection = (JarURLConnection) jarUrl.openConnection(); + try (JarFile jarFile = jarConnection.getJarFile()) { + final Enumeration<JarEntry> entries = jarFile.entries(); + while (entries.hasMoreElements()) { + final JarEntry entry = entries.nextElement(); + if (!entry.isDirectory()) { + try { + uris.add(new URI(jarUrl + entry.getName())); + } catch (URISyntaxException ignored) { + //if we cannot convert to URI here, we ignore that entry. + } + } + } + } + + return uris; + } + + protected boolean isWindows() { + return System.getProperty("os.name", "unknown").toLowerCase(Locale.ROOT).contains("win"); + } + } diff --git a/opennlp-tools-models/src/main/java/opennlp/tools/models/dir/DirectoryModelFinder.java b/opennlp-tools-models/src/main/java/opennlp/tools/models/dir/DirectoryModelFinder.java new file mode 100644 index 000000000..9a7b3d1d6 --- /dev/null +++ b/opennlp-tools-models/src/main/java/opennlp/tools/models/dir/DirectoryModelFinder.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.models.dir; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.regex.Pattern; +import java.util.stream.Stream; + +import org.slf4j.LoggerFactory; + +import opennlp.tools.models.AbstractClassPathModelFinder; +import opennlp.tools.models.ClassPathModelFinder; + +/** + * The {@code DirectoryModelFinder} class is responsible for finding model files in a given directory + * on the classpath. + * + * <p>This class allows searching for models based on wildcard patterns, either in plain directory structures + * or within JAR files. The search can be performed recursively depending on the specified configuration. + * + * <p><b>Usage:</b> + * <ul> + * <li>Provide the prefix for models to be found in JAR files using the {@code jarModelPrefix} + * parameter.</li> + * <li>Specify the directory to search and whether to enable recursive scanning.</li> + * <li>The class supports resolving both direct file matches and entries within JAR archives.</li> + * </ul> + * + * @see AbstractClassPathModelFinder + * @see ClassPathModelFinder + */ +public class DirectoryModelFinder extends AbstractClassPathModelFinder implements ClassPathModelFinder { + + private static final org.slf4j.Logger logger = LoggerFactory.getLogger(DirectoryModelFinder.class); + + private final Path directory; + private final boolean recursive; + private final Pattern jarPattern; + private Pattern filePattern; + private String prevFilePattern; + + /** + * Instantiates a new {@link DirectoryModelFinder} with the specified parameters. + * + * @param jarModelPrefix The prefix for identifying model files in JAR archives; may be {@code null}. + * If it is {@code null}, {@link ClassPathModelFinder#OPENNLP_MODEL_JAR_PREFIX} + * is used. + * @param directory The root directory to scan from for model files; must not be {@code null}. + * @param recursive {@code true} if the search should include subdirectories, {@code false} otherwise. + * @throws IllegalArgumentException Thrown if {@code directory} is {@code null}. + */ + public DirectoryModelFinder(String jarModelPrefix, Path directory, boolean recursive) { + super(jarModelPrefix == null ? OPENNLP_MODEL_JAR_PREFIX : jarModelPrefix); + if (directory == null) { + throw new IllegalArgumentException("Given directory must not be NULL"); + } + this.directory = directory; + this.recursive = recursive; + this.jarPattern = Pattern.compile(asRegex("*" + getJarModelPrefix())); + } + + /** + * @return Always {@code null} as it is not needed for the directory case. + */ + @Override + protected Object getContext() { + return null; //not needed for the simple case. Just return NULL. + } + + /** + * {@inheritDoc} + */ + @Override + protected List<URI> getMatchingURIs(String wildcardPattern, Object context) { + if (wildcardPattern == null) { + return Collections.emptyList(); + } + + final boolean isWindows = isWindows(); + final List<URL> cp = getDirectoryContent(); + final List<URI> cpu = new ArrayList<>(); + final String filePatternString = asRegex("*" + wildcardPattern); + if (!filePatternString.equals(prevFilePattern)) { + this.filePattern = Pattern.compile(filePatternString); + this.prevFilePattern = filePatternString; + } + + for (URL url : cp) { + if (matchesPattern(url, jarPattern)) { + try { + for (URI u : getURIsFromJar(url, isWindows)) { + if (matchesPattern(u.toURL(), filePattern)) { + cpu.add(u); + } + } + } catch (IOException e) { + logger.warn("Cannot read content of {}.", url, e); + } + } + } + + return cpu; + } + + private List<URL> getDirectoryContent() { + final List<URL> fileList = new ArrayList<>(); + try (Stream<Path> files = Files.walk(directory, recursive ? Integer.MAX_VALUE : 1)) { + files.filter(Files::isRegularFile).forEach(path -> { + try { + fileList.add(path.toUri().toURL()); + } catch (MalformedURLException ignored) { + + } + }); + } catch (IOException e) { + logger.warn(e.getLocalizedMessage(), e); + } + return fileList; + } + + + +} diff --git a/opennlp-tools-models/src/main/java/opennlp/tools/models/simple/SimpleClassPathModelFinder.java b/opennlp-tools-models/src/main/java/opennlp/tools/models/simple/SimpleClassPathModelFinder.java index ebc7da9f0..e189cef8d 100644 --- a/opennlp-tools-models/src/main/java/opennlp/tools/models/simple/SimpleClassPathModelFinder.java +++ b/opennlp-tools-models/src/main/java/opennlp/tools/models/simple/SimpleClassPathModelFinder.java @@ -19,20 +19,15 @@ package opennlp.tools.models.simple; import java.io.IOException; import java.lang.reflect.Field; import java.lang.reflect.Method; -import java.net.JarURLConnection; import java.net.MalformedURLException; import java.net.URI; -import java.net.URISyntaxException; import java.net.URL; import java.net.URLClassLoader; +import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; -import java.util.Enumeration; import java.util.List; -import java.util.Locale; -import java.util.jar.JarEntry; -import java.util.jar.JarFile; import java.util.regex.Pattern; import org.slf4j.Logger; @@ -45,7 +40,7 @@ import opennlp.tools.models.ClassPathModelFinder; * Enables the detection of OpenNLP models in the classpath via JDK classes * By default, this class will search for JAR files starting with "opennlp-models-*". * This wildcard pattern can be adjusted by using the alternative constructor of this class. - * + * * @implNote * It is a rather simple implementation of scanning the classpath by trying to obtain {@link URL urls} * from the actual classpath via a chain of possible options. It might not work for every use-case @@ -68,7 +63,6 @@ import opennlp.tools.models.ClassPathModelFinder; public class SimpleClassPathModelFinder extends AbstractClassPathModelFinder implements ClassPathModelFinder { private static final Logger logger = LoggerFactory.getLogger(SimpleClassPathModelFinder.class); - private static final String FILE_PREFIX = "file"; private static final Pattern CLASSPATH_SEPARATOR_PATTERN_WINDOWS = Pattern.compile(";"); private static final Pattern CLASSPATH_SEPARATOR_PATTERN_UNIX = Pattern.compile(":"); // ; for Windows, : for Linux/OSX @@ -131,51 +125,6 @@ public class SimpleClassPathModelFinder extends AbstractClassPathModelFinder imp return cpu; } - /** - * Escapes a {@code wildcard} expressions for usage as a Java regular expression. - * - * @param wildcard A valid expression. It must not be {@code null}. - * @return The escaped regex. - */ - private String asRegex(String wildcard) { - return wildcard - .replace(".", "\\.") - .replace("*", ".*") - .replace("?", "."); - } - - private boolean matchesPattern(URL url, Pattern pattern) { - return pattern.matcher(url.getFile()).matches(); - } - - private List<URI> getURIsFromJar(URL fileUrl, boolean isWindows) throws IOException { - final List<URI> uris = new ArrayList<>(); - final URL jarUrl = new URL(JAR + ":" + - (isWindows ? fileUrl.toString().replace("\\", "/") - : fileUrl.toString()) + "!/"); - final JarURLConnection jarConnection = (JarURLConnection) jarUrl.openConnection(); - try (JarFile jarFile = jarConnection.getJarFile()) { - final Enumeration<JarEntry> entries = jarFile.entries(); - while (entries.hasMoreElements()) { - final JarEntry entry = entries.nextElement(); - if (!entry.isDirectory()) { - final URL entryUrl = new URL(jarUrl + entry.getName()); - try { - uris.add(entryUrl.toURI()); - } catch (URISyntaxException ignored) { - //if we cannot convert to URI here, we ignore that entry. - } - } - } - } - - return uris; - } - - private boolean isWindows() { - return System.getProperty("os.name", "unknown").toLowerCase(Locale.ROOT).contains("win"); - } - /** * Attempts to obtain {@link URL URLs} from the classpath in the following order: * <p> @@ -206,12 +155,12 @@ public class SimpleClassPathModelFinder extends AbstractClassPathModelFinder imp private List<URL> getClassPathUrlsFromSystemProperty() { final String cp = System.getProperty("java.class.path", ""); final String[] matches = isWindows() - ? CLASSPATH_SEPARATOR_PATTERN_WINDOWS.split(cp) - : CLASSPATH_SEPARATOR_PATTERN_UNIX.split(cp); + ? CLASSPATH_SEPARATOR_PATTERN_WINDOWS.split(cp) + : CLASSPATH_SEPARATOR_PATTERN_UNIX.split(cp); final List<URL> jarUrls = new ArrayList<>(); for (String classPath: matches) { try { - jarUrls.add(new URL(FILE_PREFIX, "", classPath)); + jarUrls.add(Path.of(classPath).toUri().toURL()); } catch (MalformedURLException ignored) { //if we cannot parse a URL from the system property, just ignore it... //we couldn't load it anyway
