This is an automated email from the ASF dual-hosted git repository. rzo1 pushed a commit to branch OPENNLP-991 in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit a743012d935955cc416d3df193e9a24981a2057a Author: Richard Zowalla <[email protected]> AuthorDate: Sun Mar 22 21:20:53 2026 +0100 OPENNLP-991 - Validate all passed in language codes --- .../opennlp/tools/util/LanguageCodeValidator.java | 103 ++++++++++++++++++ .../java/opennlp/tools/cmdline/CmdLineUtil.java | 11 +- .../java/opennlp/tools/util/model/BaseModel.java | 2 + .../tools/util/LanguageCodeValidatorTest.java | 115 +++++++++++++++++++++ 4 files changed, 222 insertions(+), 9 deletions(-) diff --git a/opennlp-api/src/main/java/opennlp/tools/util/LanguageCodeValidator.java b/opennlp-api/src/main/java/opennlp/tools/util/LanguageCodeValidator.java new file mode 100644 index 00000000..c204afa3 --- /dev/null +++ b/opennlp-api/src/main/java/opennlp/tools/util/LanguageCodeValidator.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +/** + * Validates language codes against ISO 639 standards. + * <p> + * Accepts: + * <ul> + * <li>ISO 639-1 two-letter language codes (e.g., {@code "en"}, {@code "de"})</li> + * <li>ISO 639-2/3 three-letter language codes consisting of lowercase ASCII letters + * (e.g., {@code "eng"}, {@code "deu"}, {@code "dut"}, {@code "und"})</li> + * <li>The special code {@code "x-unspecified"} used internally by OpenNLP</li> + * </ul> + * + * @see <a href="https://iso639-3.sil.org/">ISO 639-3</a> + */ +public final class LanguageCodeValidator { + + private static final String X_UNSPECIFIED = "x-unspecified"; + + private static final Set<String> ISO639_1_CODES = + new HashSet<>(Arrays.asList(Locale.getISOLanguages())); + + private LanguageCodeValidator() { + // utility class, not intended to be instantiated + } + + /** + * Checks whether the given language code is a valid ISO 639 code. + * <p> + * Two-letter codes are validated against the known set of ISO 639-1 codes. + * Three-letter codes are accepted if they consist entirely of lowercase ASCII letters, + * covering ISO 639-2 (both bibliographic and terminological) and ISO 639-3 codes. + * + * @param languageCode The language code to check. Must not be {@code null}. + * @return {@code true} if the code is valid, {@code false} otherwise. + * @throws NullPointerException if {@code languageCode} is {@code null}. + */ + public static boolean isValid(String languageCode) { + if (languageCode == null) { + throw new NullPointerException("languageCode must not be null"); + } + + if (X_UNSPECIFIED.equals(languageCode)) { + return true; + } + + int len = languageCode.length(); + if (len == 2) { + return ISO639_1_CODES.contains(languageCode); + } + if (len == 3) { + return isLowerAsciiAlpha(languageCode); + } + return false; + } + + /** + * Validates the given language code and throws an {@link IllegalArgumentException} + * if it is not a recognized ISO 639 language code. + * + * @param languageCode The language code to validate. Must not be {@code null}. + * @throws IllegalArgumentException if the code is not a valid ISO 639 language code. + * @throws NullPointerException if {@code languageCode} is {@code null}. + */ + public static void validateLanguageCode(String languageCode) { + if (!isValid(languageCode)) { + throw new IllegalArgumentException( + "Unknown language code '" + languageCode + "', must be a valid ISO 639 code!"); + } + } + + private static boolean isLowerAsciiAlpha(String s) { + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + if (c < 'a' || c > 'z') { + return false; + } + } + return true; + } +} diff --git a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java index 0c774ff8..3c0b90ea 100644 --- a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java +++ b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java @@ -26,17 +26,13 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Locale; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; import opennlp.tools.commons.Internal; import opennlp.tools.ml.TrainerFactory; import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.LanguageCodeValidator; import opennlp.tools.util.MarkableFileInputStreamFactory; import opennlp.tools.util.TrainingParameters; import opennlp.tools.util.model.BaseModel; @@ -278,10 +274,7 @@ public final class CmdLineUtil { } public static void checkLanguageCode(String code) { - List<String> languageCodes = new ArrayList<>(Arrays.asList(Locale.getISOLanguages())); - languageCodes.add("x-unspecified"); - - if (!languageCodes.contains(code)) { + if (!LanguageCodeValidator.isValid(code)) { throw new TerminateToolException(1, "Unknown language code " + code + ", " + "must be an ISO 639 code!"); } diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/model/BaseModel.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/model/BaseModel.java index 7716bd0d..0d3171b3 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/model/BaseModel.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/model/BaseModel.java @@ -43,6 +43,7 @@ import java.util.zip.ZipOutputStream; import opennlp.tools.util.BaseToolFactory; import opennlp.tools.util.InvalidFormatException; +import opennlp.tools.util.LanguageCodeValidator; import opennlp.tools.util.Version; import opennlp.tools.util.ext.ExtensionLoader; @@ -108,6 +109,7 @@ public abstract class BaseModel implements ArtifactProvider, Serializable { this(componentName, false); Objects.requireNonNull(languageCode, "languageCode must not be null"); + LanguageCodeValidator.validateLanguageCode(languageCode); createBaseArtifactSerializers(artifactSerializers); diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/LanguageCodeValidatorTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/LanguageCodeValidatorTest.java new file mode 100644 index 00000000..441285e4 --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/LanguageCodeValidatorTest.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +class LanguageCodeValidatorTest { + + @Test + void testValidIso639_1Codes() { + Assertions.assertTrue(LanguageCodeValidator.isValid("en")); + Assertions.assertTrue(LanguageCodeValidator.isValid("de")); + Assertions.assertTrue(LanguageCodeValidator.isValid("fr")); + Assertions.assertTrue(LanguageCodeValidator.isValid("es")); + Assertions.assertTrue(LanguageCodeValidator.isValid("pt")); + Assertions.assertTrue(LanguageCodeValidator.isValid("it")); + Assertions.assertTrue(LanguageCodeValidator.isValid("nl")); + Assertions.assertTrue(LanguageCodeValidator.isValid("th")); + Assertions.assertTrue(LanguageCodeValidator.isValid("ja")); + Assertions.assertTrue(LanguageCodeValidator.isValid("pl")); + } + + @Test + void testValidIso639_3TerminologicalCodes() { + Assertions.assertTrue(LanguageCodeValidator.isValid("eng")); + Assertions.assertTrue(LanguageCodeValidator.isValid("deu")); + Assertions.assertTrue(LanguageCodeValidator.isValid("fra")); + Assertions.assertTrue(LanguageCodeValidator.isValid("spa")); + Assertions.assertTrue(LanguageCodeValidator.isValid("por")); + Assertions.assertTrue(LanguageCodeValidator.isValid("ita")); + Assertions.assertTrue(LanguageCodeValidator.isValid("nld")); + Assertions.assertTrue(LanguageCodeValidator.isValid("tha")); + Assertions.assertTrue(LanguageCodeValidator.isValid("jpn")); + Assertions.assertTrue(LanguageCodeValidator.isValid("pol")); + } + + @Test + void testValidIso639_2BibliographicCodes() { + Assertions.assertTrue(LanguageCodeValidator.isValid("dut")); + Assertions.assertTrue(LanguageCodeValidator.isValid("fre")); + Assertions.assertTrue(LanguageCodeValidator.isValid("ger")); + } + + @Test + void testUndeterminedCode() { + Assertions.assertTrue(LanguageCodeValidator.isValid("und")); + } + + @Test + void testSpecialCodes() { + Assertions.assertTrue(LanguageCodeValidator.isValid("x-unspecified")); + } + + @Test + void testInvalidCodes() { + Assertions.assertFalse(LanguageCodeValidator.isValid("")); + Assertions.assertFalse(LanguageCodeValidator.isValid("xyz123")); + Assertions.assertFalse(LanguageCodeValidator.isValid("invalid")); + Assertions.assertFalse(LanguageCodeValidator.isValid("EN")); + Assertions.assertFalse(LanguageCodeValidator.isValid("ENG")); + Assertions.assertFalse(LanguageCodeValidator.isValid("123")); + Assertions.assertFalse(LanguageCodeValidator.isValid("e")); + Assertions.assertFalse(LanguageCodeValidator.isValid("en-US")); + Assertions.assertFalse(LanguageCodeValidator.isValid("abcd")); + } + + @Test + void testInvalidTwoLetterCode() { + Assertions.assertFalse(LanguageCodeValidator.isValid("xx")); + Assertions.assertFalse(LanguageCodeValidator.isValid("zz")); + } + + @Test + void testNullCode() { + Assertions.assertThrows(NullPointerException.class, + () -> LanguageCodeValidator.isValid(null)); + } + + @Test + void testValidateThrowsForInvalidCode() { + IllegalArgumentException ex = Assertions.assertThrows(IllegalArgumentException.class, + () -> LanguageCodeValidator.validateLanguageCode("invalid_code")); + Assertions.assertTrue(ex.getMessage().contains("invalid_code")); + } + + @Test + void testValidatePassesForValidCode() { + Assertions.assertDoesNotThrow( + () -> LanguageCodeValidator.validateLanguageCode("en")); + Assertions.assertDoesNotThrow( + () -> LanguageCodeValidator.validateLanguageCode("eng")); + Assertions.assertDoesNotThrow( + () -> LanguageCodeValidator.validateLanguageCode("dut")); + Assertions.assertDoesNotThrow( + () -> LanguageCodeValidator.validateLanguageCode("und")); + Assertions.assertDoesNotThrow( + () -> LanguageCodeValidator.validateLanguageCode("x-unspecified")); + } +}
