This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/main by this push:
     new 0e194a33 OPENNLP-991: Validate all passed in language codes (#989)
0e194a33 is described below

commit 0e194a337b4fd976bf339eff5deac30be900445c
Author: Richard Zowalla <[email protected]>
AuthorDate: Sun Mar 22 22:59:49 2026 +0100

    OPENNLP-991: Validate all passed in language codes (#989)
---
 .../opennlp/tools/util/LanguageCodeValidator.java  | 121 ++++++++++++++++++++
 .../java/opennlp/tools/cmdline/CmdLineUtil.java    |  15 +--
 .../java/opennlp/tools/util/model/BaseModel.java   |   2 +
 .../tools/util/LanguageCodeValidatorTest.java      | 122 +++++++++++++++++++++
 4 files changed, 250 insertions(+), 10 deletions(-)

diff --git 
a/opennlp-api/src/main/java/opennlp/tools/util/LanguageCodeValidator.java 
b/opennlp-api/src/main/java/opennlp/tools/util/LanguageCodeValidator.java
new file mode 100644
index 00000000..8c8ead1d
--- /dev/null
+++ b/opennlp-api/src/main/java/opennlp/tools/util/LanguageCodeValidator.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util;
+
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+/**
+ * Validates language codes against ISO 639 standards.
+ * <p>
+ * Accepts:
+ * <ul>
+ *   <li>ISO 639-1 two-letter language codes
+ *       (e.g., {@code "en"}, {@code "de"})</li>
+ *   <li>ISO 639-2/3 three-letter language codes
+ *       (e.g., {@code "eng"}, {@code "deu"})</li>
+ *   <li>The special code {@code "x-unspecified"}
+ *       used internally by OpenNLP</li>
+ * </ul>
+ * <p>
+ * Valid codes are derived from {@link Locale#availableLocales()}
+ * plus additional ISO 639-2 bibliographic codes and
+ * {@code "und"} (undetermined).
+ *
+ * @see <a href="https://iso639-3.sil.org/";>ISO 639-3</a>
+ */
+public final class LanguageCodeValidator {
+
+  private static final String X_UNSPECIFIED = "x-unspecified";
+
+  private static final Set<String> VALID_CODES;
+
+  static {
+    VALID_CODES = Locale.availableLocales()
+        .flatMap(loc -> {
+          final Set<String> codes = new HashSet<>();
+          final String lang = loc.getLanguage();
+          if (!lang.isEmpty()) {
+            codes.add(lang);
+          }
+          try {
+            final String iso3 = loc.getISO3Language();
+            if (!iso3.isEmpty()) {
+              codes.add(iso3);
+            }
+          } catch (Exception ignored) {
+            // Some locales may not have a 3-letter equivalent
+          }
+          return codes.stream();
+        })
+        .collect(Collectors.toCollection(HashSet::new));
+
+    // ISO 639-2 bibliographic codes not returned by Locale
+    VALID_CODES.add("dut"); // Dutch (bibliographic)
+    VALID_CODES.add("fre"); // French (bibliographic)
+    VALID_CODES.add("ger"); // German (bibliographic)
+
+    // ISO 639-3 special code for undetermined language
+    VALID_CODES.add("und");
+
+    // OpenNLP-specific special code
+    VALID_CODES.add(X_UNSPECIFIED);
+  }
+
+  private LanguageCodeValidator() {
+    // utility class, not intended to be instantiated
+  }
+
+  /**
+   * Checks whether the given language code is a valid ISO 639 code.
+   *
+   * @param languageCode The language code to check.
+   *     Must not be {@code null}.
+   * @return {@code true} if the code is valid,
+   *     {@code false} otherwise.
+   * @throws IllegalArgumentException if {@code languageCode}
+   *     is {@code null}.
+   */
+  public static boolean isValid(String languageCode) {
+    if (languageCode == null) {
+      throw new IllegalArgumentException(
+          "languageCode must not be null");
+    }
+    return VALID_CODES.contains(languageCode);
+  }
+
+  /**
+   * Validates the given language code and throws an
+   * {@link IllegalArgumentException} if it is not a recognized
+   * ISO 639 language code.
+   *
+   * @param languageCode The language code to validate.
+   *     Must not be {@code null}.
+   * @throws IllegalArgumentException if the code is not a valid
+   *     ISO 639 language code or is {@code null}.
+   */
+  public static void validateLanguageCode(String languageCode) {
+    if (!isValid(languageCode)) {
+      throw new IllegalArgumentException(
+          "Unknown language code '" + languageCode
+              + "', must be a valid ISO 639 code!");
+    }
+  }
+}
diff --git 
a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java 
b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java
index 0c774ff8..10c1127b 100644
--- 
a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java
+++ 
b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java
@@ -26,10 +26,6 @@ import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Locale;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -37,6 +33,7 @@ import org.slf4j.LoggerFactory;
 import opennlp.tools.commons.Internal;
 import opennlp.tools.ml.TrainerFactory;
 import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.LanguageCodeValidator;
 import opennlp.tools.util.MarkableFileInputStreamFactory;
 import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.BaseModel;
@@ -278,12 +275,10 @@ public final class CmdLineUtil {
   }
 
   public static void checkLanguageCode(String code) {
-    List<String> languageCodes = new 
ArrayList<>(Arrays.asList(Locale.getISOLanguages()));
-    languageCodes.add("x-unspecified");
-
-    if (!languageCodes.contains(code)) {
-      throw new TerminateToolException(1, "Unknown language code " + code + ", 
" +
-          "must be an ISO 639 code!");
+    if (!LanguageCodeValidator.isValid(code)) {
+      throw new TerminateToolException(1,
+          "Unknown language code " + code
+              + ", must be an ISO 639 code!");
     }
   }
 
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/model/BaseModel.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/model/BaseModel.java
index 7716bd0d..0d3171b3 100644
--- 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/model/BaseModel.java
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/model/BaseModel.java
@@ -43,6 +43,7 @@ import java.util.zip.ZipOutputStream;
 
 import opennlp.tools.util.BaseToolFactory;
 import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.LanguageCodeValidator;
 import opennlp.tools.util.Version;
 import opennlp.tools.util.ext.ExtensionLoader;
 
@@ -108,6 +109,7 @@ public abstract class BaseModel implements 
ArtifactProvider, Serializable {
     this(componentName, false);
 
     Objects.requireNonNull(languageCode, "languageCode must not be null");
+    LanguageCodeValidator.validateLanguageCode(languageCode);
 
     createBaseArtifactSerializers(artifactSerializers);
 
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/LanguageCodeValidatorTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/LanguageCodeValidatorTest.java
new file mode 100644
index 00000000..6dabe46e
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/LanguageCodeValidatorTest.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+class LanguageCodeValidatorTest {
+
+  @Test
+  void testValidIso639_1Codes() {
+    Assertions.assertTrue(LanguageCodeValidator.isValid("en"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("de"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("fr"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("es"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("pt"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("it"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("nl"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("th"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("ja"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("pl"));
+  }
+
+  @Test
+  void testValidIso639_3TerminologicalCodes() {
+    Assertions.assertTrue(LanguageCodeValidator.isValid("eng"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("deu"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("fra"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("spa"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("por"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("ita"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("nld"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("tha"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("jpn"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("pol"));
+  }
+
+  @Test
+  void testValidIso639_2BibliographicCodes() {
+    Assertions.assertTrue(LanguageCodeValidator.isValid("dut"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("fre"));
+    Assertions.assertTrue(LanguageCodeValidator.isValid("ger"));
+  }
+
+  @Test
+  void testUndeterminedCode() {
+    Assertions.assertTrue(LanguageCodeValidator.isValid("und"));
+  }
+
+  @Test
+  void testSpecialCodes() {
+    Assertions.assertTrue(LanguageCodeValidator.isValid("x-unspecified"));
+  }
+
+  @Test
+  void testInvalidCodes() {
+    Assertions.assertFalse(LanguageCodeValidator.isValid(""));
+    Assertions.assertFalse(LanguageCodeValidator.isValid("xyz123"));
+    Assertions.assertFalse(LanguageCodeValidator.isValid("invalid"));
+    Assertions.assertFalse(LanguageCodeValidator.isValid("EN"));
+    Assertions.assertFalse(LanguageCodeValidator.isValid("ENG"));
+    Assertions.assertFalse(LanguageCodeValidator.isValid("123"));
+    Assertions.assertFalse(LanguageCodeValidator.isValid("e"));
+    Assertions.assertFalse(LanguageCodeValidator.isValid("en-US"));
+    Assertions.assertFalse(LanguageCodeValidator.isValid("abcd"));
+  }
+
+  @Test
+  void testInvalidTwoLetterCode() {
+    Assertions.assertFalse(LanguageCodeValidator.isValid("xx"));
+    Assertions.assertFalse(LanguageCodeValidator.isValid("zz"));
+  }
+
+  @Test
+  void testInvalidThreeLetterCode() {
+    Assertions.assertFalse(LanguageCodeValidator.isValid("xyz"));
+    Assertions.assertFalse(LanguageCodeValidator.isValid("abc"));
+    Assertions.assertFalse(LanguageCodeValidator.isValid("zzz"));
+  }
+
+  @Test
+  void testNullCode() {
+    Assertions.assertThrows(IllegalArgumentException.class,
+        () -> LanguageCodeValidator.isValid(null));
+  }
+
+  @Test
+  void testValidateThrowsForInvalidCode() {
+    IllegalArgumentException ex = 
Assertions.assertThrows(IllegalArgumentException.class,
+        () -> LanguageCodeValidator.validateLanguageCode("invalid_code"));
+    Assertions.assertTrue(ex.getMessage().contains("invalid_code"));
+  }
+
+  @Test
+  void testValidatePassesForValidCode() {
+    Assertions.assertDoesNotThrow(
+        () -> LanguageCodeValidator.validateLanguageCode("en"));
+    Assertions.assertDoesNotThrow(
+        () -> LanguageCodeValidator.validateLanguageCode("eng"));
+    Assertions.assertDoesNotThrow(
+        () -> LanguageCodeValidator.validateLanguageCode("dut"));
+    Assertions.assertDoesNotThrow(
+        () -> LanguageCodeValidator.validateLanguageCode("und"));
+    Assertions.assertDoesNotThrow(
+        () -> LanguageCodeValidator.validateLanguageCode("x-unspecified"));
+  }
+}

Reply via email to