janhoy commented on code in PR #2886:
URL: https://github.com/apache/solr/pull/2886#discussion_r1860286943


##########
solr/modules/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java:
##########
@@ -464,6 +466,101 @@ public void testMapIndividual() throws Exception {
     assertTrue(mappedIndividual.containsKey("text2_ru"));
   }
 
+  @Test
+  public void testAllowlistWithFallback() throws Exception {
+    ModifiableSolrParams parameters = new ModifiableSolrParams();
+    parameters.add("langid.fl", "name,subject");
+    parameters.add("langid.langField", "language_s");
+    parameters.add("langid.allowlist", "no,en ,,,sv"); // Any other language 
will fallback to EN
+    parameters.add("langid.fallback", "en");
+    liProcessor = createLangIdProcessor(parameters);
+
+    // Make sure that empty language codes have been filtered out and others 
trimmed.

Review Comment:
   Should be no need to do an actual detection here (which currently fails 
tests). Suffice to check that the allowList got populated:
   ```suggestion
       assertEquals(Set.of("no", "en", "sv"), liProcessor.langAllowlist);
   ```



##########
solr/modules/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java:
##########
@@ -111,18 +111,21 @@ private void initParams(SolrParams params) {
       overwrite = params.getBool(OVERWRITE, false);
       langAllowlist = new HashSet<>();
       threshold = params.getDouble(THRESHOLD, DOCID_THRESHOLD_DEFAULT);
-      String legacyAllowList = params.get(LANG_WHITELIST, "");
-      if (legacyAllowList.length() > 0) {
+      final String legacyAllowList = params.get(LANG_WHITELIST, "").trim();
+      if (!legacyAllowList.isEmpty()) {
         // nowarn compile time string concatenation
         log.warn(
             LANG_WHITELIST
                 + " parameter is deprecated; use "
                 + LANG_ALLOWLIST
                 + " instead."); // nowarn
       }
-      if (params.get(LANG_ALLOWLIST, legacyAllowList).length() > 0) {
-        for (String lang : params.get(LANG_ALLOWLIST, "").split(",")) {
-          langAllowlist.add(lang);
+      if (!params.get(LANG_ALLOWLIST, legacyAllowList).isEmpty()) {
+        for (String lang : params.get(LANG_ALLOWLIST, 
legacyAllowList).split(",")) {
+          if (lang.trim().isEmpty()) {
+            continue;
+          }
+          langAllowlist.add(lang.trim());

Review Comment:
   An alternative is functional style, but up to you:
   ```java
   params.get(LANG_ALLOWLIST, legacyAllowList)
         .flatMap(line -> Arrays.stream(line.split(",")))
         .map(String::trim)
         .filter(lang -> !lang.isEmpty())
         .forEach(langAllowlist::add);
   ```



##########
solr/modules/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java:
##########
@@ -464,6 +466,101 @@ public void testMapIndividual() throws Exception {
     assertTrue(mappedIndividual.containsKey("text2_ru"));
   }
 
+  @Test
+  public void testAllowlistWithFallback() throws Exception {
+    ModifiableSolrParams parameters = new ModifiableSolrParams();
+    parameters.add("langid.fl", "name,subject");
+    parameters.add("langid.langField", "language_s");
+    parameters.add("langid.allowlist", "no,en ,,,sv"); // Any other language 
will fallback to EN
+    parameters.add("langid.fallback", "en");
+    liProcessor = createLangIdProcessor(parameters);
+
+    // Make sure that empty language codes have been filtered out and others 
trimmed.
+    assertEquals(new HashSet<>(Arrays.asList("no", "en", "sv")), 
liProcessor.langAllowlist);
+    assertLang(
+        "no",
+        "id",
+        "1no",
+        "name",
+        "Lucene",
+        "subject",
+        "Lucene er et fri/åpen kildekode programvarebibliotek for 
informasjonsgjenfinning, opprinnelig utviklet i programmeringsspråket Java av 
Doug Cutting. Lucene støttes av Apache Software Foundation og utgis under 
Apache-lisensen.");
+    assertLang(
+        "en",
+        "id",
+        "2en",
+        "name",
+        "Lucene",
+        "subject",
+        "Apache Lucene is a free/open source information retrieval software 
library, originally created in Java by Doug Cutting. It is supported by the 
Apache Software Foundation and is released under the Apache Software License.");
+    assertLang(
+        "sv",
+        "id",
+        "3sv",
+        "name",
+        "Maven",
+        "subject",
+        "Apache Maven är ett verktyg utvecklat av Apache Software Foundation 
och används inom systemutveckling av datorprogram i programspråket Java. Maven 
används för att automatiskt paketera (bygga) programfilerna till en 
distribuerbar enhet. Maven används inom samma område som Apache Ant men dess 
byggfiler är deklarativa till skillnad ifrån Ants skriptbaserade.");
+    // Based on our langid.allowlist config,
+    // the Thai document is an unknown language, thus, language detection must 
fall back to EN
+    assertLang(
+        "en",
+        "id",
+        "6th",
+        "name",
+        "บทความคัดสรรเดือนนี้",
+        "subject",
+        "อันเนอลีส มารี อันเนอ ฟรังค์ หรือมักรู้จักในภาษาไทยว่า แอนน์ แฟรงค์ 
เป็นเด็กหญิงชาวยิว เกิดที่เมืองแฟรงก์เฟิร์ต ประเทศเยอรมนี 
เธอมีชื่อเสียงโด่งดังในฐานะผู้เขียนบันทึกประจำวันซึ่งต่อมาได้รับการตีพิมพ์เป็นหนังสือ
 บรรยายเหตุการณ์ขณะหลบซ่อนตัวจากการล่าชาวยิวในประเทศเนเธอร์แลนด์ 
ระหว่างที่ถูกเยอรมนีเข้าครอบครองในช่วงสงครามโลกครั้งที่สอง");
+  }
+
+  @Test
+  public void testAllowlistBackwardsCompatabilityWithLegacyAllowlist() throws 
Exception {
+    // The "legacy allowlist" is "langid.whitelist"
+    ModifiableSolrParams parameters = new ModifiableSolrParams();
+    parameters.add("langid.fl", "name,subject");
+    parameters.add("langid.langField", "language_s");
+    parameters.add("langid.whitelist", "no,en ,,,sv"); // Any other language 
will fallback to EN
+    parameters.add("langid.fallback", "en");
+    liProcessor = createLangIdProcessor(parameters);
+
+    // Make sure that empty language codes have been filtered out and others 
trimmed.

Review Comment:
   Same is true here, just test that the allowList got populated:
   ```suggestion
       assertEquals(Set.of("no", "en", "sv"), liProcessor.langAllowlist);
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@solr.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@solr.apache.org
For additional commands, e-mail: issues-h...@solr.apache.org

Reply via email to