janhoy commented on code in PR #2886: URL: https://github.com/apache/solr/pull/2886#discussion_r1860286943
########## solr/modules/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java: ########## @@ -464,6 +466,101 @@ public void testMapIndividual() throws Exception { assertTrue(mappedIndividual.containsKey("text2_ru")); } + @Test + public void testAllowlistWithFallback() throws Exception { + ModifiableSolrParams parameters = new ModifiableSolrParams(); + parameters.add("langid.fl", "name,subject"); + parameters.add("langid.langField", "language_s"); + parameters.add("langid.allowlist", "no,en ,,,sv"); // Any other language will fallback to EN + parameters.add("langid.fallback", "en"); + liProcessor = createLangIdProcessor(parameters); + + // Make sure that empty language codes have been filtered out and others trimmed. Review Comment: Should be no need to do an actual detection here (which currently fails tests). Suffice to check that the allowList got populated: ```suggestion assertEquals(Set.of("no", "en", "sv"), liProcessor.langAllowlist); ``` ########## solr/modules/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java: ########## @@ -111,18 +111,21 @@ private void initParams(SolrParams params) { overwrite = params.getBool(OVERWRITE, false); langAllowlist = new HashSet<>(); threshold = params.getDouble(THRESHOLD, DOCID_THRESHOLD_DEFAULT); - String legacyAllowList = params.get(LANG_WHITELIST, ""); - if (legacyAllowList.length() > 0) { + final String legacyAllowList = params.get(LANG_WHITELIST, "").trim(); + if (!legacyAllowList.isEmpty()) { // nowarn compile time string concatenation log.warn( LANG_WHITELIST + " parameter is deprecated; use " + LANG_ALLOWLIST + " instead."); // nowarn } - if (params.get(LANG_ALLOWLIST, legacyAllowList).length() > 0) { - for (String lang : params.get(LANG_ALLOWLIST, "").split(",")) { - langAllowlist.add(lang); + if (!params.get(LANG_ALLOWLIST, legacyAllowList).isEmpty()) { + for (String lang : params.get(LANG_ALLOWLIST, legacyAllowList).split(",")) { + if (lang.trim().isEmpty()) { + continue; + } + langAllowlist.add(lang.trim()); Review Comment: An alternative is functional style, but up to you: ```java params.get(LANG_ALLOWLIST, legacyAllowList) .flatMap(line -> Arrays.stream(line.split(","))) .map(String::trim) .filter(lang -> !lang.isEmpty()) .forEach(langAllowlist::add); ``` ########## solr/modules/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java: ########## @@ -464,6 +466,101 @@ public void testMapIndividual() throws Exception { assertTrue(mappedIndividual.containsKey("text2_ru")); } + @Test + public void testAllowlistWithFallback() throws Exception { + ModifiableSolrParams parameters = new ModifiableSolrParams(); + parameters.add("langid.fl", "name,subject"); + parameters.add("langid.langField", "language_s"); + parameters.add("langid.allowlist", "no,en ,,,sv"); // Any other language will fallback to EN + parameters.add("langid.fallback", "en"); + liProcessor = createLangIdProcessor(parameters); + + // Make sure that empty language codes have been filtered out and others trimmed. + assertEquals(new HashSet<>(Arrays.asList("no", "en", "sv")), liProcessor.langAllowlist); + assertLang( + "no", + "id", + "1no", + "name", + "Lucene", + "subject", + "Lucene er et fri/åpen kildekode programvarebibliotek for informasjonsgjenfinning, opprinnelig utviklet i programmeringsspråket Java av Doug Cutting. Lucene støttes av Apache Software Foundation og utgis under Apache-lisensen."); + assertLang( + "en", + "id", + "2en", + "name", + "Lucene", + "subject", + "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License."); + assertLang( + "sv", + "id", + "3sv", + "name", + "Maven", + "subject", + "Apache Maven är ett verktyg utvecklat av Apache Software Foundation och används inom systemutveckling av datorprogram i programspråket Java. Maven används för att automatiskt paketera (bygga) programfilerna till en distribuerbar enhet. Maven används inom samma område som Apache Ant men dess byggfiler är deklarativa till skillnad ifrån Ants skriptbaserade."); + // Based on our langid.allowlist config, + // the Thai document is an unknown language, thus, language detection must fall back to EN + assertLang( + "en", + "id", + "6th", + "name", + "บทความคัดสรรเดือนนี้", + "subject", + "อันเนอลีส มารี อันเนอ ฟรังค์ หรือมักรู้จักในภาษาไทยว่า แอนน์ แฟรงค์ เป็นเด็กหญิงชาวยิว เกิดที่เมืองแฟรงก์เฟิร์ต ประเทศเยอรมนี เธอมีชื่อเสียงโด่งดังในฐานะผู้เขียนบันทึกประจำวันซึ่งต่อมาได้รับการตีพิมพ์เป็นหนังสือ บรรยายเหตุการณ์ขณะหลบซ่อนตัวจากการล่าชาวยิวในประเทศเนเธอร์แลนด์ ระหว่างที่ถูกเยอรมนีเข้าครอบครองในช่วงสงครามโลกครั้งที่สอง"); + } + + @Test + public void testAllowlistBackwardsCompatabilityWithLegacyAllowlist() throws Exception { + // The "legacy allowlist" is "langid.whitelist" + ModifiableSolrParams parameters = new ModifiableSolrParams(); + parameters.add("langid.fl", "name,subject"); + parameters.add("langid.langField", "language_s"); + parameters.add("langid.whitelist", "no,en ,,,sv"); // Any other language will fallback to EN + parameters.add("langid.fallback", "en"); + liProcessor = createLangIdProcessor(parameters); + + // Make sure that empty language codes have been filtered out and others trimmed. Review Comment: Same is true here, just test that the allowList got populated: ```suggestion assertEquals(Set.of("no", "en", "sv"), liProcessor.langAllowlist); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@solr.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@solr.apache.org For additional commands, e-mail: issues-h...@solr.apache.org