i18npool/qa/cppunit/test_breakiterator.cxx | 249 +++++++++++++++- i18npool/source/breakiterator/breakiterator_unicode.cxx | 12 2 files changed, 249 insertions(+), 12 deletions(-)
New commits: commit 10ee7d30f7c1c8c9b80155341c2bf1639ca21d5f Author: Jonathan Clark <jonat...@libreoffice.org> AuthorDate: Mon Dec 2 16:03:43 2024 -0700 Commit: Xisco Fauli <xiscofa...@libreoffice.org> CommitDate: Wed Dec 4 00:19:59 2024 +0100 tdf#162912 i18npool: Updated CJK BreakIterator to use custom rules Regression from commit 14c6cde779d64596eab0f4d3f32f181ce2243929: "tdf#49885 Updated CJK BreakIterator to use ICU" Previously, languages requiring dictionary-based break iterators were handled by instantiating a stock ICU break iterator as a special case. tdf#49885 upgraded our custom rules to support passthrough for dictionary-based breaking, so this special case is no longer necessary. Change-Id: Iebb06de82eb511946e5b220e5dc414440838b03c Reviewed-on: https://gerrit.libreoffice.org/c/core/+/177713 Tested-by: Jenkins Reviewed-by: Jonathan Clark <jonat...@libreoffice.org> Signed-off-by: Xisco Fauli <xiscofa...@libreoffice.org> Reviewed-on: https://gerrit.libreoffice.org/c/core/+/177754 diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx index a8e45875e7a8..c516bb74a854 100644 --- a/i18npool/qa/cppunit/test_breakiterator.cxx +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -49,6 +49,7 @@ public: void testHebrewGereshGershaim(); void testLegacySurrogatePairs(); void testWordCount(); + void testDictionaryIteratorLanguages(); CPPUNIT_TEST_SUITE(TestBreakIterator); CPPUNIT_TEST(testLineBreaking); @@ -70,6 +71,7 @@ public: CPPUNIT_TEST(testHebrewGereshGershaim); CPPUNIT_TEST(testLegacySurrogatePairs); CPPUNIT_TEST(testWordCount); + CPPUNIT_TEST(testDictionaryIteratorLanguages); CPPUNIT_TEST_SUITE_END(); private: @@ -1591,6 +1593,25 @@ void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > co CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); } + + { + // tdf#162912: Double-clicking should only select one Basic identifier + static constexpr OUString aTest = u"ThisComponent.CurrentSelection"_ustr; + + aBounds = xBreak->getWordBoundary(aTest, 5, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); + + aBounds = xBreak->getWordBoundary(aTest, 5, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.endPos); + + aBounds = xBreak->getWordBoundary(aTest, 15, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); + } } void TestBreakIterator::testJapanese() @@ -1894,7 +1915,233 @@ void TestBreakIterator::testWordCount() const OUString str = u"Wordの様にワード数をするのにTest 植松町"_ustr; - CPPUNIT_ASSERT_EQUAL(7, count_words_fn(str, aLocale)); + CPPUNIT_ASSERT_EQUAL(8, count_words_fn(str, aLocale)); + } +} + +void TestBreakIterator::testDictionaryIteratorLanguages() +{ + // Thai + { + lang::Locale aLocale{ "th", "TH", "" }; + + const OUString aStr = u"รอนานหรือเปล่า"_ustr; + + i18n::Boundary aBounds; + + aBounds + = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 10, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 10, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 10, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos); + } + + // Japanese + { + lang::Locale aLocale{ "ja", "JP", "" }; + + const OUString aStr = u"通産省工業技術院北海道"_ustr; + + i18n::Boundary aBounds; + + aBounds + = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 2, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 7, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 9, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 2, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 4, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 7, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 9, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 2, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 4, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 7, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 9, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos); + } + + // Chinese + { + lang::Locale aLocale{ "zh", "CN", "" }; + + const OUString aStr = u"很高兴认识你"_ustr; + + i18n::Boundary aBounds; + + aBounds + = m_xBreak->getWordBoundary(aStr, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 5, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 0, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 5, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 0, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 5, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); } } diff --git a/i18npool/source/breakiterator/breakiterator_unicode.cxx b/i18npool/source/breakiterator/breakiterator_unicode.cxx index 9b47c433f296..a0fe58aae43e 100644 --- a/i18npool/source/breakiterator/breakiterator_unicode.cxx +++ b/i18npool/source/breakiterator/breakiterator_unicode.cxx @@ -74,16 +74,6 @@ class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator }; -bool locale_requires_dictionary_iterator(const css::lang::Locale& rLocale) -{ - return rLocale.Language == "bo" || // Tibetan - rLocale.Language == "dz" || // Dzongkha - rLocale.Language == "ja" || // Japanese - rLocale.Language == "km" || // Khmer - rLocale.Language == "lo" || // Lao - rLocale.Language == "th" || // Thai - rLocale.Language == "zh"; // Chinese -} } // loading ICU breakiterator on demand. @@ -189,7 +179,7 @@ void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocal rbi.reset(); } } - else if(!locale_requires_dictionary_iterator(rLocale)) + else { // language;rule (not langtag, unless we'd actually load such) OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8());