i18npool/qa/cppunit/test_breakiterator.cxx | 34 ++++++++++++++++++++ i18npool/source/breakiterator/data/dict_word_hu.txt | 21 +++++++++++- 2 files changed, 53 insertions(+), 2 deletions(-)
New commits: commit be49bc3f64eb8bfcf54628e907c8bd6a5c50dae3 Author: László Németh <[email protected]> AuthorDate: Fri Dec 26 01:07:56 2025 +0100 Commit: Xisco Fauli <[email protected]> CommitDate: Sun Dec 28 14:01:44 2025 +0100 tdf#162514 i18npool: apply fix for Hungarian abbreviations, too Restores Hungarian abbreviation handling to spell checking by applying the fix for dict_word_hu.txt. Regression from commit 44699b3de37f07090ac6fee1cd97aa76036e9700 "tdf#49885 BreakIterator rule upgrades". Follow-up to commit f4fe6df6aa92573368c3fa0edb9fd03e64d9d059 "tdf#162514 i18npool: Handle abbreviations in dictionary breakiterator". Change-Id: I83e30c831759ae896f1db2da697287b8c4dcd26b Reviewed-on: https://gerrit.libreoffice.org/c/core/+/196224 Tested-by: Jenkins Reviewed-by: László Németh <[email protected]> (cherry picked from commit 167bbe31c0620d6ca1c4640a81f2e759f8f65e6a) Reviewed-on: https://gerrit.libreoffice.org/c/core/+/196231 Reviewed-by: Xisco Fauli <[email protected]> diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx index 658481bd4381..5e443bec8fdc 100644 --- a/i18npool/qa/cppunit/test_breakiterator.cxx +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -46,6 +46,7 @@ public: void testKorean(); void testDictWordAbbreviation(); + void testDictWordAbbreviationHU(); void testDictWordPrepostDash(); void testHebrewGereshGershaim(); void testLegacySurrogatePairs(); @@ -69,6 +70,7 @@ public: CPPUNIT_TEST(testChinese); CPPUNIT_TEST(testKorean); CPPUNIT_TEST(testDictWordAbbreviation); + CPPUNIT_TEST(testDictWordAbbreviationHU); CPPUNIT_TEST(testDictWordPrepostDash); CPPUNIT_TEST(testHebrewGereshGershaim); CPPUNIT_TEST(testLegacySurrogatePairs); @@ -1834,6 +1836,38 @@ void TestBreakIterator::testDictWordAbbreviation() } } +void TestBreakIterator::testDictWordAbbreviationHU() +{ + std::vector<lang::Locale> aLocale{ + { "hu", "HU", "" } // dict_word_hu locale + }; + + for (const auto& rLocale : aLocale) + { + auto aTest = u"Pl. stb. dr.-ral Mo.-gal"_ustr; + + i18n::Boundary aBounds + = m_xBreak->getWordBoundary(aTest, 1, rLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aTest, 4, rLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aTest, 9, rLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aTest, 17, rLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos); + } +} + void TestBreakIterator::testHebrewGereshGershaim() { // In Hebrew documents, there are multiple valid ways to represent the geresh and gershaim diff --git a/i18npool/source/breakiterator/data/dict_word_hu.txt b/i18npool/source/breakiterator/data/dict_word_hu.txt index 4ba426c8c7db..df28b2fbd679 100644 --- a/i18npool/source/breakiterator/data/dict_word_hu.txt +++ b/i18npool/source/breakiterator/data/dict_word_hu.txt @@ -94,6 +94,9 @@ $IncludedML_hu = [[:name = RIGHT DOUBLE QUOTATION MARK:] [:name = QUESTION MARK:] $Symbols_hu]; +### tdf#162514: For spell checking, abbreviations may end with a period. +$PostPeriod = [:name = FULL STOP:]; + # $MidLetter = [\p{Word_Break = MidLetter}]; $MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML $IncludedML_hu]; @@ -160,10 +163,24 @@ $Ideographic $ExFm* {400}; # # rule 5 # Do not break between most letters. # -($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); + +### BEGIN CUSTOMIZATION +### tdf#162514: For spell checking, abbreviations may end with a period. + +# ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); +($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PostPeriod)?; + +### END CUSTOMIZATION # rule 6 and 7 -($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; + +### BEGIN CUSTOMIZATION +### tdf#162514: For spell checking, abbreviations may end with a period. + +# ($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; +($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PostPeriod)? {200}; + +### END CUSTOMIZATION # rule 7a $Hebrew_Letter $ExFm* $Single_Quote {200};
