i18npool/qa/cppunit/test_breakiterator.cxx | 102 ++++++++++++++++- i18npool/source/breakiterator/breakiterator_cjk.cxx | 115 +++++++++++++------- 2 files changed, 174 insertions(+), 43 deletions(-)
New commits: commit 6aad8cb6d5b7c936a7e55126d849e691735fd1f5 Author: Jonathan Clark <jonat...@libreoffice.org> AuthorDate: Mon Apr 7 21:37:26 2025 -0600 Commit: Christian Lohmaier <lohmaier+libreoff...@googlemail.com> CommitDate: Wed Apr 9 16:33:27 2025 +0200 tdf#130592 i18npool: Fix incorrect line breaking in mixed CJK+Latin This change updates the CJK BreakIterator to run ICU line breaking rules after applying user forbidden rules. This fixes several identified CJK GUI text rendering issues, most significantly that embedded English runs were previously broken using CJK break rules, rather than using the correct English rules. By implementing this fallback to the ICU rules, the CJK BreakIterator now also correctly handles non-breaking characters. This change therefore also fixes tdf#117554. Change-Id: I3bacc7e2b559729df2c37c010974d180c8eef1f5 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/183825 Tested-by: Jenkins Reviewed-by: Jonathan Clark <jonat...@libreoffice.org> (cherry picked from commit 5a03d511f46ecc05aab35bb29e714b46f5638b1b) Reviewed-on: https://gerrit.libreoffice.org/c/core/+/183935 Reviewed-by: Christian Lohmaier <lohmaier+libreoff...@googlemail.com> diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx index 80bdeb15c7be..9e18958762cb 100644 --- a/i18npool/qa/cppunit/test_breakiterator.cxx +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -43,6 +43,7 @@ public: #endif void testJapanese(); void testChinese(); + void testKorean(); void testDictWordAbbreviation(); void testDictWordPrepostDash(); @@ -66,6 +67,7 @@ public: #endif CPPUNIT_TEST(testJapanese); CPPUNIT_TEST(testChinese); + CPPUNIT_TEST(testKorean); CPPUNIT_TEST(testDictWordAbbreviation); CPPUNIT_TEST(testDictWordPrepostDash); CPPUNIT_TEST(testHebrewGereshGershaim); @@ -390,17 +392,16 @@ void TestBreakIterator::testLineBreaking() } // i#72868: Writer/Impress line does not break after Chinese punctuation and Latin letters + // tdf#130592: Fixed the regression. If this case fails, UI text will be laid out incorrectly. { aLocale.Language = "zh"; aLocale.Country = "HK"; { - // Per the bug, this should break at the ideographic comma. However, this change has - // been reverted at some point. This test only verifies current behavior. const OUString str = u"word word、word word"_ustr; i18n::LineBreakResults aResult = m_xBreak->getLineBreak( str, strlen("word wordXwor"), aLocale, 0, aHyphOptions, aUserOptions); - CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(13), aResult.breakIndex); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(10), aResult.breakIndex); } } @@ -1648,18 +1649,103 @@ void TestBreakIterator::testJapanese() void TestBreakIterator::testChinese() { - lang::Locale aLocale; - aLocale.Language = "zh"; - aLocale.Country = "CN"; + lang::Locale stLocale; + stLocale.Language = "zh"; + stLocale.Country = "CN"; + // Verify dictionary-based word breakiterator { static constexpr OUStringLiteral aTest = u"\u6A35\u6A30\u69FE\u8919\U00029EDB"; - i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, - i18n::WordType::DICTIONARY_WORD, true); + i18n::Boundary aBounds + = m_xBreak->getWordBoundary(aTest, 4, stLocale, i18n::WordType::DICTIONARY_WORD, true); CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos); CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); } + + // Chinese allows line breaking inside a word + { + i18n::LineBreakHyphenationOptions stHyphOptions; + i18n::LineBreakUserOptions stUserOptions; + + auto aTest = u"手机"_ustr; + auto stBreak = m_xBreak->getLineBreak(aTest, 1, stLocale, 0, stHyphOptions, stUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), stBreak.breakIndex); + } + + // Characteristic test including built-in forbidden rules + { + i18n::LineBreakHyphenationOptions stHyphOptions; + i18n::LineBreakUserOptions stUserOptions; + + // Comma normally not allowed at start of line, quote normally not allowed at end + auto aTest = u"水水水、水水水「水水水水水水水水水"_ustr; + auto stBreak1 = m_xBreak->getLineBreak(aTest, 3, stLocale, 0, stHyphOptions, stUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), stBreak1.breakIndex); + auto stBreak2 = m_xBreak->getLineBreak(aTest, 8, stLocale, 0, stHyphOptions, stUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), stBreak2.breakIndex); + } + + // tdf#117554 Do not break at ZWNBSP + { + i18n::LineBreakHyphenationOptions stHyphOptions; + i18n::LineBreakUserOptions stUserOptions; + + auto aTest = u"手\uFEFF机"_ustr; + auto stBreak = m_xBreak->getLineBreak(aTest, 2, stLocale, 0, stHyphOptions, stUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), stBreak.breakIndex); + } + + // Characteristic test for hanging punctuation + { + i18n::LineBreakHyphenationOptions stHyphOptions; + i18n::LineBreakUserOptions stUserOptions; + stUserOptions.allowPunctuationOutsideMargin = true; + + auto aTest = u"水水水、水水水。"_ustr; + + // Comma normally not allowed at start of line. Usually this should wrap the preceding + // character to the next line, but with hanging punctuation it can overflow the line. + auto stBreak1 = m_xBreak->getLineBreak(aTest, 3, stLocale, 0, stHyphOptions, stUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), stBreak1.breakIndex); + + // With hanging punctuation, the period should be allowed to carry over to the margin. + auto stBreak2 = m_xBreak->getLineBreak(aTest, 7, stLocale, 0, stHyphOptions, stUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), stBreak2.breakIndex); + } + + // tdf#58604: Test for interaction between line breaks and hanging punctuation + { + i18n::LineBreakHyphenationOptions stHyphOptions; + i18n::LineBreakUserOptions stUserOptions; + stUserOptions.allowPunctuationOutsideMargin = true; + + auto aTest = u"水水水、 水水水。"_ustr; + + // Lines should always break after any hanging punctuation + auto stBreak3 = m_xBreak->getLineBreak(aTest, 3, stLocale, 0, stHyphOptions, stUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), stBreak3.breakIndex); + + // Lines should also break after the first-seen hanging punctuation + auto stBreak4 = m_xBreak->getLineBreak(aTest, 4, stLocale, 0, stHyphOptions, stUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), stBreak4.breakIndex); + } +} + +void TestBreakIterator::testKorean() +{ + lang::Locale stLocale; + stLocale.Language = "ko"; + stLocale.Country = "KR"; + + { + i18n::LineBreakHyphenationOptions stHyphOptions; + i18n::LineBreakUserOptions stUserOptions; + + auto aTest = u"저는 한국에서 살고 있어요"_ustr; + auto stBreak = m_xBreak->getLineBreak(aTest, 5, stLocale, 0, stHyphOptions, stUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), stBreak.breakIndex); + } } void TestBreakIterator::testDictWordPrepostDash() diff --git a/i18npool/source/breakiterator/breakiterator_cjk.cxx b/i18npool/source/breakiterator/breakiterator_cjk.cxx index d674b8649341..f9b1b7e46199 100644 --- a/i18npool/source/breakiterator/breakiterator_cjk.cxx +++ b/i18npool/source/breakiterator/breakiterator_cjk.cxx @@ -47,46 +47,91 @@ bool isHangul( sal_Unicode cCh ) } LineBreakResults SAL_CALL BreakIterator_CJK::getLineBreak( - const OUString& Text, sal_Int32 nStartPos, - const css::lang::Locale& /*rLocale*/, sal_Int32 /*nMinBreakPos*/, - const LineBreakHyphenationOptions& /*hOptions*/, - const LineBreakUserOptions& bOptions ) + const OUString& Text, sal_Int32 nStartPos, const css::lang::Locale& rLocale, + sal_Int32 nMinBreakPos, const LineBreakHyphenationOptions& hOptions, + const LineBreakUserOptions& bOptions) { - LineBreakResults lbr; - - const sal_Int32 nOldStartPos = nStartPos; - - if (bOptions.allowPunctuationOutsideMargin && - nStartPos != Text.getLength() && - hangingCharacters.indexOf(Text[nStartPos]) != -1 && - (Text.iterateCodePoints( &nStartPos ), nStartPos == Text.getLength())) { - ; // do nothing - } else if (bOptions.applyForbiddenRules && 0 < nStartPos && nStartPos < Text.getLength()) { - - while (nStartPos > 0 && - (bOptions.forbiddenBeginCharacters.indexOf(Text[nStartPos]) != -1 || - bOptions.forbiddenEndCharacters.indexOf(Text[nStartPos-1]) != -1)) - Text.iterateCodePoints( &nStartPos, -1); - } - - // Prevent cutting Korean words in the middle. - if (nOldStartPos == nStartPos && nStartPos < Text.getLength() - && isHangul(Text[nStartPos])) + auto fnIsForbiddenBreak = [&](sal_Int32 nBreakPos) { - while ( nStartPos >= 0 && isHangul( Text[nStartPos] ) ) - --nStartPos; - - // beginning of the last Korean word. - if ( nStartPos < nOldStartPos ) - ++nStartPos; + return nBreakPos > 0 + && (bOptions.forbiddenBeginCharacters.indexOf(Text[nBreakPos]) != -1 + || bOptions.forbiddenEndCharacters.indexOf(Text[nBreakPos - 1]) != -1); + }; - if ( nStartPos == 0 ) - nStartPos = nOldStartPos; + while (nStartPos > 0 && nStartPos < Text.getLength()) + { + // Apply hanging punctuation + if (bOptions.allowPunctuationOutsideMargin + && hangingCharacters.indexOf(Text[nStartPos]) != -1) + { + // The current character is allowed to overhang the margin. + sal_Int32 nNextPos = nStartPos; + Text.iterateCodePoints(&nNextPos); + + // tdf#130592: The original code always allowed a line break after hanging + // punctuation, even if it's not a valid ICU break. This refactor preserves the + // original behavior in order to avoid regressing tdf#58604. + if (nNextPos >= Text.getLength() || !fnIsForbiddenBreak(nNextPos)) + { + LineBreakResults stBreak; + stBreak.breakIndex = nNextPos; + stBreak.breakType = BreakType::HANGINGPUNCTUATION; + return stBreak; + } + } + + const sal_Int32 nOldStartPos = nStartPos; + + // Apply forbidden rules + if (bOptions.applyForbiddenRules) + { + while (fnIsForbiddenBreak(nStartPos)) + { + Text.iterateCodePoints(&nStartPos, -1); + } + } + + // Prevent cutting Korean words in the middle + if (nOldStartPos == nStartPos && isHangul(Text[nStartPos])) + { + while (nStartPos >= 0 && isHangul(Text[nStartPos])) + --nStartPos; + + // beginning of the last Korean word. + if (nStartPos < nOldStartPos) + ++nStartPos; + + if (nStartPos == 0) + nStartPos = nOldStartPos; + } + + // tdf#130592: Fall back to the ICU breakiterator after applying CJK-specific rules + auto stBreak = BreakIterator_Unicode::getLineBreak(Text, nStartPos, rLocale, nMinBreakPos, + hOptions, bOptions); + if (stBreak.breakIndex == nStartPos) + { + // Located break is valid under both iterators + return stBreak; + } + + // CJK break is not valid; restart search from the next candidate + sal_Int32 nNextCandidate = stBreak.breakIndex; + while (bOptions.allowPunctuationOutsideMargin && nStartPos > stBreak.breakIndex) + { + if (hangingCharacters.indexOf(Text[nStartPos]) != -1) + { + nNextCandidate = nStartPos; + break; + } + + Text.iterateCodePoints(&nStartPos, -1); + } + + nStartPos = nNextCandidate; } - lbr.breakIndex = nStartPos; - lbr.breakType = BreakType::WORDBOUNDARY; - return lbr; + return BreakIterator_Unicode::getLineBreak(Text, nStartPos, rLocale, nMinBreakPos, hOptions, + bOptions); } #define LOCALE(language, country) css::lang::Locale(language, country, OUString())