source

Jonathan Clark (via logerrit) Wed, 09 Apr 2025 10:23:57 -0700

 i18npool/qa/cppunit/test_breakiterator.cxx          |  102 ++++++++++++++++-
 i18npool/source/breakiterator/breakiterator_cjk.cxx |  115 +++++++++++++-------
 2 files changed, 174 insertions(+), 43 deletions(-)


New commits:
commit 6aad8cb6d5b7c936a7e55126d849e691735fd1f5
Author:     Jonathan Clark <jonat...@libreoffice.org>
AuthorDate: Mon Apr 7 21:37:26 2025 -0600
Commit:     Christian Lohmaier <lohmaier+libreoff...@googlemail.com>
CommitDate: Wed Apr 9 16:33:27 2025 +0200

    tdf#130592 i18npool: Fix incorrect line breaking in mixed CJK+Latin
    
    This change updates the CJK BreakIterator to run ICU line breaking rules
    after applying user forbidden rules. This fixes several identified CJK
    GUI text rendering issues, most significantly that embedded English runs
    were previously broken using CJK break rules, rather than using the
    correct English rules.
    
    By implementing this fallback to the ICU rules, the CJK BreakIterator
    now also correctly handles non-breaking characters. This change
    therefore also fixes tdf#117554.
    
    Change-Id: I3bacc7e2b559729df2c37c010974d180c8eef1f5
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/183825
    Tested-by: Jenkins
    Reviewed-by: Jonathan Clark <jonat...@libreoffice.org>
    (cherry picked from commit 5a03d511f46ecc05aab35bb29e714b46f5638b1b)
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/183935
    Reviewed-by: Christian Lohmaier <lohmaier+libreoff...@googlemail.com>

diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx 
b/i18npool/qa/cppunit/test_breakiterator.cxx
index 80bdeb15c7be..9e18958762cb 100644
--- a/i18npool/qa/cppunit/test_breakiterator.cxx
+++ b/i18npool/qa/cppunit/test_breakiterator.cxx
@@ -43,6 +43,7 @@ public:
 #endif
     void testJapanese();
     void testChinese();
+    void testKorean();
 
     void testDictWordAbbreviation();
     void testDictWordPrepostDash();
@@ -66,6 +67,7 @@ public:
 #endif
     CPPUNIT_TEST(testJapanese);
     CPPUNIT_TEST(testChinese);
+    CPPUNIT_TEST(testKorean);
     CPPUNIT_TEST(testDictWordAbbreviation);
     CPPUNIT_TEST(testDictWordPrepostDash);
     CPPUNIT_TEST(testHebrewGereshGershaim);
@@ -390,17 +392,16 @@ void TestBreakIterator::testLineBreaking()
     }
 
     // i#72868: Writer/Impress line does not break after Chinese punctuation 
and Latin letters
+    // tdf#130592: Fixed the regression. If this case fails, UI text will be 
laid out incorrectly.
     {
         aLocale.Language = "zh";
         aLocale.Country = "HK";
 
         {
-            // Per the bug, this should break at the ideographic comma. 
However, this change has
-            // been reverted at some point. This test only verifies current 
behavior.
             const OUString str = u"word word、word word"_ustr;
             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
                 str, strlen("word wordXwor"), aLocale, 0, aHyphOptions, 
aUserOptions);
-            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(13), 
aResult.breakIndex);
+            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(10), 
aResult.breakIndex);
         }
     }
 
@@ -1648,18 +1649,103 @@ void TestBreakIterator::testJapanese()
 
 void TestBreakIterator::testChinese()
 {
-    lang::Locale aLocale;
-    aLocale.Language = "zh";
-    aLocale.Country = "CN";
+    lang::Locale stLocale;
+    stLocale.Language = "zh";
+    stLocale.Country = "CN";
 
+    // Verify dictionary-based word breakiterator
     {
         static constexpr OUStringLiteral aTest = 
u"\u6A35\u6A30\u69FE\u8919\U00029EDB";
 
-        i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale,
-            i18n::WordType::DICTIONARY_WORD, true);
+        i18n::Boundary aBounds
+            = m_xBreak->getWordBoundary(aTest, 4, stLocale, 
i18n::WordType::DICTIONARY_WORD, true);
         CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos);
         CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
     }
+
+    // Chinese allows line breaking inside a word
+    {
+        i18n::LineBreakHyphenationOptions stHyphOptions;
+        i18n::LineBreakUserOptions stUserOptions;
+
+        auto aTest = u"手机"_ustr;
+        auto stBreak = m_xBreak->getLineBreak(aTest, 1, stLocale, 0, 
stHyphOptions, stUserOptions);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(1), stBreak.breakIndex);
+    }
+
+    // Characteristic test including built-in forbidden rules
+    {
+        i18n::LineBreakHyphenationOptions stHyphOptions;
+        i18n::LineBreakUserOptions stUserOptions;
+
+        // Comma normally not allowed at start of line, quote normally not 
allowed at end
+        auto aTest = u"水水水、水水水「水水水水水水水水水"_ustr;
+        auto stBreak1 = m_xBreak->getLineBreak(aTest, 3, stLocale, 0, 
stHyphOptions, stUserOptions);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(2), stBreak1.breakIndex);
+        auto stBreak2 = m_xBreak->getLineBreak(aTest, 8, stLocale, 0, 
stHyphOptions, stUserOptions);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(7), stBreak2.breakIndex);
+    }
+
+    // tdf#117554 Do not break at ZWNBSP
+    {
+        i18n::LineBreakHyphenationOptions stHyphOptions;
+        i18n::LineBreakUserOptions stUserOptions;
+
+        auto aTest = u"手\uFEFF机"_ustr;
+        auto stBreak = m_xBreak->getLineBreak(aTest, 2, stLocale, 0, 
stHyphOptions, stUserOptions);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), stBreak.breakIndex);
+    }
+
+    // Characteristic test for hanging punctuation
+    {
+        i18n::LineBreakHyphenationOptions stHyphOptions;
+        i18n::LineBreakUserOptions stUserOptions;
+        stUserOptions.allowPunctuationOutsideMargin = true;
+
+        auto aTest = u"水水水、水水水。"_ustr;
+
+        // Comma normally not allowed at start of line. Usually this should 
wrap the preceding
+        // character to the next line, but with hanging punctuation it can 
overflow the line.
+        auto stBreak1 = m_xBreak->getLineBreak(aTest, 3, stLocale, 0, 
stHyphOptions, stUserOptions);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(4), stBreak1.breakIndex);
+
+        // With hanging punctuation, the period should be allowed to carry 
over to the margin.
+        auto stBreak2 = m_xBreak->getLineBreak(aTest, 7, stLocale, 0, 
stHyphOptions, stUserOptions);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(8), stBreak2.breakIndex);
+    }
+
+    // tdf#58604: Test for interaction between line breaks and hanging 
punctuation
+    {
+        i18n::LineBreakHyphenationOptions stHyphOptions;
+        i18n::LineBreakUserOptions stUserOptions;
+        stUserOptions.allowPunctuationOutsideMargin = true;
+
+        auto aTest = u"水水水、
水水水。"_ustr;
+
+        // Lines should always break after any hanging punctuation
+        auto stBreak3 = m_xBreak->getLineBreak(aTest, 3, stLocale, 0, 
stHyphOptions, stUserOptions);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(4), stBreak3.breakIndex);
+
+        // Lines should also break after the first-seen hanging punctuation
+        auto stBreak4 = m_xBreak->getLineBreak(aTest, 4, stLocale, 0, 
stHyphOptions, stUserOptions);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(4), stBreak4.breakIndex);
+    }
+}
+
+void TestBreakIterator::testKorean()
+{
+    lang::Locale stLocale;
+    stLocale.Language = "ko";
+    stLocale.Country = "KR";
+
+    {
+        i18n::LineBreakHyphenationOptions stHyphOptions;
+        i18n::LineBreakUserOptions stUserOptions;
+
+        auto aTest = u"저는 한국에서 살고 있어요"_ustr;
+        auto stBreak = m_xBreak->getLineBreak(aTest, 5, stLocale, 0, 
stHyphOptions, stUserOptions);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(3), stBreak.breakIndex);
+    }
 }
 
 void TestBreakIterator::testDictWordPrepostDash()
diff --git a/i18npool/source/breakiterator/breakiterator_cjk.cxx 
b/i18npool/source/breakiterator/breakiterator_cjk.cxx
index d674b8649341..f9b1b7e46199 100644
--- a/i18npool/source/breakiterator/breakiterator_cjk.cxx
+++ b/i18npool/source/breakiterator/breakiterator_cjk.cxx
@@ -47,46 +47,91 @@ bool isHangul( sal_Unicode cCh )
 }
 
 LineBreakResults SAL_CALL BreakIterator_CJK::getLineBreak(
-        const OUString& Text, sal_Int32 nStartPos,
-        const css::lang::Locale& /*rLocale*/, sal_Int32 /*nMinBreakPos*/,
-        const LineBreakHyphenationOptions& /*hOptions*/,
-        const LineBreakUserOptions& bOptions )
+    const OUString& Text, sal_Int32 nStartPos, const css::lang::Locale& 
rLocale,
+    sal_Int32 nMinBreakPos, const LineBreakHyphenationOptions& hOptions,
+    const LineBreakUserOptions& bOptions)
 {
-    LineBreakResults lbr;
-
-    const sal_Int32 nOldStartPos = nStartPos;
-
-    if (bOptions.allowPunctuationOutsideMargin &&
-            nStartPos != Text.getLength() &&
-            hangingCharacters.indexOf(Text[nStartPos]) != -1 &&
-            (Text.iterateCodePoints( &nStartPos ), nStartPos == 
Text.getLength())) {
-        ; // do nothing
-    } else if (bOptions.applyForbiddenRules && 0 < nStartPos && nStartPos < 
Text.getLength()) {
-
-        while (nStartPos > 0 &&
-                (bOptions.forbiddenBeginCharacters.indexOf(Text[nStartPos]) != 
-1 ||
-                 bOptions.forbiddenEndCharacters.indexOf(Text[nStartPos-1]) != 
-1))
-            Text.iterateCodePoints( &nStartPos, -1);
-    }
-
-    // Prevent cutting Korean words in the middle.
-    if (nOldStartPos == nStartPos && nStartPos < Text.getLength()
-        && isHangul(Text[nStartPos]))
+    auto fnIsForbiddenBreak = [&](sal_Int32 nBreakPos)
     {
-        while ( nStartPos >= 0 && isHangul( Text[nStartPos] ) )
-            --nStartPos;
-
-        // beginning of the last Korean word.
-        if ( nStartPos < nOldStartPos )
-            ++nStartPos;
+        return nBreakPos > 0
+               && (bOptions.forbiddenBeginCharacters.indexOf(Text[nBreakPos]) 
!= -1
+                   || bOptions.forbiddenEndCharacters.indexOf(Text[nBreakPos - 
1]) != -1);
+    };
 
-        if ( nStartPos == 0 )
-            nStartPos = nOldStartPos;
+    while (nStartPos > 0 && nStartPos < Text.getLength())
+    {
+        // Apply hanging punctuation
+        if (bOptions.allowPunctuationOutsideMargin
+            && hangingCharacters.indexOf(Text[nStartPos]) != -1)
+        {
+            // The current character is allowed to overhang the margin.
+            sal_Int32 nNextPos = nStartPos;
+            Text.iterateCodePoints(&nNextPos);
+
+            // tdf#130592: The original code always allowed a line break after 
hanging
+            // punctuation, even if it's not a valid ICU break. This refactor 
preserves the
+            // original behavior in order to avoid regressing tdf#58604.
+            if (nNextPos >= Text.getLength() || !fnIsForbiddenBreak(nNextPos))
+            {
+                LineBreakResults stBreak;
+                stBreak.breakIndex = nNextPos;
+                stBreak.breakType = BreakType::HANGINGPUNCTUATION;
+                return stBreak;
+            }
+        }
+
+        const sal_Int32 nOldStartPos = nStartPos;
+
+        // Apply forbidden rules
+        if (bOptions.applyForbiddenRules)
+        {
+            while (fnIsForbiddenBreak(nStartPos))
+            {
+                Text.iterateCodePoints(&nStartPos, -1);
+            }
+        }
+
+        // Prevent cutting Korean words in the middle
+        if (nOldStartPos == nStartPos && isHangul(Text[nStartPos]))
+        {
+            while (nStartPos >= 0 && isHangul(Text[nStartPos]))
+                --nStartPos;
+
+            // beginning of the last Korean word.
+            if (nStartPos < nOldStartPos)
+                ++nStartPos;
+
+            if (nStartPos == 0)
+                nStartPos = nOldStartPos;
+        }
+
+        // tdf#130592: Fall back to the ICU breakiterator after applying 
CJK-specific rules
+        auto stBreak = BreakIterator_Unicode::getLineBreak(Text, nStartPos, 
rLocale, nMinBreakPos,
+                                                           hOptions, bOptions);
+        if (stBreak.breakIndex == nStartPos)
+        {
+            // Located break is valid under both iterators
+            return stBreak;
+        }
+
+        // CJK break is not valid; restart search from the next candidate
+        sal_Int32 nNextCandidate = stBreak.breakIndex;
+        while (bOptions.allowPunctuationOutsideMargin && nStartPos > 
stBreak.breakIndex)
+        {
+            if (hangingCharacters.indexOf(Text[nStartPos]) != -1)
+            {
+                nNextCandidate = nStartPos;
+                break;
+            }
+
+            Text.iterateCodePoints(&nStartPos, -1);
+        }
+
+        nStartPos = nNextCandidate;
     }
 
-    lbr.breakIndex = nStartPos;
-    lbr.breakType = BreakType::WORDBOUNDARY;
-    return lbr;
+    return BreakIterator_Unicode::getLineBreak(Text, nStartPos, rLocale, 
nMinBreakPos, hOptions,
+                                               bOptions);
 }
 
 #define LOCALE(language, country) css::lang::Locale(language, country, 
OUString())

core.git: Branch 'libreoffice-25-2' - i18npool/qa i18npool/source

Reply via email to