i18nutil/qa/cppunit/test_scriptchangescanner.cxx | 23 ++++++++++++++++++----- i18nutil/source/utility/scriptchangescanner.cxx | 23 ++++++++++++++++++----- 2 files changed, 36 insertions(+), 10 deletions(-)
New commits: commit 9b505f583954c88ce7b72a07c9bfd65d78d863ef Author: Jonathan Clark <jonat...@libreoffice.org> AuthorDate: Mon Jan 6 03:36:31 2025 -0700 Commit: Jonathan Clark <jonat...@libreoffice.org> CommitDate: Mon Jan 6 13:45:19 2025 +0100 tdf#66791 sw: Apply first-seen script type to leading weak characters Previously, leading weak characters were always assigned a script type derived from the application language. This change updates the algorithm to instead use the logical-order first-seen non-weak script type. Change-Id: I5f6732c6faa1eb35aff51e98e82a87c8008f70ab Reviewed-on: https://gerrit.libreoffice.org/c/core/+/179824 Tested-by: Jenkins Reviewed-by: Jonathan Clark <jonat...@libreoffice.org> diff --git a/i18nutil/qa/cppunit/test_scriptchangescanner.cxx b/i18nutil/qa/cppunit/test_scriptchangescanner.cxx index e0726a45d922..1b033d94218a 100644 --- a/i18nutil/qa/cppunit/test_scriptchangescanner.cxx +++ b/i18nutil/qa/cppunit/test_scriptchangescanner.cxx @@ -27,6 +27,7 @@ public: void testTrivial(); void testTrivialAppLang(); void testWeakAtStart(); + void testOnlyWeak(); void testStrongChange(); void testMongolianAfterNNBSP(); void testNonspacingMark(); @@ -44,6 +45,7 @@ public: CPPUNIT_TEST(testTrivial); CPPUNIT_TEST(testTrivialAppLang); CPPUNIT_TEST(testWeakAtStart); + CPPUNIT_TEST(testOnlyWeak); CPPUNIT_TEST(testStrongChange); CPPUNIT_TEST(testMongolianAfterNNBSP); CPPUNIT_TEST(testNonspacingMark); @@ -101,21 +103,32 @@ void ScriptChangeScannerTest::testTrivialAppLang() void ScriptChangeScannerTest::testWeakAtStart() { + // The first-seen script type is used for weak characters at the start auto aText = u"“x”"_ustr; auto pDirScanner = MakeDirectionChangeScanner(aText, 0); auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::COMPLEX, *pDirScanner); CPPUNIT_ASSERT(!pScanner->AtEnd()); - CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::COMPLEX, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex); - CPPUNIT_ASSERT_EQUAL(sal_Int32(1), pScanner->Peek().m_nEndIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), pScanner->Peek().m_nEndIndex); pScanner->Advance(); + CPPUNIT_ASSERT(pScanner->AtEnd()); +} + +void ScriptChangeScannerTest::testOnlyWeak() +{ + // The application language is used for text containing only weak characters + auto aText = u"“”"_ustr; + auto pDirScanner = MakeDirectionChangeScanner(aText, 0); + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::COMPLEX, *pDirScanner); + CPPUNIT_ASSERT(!pScanner->AtEnd()); - CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); - CPPUNIT_ASSERT_EQUAL(sal_Int32(1), pScanner->Peek().m_nStartIndex); - CPPUNIT_ASSERT_EQUAL(sal_Int32(3), pScanner->Peek().m_nEndIndex); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::COMPLEX, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), pScanner->Peek().m_nEndIndex); pScanner->Advance(); diff --git a/i18nutil/source/utility/scriptchangescanner.cxx b/i18nutil/source/utility/scriptchangescanner.cxx index 2cb8d9d96819..456af39bb495 100644 --- a/i18nutil/source/utility/scriptchangescanner.cxx +++ b/i18nutil/source/utility/scriptchangescanner.cxx @@ -132,7 +132,7 @@ private: ScriptChange m_stCurr; DirectionChangeScanner* m_pDirScanner; const OUString& m_rText; - sal_Int16 m_nPrevScript; + sal_Int16 m_nPrevScript = css::i18n::ScriptType::WEAK; sal_Int32 m_nIndex = 0; bool m_bAtEnd = false; bool m_bApplyAsianToWeakQuotes = false; @@ -142,16 +142,23 @@ public: DirectionChangeScanner* pDirScanner) : m_pDirScanner(pDirScanner) , m_rText(rText) - , m_nPrevScript(nDefaultScriptType) { // tdf#66791: For compatibility with other programs, the Asian script is // applied to any weak-script quote characters if the enclosing paragraph // contains Chinese- or Japanese-script characters. + // In the original Writer algorithm, the application language is used for + // all leading weak characters (#94331#). This implementation deviates by + // instead using the first-seen non-weak script. sal_Int32 nCjBase = 0; while (nCjBase < m_rText.getLength()) { auto nChar = m_rText.iterateCodePoints(&nCjBase); auto nScript = GetScriptClass(nChar); + if (m_nPrevScript == css::i18n::ScriptType::WEAK) + { + m_nPrevScript = nScript; + } + if (nScript == css::i18n::ScriptType::COMPLEX) { m_bApplyAsianToWeakQuotes = false; @@ -172,12 +179,18 @@ public: } } - // In the original Writer algorithm, the application language is used for - // all leading weak characters. Make a change record for those characters. + // Fall back to the application language for leading weak characters if a + // better candidate was not found. + if (m_nPrevScript == css::i18n::ScriptType::WEAK) + { + m_nPrevScript = nDefaultScriptType; + } + + // Make a change record for leading weak characters. Advance(); if (m_stCurr.m_nStartIndex == m_stCurr.m_nEndIndex) { - // The text does not start with application-language leading characters. + // The text does not start with weak characters. // Initialize with a non-empty record. Advance(); }