i18nutil/qa/cppunit/test_scriptchangescanner.cxx | 23 +++ i18nutil/source/utility/scriptchangescanner.cxx | 154 ++++++++++++----------- 2 files changed, 104 insertions(+), 73 deletions(-)
New commits: commit 519cae93f19fac96b5b67aff5e31f9abf89aa1bc Author: Jonathan Clark <jonat...@libreoffice.org> AuthorDate: Tue Jan 7 05:52:27 2025 -0700 Commit: Michael Stahl <michael.st...@allotropia.de> CommitDate: Fri Jan 10 11:39:20 2025 +0100 tdf#164493 Update script change algorithm to always make progress Fixes ofz#385256118 Timeout on ImpEditEngine::InitScriptTypes. Previously, the script assignment algorithm could backtrack in certain cases where a script run ends with weak characters that should be included in the following run. Fuzz testing unearthed a case involving right-to-left override and a CJK combining mark, which caused the algorithm to make no progress and hang. The algorithm now always makes progress on each iteration. Change-Id: I4da138c51d391c152afcee2428c21dc762a2dafc Reviewed-on: https://gerrit.libreoffice.org/c/core/+/179888 Tested-by: Jenkins Reviewed-by: Jonathan Clark <jonat...@libreoffice.org> (cherry picked from commit 1afdda6bca508abe56edf7968677e689b5ee07d4) Reviewed-on: https://gerrit.libreoffice.org/c/core/+/179937 Reviewed-by: Michael Stahl <michael.st...@allotropia.de> diff --git a/i18nutil/qa/cppunit/test_scriptchangescanner.cxx b/i18nutil/qa/cppunit/test_scriptchangescanner.cxx index 1b033d94218a..ea8f6c9e1518 100644 --- a/i18nutil/qa/cppunit/test_scriptchangescanner.cxx +++ b/i18nutil/qa/cppunit/test_scriptchangescanner.cxx @@ -39,6 +39,7 @@ public: void testRtlRunEmbeddedLtrStrong(); void testRtlRunEmbeddedLtrWeakComplex(); void testRtlRunOverrideCJKAsian(); + void testTdf164493InfiniteLoop(); CPPUNIT_TEST_SUITE(ScriptChangeScannerTest); CPPUNIT_TEST(testEmpty); @@ -57,6 +58,7 @@ public: CPPUNIT_TEST(testRtlRunEmbeddedLtrStrong); CPPUNIT_TEST(testRtlRunEmbeddedLtrWeakComplex); CPPUNIT_TEST(testRtlRunOverrideCJKAsian); + CPPUNIT_TEST(testTdf164493InfiniteLoop); CPPUNIT_TEST_SUITE_END(); }; @@ -592,6 +594,27 @@ void ScriptChangeScannerTest::testRtlRunOverrideCJKAsian() CPPUNIT_ASSERT(pScanner->AtEnd()); } +void ScriptChangeScannerTest::testTdf164493InfiniteLoop() +{ + // tdf#164493: Tests a case causing an infinite loop due to interaction + // between right-to-left override and a CJK combining mark. + + // U+202E: RIGHT-TO-LEFT OVERRIDE + // U+302E: HANGUL SINGLE DOT TONE MARK + auto aText = u"\u202e\u302e"_ustr; + auto pDirScanner = MakeDirectionChangeScanner(aText, 0); + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN, *pDirScanner); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::ASIAN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(pScanner->AtEnd()); +} + CPPUNIT_TEST_SUITE_REGISTRATION(ScriptChangeScannerTest); } diff --git a/i18nutil/source/utility/scriptchangescanner.cxx b/i18nutil/source/utility/scriptchangescanner.cxx index 456af39bb495..a3e27bdaf155 100644 --- a/i18nutil/source/utility/scriptchangescanner.cxx +++ b/i18nutil/source/utility/scriptchangescanner.cxx @@ -132,84 +132,28 @@ private: ScriptChange m_stCurr; DirectionChangeScanner* m_pDirScanner; const OUString& m_rText; - sal_Int16 m_nPrevScript = css::i18n::ScriptType::WEAK; sal_Int32 m_nIndex = 0; + sal_Int32 m_nNextStart = 0; + sal_Int16 m_nPrevScript = css::i18n::ScriptType::WEAK; bool m_bAtEnd = false; bool m_bApplyAsianToWeakQuotes = false; -public: - GreedyScriptChangeScanner(const OUString& rText, sal_Int16 nDefaultScriptType, - DirectionChangeScanner* pDirScanner) - : m_pDirScanner(pDirScanner) - , m_rText(rText) + void AdvanceOnce() { - // tdf#66791: For compatibility with other programs, the Asian script is - // applied to any weak-script quote characters if the enclosing paragraph - // contains Chinese- or Japanese-script characters. - // In the original Writer algorithm, the application language is used for - // all leading weak characters (#94331#). This implementation deviates by - // instead using the first-seen non-weak script. - sal_Int32 nCjBase = 0; - while (nCjBase < m_rText.getLength()) - { - auto nChar = m_rText.iterateCodePoints(&nCjBase); - auto nScript = GetScriptClass(nChar); - if (m_nPrevScript == css::i18n::ScriptType::WEAK) - { - m_nPrevScript = nScript; - } + m_stCurr + = ScriptChange{ /*start*/ m_nNextStart, /*end*/ m_nNextStart, /*type*/ m_nPrevScript }; - if (nScript == css::i18n::ScriptType::COMPLEX) - { - m_bApplyAsianToWeakQuotes = false; - break; - } - - auto nUnicodeScript = u_getIntPropertyValue(nChar, UCHAR_SCRIPT); - switch (nUnicodeScript) - { - case USCRIPT_HAN: - case USCRIPT_HIRAGANA: - case USCRIPT_KATAKANA: - m_bApplyAsianToWeakQuotes = true; - break; - - default: - break; - } - } - - // Fall back to the application language for leading weak characters if a - // better candidate was not found. - if (m_nPrevScript == css::i18n::ScriptType::WEAK) - { - m_nPrevScript = nDefaultScriptType; - } - - // Make a change record for leading weak characters. - Advance(); - if (m_stCurr.m_nStartIndex == m_stCurr.m_nEndIndex) - { - // The text does not start with weak characters. - // Initialize with a non-empty record. - Advance(); - } - } - - bool AtEnd() const override { return m_bAtEnd; } - - void Advance() override - { - m_stCurr = ScriptChange{ /*start*/ 0, /*end*/ 0, /*type*/ m_nPrevScript }; - - if (m_nIndex >= m_rText.getLength()) + if (m_nNextStart >= m_rText.getLength()) { m_bAtEnd = true; return; } - auto nRunStart = m_nIndex; + auto nRunStart = m_nNextStart; + m_nNextStart = m_nIndex; + auto nScript = m_nPrevScript; + while (m_nIndex < m_rText.getLength()) { auto nPrevIndex = m_nIndex; @@ -262,23 +206,25 @@ public: if (nScript != m_nPrevScript) { - m_nIndex = nPrevIndex; + m_nNextStart = nPrevIndex; break; } + + m_nNextStart = m_nIndex; } - if (m_nIndex > 0) + if (m_nNextStart > 0) { // special case for dotted circle since it can be used with complex // before a mark, so we want it associated with the mark's script // tdf#112594: another special case for NNBSP followed by a Mongolian // character, since NNBSP has special uses in Mongolian (tdf#112594) - auto nPrevPos = m_nIndex; + auto nPrevPos = m_nNextStart; auto nPrevChar = m_rText.iterateCodePoints(&nPrevPos, -1); - if (m_nIndex < m_rText.getLength() + if (m_nNextStart < m_rText.getLength() && css::i18n::ScriptType::WEAK == GetScriptClass(nPrevChar)) { - auto nChar = m_rText.iterateCodePoints(&m_nIndex, 0); + auto nChar = m_rText.iterateCodePoints(&m_nNextStart, 0); auto nType = unicode::getUnicodeType(nChar); if (nType == css::i18n::UnicodeType::NON_SPACING_MARK || nType == css::i18n::UnicodeType::ENCLOSING_MARK @@ -286,15 +232,77 @@ public: || (nPrevChar == CHAR_NNBSP && u_getIntPropertyValue(nChar, UCHAR_SCRIPT) == USCRIPT_MONGOLIAN)) { - m_nIndex = nPrevPos; + m_nNextStart = nPrevPos; } } } - m_stCurr = ScriptChange{ nRunStart, m_nIndex, m_nPrevScript }; + m_stCurr = ScriptChange{ nRunStart, m_nNextStart, m_nPrevScript }; m_nPrevScript = nScript; } +public: + GreedyScriptChangeScanner(const OUString& rText, sal_Int16 nDefaultScriptType, + DirectionChangeScanner* pDirScanner) + : m_pDirScanner(pDirScanner) + , m_rText(rText) + { + // tdf#66791: For compatibility with other programs, the Asian script is + // applied to any weak-script quote characters if the enclosing paragraph + // contains Chinese- or Japanese-script characters. + // In the original Writer algorithm, the application language is used for + // all leading weak characters (#94331#). This implementation deviates by + // instead using the first-seen non-weak script. + sal_Int32 nCjBase = 0; + while (nCjBase < m_rText.getLength()) + { + auto nChar = m_rText.iterateCodePoints(&nCjBase); + auto nScript = GetScriptClass(nChar); + if (m_nPrevScript == css::i18n::ScriptType::WEAK) + { + m_nPrevScript = nScript; + } + + if (nScript == css::i18n::ScriptType::COMPLEX) + { + m_bApplyAsianToWeakQuotes = false; + break; + } + + auto nUnicodeScript = u_getIntPropertyValue(nChar, UCHAR_SCRIPT); + switch (nUnicodeScript) + { + case USCRIPT_HAN: + case USCRIPT_HIRAGANA: + case USCRIPT_KATAKANA: + m_bApplyAsianToWeakQuotes = true; + break; + + default: + break; + } + } + + // Fall back to the application language for leading weak characters if a + // better candidate was not found. + if (m_nPrevScript == css::i18n::ScriptType::WEAK) + { + m_nPrevScript = nDefaultScriptType; + } + + Advance(); + } + + bool AtEnd() const override { return m_bAtEnd; } + + void Advance() override + { + do + { + AdvanceOnce(); + } while (!AtEnd() && (m_stCurr.m_nStartIndex == m_stCurr.m_nEndIndex)); + } + ScriptChange Peek() const override { return m_stCurr; } }; }