source

Jonathan Clark (via logerrit) Mon, 16 Dec 2024 13:46:32 -0800

 editeng/source/editeng/impedit2.cxx                 |   48 ---
 i18npool/source/breakiterator/breakiteratorImpl.cxx |   93 ------
 i18nutil/CppunitTest_i18nutil.mk                    |    9 
 i18nutil/Library_i18nutil.mk                        |    2 
 i18nutil/Module_i18nutil.mk                         |    2 
 i18nutil/qa/cppunit/test_scriptchangescanner.cxx    |  290 ++++++++++++++++++++
 i18nutil/source/utility/scriptchangescanner.cxx     |  160 +++++++++++
 i18nutil/source/utility/scriptclass.cxx             |  143 +++++++++
 include/i18nutil/scriptchangescanner.hxx            |   38 ++
 include/i18nutil/scriptclass.hxx                    |   18 +
 sd/qa/unit/layout-tests.cxx                         |   12 
 sw/source/core/text/porlay.cxx                      |   78 +----
 12 files changed, 687 insertions(+), 206 deletions(-)


New commits:
commit 537645c0834eab2d277113f1e3fcf039c994832d
Author:     Jonathan Clark <jonat...@libreoffice.org>
AuthorDate: Tue Dec 10 02:25:31 2024 -0700
Commit:     Jonathan Clark <jonat...@libreoffice.org>
CommitDate: Mon Dec 16 22:46:05 2024 +0100

    tdf#66791 sw: Treat weak punctuation as Asian in Asian paragraphs
    
    This change modifies script detection to treat certain weak punctuation
    marks, particularly left- and right- quotation marks, as Asian script in
    paragraphs containing Chinese and Japanese characters, but no Complex
    characters. This change improves our script detection compatibility with
    other programs.
    
    As part of this change, duplicated script detection code has been
    extracted from Writer and Edit Engine, and consolidated.
    
    Change-Id: Ib2880f2e832aaac4c0093971daa88223c7232d63
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/178608
    Tested-by: Jenkins
    Reviewed-by: Jonathan Clark <jonat...@libreoffice.org>

diff --git a/editeng/source/editeng/impedit2.cxx 
b/editeng/source/editeng/impedit2.cxx
index f8d15dcdbee2..44f9ae6a33e1 100644
--- a/editeng/source/editeng/impedit2.cxx
+++ b/editeng/source/editeng/impedit2.cxx
@@ -63,6 +63,7 @@
 #include <svl/asiancfg.hxx>
 #include <svl/voiditem.hxx>
 #include <i18nutil/unicode.hxx>
+#include <i18nutil/scriptchangescanner.hxx>
 #include <comphelper/diagnose_ex.hxx>
 #include <comphelper/flagguard.hxx>
 #include <comphelper/lok.hxx>
@@ -1715,51 +1716,16 @@ void ImpEditEngine::InitScriptTypes( sal_Int32 nPara )
         pField = pField->GetEnd() ? pNode->GetCharAttribs().FindNextAttrib( 
EE_FEATURE_FIELD, pField->GetEnd() ) : nullptr;
     }
 
-    sal_Int32 nTextLen = aText.getLength();
-
-    sal_Int32 nPos = 0;
-    short nScriptType = _xBI->getScriptType( aText, nPos );
-    rTypes.emplace_back( nScriptType, nPos, nTextLen );
-    nPos = _xBI->endOfScript( aText, nPos, nScriptType );
-    while ( ( nPos != -1 ) && ( nPos < nTextLen ) )
+    auto pScriptScanner = i18nutil::MakeScriptChangeScanner(
+        aText, 
SvtLanguageOptions::GetI18NScriptTypeOfLanguage(GetDefaultLanguage()));
+    while (!pScriptScanner->AtEnd() || rTypes.empty())
     {
-        rTypes.back().nEndPos = nPos;
-
-        nScriptType = _xBI->getScriptType( aText, nPos );
-        tools::Long nEndPos = _xBI->endOfScript( aText, nPos, nScriptType );
-
-        if ( ( nScriptType == i18n::ScriptType::WEAK ) || ( nScriptType == 
rTypes.back().nScriptType ) )
-        {
-            // Expand last ScriptTypePosInfo, don't create weak or unnecessary 
portions
-            rTypes.back().nEndPos = nEndPos;
-        }
-        else
-        {
-            auto nPrevPos = nPos;
-            auto nPrevChar = aText.iterateCodePoints(&nPrevPos, -1);
-            if (_xBI->getScriptType(aText, nPrevPos) == i18n::ScriptType::WEAK)
-            {
-                auto nChar = aText.iterateCodePoints(&nPos, 0);
-                auto nType = unicode::getUnicodeType(nChar);
-                if (nType == css::i18n::UnicodeType::NON_SPACING_MARK ||
-                    nType == css::i18n::UnicodeType::ENCLOSING_MARK ||
-                    nType == css::i18n::UnicodeType::COMBINING_SPACING_MARK ||
-                    (nPrevChar == 0x202F /* NNBSP, tdf#112594 */ &&
-                     u_getIntPropertyValue(nChar, UCHAR_SCRIPT) == 
USCRIPT_MONGOLIAN))
-                {
-                    rTypes.back().nEndPos = nPos = nPrevPos;
-                    break;
-                }
-            }
-            rTypes.emplace_back( nScriptType, nPos, nTextLen );
-        }
+        auto stChange = pScriptScanner->Peek();
+        rTypes.emplace_back(stChange.m_nScriptType, stChange.m_nStartIndex, 
stChange.m_nEndIndex);
 
-        nPos = nEndPos;
+        pScriptScanner->Advance();
     }
 
-    if ( rTypes[0].nScriptType == i18n::ScriptType::WEAK )
-        rTypes[0].nScriptType = ( rTypes.size() > 1 ) ? rTypes[1].nScriptType 
: SvtLanguageOptions::GetI18NScriptTypeOfLanguage( GetDefaultLanguage() );
-
     // create writing direction information:
     WritingDirectionInfos& rDirInfos = 
pParaPortion->getWritingDirectionInfos();
     if (rDirInfos.empty())
diff --git a/i18npool/source/breakiterator/breakiteratorImpl.cxx 
b/i18npool/source/breakiterator/breakiteratorImpl.cxx
index 6b6870f83ecd..ecff78cb790b 100644
--- a/i18npool/source/breakiterator/breakiteratorImpl.cxx
+++ b/i18npool/source/breakiterator/breakiteratorImpl.cxx
@@ -22,6 +22,7 @@
 #include <breakiteratorImpl.hxx>
 #include <cppuhelper/supportsservice.hxx>
 #include <unicode/uchar.h>
+#include <i18nutil/scriptclass.hxx>
 #include <i18nutil/unicode.hxx>
 #include <o3tl/string_view.hxx>
 
@@ -449,94 +450,6 @@ sal_Int16 SAL_CALL BreakIteratorImpl::getWordType( const 
OUString& /*Text*/,
     return 0;
 }
 
-namespace
-{
-sal_Int16 getScriptClassByUAX24Script(sal_uInt32 currentChar)
-{
-    int32_t script = u_getIntPropertyValue(currentChar, UCHAR_SCRIPT);
-    return 
unicode::getScriptClassFromUScriptCode(static_cast<UScriptCode>(script));
-}
-
-struct UBlock2Script
-{
-    UBlockCode from;
-    UBlockCode to;
-    sal_Int16 script;
-};
-
-const UBlock2Script scriptList[] =
-{
-    {UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK},
-    {UBLOCK_BASIC_LATIN, UBLOCK_SPACING_MODIFIER_LETTERS, ScriptType::LATIN},
-    {UBLOCK_GREEK, UBLOCK_ARMENIAN, ScriptType::LATIN},
-    {UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX},
-    {UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN},
-    {UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN},
-    {UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX},
-    {UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN},
-    {UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX},
-    {UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, 
ScriptType::LATIN},
-    {UBLOCK_NUMBER_FORMS, UBLOCK_NUMBER_FORMS, ScriptType::WEAK},
-    {UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, 
ScriptType::ASIAN},
-    {UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, 
ScriptType::ASIAN},
-    {UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, 
ScriptType::COMPLEX},
-    {UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, 
ScriptType::ASIAN},
-    {UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, 
ScriptType::COMPLEX},
-    {UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, 
UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, ScriptType::ASIAN},
-    {UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, 
UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, ScriptType::ASIAN},
-    {UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN},
-    {UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN}
-};
-
-#define scriptListCount SAL_N_ELEMENTS(scriptList)
-
-//always sets rScriptType
-
-//returns true for characters historically explicitly assigned to
-//latin/weak/asian
-
-//returns false for characters that historically implicitly assigned to
-//weak as unknown
-bool getCompatibilityScriptClassByBlock(sal_uInt32 currentChar, sal_Int16 
&rScriptType)
-{
-    bool bKnown = true;
-    //handle specific characters always as weak:
-    //  0x01 - this breaks a word
-    //  0x02 - this can be inside a word
-    //  0x20 & 0xA0 - Bug 102975, declare western space and non-break space as 
WEAK char.
-    if( 0x01 == currentChar || 0x02 == currentChar || 0x20 == currentChar || 
0xA0 == currentChar)
-        rScriptType = ScriptType::WEAK;
-    // Few Spacing Modifier Letters that can be Bopomofo tonal marks.
-    else if ( 0x2CA == currentChar || 0x2CB == currentChar || 0x2C7 == 
currentChar || 0x2D9 == currentChar )
-        rScriptType = ScriptType::WEAK;
-    // tdf#52577 superscript numbers should be we weak.
-    else if ( 0xB2 == currentChar || 0xB3 == currentChar || 0xB9 == 
currentChar )
-        rScriptType = ScriptType::WEAK;
-    // workaround for Coptic
-    else if ( 0x2C80 <= currentChar && 0x2CE3 >= currentChar)
-        rScriptType = ScriptType::LATIN;
-    else
-    {
-        UBlockCode block=ublock_getCode(currentChar);
-        size_t i = 0;
-        while (i < scriptListCount)
-        {
-            if (block <= scriptList[i].to)
-                break;
-            ++i;
-        }
-        if (i < scriptListCount && block >= scriptList[i].from)
-            rScriptType = scriptList[i].script;
-        else
-        {
-            rScriptType = ScriptType::WEAK;
-            bKnown = false;
-        }
-    }
-    return bKnown;
-}
-}
-
 sal_Int16  BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar)
 {
     static sal_uInt32 lastChar = 0;
@@ -545,9 +458,7 @@ sal_Int16  BreakIteratorImpl::getScriptClass(sal_uInt32 
currentChar)
     if (currentChar != lastChar)
     {
         lastChar = currentChar;
-
-        if (!getCompatibilityScriptClassByBlock(currentChar, nRet))
-            nRet = getScriptClassByUAX24Script(currentChar);
+        nRet = i18nutil::GetScriptClass(currentChar);
     }
 
     return nRet;
diff --git a/i18nutil/CppunitTest_i18nutil_kashida.mk 
b/i18nutil/CppunitTest_i18nutil.mk
similarity index 62%
rename from i18nutil/CppunitTest_i18nutil_kashida.mk
rename to i18nutil/CppunitTest_i18nutil.mk
index 4920f0a79a54..cdf79a2d2268 100644
--- a/i18nutil/CppunitTest_i18nutil_kashida.mk
+++ b/i18nutil/CppunitTest_i18nutil.mk
@@ -9,13 +9,16 @@ For makefiles:
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-$(eval $(call gb_CppunitTest_CppunitTest,i18nutil_kashida))
+$(eval $(call gb_CppunitTest_CppunitTest,i18nutil))
 
-$(eval $(call gb_CppunitTest_add_exception_objects,i18nutil_kashida,\
+$(eval $(call gb_CppunitTest_use_sdk_api,i18nutil))
+
+$(eval $(call gb_CppunitTest_add_exception_objects,i18nutil,\
        i18nutil/qa/cppunit/test_kashida \
+       i18nutil/qa/cppunit/test_scriptchangescanner \
 ))
 
-$(eval $(call gb_CppunitTest_use_libraries,i18nutil_kashida,\
+$(eval $(call gb_CppunitTest_use_libraries,i18nutil,\
        i18nutil \
        sal \
        test \
diff --git a/i18nutil/Library_i18nutil.mk b/i18nutil/Library_i18nutil.mk
index 264c9c9f969e..8e68c3ba00c2 100644
--- a/i18nutil/Library_i18nutil.mk
+++ b/i18nutil/Library_i18nutil.mk
@@ -47,6 +47,8 @@ $(eval $(call gb_Library_add_exception_objects,i18nutil,\
        i18nutil/source/utility/kashida \
        i18nutil/source/utility/oneToOneMapping \
        i18nutil/source/utility/paper \
+       i18nutil/source/utility/scriptchangescanner \
+       i18nutil/source/utility/scriptclass \
        i18nutil/source/utility/scripttypedetector \
        i18nutil/source/utility/unicode \
        i18nutil/source/utility/widthfolding \
diff --git a/i18nutil/Module_i18nutil.mk b/i18nutil/Module_i18nutil.mk
index bb8ef7056c5c..3fac872e83ed 100644
--- a/i18nutil/Module_i18nutil.mk
+++ b/i18nutil/Module_i18nutil.mk
@@ -13,7 +13,7 @@ $(eval $(call gb_Module_add_targets,i18nutil,\
 ))
 
 $(eval $(call gb_Module_add_check_targets,i18nutil,\
-       CppunitTest_i18nutil_kashida \
+       CppunitTest_i18nutil \
 ))
 
 # vim: set noet sw=4:
diff --git a/i18nutil/qa/cppunit/test_scriptchangescanner.cxx 
b/i18nutil/qa/cppunit/test_scriptchangescanner.cxx
new file mode 100644
index 000000000000..99685bb7be21
--- /dev/null
+++ b/i18nutil/qa/cppunit/test_scriptchangescanner.cxx
@@ -0,0 +1,290 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; 
fill-column: 100 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <sal/types.h>
+#include <cppunit/TestAssert.h>
+#include <cppunit/TestFixture.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/plugin/TestPlugIn.h>
+#include <i18nutil/scriptchangescanner.hxx>
+#include <com/sun/star/i18n/ScriptType.hpp>
+
+using namespace i18nutil;
+namespace css = ::com::sun::star;
+
+namespace
+{
+class ScriptChangeScannerTest : public CppUnit::TestFixture
+{
+public:
+    void testEmpty();
+    void testTrivial();
+    void testTrivialAppLang();
+    void testWeakAtStart();
+    void testStrongChange();
+    void testMongolianAfterNNBSP();
+    void testNonspacingMark();
+    void testSmartQuoteCompatibilityCJ();
+    void testSmartQuoteCompatibilityComplexAndCJ();
+    void testSmartQuoteCJAtStart();
+
+    CPPUNIT_TEST_SUITE(ScriptChangeScannerTest);
+    CPPUNIT_TEST(testEmpty);
+    CPPUNIT_TEST(testTrivial);
+    CPPUNIT_TEST(testTrivialAppLang);
+    CPPUNIT_TEST(testWeakAtStart);
+    CPPUNIT_TEST(testStrongChange);
+    CPPUNIT_TEST(testMongolianAfterNNBSP);
+    CPPUNIT_TEST(testNonspacingMark);
+    CPPUNIT_TEST(testSmartQuoteCompatibilityCJ);
+    CPPUNIT_TEST(testSmartQuoteCompatibilityComplexAndCJ);
+    CPPUNIT_TEST(testSmartQuoteCJAtStart);
+    CPPUNIT_TEST_SUITE_END();
+};
+
+void ScriptChangeScannerTest::testEmpty()
+{
+    auto aText = u""_ustr;
+    auto pScanner = MakeScriptChangeScanner(aText, 
css::i18n::ScriptType::LATIN);
+    CPPUNIT_ASSERT(pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nEndIndex);
+}
+
+void ScriptChangeScannerTest::testTrivial()
+{
+    auto aText = u"Trivial case with a single span of a script"_ustr;
+    auto pScanner = MakeScriptChangeScanner(aText, 
css::i18n::ScriptType::LATIN);
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(43), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+    CPPUNIT_ASSERT(pScanner->AtEnd());
+}
+
+void ScriptChangeScannerTest::testTrivialAppLang()
+{
+    auto aText = u"Trivial case with a single span of a script"_ustr;
+    auto pScanner = MakeScriptChangeScanner(aText, 
css::i18n::ScriptType::ASIAN);
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(43), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+    CPPUNIT_ASSERT(pScanner->AtEnd());
+}
+
+void ScriptChangeScannerTest::testWeakAtStart()
+{
+    auto aText = u"“x”"_ustr;
+    auto pScanner = MakeScriptChangeScanner(aText, 
css::i18n::ScriptType::COMPLEX);
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::COMPLEX, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(1), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(1), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(3), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+
+    CPPUNIT_ASSERT(pScanner->AtEnd());
+}
+
+void ScriptChangeScannerTest::testStrongChange()
+{
+    auto aText = u"wide 廣 vast"_ustr;
+    auto pScanner = MakeScriptChangeScanner(aText, 
css::i18n::ScriptType::LATIN);
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(5), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::ASIAN, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(5), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(7), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(7), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(11), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+
+    CPPUNIT_ASSERT(pScanner->AtEnd());
+}
+
+void ScriptChangeScannerTest::testMongolianAfterNNBSP()
+{
+    // NNBSP before Mongolian text should be part of the Mongolian run
+    auto aText = u"Before\u202f\u1822\u1822After"_ustr;
+    auto pScanner = MakeScriptChangeScanner(aText, 
css::i18n::ScriptType::LATIN);
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(6), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::COMPLEX, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(6), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(9), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(9), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(14), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+
+    CPPUNIT_ASSERT(pScanner->AtEnd());
+}
+
+void ScriptChangeScannerTest::testNonspacingMark()
+{
+    // A preceding weak character should be included in the run
+    // of a following non-spacing mark
+    auto aText = u"Before \u0944\u0911\u0911 After"_ustr;
+    auto pScanner = MakeScriptChangeScanner(aText, 
css::i18n::ScriptType::LATIN);
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(6), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::COMPLEX, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(6), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(11), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(11), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(16), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+
+    CPPUNIT_ASSERT(pScanner->AtEnd());
+}
+
+void ScriptChangeScannerTest::testSmartQuoteCompatibilityCJ()
+{
+    // tdf#66791: For compatibility with other programs, weak-script quotes in 
paragraphs
+    // containing CJ characters should be treated as Asian script
+
+    auto aText = u"Before \u201c水\u201d After"_ustr;
+    auto pScanner = MakeScriptChangeScanner(aText, 
css::i18n::ScriptType::LATIN);
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(7), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::ASIAN, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(7), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(11), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(11), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(16), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+
+    CPPUNIT_ASSERT(pScanner->AtEnd());
+}
+
+void ScriptChangeScannerTest::testSmartQuoteCompatibilityComplexAndCJ()
+{
+    // tdf#66791: However, if a paragraph contains complex text, weak-script
+    // quotes are assigned in the usual greedy way.
+
+    auto aText = u"Before \u201c水\u201d After \u05d0"_ustr;
+    auto pScanner = MakeScriptChangeScanner(aText, 
css::i18n::ScriptType::LATIN);
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(8), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::ASIAN, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(8), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(11), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(11), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(17), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::COMPLEX, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(17), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(18), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+
+    CPPUNIT_ASSERT(pScanner->AtEnd());
+}
+
+void ScriptChangeScannerTest::testSmartQuoteCJAtStart()
+{
+    auto aText = u"“廣”"_ustr;
+    auto pScanner = MakeScriptChangeScanner(aText, 
css::i18n::ScriptType::LATIN);
+
+    CPPUNIT_ASSERT(!pScanner->AtEnd());
+    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::ASIAN, 
pScanner->Peek().m_nScriptType);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex);
+    CPPUNIT_ASSERT_EQUAL(sal_Int32(3), pScanner->Peek().m_nEndIndex);
+
+    pScanner->Advance();
+
+    CPPUNIT_ASSERT(pScanner->AtEnd());
+}
+
+CPPUNIT_TEST_SUITE_REGISTRATION(ScriptChangeScannerTest);
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s 
cinkeys+=0=break: */
diff --git a/i18nutil/source/utility/scriptchangescanner.cxx 
b/i18nutil/source/utility/scriptchangescanner.cxx
new file mode 100644
index 000000000000..8a62f4de1660
--- /dev/null
+++ b/i18nutil/source/utility/scriptchangescanner.cxx
@@ -0,0 +1,160 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; 
fill-column: 100 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <i18nutil/scriptchangescanner.hxx>
+#include <i18nutil/unicode.hxx>
+#include <i18nutil/scriptclass.hxx>
+#include <unicode/uchar.h>
+#include <sal/log.hxx>
+#include <com/sun/star/i18n/ScriptType.hpp>
+#include <com/sun/star/i18n/CharType.hpp>
+#include <com/sun/star/i18n/UnicodeType.hpp>
+
+namespace css = ::com::sun::star;
+
+namespace i18nutil
+{
+namespace
+{
+constexpr sal_uInt32 CHAR_NNBSP = 0x202f;
+
+class GreedyScriptChangeScanner : public ScriptChangeScanner
+{
+private:
+    ScriptChange m_stCurr;
+    const OUString& m_rText;
+    sal_Int16 m_nPrevScript;
+    sal_Int32 m_nIndex = 0;
+    bool m_bAtEnd = false;
+    bool m_bApplyAsianToWeakQuotes = false;
+
+public:
+    GreedyScriptChangeScanner(const OUString& rText, sal_Int16 
nDefaultScriptType)
+        : m_rText(rText)
+        , m_nPrevScript(nDefaultScriptType)
+    {
+        // tdf#66791: For compatibility with other programs, the Asian script 
is
+        // applied to any weak-script quote characters if the enclosing 
paragraph
+        // contains Chinese- or Japanese-script characters.
+        sal_Int32 nCjBase = 0;
+        while (nCjBase < m_rText.getLength())
+        {
+            auto nChar = m_rText.iterateCodePoints(&nCjBase);
+            auto nScript = GetScriptClass(nChar);
+            if (nScript == css::i18n::ScriptType::COMPLEX)
+            {
+                m_bApplyAsianToWeakQuotes = false;
+                break;
+            }
+
+            auto nUnicodeScript = u_getIntPropertyValue(nChar, UCHAR_SCRIPT);
+            switch (nUnicodeScript)
+            {
+                case USCRIPT_HAN:
+                case USCRIPT_HIRAGANA:
+                case USCRIPT_KATAKANA:
+                    m_bApplyAsianToWeakQuotes = true;
+                    break;
+
+                default:
+                    break;
+            }
+        }
+
+        // In the original Writer algorithm, the application language is used 
for
+        // all leading weak characters. Make a change record for those 
characters.
+        Advance();
+        if (m_stCurr.m_nStartIndex == m_stCurr.m_nEndIndex)
+        {
+            // The text does not start with application-language leading 
characters.
+            // Initialize with a non-empty record.
+            Advance();
+        }
+    }
+
+    bool AtEnd() const override { return m_bAtEnd; }
+
+    void Advance() override
+    {
+        m_stCurr = ScriptChange{ /*start*/ 0, /*end*/ 0, /*type*/ 
m_nPrevScript };
+
+        if (m_nIndex >= m_rText.getLength())
+        {
+            m_bAtEnd = true;
+            return;
+        }
+
+        auto nRunStart = m_nIndex;
+        auto nScript = m_nPrevScript;
+        while (m_nIndex < m_rText.getLength())
+        {
+            auto nPrevIndex = m_nIndex;
+            auto nChar = m_rText.iterateCodePoints(&m_nIndex);
+            nScript = GetScriptClass(nChar);
+            if (nScript == css::i18n::ScriptType::WEAK)
+            {
+                nScript = m_nPrevScript;
+                if (m_bApplyAsianToWeakQuotes)
+                {
+                    auto nType = unicode::getUnicodeType(nChar);
+                    if (nType == css::i18n::UnicodeType::INITIAL_PUNCTUATION
+                        || nType == css::i18n::UnicodeType::FINAL_PUNCTUATION)
+                    {
+                        nScript = css::i18n::ScriptType::ASIAN;
+                    }
+                }
+            }
+
+            if (nScript != m_nPrevScript)
+            {
+                m_nIndex = nPrevIndex;
+                break;
+            }
+        }
+
+        if (m_nIndex > 0)
+        {
+            // special case for dotted circle since it can be used with complex
+            // before a mark, so we want it associated with the mark's script
+            // tdf#112594: another special case for NNBSP followed by a 
Mongolian
+            // character, since NNBSP has special uses in Mongolian 
(tdf#112594)
+            auto nPrevPos = m_nIndex;
+            auto nPrevChar = m_rText.iterateCodePoints(&nPrevPos, -1);
+            if (m_nIndex < m_rText.getLength()
+                && css::i18n::ScriptType::WEAK == GetScriptClass(nPrevChar))
+            {
+                auto nChar = m_rText.iterateCodePoints(&m_nIndex, 0);
+                auto nType = unicode::getUnicodeType(nChar);
+                if (nType == css::i18n::UnicodeType::NON_SPACING_MARK
+                    || nType == css::i18n::UnicodeType::ENCLOSING_MARK
+                    || nType == css::i18n::UnicodeType::COMBINING_SPACING_MARK
+                    || (nPrevChar == CHAR_NNBSP
+                        && u_getIntPropertyValue(nChar, UCHAR_SCRIPT) == 
USCRIPT_MONGOLIAN))
+                {
+                    m_nIndex = nPrevPos;
+                }
+            }
+        }
+
+        m_stCurr = ScriptChange{ nRunStart, m_nIndex, m_nPrevScript };
+        m_nPrevScript = nScript;
+    }
+
+    ScriptChange Peek() const override { return m_stCurr; }
+};
+}
+}
+
+std::unique_ptr<i18nutil::ScriptChangeScanner>
+i18nutil::MakeScriptChangeScanner(const OUString& rText, sal_Int16 
nDefaultScriptType)
+{
+    return std::make_unique<GreedyScriptChangeScanner>(rText, 
nDefaultScriptType);
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s 
cinkeys+=0=break: */
diff --git a/i18nutil/source/utility/scriptclass.cxx 
b/i18nutil/source/utility/scriptclass.cxx
new file mode 100644
index 000000000000..843f654ac7d0
--- /dev/null
+++ b/i18nutil/source/utility/scriptclass.cxx
@@ -0,0 +1,143 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ *   Licensed to the Apache Software Foundation (ASF) under one or more
+ *   contributor license agreements. See the NOTICE file distributed
+ *   with this work for additional information regarding copyright
+ *   ownership. The ASF licenses this file to you under the Apache
+ *   License, Version 2.0 (the "License"); you may not use this file
+ *   except in compliance with the License. You may obtain a copy of
+ *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <config_locales.h>
+
+#include <i18nutil/scriptclass.hxx>
+#include <cppuhelper/supportsservice.hxx>
+#include <unicode/uchar.h>
+#include <i18nutil/unicode.hxx>
+#include <o3tl/string_view.hxx>
+
+#include <com/sun/star/i18n/CharType.hpp>
+#include <com/sun/star/i18n/ScriptType.hpp>
+#include <com/sun/star/i18n/WordType.hpp>
+#include <com/sun/star/uno/XComponentContext.hpp>
+
+using namespace ::com::sun::star;
+using namespace ::com::sun::star::uno;
+using namespace ::com::sun::star::i18n;
+using namespace ::com::sun::star::lang;
+
+namespace i18nutil
+{
+namespace
+{
+sal_Int16 getScriptClassByUAX24Script(sal_uInt32 currentChar)
+{
+    int32_t script = u_getIntPropertyValue(currentChar, UCHAR_SCRIPT);
+    return 
unicode::getScriptClassFromUScriptCode(static_cast<UScriptCode>(script));
+}
+
+struct UBlock2Script
+{
+    UBlockCode from;
+    UBlockCode to;
+    sal_Int16 script;
+};
+
+const UBlock2Script scriptList[] = {
+    { UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK },
+    { UBLOCK_BASIC_LATIN, UBLOCK_SPACING_MODIFIER_LETTERS, ScriptType::LATIN },
+    { UBLOCK_GREEK, UBLOCK_ARMENIAN, ScriptType::LATIN },
+    { UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX },
+    { UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN },
+    { UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN },
+    { UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX },
+    { UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN },
+    { UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX },
+    { UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, 
ScriptType::LATIN },
+    { UBLOCK_NUMBER_FORMS, UBLOCK_NUMBER_FORMS, ScriptType::WEAK },
+    { UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, 
ScriptType::ASIAN },
+    { UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, 
UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, ScriptType::ASIAN },
+    { UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, 
ScriptType::COMPLEX },
+    { UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, 
ScriptType::ASIAN },
+    { UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, 
ScriptType::COMPLEX },
+    { UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, 
UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS,
+      ScriptType::ASIAN },
+    { UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, 
UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
+      ScriptType::ASIAN },
+    { UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN },
+    { UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN }
+};
+
+#define scriptListCount SAL_N_ELEMENTS(scriptList)
+
+//always sets rScriptType
+
+//returns true for characters historically explicitly assigned to
+//latin/weak/asian
+
+//returns false for characters that historically implicitly assigned to
+//weak as unknown
+bool getCompatibilityScriptClassByBlock(sal_uInt32 currentChar, sal_Int16& 
rScriptType)
+{
+    bool bKnown = true;
+    //handle specific characters always as weak:
+    //  0x01 - this breaks a word
+    //  0x02 - this can be inside a word
+    //  0x20 & 0xA0 - Bug 102975, declare western space and non-break space as 
WEAK char.
+    if (0x01 == currentChar || 0x02 == currentChar || 0x20 == currentChar || 
0xA0 == currentChar)
+        rScriptType = ScriptType::WEAK;
+    // Few Spacing Modifier Letters that can be Bopomofo tonal marks.
+    else if (0x2CA == currentChar || 0x2CB == currentChar || 0x2C7 == 
currentChar
+             || 0x2D9 == currentChar)
+        rScriptType = ScriptType::WEAK;
+    // tdf#52577 superscript numbers should be we weak.
+    else if (0xB2 == currentChar || 0xB3 == currentChar || 0xB9 == currentChar)
+        rScriptType = ScriptType::WEAK;
+    // workaround for Coptic
+    else if (0x2C80 <= currentChar && 0x2CE3 >= currentChar)
+        rScriptType = ScriptType::LATIN;
+    else
+    {
+        UBlockCode block = ublock_getCode(currentChar);
+        size_t i = 0;
+        while (i < scriptListCount)
+        {
+            if (block <= scriptList[i].to)
+                break;
+            ++i;
+        }
+        if (i < scriptListCount && block >= scriptList[i].from)
+            rScriptType = scriptList[i].script;
+        else
+        {
+            rScriptType = ScriptType::WEAK;
+            bKnown = false;
+        }
+    }
+    return bKnown;
+}
+}
+}
+
+sal_Int16 i18nutil::GetScriptClass(sal_uInt32 currentChar)
+{
+    sal_Int16 nRet = ScriptType::WEAK;
+
+    if (!getCompatibilityScriptClassByBlock(currentChar, nRet))
+    {
+        nRet = getScriptClassByUAX24Script(currentChar);
+    }
+
+    return nRet;
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/include/i18nutil/scriptchangescanner.hxx 
b/include/i18nutil/scriptchangescanner.hxx
new file mode 100644
index 000000000000..cc7630cb83a5
--- /dev/null
+++ b/include/i18nutil/scriptchangescanner.hxx
@@ -0,0 +1,38 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; 
fill-column: 100 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <i18nutil/i18nutildllapi.h>
+#include <rtl/ustring.hxx>
+#include <optional>
+#include <memory>
+
+namespace i18nutil
+{
+struct ScriptChange
+{
+    sal_Int32 m_nStartIndex = 0;
+    sal_Int32 m_nEndIndex = 0;
+    sal_Int16 m_nScriptType = 0;
+};
+
+class I18NUTIL_DLLPUBLIC ScriptChangeScanner
+{
+public:
+    virtual ~ScriptChangeScanner() = default;
+
+    virtual bool AtEnd() const = 0;
+    virtual void Advance() = 0;
+    virtual ScriptChange Peek() const = 0;
+};
+
+I18NUTIL_DLLPUBLIC std::unique_ptr<ScriptChangeScanner>
+MakeScriptChangeScanner(const OUString& rWord, sal_Int16 nDefaultScriptType);
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s 
cinkeys+=0=break: */
diff --git a/include/i18nutil/scriptclass.hxx b/include/i18nutil/scriptclass.hxx
new file mode 100644
index 000000000000..5833658cbfd4
--- /dev/null
+++ b/include/i18nutil/scriptclass.hxx
@@ -0,0 +1,18 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; 
fill-column: 100 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <i18nutil/i18nutildllapi.h>
+#include <rtl/ustring.hxx>
+
+namespace i18nutil
+{
+I18NUTIL_DLLPUBLIC sal_Int16 GetScriptClass(sal_uInt32 nChar);
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s 
cinkeys+=0=break: */
diff --git a/sd/qa/unit/layout-tests.cxx b/sd/qa/unit/layout-tests.cxx
index 9461068480e2..57baf576c8c5 100644
--- a/sd/qa/unit/layout-tests.cxx
+++ b/sd/qa/unit/layout-tests.cxx
@@ -367,15 +367,13 @@ CPPUNIT_TEST_FIXTURE(SdLayoutTest, testTdf112594)
     xmlDocUniquePtr pXmlDoc = load("odp/Tdf112594.fodp");
 
     // Test that a NNBSP is grouped with the Mongolian characters after it, so
-    // we have one text array covering the whole string.
-    //
-    // Without the fix, it fails with:
-    // - Expected: 4
-    // - Actual  : 3
-    // - In <>, attribute 'length' of '/metafile/push[1]/push[1]/textarray[3]' 
incorrect value.
+    // we have two text arrays, one covering the digits, and the other with 
the rest.
     assertXPath(pXmlDoc, "/metafile/push[1]/push[1]/textarray[3]", "index", 
u"0");
-    assertXPath(pXmlDoc, "/metafile/push[1]/push[1]/textarray[3]", "length", 
u"4");
+    assertXPath(pXmlDoc, "/metafile/push[1]/push[1]/textarray[3]", "length", 
u"2");
     assertXPathContent(pXmlDoc, "/metafile/push[1]/push[1]/textarray[3]/text", 
u"11\u202f\u1824");
+    assertXPath(pXmlDoc, "/metafile/push[1]/push[1]/textarray[4]", "index", 
u"2");
+    assertXPath(pXmlDoc, "/metafile/push[1]/push[1]/textarray[4]", "length", 
u"2");
+    assertXPathContent(pXmlDoc, "/metafile/push[1]/push[1]/textarray[4]/text", 
u"11\u202f\u1824");
 }
 
 CPPUNIT_TEST_FIXTURE(SdLayoutTest, testTdf152906_AdjustToContour)
diff --git a/sw/source/core/text/porlay.cxx b/sw/source/core/text/porlay.cxx
index cf81585418d9..15958ab1b70f 100644
--- a/sw/source/core/text/porlay.cxx
+++ b/sw/source/core/text/porlay.cxx
@@ -80,6 +80,7 @@
 #include <i18nutil/scripttypedetector.hxx>
 #include <i18nutil/unicode.hxx>
 #include <i18nutil/kashida.hxx>
+#include <i18nutil/scriptchangescanner.hxx>
 #include <unotxdoc.hxx>
 
 using namespace ::com::sun::star;
@@ -1410,40 +1411,17 @@ void SwScriptInfo::InitScriptInfo(const SwTextNode& 
rNode,
     // remove invalid entries from kashida array
     m_Kashida.erase(m_Kashida.begin() + nCntKash, m_Kashida.end());
 
-    // TAKE CARE OF WEAK CHARACTERS: WE MUST FIND AN APPROPRIATE
-    // SCRIPT FOR WEAK CHARACTERS AT THE BEGINNING OF A PARAGRAPH
-
-    if (WEAK == g_pBreakIt->GetBreakIter()->getScriptType(rText, 
sal_Int32(nChg)))
+    // Construct the script change scanner and advance it to the change range
+    auto pScriptScanner = i18nutil::MakeScriptChangeScanner(
+        rText, 
SvtLanguageOptions::GetI18NScriptTypeOfLanguage(GetAppLanguage()));
+    while (!pScriptScanner->AtEnd())
     {
-        // If the beginning of the current group is weak, this means that
-        // all of the characters in this group are weak. We have to assign
-        // the scripts to these characters depending on the fonts which are
-        // set for these characters to display them.
-        TextFrameIndex nEnd(
-            g_pBreakIt->GetBreakIter()->endOfScript(rText, sal_Int32(nChg), 
WEAK));
-
-        if (nEnd > TextFrameIndex(rText.getLength()) || nEnd < 
TextFrameIndex(0))
-            nEnd = TextFrameIndex(rText.getLength());
-
-        nScript = SvtLanguageOptions::GetI18NScriptTypeOfLanguage( 
GetAppLanguage() );
-
-        SAL_WARN_IF( i18n::ScriptType::LATIN != nScript &&
-                i18n::ScriptType::ASIAN != nScript &&
-                i18n::ScriptType::COMPLEX != nScript, "sw.core", "Wrong 
default language" );
-
-        nChg = nEnd;
-
-        // Get next script type or set to weak in order to exit
-        sal_uInt8 nNextScript = (nEnd < TextFrameIndex(rText.getLength()))
-            ? 
static_cast<sal_uInt8>(g_pBreakIt->GetBreakIter()->getScriptType(rText, 
sal_Int32(nEnd)))
-            : sal_uInt8(WEAK);
-
-        if ( nScript != nNextScript )
+        if (pScriptScanner->Peek().m_nStartIndex <= 
static_cast<sal_Int32>(nChg))
         {
-            m_ScriptChanges.emplace_back(nEnd, nScript);
-            nCnt++;
-            nScript = nNextScript;
+            break;
         }
+
+        pScriptScanner->Advance();
     }
 
     // UPDATE THE SCRIPT INFO ARRAYS:
@@ -1451,38 +1429,12 @@ void SwScriptInfo::InitScriptInfo(const SwTextNode& 
rNode,
     while (nChg < TextFrameIndex(rText.getLength())
            || (m_ScriptChanges.empty() && rText.isEmpty()))
     {
-        SAL_WARN_IF( i18n::ScriptType::WEAK == nScript,
-                "sw.core", "Inserting WEAK into SwScriptInfo structure" );
-
-        TextFrameIndex nSearchStt = nChg;
-        nChg = TextFrameIndex(g_pBreakIt->GetBreakIter()->endOfScript(
-                    rText, sal_Int32(nSearchStt), nScript));
-
-        if (nChg > TextFrameIndex(rText.getLength()) || nChg < 
TextFrameIndex(0))
-            nChg = TextFrameIndex(rText.getLength());
-
-        // special case for dotted circle since it can be used with complex
-        // before a mark, so we want it associated with the mark's script
-        // tdf#112594: another special case for NNBSP followed by a Mongolian
-        // character, since NNBSP has special uses in Mongolian (tdf#112594)
-        auto nPos = sal_Int32(nChg);
-        auto nPrevPos = nPos;
-        auto nPrevChar = rText.iterateCodePoints(&nPrevPos, -1);
-        if (nChg < TextFrameIndex(rText.getLength()) && nChg > 
TextFrameIndex(0) &&
-            i18n::ScriptType::WEAK == 
g_pBreakIt->GetBreakIter()->getScriptType(rText, nPrevPos))
-        {
-            auto nChar = rText.iterateCodePoints(&nPos, 0);
-            auto nType = unicode::getUnicodeType(nChar);
-            if (nType == css::i18n::UnicodeType::NON_SPACING_MARK ||
-                nType == css::i18n::UnicodeType::ENCLOSING_MARK ||
-                nType == css::i18n::UnicodeType::COMBINING_SPACING_MARK ||
-                (nPrevChar == CHAR_NNBSP &&
-                 u_getIntPropertyValue(nChar, UCHAR_SCRIPT) == 
USCRIPT_MONGOLIAN))
-            {
-                nPos = nPrevPos;
-            }
-        }
-        m_ScriptChanges.emplace_back(TextFrameIndex(nPos), nScript);
+        auto stChange = pScriptScanner->Peek();
+        pScriptScanner->Advance();
+
+        nScript = stChange.m_nScriptType;
+        nChg = TextFrameIndex{ stChange.m_nEndIndex };
+        m_ScriptChanges.emplace_back(nChg, nScript);
         ++nCnt;
 
         // if current script is asian, we search for compressible characters

core.git: editeng/source i18npool/source i18nutil/CppunitTest_i18nutil_kashida.mk i18nutil/CppunitTest_i18nutil.mk i18nutil/Library_i18nutil.mk i18nutil/Module_i18nutil.mk i18nutil/qa i18nutil/source include/i18nutil sd/qa sw/source

Reply via email to