editeng/source/editeng/impedit2.cxx | 48 --- i18npool/source/breakiterator/breakiteratorImpl.cxx | 93 ------ i18nutil/CppunitTest_i18nutil.mk | 9 i18nutil/Library_i18nutil.mk | 2 i18nutil/Module_i18nutil.mk | 2 i18nutil/qa/cppunit/test_scriptchangescanner.cxx | 290 ++++++++++++++++++++ i18nutil/source/utility/scriptchangescanner.cxx | 160 +++++++++++ i18nutil/source/utility/scriptclass.cxx | 143 +++++++++ include/i18nutil/scriptchangescanner.hxx | 38 ++ include/i18nutil/scriptclass.hxx | 18 + sd/qa/unit/layout-tests.cxx | 12 sw/source/core/text/porlay.cxx | 78 +---- 12 files changed, 687 insertions(+), 206 deletions(-)
New commits: commit 537645c0834eab2d277113f1e3fcf039c994832d Author: Jonathan Clark <jonat...@libreoffice.org> AuthorDate: Tue Dec 10 02:25:31 2024 -0700 Commit: Jonathan Clark <jonat...@libreoffice.org> CommitDate: Mon Dec 16 22:46:05 2024 +0100 tdf#66791 sw: Treat weak punctuation as Asian in Asian paragraphs This change modifies script detection to treat certain weak punctuation marks, particularly left- and right- quotation marks, as Asian script in paragraphs containing Chinese and Japanese characters, but no Complex characters. This change improves our script detection compatibility with other programs. As part of this change, duplicated script detection code has been extracted from Writer and Edit Engine, and consolidated. Change-Id: Ib2880f2e832aaac4c0093971daa88223c7232d63 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/178608 Tested-by: Jenkins Reviewed-by: Jonathan Clark <jonat...@libreoffice.org> diff --git a/editeng/source/editeng/impedit2.cxx b/editeng/source/editeng/impedit2.cxx index f8d15dcdbee2..44f9ae6a33e1 100644 --- a/editeng/source/editeng/impedit2.cxx +++ b/editeng/source/editeng/impedit2.cxx @@ -63,6 +63,7 @@ #include <svl/asiancfg.hxx> #include <svl/voiditem.hxx> #include <i18nutil/unicode.hxx> +#include <i18nutil/scriptchangescanner.hxx> #include <comphelper/diagnose_ex.hxx> #include <comphelper/flagguard.hxx> #include <comphelper/lok.hxx> @@ -1715,51 +1716,16 @@ void ImpEditEngine::InitScriptTypes( sal_Int32 nPara ) pField = pField->GetEnd() ? pNode->GetCharAttribs().FindNextAttrib( EE_FEATURE_FIELD, pField->GetEnd() ) : nullptr; } - sal_Int32 nTextLen = aText.getLength(); - - sal_Int32 nPos = 0; - short nScriptType = _xBI->getScriptType( aText, nPos ); - rTypes.emplace_back( nScriptType, nPos, nTextLen ); - nPos = _xBI->endOfScript( aText, nPos, nScriptType ); - while ( ( nPos != -1 ) && ( nPos < nTextLen ) ) + auto pScriptScanner = i18nutil::MakeScriptChangeScanner( + aText, SvtLanguageOptions::GetI18NScriptTypeOfLanguage(GetDefaultLanguage())); + while (!pScriptScanner->AtEnd() || rTypes.empty()) { - rTypes.back().nEndPos = nPos; - - nScriptType = _xBI->getScriptType( aText, nPos ); - tools::Long nEndPos = _xBI->endOfScript( aText, nPos, nScriptType ); - - if ( ( nScriptType == i18n::ScriptType::WEAK ) || ( nScriptType == rTypes.back().nScriptType ) ) - { - // Expand last ScriptTypePosInfo, don't create weak or unnecessary portions - rTypes.back().nEndPos = nEndPos; - } - else - { - auto nPrevPos = nPos; - auto nPrevChar = aText.iterateCodePoints(&nPrevPos, -1); - if (_xBI->getScriptType(aText, nPrevPos) == i18n::ScriptType::WEAK) - { - auto nChar = aText.iterateCodePoints(&nPos, 0); - auto nType = unicode::getUnicodeType(nChar); - if (nType == css::i18n::UnicodeType::NON_SPACING_MARK || - nType == css::i18n::UnicodeType::ENCLOSING_MARK || - nType == css::i18n::UnicodeType::COMBINING_SPACING_MARK || - (nPrevChar == 0x202F /* NNBSP, tdf#112594 */ && - u_getIntPropertyValue(nChar, UCHAR_SCRIPT) == USCRIPT_MONGOLIAN)) - { - rTypes.back().nEndPos = nPos = nPrevPos; - break; - } - } - rTypes.emplace_back( nScriptType, nPos, nTextLen ); - } + auto stChange = pScriptScanner->Peek(); + rTypes.emplace_back(stChange.m_nScriptType, stChange.m_nStartIndex, stChange.m_nEndIndex); - nPos = nEndPos; + pScriptScanner->Advance(); } - if ( rTypes[0].nScriptType == i18n::ScriptType::WEAK ) - rTypes[0].nScriptType = ( rTypes.size() > 1 ) ? rTypes[1].nScriptType : SvtLanguageOptions::GetI18NScriptTypeOfLanguage( GetDefaultLanguage() ); - // create writing direction information: WritingDirectionInfos& rDirInfos = pParaPortion->getWritingDirectionInfos(); if (rDirInfos.empty()) diff --git a/i18npool/source/breakiterator/breakiteratorImpl.cxx b/i18npool/source/breakiterator/breakiteratorImpl.cxx index 6b6870f83ecd..ecff78cb790b 100644 --- a/i18npool/source/breakiterator/breakiteratorImpl.cxx +++ b/i18npool/source/breakiterator/breakiteratorImpl.cxx @@ -22,6 +22,7 @@ #include <breakiteratorImpl.hxx> #include <cppuhelper/supportsservice.hxx> #include <unicode/uchar.h> +#include <i18nutil/scriptclass.hxx> #include <i18nutil/unicode.hxx> #include <o3tl/string_view.hxx> @@ -449,94 +450,6 @@ sal_Int16 SAL_CALL BreakIteratorImpl::getWordType( const OUString& /*Text*/, return 0; } -namespace -{ -sal_Int16 getScriptClassByUAX24Script(sal_uInt32 currentChar) -{ - int32_t script = u_getIntPropertyValue(currentChar, UCHAR_SCRIPT); - return unicode::getScriptClassFromUScriptCode(static_cast<UScriptCode>(script)); -} - -struct UBlock2Script -{ - UBlockCode from; - UBlockCode to; - sal_Int16 script; -}; - -const UBlock2Script scriptList[] = -{ - {UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK}, - {UBLOCK_BASIC_LATIN, UBLOCK_SPACING_MODIFIER_LETTERS, ScriptType::LATIN}, - {UBLOCK_GREEK, UBLOCK_ARMENIAN, ScriptType::LATIN}, - {UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX}, - {UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN}, - {UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN}, - {UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX}, - {UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN}, - {UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX}, - {UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, ScriptType::LATIN}, - {UBLOCK_NUMBER_FORMS, UBLOCK_NUMBER_FORMS, ScriptType::WEAK}, - {UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, ScriptType::ASIAN}, - {UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, ScriptType::ASIAN}, - {UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, ScriptType::COMPLEX}, - {UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, ScriptType::ASIAN}, - {UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, ScriptType::COMPLEX}, - {UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, ScriptType::ASIAN}, - {UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, ScriptType::ASIAN}, - {UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN}, - {UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN} -}; - -#define scriptListCount SAL_N_ELEMENTS(scriptList) - -//always sets rScriptType - -//returns true for characters historically explicitly assigned to -//latin/weak/asian - -//returns false for characters that historically implicitly assigned to -//weak as unknown -bool getCompatibilityScriptClassByBlock(sal_uInt32 currentChar, sal_Int16 &rScriptType) -{ - bool bKnown = true; - //handle specific characters always as weak: - // 0x01 - this breaks a word - // 0x02 - this can be inside a word - // 0x20 & 0xA0 - Bug 102975, declare western space and non-break space as WEAK char. - if( 0x01 == currentChar || 0x02 == currentChar || 0x20 == currentChar || 0xA0 == currentChar) - rScriptType = ScriptType::WEAK; - // Few Spacing Modifier Letters that can be Bopomofo tonal marks. - else if ( 0x2CA == currentChar || 0x2CB == currentChar || 0x2C7 == currentChar || 0x2D9 == currentChar ) - rScriptType = ScriptType::WEAK; - // tdf#52577 superscript numbers should be we weak. - else if ( 0xB2 == currentChar || 0xB3 == currentChar || 0xB9 == currentChar ) - rScriptType = ScriptType::WEAK; - // workaround for Coptic - else if ( 0x2C80 <= currentChar && 0x2CE3 >= currentChar) - rScriptType = ScriptType::LATIN; - else - { - UBlockCode block=ublock_getCode(currentChar); - size_t i = 0; - while (i < scriptListCount) - { - if (block <= scriptList[i].to) - break; - ++i; - } - if (i < scriptListCount && block >= scriptList[i].from) - rScriptType = scriptList[i].script; - else - { - rScriptType = ScriptType::WEAK; - bKnown = false; - } - } - return bKnown; -} -} - sal_Int16 BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar) { static sal_uInt32 lastChar = 0; @@ -545,9 +458,7 @@ sal_Int16 BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar) if (currentChar != lastChar) { lastChar = currentChar; - - if (!getCompatibilityScriptClassByBlock(currentChar, nRet)) - nRet = getScriptClassByUAX24Script(currentChar); + nRet = i18nutil::GetScriptClass(currentChar); } return nRet; diff --git a/i18nutil/CppunitTest_i18nutil_kashida.mk b/i18nutil/CppunitTest_i18nutil.mk similarity index 62% rename from i18nutil/CppunitTest_i18nutil_kashida.mk rename to i18nutil/CppunitTest_i18nutil.mk index 4920f0a79a54..cdf79a2d2268 100644 --- a/i18nutil/CppunitTest_i18nutil_kashida.mk +++ b/i18nutil/CppunitTest_i18nutil.mk @@ -9,13 +9,16 @@ For makefiles: # file, You can obtain one at http://mozilla.org/MPL/2.0/. # -$(eval $(call gb_CppunitTest_CppunitTest,i18nutil_kashida)) +$(eval $(call gb_CppunitTest_CppunitTest,i18nutil)) -$(eval $(call gb_CppunitTest_add_exception_objects,i18nutil_kashida,\ +$(eval $(call gb_CppunitTest_use_sdk_api,i18nutil)) + +$(eval $(call gb_CppunitTest_add_exception_objects,i18nutil,\ i18nutil/qa/cppunit/test_kashida \ + i18nutil/qa/cppunit/test_scriptchangescanner \ )) -$(eval $(call gb_CppunitTest_use_libraries,i18nutil_kashida,\ +$(eval $(call gb_CppunitTest_use_libraries,i18nutil,\ i18nutil \ sal \ test \ diff --git a/i18nutil/Library_i18nutil.mk b/i18nutil/Library_i18nutil.mk index 264c9c9f969e..8e68c3ba00c2 100644 --- a/i18nutil/Library_i18nutil.mk +++ b/i18nutil/Library_i18nutil.mk @@ -47,6 +47,8 @@ $(eval $(call gb_Library_add_exception_objects,i18nutil,\ i18nutil/source/utility/kashida \ i18nutil/source/utility/oneToOneMapping \ i18nutil/source/utility/paper \ + i18nutil/source/utility/scriptchangescanner \ + i18nutil/source/utility/scriptclass \ i18nutil/source/utility/scripttypedetector \ i18nutil/source/utility/unicode \ i18nutil/source/utility/widthfolding \ diff --git a/i18nutil/Module_i18nutil.mk b/i18nutil/Module_i18nutil.mk index bb8ef7056c5c..3fac872e83ed 100644 --- a/i18nutil/Module_i18nutil.mk +++ b/i18nutil/Module_i18nutil.mk @@ -13,7 +13,7 @@ $(eval $(call gb_Module_add_targets,i18nutil,\ )) $(eval $(call gb_Module_add_check_targets,i18nutil,\ - CppunitTest_i18nutil_kashida \ + CppunitTest_i18nutil \ )) # vim: set noet sw=4: diff --git a/i18nutil/qa/cppunit/test_scriptchangescanner.cxx b/i18nutil/qa/cppunit/test_scriptchangescanner.cxx new file mode 100644 index 000000000000..99685bb7be21 --- /dev/null +++ b/i18nutil/qa/cppunit/test_scriptchangescanner.cxx @@ -0,0 +1,290 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include <sal/types.h> +#include <cppunit/TestAssert.h> +#include <cppunit/TestFixture.h> +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/plugin/TestPlugIn.h> +#include <i18nutil/scriptchangescanner.hxx> +#include <com/sun/star/i18n/ScriptType.hpp> + +using namespace i18nutil; +namespace css = ::com::sun::star; + +namespace +{ +class ScriptChangeScannerTest : public CppUnit::TestFixture +{ +public: + void testEmpty(); + void testTrivial(); + void testTrivialAppLang(); + void testWeakAtStart(); + void testStrongChange(); + void testMongolianAfterNNBSP(); + void testNonspacingMark(); + void testSmartQuoteCompatibilityCJ(); + void testSmartQuoteCompatibilityComplexAndCJ(); + void testSmartQuoteCJAtStart(); + + CPPUNIT_TEST_SUITE(ScriptChangeScannerTest); + CPPUNIT_TEST(testEmpty); + CPPUNIT_TEST(testTrivial); + CPPUNIT_TEST(testTrivialAppLang); + CPPUNIT_TEST(testWeakAtStart); + CPPUNIT_TEST(testStrongChange); + CPPUNIT_TEST(testMongolianAfterNNBSP); + CPPUNIT_TEST(testNonspacingMark); + CPPUNIT_TEST(testSmartQuoteCompatibilityCJ); + CPPUNIT_TEST(testSmartQuoteCompatibilityComplexAndCJ); + CPPUNIT_TEST(testSmartQuoteCJAtStart); + CPPUNIT_TEST_SUITE_END(); +}; + +void ScriptChangeScannerTest::testEmpty() +{ + auto aText = u""_ustr; + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN); + CPPUNIT_ASSERT(pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nEndIndex); +} + +void ScriptChangeScannerTest::testTrivial() +{ + auto aText = u"Trivial case with a single span of a script"_ustr; + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(43), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + CPPUNIT_ASSERT(pScanner->AtEnd()); +} + +void ScriptChangeScannerTest::testTrivialAppLang() +{ + auto aText = u"Trivial case with a single span of a script"_ustr; + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::ASIAN); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(43), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + CPPUNIT_ASSERT(pScanner->AtEnd()); +} + +void ScriptChangeScannerTest::testWeakAtStart() +{ + auto aText = u"“x”"_ustr; + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::COMPLEX); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::COMPLEX, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(pScanner->AtEnd()); +} + +void ScriptChangeScannerTest::testStrongChange() +{ + auto aText = u"wide 廣 vast"_ustr; + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::ASIAN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(11), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(pScanner->AtEnd()); +} + +void ScriptChangeScannerTest::testMongolianAfterNNBSP() +{ + // NNBSP before Mongolian text should be part of the Mongolian run + auto aText = u"Before\u202f\u1822\u1822After"_ustr; + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::COMPLEX, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(pScanner->AtEnd()); +} + +void ScriptChangeScannerTest::testNonspacingMark() +{ + // A preceding weak character should be included in the run + // of a following non-spacing mark + auto aText = u"Before \u0944\u0911\u0911 After"_ustr; + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::COMPLEX, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(11), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(11), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(16), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(pScanner->AtEnd()); +} + +void ScriptChangeScannerTest::testSmartQuoteCompatibilityCJ() +{ + // tdf#66791: For compatibility with other programs, weak-script quotes in paragraphs + // containing CJ characters should be treated as Asian script + + auto aText = u"Before \u201c水\u201d After"_ustr; + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::ASIAN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(11), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(11), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(16), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(pScanner->AtEnd()); +} + +void ScriptChangeScannerTest::testSmartQuoteCompatibilityComplexAndCJ() +{ + // tdf#66791: However, if a paragraph contains complex text, weak-script + // quotes are assigned in the usual greedy way. + + auto aText = u"Before \u201c水\u201d After \u05d0"_ustr; + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::ASIAN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(11), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(11), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(17), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::COMPLEX, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(17), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(18), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(pScanner->AtEnd()); +} + +void ScriptChangeScannerTest::testSmartQuoteCJAtStart() +{ + auto aText = u"“廣”"_ustr; + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::ASIAN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(pScanner->AtEnd()); +} + +CPPUNIT_TEST_SUITE_REGISTRATION(ScriptChangeScannerTest); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */ diff --git a/i18nutil/source/utility/scriptchangescanner.cxx b/i18nutil/source/utility/scriptchangescanner.cxx new file mode 100644 index 000000000000..8a62f4de1660 --- /dev/null +++ b/i18nutil/source/utility/scriptchangescanner.cxx @@ -0,0 +1,160 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include <i18nutil/scriptchangescanner.hxx> +#include <i18nutil/unicode.hxx> +#include <i18nutil/scriptclass.hxx> +#include <unicode/uchar.h> +#include <sal/log.hxx> +#include <com/sun/star/i18n/ScriptType.hpp> +#include <com/sun/star/i18n/CharType.hpp> +#include <com/sun/star/i18n/UnicodeType.hpp> + +namespace css = ::com::sun::star; + +namespace i18nutil +{ +namespace +{ +constexpr sal_uInt32 CHAR_NNBSP = 0x202f; + +class GreedyScriptChangeScanner : public ScriptChangeScanner +{ +private: + ScriptChange m_stCurr; + const OUString& m_rText; + sal_Int16 m_nPrevScript; + sal_Int32 m_nIndex = 0; + bool m_bAtEnd = false; + bool m_bApplyAsianToWeakQuotes = false; + +public: + GreedyScriptChangeScanner(const OUString& rText, sal_Int16 nDefaultScriptType) + : m_rText(rText) + , m_nPrevScript(nDefaultScriptType) + { + // tdf#66791: For compatibility with other programs, the Asian script is + // applied to any weak-script quote characters if the enclosing paragraph + // contains Chinese- or Japanese-script characters. + sal_Int32 nCjBase = 0; + while (nCjBase < m_rText.getLength()) + { + auto nChar = m_rText.iterateCodePoints(&nCjBase); + auto nScript = GetScriptClass(nChar); + if (nScript == css::i18n::ScriptType::COMPLEX) + { + m_bApplyAsianToWeakQuotes = false; + break; + } + + auto nUnicodeScript = u_getIntPropertyValue(nChar, UCHAR_SCRIPT); + switch (nUnicodeScript) + { + case USCRIPT_HAN: + case USCRIPT_HIRAGANA: + case USCRIPT_KATAKANA: + m_bApplyAsianToWeakQuotes = true; + break; + + default: + break; + } + } + + // In the original Writer algorithm, the application language is used for + // all leading weak characters. Make a change record for those characters. + Advance(); + if (m_stCurr.m_nStartIndex == m_stCurr.m_nEndIndex) + { + // The text does not start with application-language leading characters. + // Initialize with a non-empty record. + Advance(); + } + } + + bool AtEnd() const override { return m_bAtEnd; } + + void Advance() override + { + m_stCurr = ScriptChange{ /*start*/ 0, /*end*/ 0, /*type*/ m_nPrevScript }; + + if (m_nIndex >= m_rText.getLength()) + { + m_bAtEnd = true; + return; + } + + auto nRunStart = m_nIndex; + auto nScript = m_nPrevScript; + while (m_nIndex < m_rText.getLength()) + { + auto nPrevIndex = m_nIndex; + auto nChar = m_rText.iterateCodePoints(&m_nIndex); + nScript = GetScriptClass(nChar); + if (nScript == css::i18n::ScriptType::WEAK) + { + nScript = m_nPrevScript; + if (m_bApplyAsianToWeakQuotes) + { + auto nType = unicode::getUnicodeType(nChar); + if (nType == css::i18n::UnicodeType::INITIAL_PUNCTUATION + || nType == css::i18n::UnicodeType::FINAL_PUNCTUATION) + { + nScript = css::i18n::ScriptType::ASIAN; + } + } + } + + if (nScript != m_nPrevScript) + { + m_nIndex = nPrevIndex; + break; + } + } + + if (m_nIndex > 0) + { + // special case for dotted circle since it can be used with complex + // before a mark, so we want it associated with the mark's script + // tdf#112594: another special case for NNBSP followed by a Mongolian + // character, since NNBSP has special uses in Mongolian (tdf#112594) + auto nPrevPos = m_nIndex; + auto nPrevChar = m_rText.iterateCodePoints(&nPrevPos, -1); + if (m_nIndex < m_rText.getLength() + && css::i18n::ScriptType::WEAK == GetScriptClass(nPrevChar)) + { + auto nChar = m_rText.iterateCodePoints(&m_nIndex, 0); + auto nType = unicode::getUnicodeType(nChar); + if (nType == css::i18n::UnicodeType::NON_SPACING_MARK + || nType == css::i18n::UnicodeType::ENCLOSING_MARK + || nType == css::i18n::UnicodeType::COMBINING_SPACING_MARK + || (nPrevChar == CHAR_NNBSP + && u_getIntPropertyValue(nChar, UCHAR_SCRIPT) == USCRIPT_MONGOLIAN)) + { + m_nIndex = nPrevPos; + } + } + } + + m_stCurr = ScriptChange{ nRunStart, m_nIndex, m_nPrevScript }; + m_nPrevScript = nScript; + } + + ScriptChange Peek() const override { return m_stCurr; } +}; +} +} + +std::unique_ptr<i18nutil::ScriptChangeScanner> +i18nutil::MakeScriptChangeScanner(const OUString& rText, sal_Int16 nDefaultScriptType) +{ + return std::make_unique<GreedyScriptChangeScanner>(rText, nDefaultScriptType); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */ diff --git a/i18nutil/source/utility/scriptclass.cxx b/i18nutil/source/utility/scriptclass.cxx new file mode 100644 index 000000000000..843f654ac7d0 --- /dev/null +++ b/i18nutil/source/utility/scriptclass.cxx @@ -0,0 +1,143 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include <config_locales.h> + +#include <i18nutil/scriptclass.hxx> +#include <cppuhelper/supportsservice.hxx> +#include <unicode/uchar.h> +#include <i18nutil/unicode.hxx> +#include <o3tl/string_view.hxx> + +#include <com/sun/star/i18n/CharType.hpp> +#include <com/sun/star/i18n/ScriptType.hpp> +#include <com/sun/star/i18n/WordType.hpp> +#include <com/sun/star/uno/XComponentContext.hpp> + +using namespace ::com::sun::star; +using namespace ::com::sun::star::uno; +using namespace ::com::sun::star::i18n; +using namespace ::com::sun::star::lang; + +namespace i18nutil +{ +namespace +{ +sal_Int16 getScriptClassByUAX24Script(sal_uInt32 currentChar) +{ + int32_t script = u_getIntPropertyValue(currentChar, UCHAR_SCRIPT); + return unicode::getScriptClassFromUScriptCode(static_cast<UScriptCode>(script)); +} + +struct UBlock2Script +{ + UBlockCode from; + UBlockCode to; + sal_Int16 script; +}; + +const UBlock2Script scriptList[] = { + { UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK }, + { UBLOCK_BASIC_LATIN, UBLOCK_SPACING_MODIFIER_LETTERS, ScriptType::LATIN }, + { UBLOCK_GREEK, UBLOCK_ARMENIAN, ScriptType::LATIN }, + { UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX }, + { UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN }, + { UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN }, + { UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX }, + { UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN }, + { UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX }, + { UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, ScriptType::LATIN }, + { UBLOCK_NUMBER_FORMS, UBLOCK_NUMBER_FORMS, ScriptType::WEAK }, + { UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, ScriptType::ASIAN }, + { UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, ScriptType::ASIAN }, + { UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, ScriptType::COMPLEX }, + { UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, ScriptType::ASIAN }, + { UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, ScriptType::COMPLEX }, + { UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, + ScriptType::ASIAN }, + { UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, + ScriptType::ASIAN }, + { UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN }, + { UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN } +}; + +#define scriptListCount SAL_N_ELEMENTS(scriptList) + +//always sets rScriptType + +//returns true for characters historically explicitly assigned to +//latin/weak/asian + +//returns false for characters that historically implicitly assigned to +//weak as unknown +bool getCompatibilityScriptClassByBlock(sal_uInt32 currentChar, sal_Int16& rScriptType) +{ + bool bKnown = true; + //handle specific characters always as weak: + // 0x01 - this breaks a word + // 0x02 - this can be inside a word + // 0x20 & 0xA0 - Bug 102975, declare western space and non-break space as WEAK char. + if (0x01 == currentChar || 0x02 == currentChar || 0x20 == currentChar || 0xA0 == currentChar) + rScriptType = ScriptType::WEAK; + // Few Spacing Modifier Letters that can be Bopomofo tonal marks. + else if (0x2CA == currentChar || 0x2CB == currentChar || 0x2C7 == currentChar + || 0x2D9 == currentChar) + rScriptType = ScriptType::WEAK; + // tdf#52577 superscript numbers should be we weak. + else if (0xB2 == currentChar || 0xB3 == currentChar || 0xB9 == currentChar) + rScriptType = ScriptType::WEAK; + // workaround for Coptic + else if (0x2C80 <= currentChar && 0x2CE3 >= currentChar) + rScriptType = ScriptType::LATIN; + else + { + UBlockCode block = ublock_getCode(currentChar); + size_t i = 0; + while (i < scriptListCount) + { + if (block <= scriptList[i].to) + break; + ++i; + } + if (i < scriptListCount && block >= scriptList[i].from) + rScriptType = scriptList[i].script; + else + { + rScriptType = ScriptType::WEAK; + bKnown = false; + } + } + return bKnown; +} +} +} + +sal_Int16 i18nutil::GetScriptClass(sal_uInt32 currentChar) +{ + sal_Int16 nRet = ScriptType::WEAK; + + if (!getCompatibilityScriptClassByBlock(currentChar, nRet)) + { + nRet = getScriptClassByUAX24Script(currentChar); + } + + return nRet; +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/i18nutil/scriptchangescanner.hxx b/include/i18nutil/scriptchangescanner.hxx new file mode 100644 index 000000000000..cc7630cb83a5 --- /dev/null +++ b/include/i18nutil/scriptchangescanner.hxx @@ -0,0 +1,38 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include <i18nutil/i18nutildllapi.h> +#include <rtl/ustring.hxx> +#include <optional> +#include <memory> + +namespace i18nutil +{ +struct ScriptChange +{ + sal_Int32 m_nStartIndex = 0; + sal_Int32 m_nEndIndex = 0; + sal_Int16 m_nScriptType = 0; +}; + +class I18NUTIL_DLLPUBLIC ScriptChangeScanner +{ +public: + virtual ~ScriptChangeScanner() = default; + + virtual bool AtEnd() const = 0; + virtual void Advance() = 0; + virtual ScriptChange Peek() const = 0; +}; + +I18NUTIL_DLLPUBLIC std::unique_ptr<ScriptChangeScanner> +MakeScriptChangeScanner(const OUString& rWord, sal_Int16 nDefaultScriptType); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */ diff --git a/include/i18nutil/scriptclass.hxx b/include/i18nutil/scriptclass.hxx new file mode 100644 index 000000000000..5833658cbfd4 --- /dev/null +++ b/include/i18nutil/scriptclass.hxx @@ -0,0 +1,18 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include <i18nutil/i18nutildllapi.h> +#include <rtl/ustring.hxx> + +namespace i18nutil +{ +I18NUTIL_DLLPUBLIC sal_Int16 GetScriptClass(sal_uInt32 nChar); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */ diff --git a/sd/qa/unit/layout-tests.cxx b/sd/qa/unit/layout-tests.cxx index 9461068480e2..57baf576c8c5 100644 --- a/sd/qa/unit/layout-tests.cxx +++ b/sd/qa/unit/layout-tests.cxx @@ -367,15 +367,13 @@ CPPUNIT_TEST_FIXTURE(SdLayoutTest, testTdf112594) xmlDocUniquePtr pXmlDoc = load("odp/Tdf112594.fodp"); // Test that a NNBSP is grouped with the Mongolian characters after it, so - // we have one text array covering the whole string. - // - // Without the fix, it fails with: - // - Expected: 4 - // - Actual : 3 - // - In <>, attribute 'length' of '/metafile/push[1]/push[1]/textarray[3]' incorrect value. + // we have two text arrays, one covering the digits, and the other with the rest. assertXPath(pXmlDoc, "/metafile/push[1]/push[1]/textarray[3]", "index", u"0"); - assertXPath(pXmlDoc, "/metafile/push[1]/push[1]/textarray[3]", "length", u"4"); + assertXPath(pXmlDoc, "/metafile/push[1]/push[1]/textarray[3]", "length", u"2"); assertXPathContent(pXmlDoc, "/metafile/push[1]/push[1]/textarray[3]/text", u"11\u202f\u1824"); + assertXPath(pXmlDoc, "/metafile/push[1]/push[1]/textarray[4]", "index", u"2"); + assertXPath(pXmlDoc, "/metafile/push[1]/push[1]/textarray[4]", "length", u"2"); + assertXPathContent(pXmlDoc, "/metafile/push[1]/push[1]/textarray[4]/text", u"11\u202f\u1824"); } CPPUNIT_TEST_FIXTURE(SdLayoutTest, testTdf152906_AdjustToContour) diff --git a/sw/source/core/text/porlay.cxx b/sw/source/core/text/porlay.cxx index cf81585418d9..15958ab1b70f 100644 --- a/sw/source/core/text/porlay.cxx +++ b/sw/source/core/text/porlay.cxx @@ -80,6 +80,7 @@ #include <i18nutil/scripttypedetector.hxx> #include <i18nutil/unicode.hxx> #include <i18nutil/kashida.hxx> +#include <i18nutil/scriptchangescanner.hxx> #include <unotxdoc.hxx> using namespace ::com::sun::star; @@ -1410,40 +1411,17 @@ void SwScriptInfo::InitScriptInfo(const SwTextNode& rNode, // remove invalid entries from kashida array m_Kashida.erase(m_Kashida.begin() + nCntKash, m_Kashida.end()); - // TAKE CARE OF WEAK CHARACTERS: WE MUST FIND AN APPROPRIATE - // SCRIPT FOR WEAK CHARACTERS AT THE BEGINNING OF A PARAGRAPH - - if (WEAK == g_pBreakIt->GetBreakIter()->getScriptType(rText, sal_Int32(nChg))) + // Construct the script change scanner and advance it to the change range + auto pScriptScanner = i18nutil::MakeScriptChangeScanner( + rText, SvtLanguageOptions::GetI18NScriptTypeOfLanguage(GetAppLanguage())); + while (!pScriptScanner->AtEnd()) { - // If the beginning of the current group is weak, this means that - // all of the characters in this group are weak. We have to assign - // the scripts to these characters depending on the fonts which are - // set for these characters to display them. - TextFrameIndex nEnd( - g_pBreakIt->GetBreakIter()->endOfScript(rText, sal_Int32(nChg), WEAK)); - - if (nEnd > TextFrameIndex(rText.getLength()) || nEnd < TextFrameIndex(0)) - nEnd = TextFrameIndex(rText.getLength()); - - nScript = SvtLanguageOptions::GetI18NScriptTypeOfLanguage( GetAppLanguage() ); - - SAL_WARN_IF( i18n::ScriptType::LATIN != nScript && - i18n::ScriptType::ASIAN != nScript && - i18n::ScriptType::COMPLEX != nScript, "sw.core", "Wrong default language" ); - - nChg = nEnd; - - // Get next script type or set to weak in order to exit - sal_uInt8 nNextScript = (nEnd < TextFrameIndex(rText.getLength())) - ? static_cast<sal_uInt8>(g_pBreakIt->GetBreakIter()->getScriptType(rText, sal_Int32(nEnd))) - : sal_uInt8(WEAK); - - if ( nScript != nNextScript ) + if (pScriptScanner->Peek().m_nStartIndex <= static_cast<sal_Int32>(nChg)) { - m_ScriptChanges.emplace_back(nEnd, nScript); - nCnt++; - nScript = nNextScript; + break; } + + pScriptScanner->Advance(); } // UPDATE THE SCRIPT INFO ARRAYS: @@ -1451,38 +1429,12 @@ void SwScriptInfo::InitScriptInfo(const SwTextNode& rNode, while (nChg < TextFrameIndex(rText.getLength()) || (m_ScriptChanges.empty() && rText.isEmpty())) { - SAL_WARN_IF( i18n::ScriptType::WEAK == nScript, - "sw.core", "Inserting WEAK into SwScriptInfo structure" ); - - TextFrameIndex nSearchStt = nChg; - nChg = TextFrameIndex(g_pBreakIt->GetBreakIter()->endOfScript( - rText, sal_Int32(nSearchStt), nScript)); - - if (nChg > TextFrameIndex(rText.getLength()) || nChg < TextFrameIndex(0)) - nChg = TextFrameIndex(rText.getLength()); - - // special case for dotted circle since it can be used with complex - // before a mark, so we want it associated with the mark's script - // tdf#112594: another special case for NNBSP followed by a Mongolian - // character, since NNBSP has special uses in Mongolian (tdf#112594) - auto nPos = sal_Int32(nChg); - auto nPrevPos = nPos; - auto nPrevChar = rText.iterateCodePoints(&nPrevPos, -1); - if (nChg < TextFrameIndex(rText.getLength()) && nChg > TextFrameIndex(0) && - i18n::ScriptType::WEAK == g_pBreakIt->GetBreakIter()->getScriptType(rText, nPrevPos)) - { - auto nChar = rText.iterateCodePoints(&nPos, 0); - auto nType = unicode::getUnicodeType(nChar); - if (nType == css::i18n::UnicodeType::NON_SPACING_MARK || - nType == css::i18n::UnicodeType::ENCLOSING_MARK || - nType == css::i18n::UnicodeType::COMBINING_SPACING_MARK || - (nPrevChar == CHAR_NNBSP && - u_getIntPropertyValue(nChar, UCHAR_SCRIPT) == USCRIPT_MONGOLIAN)) - { - nPos = nPrevPos; - } - } - m_ScriptChanges.emplace_back(TextFrameIndex(nPos), nScript); + auto stChange = pScriptScanner->Peek(); + pScriptScanner->Advance(); + + nScript = stChange.m_nScriptType; + nChg = TextFrameIndex{ stChange.m_nEndIndex }; + m_ScriptChanges.emplace_back(nChg, nScript); ++nCnt; // if current script is asian, we search for compressible characters