editeng/source/editeng/impedit3.cxx | 248 ++++++++++++++++++++++++++++-------- 1 file changed, 199 insertions(+), 49 deletions(-)
New commits: commit 640a2bc1cd093875b701bb958b6156d5752c0fac Author: Khaled Hosny <kha...@aliftype.com> AuthorDate: Thu Aug 11 14:53:19 2022 +0200 Commit: Caolán McNamara <caol...@redhat.com> CommitDate: Sun Aug 14 21:10:43 2022 +0200 tdf#137528: Sync Kashida algorithm in editeng with sw It seems that editeng is using an old copy of the code in sw and it was never updated. With this change, the text in the test document from tdf#137528 looks essentially identical in both cases. Fixes also tdf#146199. Change-Id: I38541bfb35eae36681af84e73a7965d36152448f Reviewed-on: https://gerrit.libreoffice.org/c/core/+/138145 Tested-by: Jenkins Reviewed-by: Caolán McNamara <caol...@redhat.com> diff --git a/editeng/source/editeng/impedit3.cxx b/editeng/source/editeng/impedit3.cxx index 9d71560c108b..d7129999dfa5 100644 --- a/editeng/source/editeng/impedit3.cxx +++ b/editeng/source/editeng/impedit3.cxx @@ -75,6 +75,8 @@ #include <vcl/outdev/ScopedStates.hxx> +#include <unicode/uchar.h> + using namespace ::com::sun::star; using namespace ::com::sun::star::uno; using namespace ::com::sun::star::beans; @@ -254,19 +256,93 @@ static Point lcl_ImplCalcRotatedPos( Point rPos, Point rOrigin, double nSin, dou return aTranslatedPos; } -static bool lcl_IsLigature( sal_Unicode cCh, sal_Unicode cNextCh ) // For Kashidas from sw/source/core/text/porlay.txt +// For Kashidas from sw/source/core/text/porlay.cxx + +#define IS_JOINING_GROUP(c, g) ( u_getIntPropertyValue( (c), UCHAR_JOINING_GROUP ) == U_JG_##g ) +#define isAinChar(c) IS_JOINING_GROUP((c), AIN) +#define isAlefChar(c) IS_JOINING_GROUP((c), ALEF) +#define isDalChar(c) IS_JOINING_GROUP((c), DAL) +#if U_ICU_VERSION_MAJOR_NUM >= 58 +#define isFehChar(c) (IS_JOINING_GROUP((c), FEH) || IS_JOINING_GROUP((c), AFRICAN_FEH)) +#else +#define isFehChar(c) IS_JOINING_GROUP((c), FEH) +#endif +#define isGafChar(c) IS_JOINING_GROUP((c), GAF) +#define isHehChar(c) IS_JOINING_GROUP((c), HEH) +#define isKafChar(c) IS_JOINING_GROUP((c), KAF) +#define isLamChar(c) IS_JOINING_GROUP((c), LAM) +#if U_ICU_VERSION_MAJOR_NUM >= 58 +#define isQafChar(c) (IS_JOINING_GROUP((c), QAF) || IS_JOINING_GROUP((c), AFRICAN_QAF)) +#else +#define isQafChar(c) IS_JOINING_GROUP((c), QAF) +#endif +#define isRehChar(c) IS_JOINING_GROUP((c), REH) +#define isTahChar(c) IS_JOINING_GROUP((c), TAH) +#define isTehMarbutaChar(c) IS_JOINING_GROUP((c), TEH_MARBUTA) +#define isWawChar(c) IS_JOINING_GROUP((c), WAW) +#define isSeenOrSadChar(c) (IS_JOINING_GROUP((c), SAD) || IS_JOINING_GROUP((c), SEEN)) + +// Beh and characters that behave like Beh in medial form. +static bool isBehChar(sal_Unicode cCh) +{ + bool bRet = false; + switch (u_getIntPropertyValue(cCh, UCHAR_JOINING_GROUP)) + { + case U_JG_BEH: + case U_JG_NOON: +#if U_ICU_VERSION_MAJOR_NUM >= 58 + case U_JG_AFRICAN_NOON: +#endif + case U_JG_NYA: + case U_JG_YEH: + case U_JG_FARSI_YEH: + case U_JG_BURUSHASKI_YEH_BARREE: + bRet = true; + break; + default: + bRet = false; + break; + } + + return bRet; +} + +// Yeh and characters that behave like Yeh in final form. +static bool isYehChar(sal_Unicode cCh) +{ + bool bRet = false; + switch (u_getIntPropertyValue(cCh, UCHAR_JOINING_GROUP)) + { + case U_JG_YEH: + case U_JG_FARSI_YEH: + case U_JG_YEH_BARREE: + case U_JG_BURUSHASKI_YEH_BARREE: + case U_JG_YEH_WITH_TAIL: + bRet = true; + break; + default: + bRet = false; + break; + } + + return bRet; +} + +static bool isTransparentChar ( sal_Unicode cCh ) +{ + return u_getIntPropertyValue( cCh, UCHAR_JOINING_TYPE ) == U_JT_TRANSPARENT; +} + +static bool lcl_IsLigature( sal_Unicode cCh, sal_Unicode cNextCh ) { // Lam + Alef - return ( 0x644 == cCh && 0x627 == cNextCh ) || - // Beh + Reh - ( 0x628 == cCh && 0x631 == cNextCh ); + return ( isLamChar ( cCh ) && isAlefChar ( cNextCh )); } -static bool lcl_ConnectToPrev( sal_Unicode cCh, sal_Unicode cPrevCh ) // For Kashidas from sw/source/core/text/porlay.txt +static bool lcl_ConnectToPrev( sal_Unicode cCh, sal_Unicode cPrevCh ) { - // Alef, Dal, Thal, Reh, Zain, and Waw do not connect to the left - bool bRet = 0x627 != cPrevCh && 0x62F != cPrevCh && 0x630 != cPrevCh && - 0x631 != cPrevCh && 0x632 != cPrevCh && 0x648 != cPrevCh; + const int32_t nJoiningType = u_getIntPropertyValue( cPrevCh, UCHAR_JOINING_TYPE ); + bool bRet = nJoiningType != U_JT_RIGHT_JOINING && nJoiningType != U_JT_NON_JOINING; // check for ligatures cPrevChar + cChar if ( bRet ) @@ -2236,8 +2312,15 @@ void ImpEditEngine::ImpAdjustBlocks( ParaPortion* pParaPortion, EditLine* pLine, pLine->SetTextWidth( pLine->GetTextWidth() + nRemainingSpace ); } +// For Kashidas from sw/source/core/text/porlay.cxx void ImpEditEngine::ImpFindKashidas( ContentNode* pNode, sal_Int32 nStart, sal_Int32 nEnd, std::vector<sal_Int32>& rArray ) { + // Kashida glyph looks suspicious, skip Kashida justification + if (GetRefDevice()->GetMinKashida() < 0) + return; + + std::vector<sal_Int32> aKashidaArray; + // the search has to be performed on a per word base EditSelection aWordSel( EditPaM( pNode, nStart ) ); @@ -2261,7 +2344,16 @@ void ImpEditEngine::ImpFindKashidas( ContentNode* pNode, sal_Int32 nStart, sal_I sal_Unicode cCh; sal_Unicode cPrevCh = 0; - while ( nIdx < aWord.getLength() ) + int nPriorityLevel = 7; // 0..6 = level found + // 7 not found + + sal_Int32 nWordLen = aWord.getLength(); + + // ignore trailing vowel chars + while( nWordLen && isTransparentChar( aWord[ nWordLen - 1 ] )) + --nWordLen; + + while ( nIdx < nWordLen ) { cCh = aWord[ nIdx ]; @@ -2270,87 +2362,145 @@ void ImpEditEngine::ImpFindKashidas( ContentNode* pNode, sal_Int32 nStart, sal_I if ( 0x640 == cCh ) { nKashidaPos = aWordSel.Min().GetIndex() + nIdx; - break; + nPriorityLevel = 0; } // 2. Priority: // after a Seen or Sad - if ( nIdx + 1 < aWord.getLength() && - ( 0x633 == cCh || 0x635 == cCh ) ) + if (nPriorityLevel >= 1 && nIdx < nWordLen - 1) { - nKashidaPos = aWordSel.Min().GetIndex() + nIdx; - break; + if( isSeenOrSadChar( cCh ) + && (aWord[ nIdx+1 ] != 0x200C) ) // #i98410#: prevent ZWNJ expansion + { + nKashidaPos = aWordSel.Min().GetIndex() + nIdx; + nPriorityLevel = 1; + } } // 3. Priority: - // before final form of the Marbuta, Hah, Dal - // 4. Priority: - // before final form of Alef, Lam or Kaf - if ( nIdx && nIdx + 1 == aWord.getLength() && - ( 0x629 == cCh || 0x62D == cCh || 0x62F == cCh || - 0x627 == cCh || 0x644 == cCh || 0x643 == cCh ) ) + // before final form of Teh Marbuta, Heh, Dal + if ( nPriorityLevel >= 2 && nIdx > 0 ) { - DBG_ASSERT( 0 != cPrevCh, "No previous character" ); + if ( isTehMarbutaChar ( cCh ) || // Teh Marbuta (right joining) + isDalChar ( cCh ) || // Dal (right joining) final form may appear in the middle of word + ( isHehChar ( cCh ) && nIdx == nWordLen - 1)) // Heh (dual joining) only at end of word + { - // check if character is connectable to previous character, - if ( lcl_ConnectToPrev( cCh, cPrevCh ) ) + SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous character" ); + // check if character is connectable to previous character, + if ( lcl_ConnectToPrev( cCh, cPrevCh ) ) + { + nKashidaPos = aWordSel.Min().GetIndex() + nIdx - 1; + nPriorityLevel = 2; + } + } + } + + // 4. Priority: + // before final form of Alef, Tah, Lam, Kaf or Gaf + if ( nPriorityLevel >= 3 && nIdx > 0 ) + { + if ( isAlefChar ( cCh ) || // Alef (right joining) final form may appear in the middle of word + (( isLamChar ( cCh ) || // Lam, + isTahChar ( cCh ) || // Tah, + isKafChar ( cCh ) || // Kaf (all dual joining) + isGafChar ( cCh ) ) + && nIdx == nWordLen - 1)) // only at end of word { - nKashidaPos = aWordSel.Min().GetIndex() + nIdx - 1; - break; + SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous character" ); + // check if character is connectable to previous character, + if ( lcl_ConnectToPrev( cCh, cPrevCh ) ) + { + nKashidaPos = aWordSel.Min().GetIndex() + nIdx - 1; + nPriorityLevel = 3; + } } } // 5. Priority: - // before media Bah - if ( nIdx && nIdx + 1 < aWord.getLength() && 0x628 == cCh ) + // before medial Beh-like + if ( nPriorityLevel >= 4 && nIdx > 0 && nIdx < nWordLen - 1 ) { - DBG_ASSERT( 0 != cPrevCh, "No previous character" ); - - // check if next character is Reh, Yeh or Alef Maksura - sal_Unicode cNextCh = aWord[ nIdx + 1 ]; + if ( isBehChar ( cCh ) ) + { + // check if next character is Reh or Yeh-like + sal_Unicode cNextCh = aWord[ nIdx + 1 ]; + if ( isRehChar ( cNextCh ) || isYehChar ( cNextCh )) + { + SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous character" ); + // check if character is connectable to previous character, + if ( lcl_ConnectToPrev( cCh, cPrevCh ) ) + { + nKashidaPos = aWordSel.Min().GetIndex() + nIdx - 1; + nPriorityLevel = 4; + } + } + } + } - if ( 0x631 == cNextCh || 0x64A == cNextCh || - 0x649 == cNextCh ) + // 6. Priority: + // before the final form of Waw, Ain, Qaf and Feh + if ( nPriorityLevel >= 5 && nIdx > 0 ) + { + if ( isWawChar ( cCh ) || // Wav (right joining) + // final form may appear in the middle of word + (( isAinChar ( cCh ) || // Ain (dual joining) + isQafChar ( cCh ) || // Qaf (dual joining) + isFehChar ( cCh ) ) // Feh (dual joining) + && nIdx == nWordLen - 1)) // only at end of word { + SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous character" ); // check if character is connectable to previous character, if ( lcl_ConnectToPrev( cCh, cPrevCh ) ) + { nKashidaPos = aWordSel.Min().GetIndex() + nIdx - 1; + nPriorityLevel = 5; + } } } - // 6. Priority: // other connecting possibilities - if ( nIdx && nIdx + 1 == aWord.getLength() && - 0x60C <= cCh && 0x6FE >= cCh ) + if ( nPriorityLevel >= 6 && nIdx > 0 ) { - DBG_ASSERT( 0 != cPrevCh, "No previous character" ); - - // check if character is connectable to previous character, - if ( lcl_ConnectToPrev( cCh, cPrevCh ) ) + // Reh, Zain + if ( isRehChar ( cCh ) ) { - // only choose this position if we did not find - // a better one: - if ( nKashidaPos<0 ) + SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous character" ); + // check if character is connectable to previous character, + if ( lcl_ConnectToPrev( cCh, cPrevCh ) ) + { nKashidaPos = aWordSel.Min().GetIndex() + nIdx - 1; - break; + nPriorityLevel = 6; + } } } - // Do not consider Fathatan, Dammatan, Kasratan, Fatha, - // Damma, Kasra, Shadda and Sukun when checking if - // a character can be connected to previous character. - if ( cCh < 0x64B || cCh > 0x652 ) + // Do not consider vowel marks when checking if a character + // can be connected to previous character. + if ( !isTransparentChar ( cCh) ) cPrevCh = cCh; ++nIdx; } // end of current word if ( nKashidaPos>=0 ) - rArray.push_back( nKashidaPos ); + aKashidaArray.push_back( nKashidaPos ); aWordSel = WordRight( aWordSel.Max(), css::i18n::WordType::DICTIONARY_WORD ); aWordSel = SelectWord( aWordSel, css::i18n::WordType::DICTIONARY_WORD ); } + + // Validate + std::vector<sal_Int32> aDropped(aKashidaArray.size()); + auto nOldLayout = GetRefDevice()->GetLayoutMode(); + GetRefDevice()->SetLayoutMode(nOldLayout | vcl::text::ComplexTextLayoutFlags::BiDiRtl); + GetRefDevice()->ValidateKashidas(pNode->GetString(), nStart, nEnd - nStart, + aKashidaArray.size(), aKashidaArray.data(), aDropped.data()); + GetRefDevice()->SetLayoutMode(nOldLayout); + + for (auto const& pos : aKashidaArray) + if (std::find(aDropped.begin(), aDropped.end(), pos) == aDropped.end()) + rArray.push_back(pos); } sal_Int32 ImpEditEngine::SplitTextPortion( ParaPortion* pPortion, sal_Int32 nPos, EditLine* pCurLine )