editeng/source/editeng/impedit4.cxx | 36 +++++++++--------- sc/qa/unit/copy_paste_test.cxx | 28 ++++++++++++++ sc/qa/unit/data/xlsx/tdf122716_font_with_charset.xlsx |binary 3 files changed, 47 insertions(+), 17 deletions(-)
New commits: commit 0c1ae785e3fb3a800f6b7743a03245dca6c01f14 Author: Mike Kaganski <mike.kagan...@collabora.com> AuthorDate: Tue Nov 5 16:08:50 2024 +0500 Commit: Mike Kaganski <mike.kagan...@collabora.com> CommitDate: Wed Nov 6 08:32:16 2024 +0100 tdf#122716: take encoding defined for font into account Before this, the non-ASCII Windows-1252 characters get exported to RTF without Unicode markup, regardless of the font-defined charset; and on import to Writer (and other compliant RTF readers), this 8-bit markup was interpreted using the font data, producing different characters. Change-Id: I2032930b6585287fde3eb3b5e6abed0298d29330 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/176048 Tested-by: Jenkins Reviewed-by: Mike Kaganski <mike.kagan...@collabora.com> diff --git a/editeng/source/editeng/impedit4.cxx b/editeng/source/editeng/impedit4.cxx index 22e99ccf894f..b4ba8ea835a0 100644 --- a/editeng/source/editeng/impedit4.cxx +++ b/editeng/source/editeng/impedit4.cxx @@ -293,6 +293,15 @@ void ImpEditEngine::WriteXML(SvStream& rOutput, const EditSelection& rSel) SvxWriteXML( *GetEditEnginePtr(), rOutput, aESel ); } +static size_t GetFontIndex(const SfxPoolItem& rItem, + const std::vector<std::unique_ptr<SvxFontItem>>& rFontTable) +{ + for (size_t i = 0; i < rFontTable.size(); ++i) + if (*rFontTable[i] == rItem) + return i; + return 0; +} + ErrCode ImpEditEngine::WriteRTF( SvStream& rOutput, EditSelection aSel, bool bClipboard ) { assert( IsUpdateLayout() && "WriteRTF for UpdateMode = sal_False!" ); @@ -381,10 +390,6 @@ ErrCode ImpEditEngine::WriteRTF( SvStream& rOutput, EditSelection aSel, bool bCl rtl_TextEncoding eChrSet = pFontItem->GetCharSet(); // tdf#47679 OpenSymbol is not encoded in Symbol Encoding - // and anyway we always attempt to write as eDestEnc - // of RTL_TEXTENCODING_MS_1252 and pay no attention - // on export what encoding we claim to use for these - // fonts. if (IsOpenSymbol(pFontItem->GetFamilyName())) { SAL_WARN_IF(eChrSet == RTL_TEXTENCODING_SYMBOL, "editeng", "OpenSymbol should not have charset of RTL_TEXTENCODING_SYMBOL in new documents"); @@ -687,10 +692,17 @@ ErrCode ImpEditEngine::WriteRTF( SvStream& rOutput, EditSelection aSel, bool bCl aAttribItems.Clear(); sal_uInt16 nScriptTypeI18N = GetI18NScriptType( EditPaM( pNode, nIndex+1 ) ); SvtScriptType nScriptType = SvtLanguageOptions::FromI18NToSvtScriptType(nScriptTypeI18N); + rtl_TextEncoding actEncoding = eDestEnc; if ( !n || IsScriptChange( EditPaM( pNode, nIndex ) ) ) { SfxItemSet aAttribs = GetAttribs( nNode, nIndex+1, nIndex+1 ); - aAttribItems.Insert( &aAttribs.Get( GetScriptItemId( EE_CHAR_FONTINFO, nScriptType ) ) ); + auto& item = aAttribs.Get(GetScriptItemId(EE_CHAR_FONTINFO, nScriptType)); + aAttribItems.Insert(&item); + // The actual encoding that RTF uses for the portion is defined by the font + if (auto i = GetFontIndex(item, aFontTable); + i < aFontTable.size() + && aFontTable[i]->GetCharSet() != RTL_TEXTENCODING_DONTKNOW) + actEncoding = aFontTable[i]->GetCharSet(); aAttribItems.Insert( &aAttribs.Get( GetScriptItemId( EE_CHAR_FONTHEIGHT, nScriptType ) ) ); aAttribItems.Insert( &aAttribs.Get( GetScriptItemId( EE_CHAR_WEIGHT, nScriptType ) ) ); aAttribItems.Insert( &aAttribs.Get( GetScriptItemId( EE_CHAR_ITALIC, nScriptType ) ) ); @@ -711,7 +723,7 @@ ErrCode ImpEditEngine::WriteRTF( SvStream& rOutput, EditSelection aSel, bool bCl nE = nEndPos; OUString aRTFStr = EditDoc::GetParaAsString( pNode, nS, nE); - RTFOutFuncs::Out_String( rOutput, aRTFStr, eDestEnc ); + RTFOutFuncs::Out_String(rOutput, aRTFStr, actEncoding); rOutput.WriteChar( '}' ); } if ( bFinishPortion ) @@ -847,18 +859,8 @@ void ImpEditEngine::WriteItemAsRTF( const SfxPoolItem& rItem, SvStream& rOutput, case EE_CHAR_FONTINFO_CJK: case EE_CHAR_FONTINFO_CTL: { - sal_uInt32 n = 0; - for (size_t i = 0; i < rFontTable.size(); ++i) - { - if (*rFontTable[i] == rItem) - { - n = i; - break; - } - } - rOutput.WriteOString( OOO_STRING_SVTOOLS_RTF_F ); - rOutput.WriteNumberAsString( n ); + rOutput.WriteNumberAsString(GetFontIndex(rItem, rFontTable)); } break; case EE_CHAR_FONTHEIGHT: diff --git a/sc/qa/unit/copy_paste_test.cxx b/sc/qa/unit/copy_paste_test.cxx index 41d764e82381..b1691fc23d5d 100644 --- a/sc/qa/unit/copy_paste_test.cxx +++ b/sc/qa/unit/copy_paste_test.cxx @@ -45,6 +45,7 @@ public: void tdf113500_autofillMixed(); void tdf137625_autofillMergedUserlist(); void tdf137624_autofillMergedMixed(); + void tdf122716_rtf_portion_encoding(); CPPUNIT_TEST_SUITE(ScCopyPasteTest); CPPUNIT_TEST(testCopyPasteXLS); @@ -62,6 +63,7 @@ public: CPPUNIT_TEST(tdf113500_autofillMixed); CPPUNIT_TEST(tdf137625_autofillMergedUserlist); CPPUNIT_TEST(tdf137624_autofillMergedMixed); + CPPUNIT_TEST(tdf122716_rtf_portion_encoding); CPPUNIT_TEST_SUITE_END(); private: @@ -775,6 +777,32 @@ void ScCopyPasteTest::tdf137624_autofillMergedMixed() } } +void ScCopyPasteTest::tdf122716_rtf_portion_encoding() +{ + // Given a document with an explicitly defined "204" (Russian) charset for a font, + // and a cell having contents of "Šampūnas", which has character "Š" representable + // in Windows-1252 (RTF default), but not in Windows-1251 (i.e. charset 204): + createScDoc("xlsx/tdf122716_font_with_charset.xlsx"); + ScModelObj* pModelObj = comphelper::getFromUnoTunnel<ScModelObj>(mxComponent); + // Obtain a transferable, similar to what happens on copy to clipboard: + auto xTransferable = pModelObj->getSelection(); + // Get the RTF data: + auto rtf_any = xTransferable->getTransferData({ u"text/rtf"_ustr, {}, {} }); + css::uno::Sequence<sal_Int8> rtf_bytes; + CPPUNIT_ASSERT(rtf_any >>= rtf_bytes); + OString rtf_string(reinterpret_cast<const char*>(rtf_bytes.getConstArray()), + rtf_bytes.getLength()); + // Check that the font with charset was actually emitted + CPPUNIT_ASSERT(rtf_string.indexOf("\fcharset204 Liberation Sans;") >= 0); + // Make sure that Unicode markup is emitted for the non-Ascii characters. + // Without the fix, "\u352" wasn't there, because the export was using Windows-1252 + // encoding unconditionally, even though the exported font defined a different one; + // so the exported characters only had Unicode markup, when not representable in the + // Windows-1252 encoding, and "Š" got exported as "\'8a". On import to Writer, font + // encoding was used, and "\'8a" was interpreted as a Cyrillic alphabet character. + CPPUNIT_ASSERT(rtf_string.indexOf("\u352\'3famp\u363\'3fnas") >= 0); +} + ScCopyPasteTest::ScCopyPasteTest() : ScModelTestBase(u"/sc/qa/unit/data/"_ustr) { diff --git a/sc/qa/unit/data/xlsx/tdf122716_font_with_charset.xlsx b/sc/qa/unit/data/xlsx/tdf122716_font_with_charset.xlsx new file mode 100644 index 000000000000..6c2326e3ed28 Binary files /dev/null and b/sc/qa/unit/data/xlsx/tdf122716_font_with_charset.xlsx differ