sw/qa/extras/rtfexport/data/tdf95706_2.rtf | 17 ++ sw/qa/extras/rtfexport/rtfexport4.cxx | 12 + writerfilter/source/rtftok/rtfdispatchvalue.cxx | 4 writerfilter/source/rtftok/rtfdocumentimpl.cxx | 162 +++++++++++++----------- writerfilter/source/rtftok/rtfdocumentimpl.hxx | 1 5 files changed, 122 insertions(+), 74 deletions(-)
New commits: commit ea042a0a742d8df369ce849086194be2f76a35dc Merge: 73e7e7d5acf7 8daac72b7a0b Author: Michael Weghorn <m.wegh...@posteo.de> AuthorDate: Mon Apr 11 15:39:48 2022 +0200 Commit: Michael Weghorn <m.wegh...@posteo.de> CommitDate: Mon Apr 11 15:39:48 2022 +0200 Merge branch 'libreoffice-7-3' into distro/lhm/libreoffice-7-3+backports Change-Id: I65f63c73c056174d7c20905822e17efbc14a1625 commit 8daac72b7a0b7cdf6eb520273829c0c0c15ddef5 Author: Vasily Melenchuk <vasily.melenc...@cib.de> AuthorDate: Thu Apr 7 20:59:08 2022 +0300 Commit: Xisco Fauli <xiscofa...@libreoffice.org> CommitDate: Mon Apr 11 11:53:51 2022 +0200 tdf#95706: RTF import: tolerant font table parsing While font name in font table should end with semicolon ({\fonttbl{\f42 Arial;}}) it is not always true and MS Word is tolerant to it: it still able to parse this correctly. Seems LO also should not require strict spec conformance. So idea of font parsing is changed: instead of inserting font on semicolon, it is done on next \fN or destination end. All collected text to this moment is a font name. Change-Id: I6b41951217442a71fd2ebbfc58a3fc79f6f913db Reviewed-on: https://gerrit.libreoffice.org/c/core/+/132686 Tested-by: Jenkins Reviewed-by: Miklos Vajna <vmik...@collabora.com> (cherry picked from commit 844be7358f1eec00094a55fa1fb4fadadb8cd1bf) Reviewed-on: https://gerrit.libreoffice.org/c/core/+/132699 Reviewed-by: Xisco Fauli <xiscofa...@libreoffice.org> diff --git a/sw/qa/extras/rtfexport/data/tdf95706_2.rtf b/sw/qa/extras/rtfexport/data/tdf95706_2.rtf new file mode 100644 index 000000000000..d36d2ccd2396 --- /dev/null +++ b/sw/qa/extras/rtfexport/data/tdf95706_2.rtf @@ -0,0 +1,17 @@ +{\rtf\ansi +{\fonttbl +{\f1 Arial} +\f2 Impact +\f3 T\'69mes New Roman +\f4 T +a +h +o +m +a +} +\pard\f1\fs26 Arial\par +\pard\f2\fs26 Impact\par +\pard\f3\fs26 Times New Roman\par +\pard\f4\fs26 Tahoma\par +} diff --git a/sw/qa/extras/rtfexport/rtfexport4.cxx b/sw/qa/extras/rtfexport/rtfexport4.cxx index 1f1434054085..33a2a246a181 100644 --- a/sw/qa/extras/rtfexport/rtfexport4.cxx +++ b/sw/qa/extras/rtfexport/rtfexport4.cxx @@ -522,6 +522,18 @@ DECLARE_RTFEXPORT_TEST(testTdf95706, "tdf95706.rtf") CPPUNIT_ASSERT_EQUAL(OUString("Arial"), getProperty<OUString>(xRun16, "CharFontName")); } +DECLARE_RTFEXPORT_TEST(testTdf95706_2, "tdf95706_2.rtf") +{ + CPPUNIT_ASSERT_EQUAL(OUString("Arial"), + getProperty<OUString>(getRun(getParagraph(1), 1), "CharFontName")); + CPPUNIT_ASSERT_EQUAL(OUString("Impact"), + getProperty<OUString>(getRun(getParagraph(2), 1), "CharFontName")); + CPPUNIT_ASSERT_EQUAL(OUString("Times New Roman"), + getProperty<OUString>(getRun(getParagraph(3), 1), "CharFontName")); + CPPUNIT_ASSERT_EQUAL(OUString("Tahoma"), + getProperty<OUString>(getRun(getParagraph(4), 1), "CharFontName")); +} + DECLARE_RTFEXPORT_TEST(testTdf111851, "tdf111851.rtf") { uno::Reference<text::XTextTable> xTable(getParagraphOrTable(1), uno::UNO_QUERY); diff --git a/writerfilter/source/rtftok/rtfdispatchvalue.cxx b/writerfilter/source/rtftok/rtfdispatchvalue.cxx index d78f087d76e3..35d3e0128c84 100644 --- a/writerfilter/source/rtftok/rtfdispatchvalue.cxx +++ b/writerfilter/source/rtftok/rtfdispatchvalue.cxx @@ -762,6 +762,10 @@ RTFError RTFDocumentImpl::dispatchValue(RTFKeyword nKeyword, int nParam) if (m_aStates.top().getDestination() == Destination::FONTTABLE || m_aStates.top().getDestination() == Destination::FONTENTRY) { + // Some text in buffer? It is font name. So previous font definition is complete + if (m_aStates.top().getCurrentDestinationText()->getLength()) + handleFontTableEntry(); + m_aFontIndexes.push_back(nParam); m_nCurrentFontIndex = getFontIndex(nParam); } diff --git a/writerfilter/source/rtftok/rtfdocumentimpl.cxx b/writerfilter/source/rtftok/rtfdocumentimpl.cxx index 8ea858abada5..9e77cbf4602e 100644 --- a/writerfilter/source/rtftok/rtfdocumentimpl.cxx +++ b/writerfilter/source/rtftok/rtfdocumentimpl.cxx @@ -1332,6 +1332,74 @@ void RTFDocumentImpl::singleChar(sal_uInt8 nValue, bool bRunProps) } } +void RTFDocumentImpl::handleFontTableEntry() +{ + OUString aName = m_aStates.top().getCurrentDestinationText()->makeStringAndClear(); + + if (aName.isEmpty()) + return; + + if (aName.endsWith(";")) + { + aName = aName.copy(0, aName.getLength() - 1); + } + + // Old documents can contain no encoding information in fontinfo, + // but there can be font name suffixes: Arial CE is not a special + // font, it is ordinal Arial, but with used cp 1250 encoding. + // Moreover these suffixes have priority over \cpgN and \fcharsetN + // in MS Word. + OUString aFontSuffix; + OUString aNameNoSuffix(aName); + sal_Int32 nLastSpace = aName.lastIndexOf(' '); + if (nLastSpace >= 0) + { + aFontSuffix = aName.copy(nLastSpace + 1); + aNameNoSuffix = aName.copy(0, nLastSpace); + sal_Int32 nEncoding = RTL_TEXTENCODING_DONTKNOW; + for (int i = 0; aRTFFontNameSuffixes[i].codepage != RTL_TEXTENCODING_DONTKNOW; i++) + { + if (aFontSuffix.equalsAscii(aRTFFontNameSuffixes[i].suffix)) + { + nEncoding = aRTFFontNameSuffixes[i].codepage; + break; + } + } + if (nEncoding > RTL_TEXTENCODING_DONTKNOW) + { + m_nCurrentEncoding = nEncoding; + m_aStates.top().setCurrentEncoding(m_nCurrentEncoding); + } + else + { + // Unknown suffix: looks like it is just a part of font name, restore it + aNameNoSuffix = aName; + } + } + + m_aFontNames[m_nCurrentFontIndex] = aNameNoSuffix; + if (m_nCurrentEncoding >= 0) + { + m_aFontEncodings[m_nCurrentFontIndex] = m_nCurrentEncoding; + m_nCurrentEncoding = -1; + } + m_aStates.top().getTableAttributes().set(NS_ooxml::LN_CT_Font_name, + new RTFValue(aNameNoSuffix)); + + writerfilter::Reference<Properties>::Pointer_t const pProp(new RTFReferenceProperties( + m_aStates.top().getTableAttributes(), m_aStates.top().getTableSprms())); + + //See fdo#47347 initial invalid font entry properties are inserted first, + //so when we attempt to insert the correct ones, there's already an + //entry in the map for them, so the new ones aren't inserted. + auto lb = m_aFontTableEntries.lower_bound(m_nCurrentFontIndex); + if (lb != m_aFontTableEntries.end() + && !(m_aFontTableEntries.key_comp()(m_nCurrentFontIndex, lb->first))) + lb->second = pProp; + else + m_aFontTableEntries.insert(lb, std::make_pair(m_nCurrentFontIndex, pProp)); +} + void RTFDocumentImpl::text(OUString& rString) { if (rString.getLength() == 1 && m_aStates.top().getDestination() != Destination::DOCCOMM) @@ -1345,10 +1413,7 @@ void RTFDocumentImpl::text(OUString& rString) bool bRet = true; switch (m_aStates.top().getDestination()) { - // Note: in fonttbl there may or may not be groups; in stylesheet - // and revtbl groups are mandatory - case Destination::FONTTABLE: - case Destination::FONTENTRY: + // Note: in stylesheet and revtbl groups are mandatory case Destination::STYLEENTRY: case Destination::LISTNAME: case Destination::REVISIONENTRY: @@ -1368,68 +1433,6 @@ void RTFDocumentImpl::text(OUString& rString) = m_aStates.top().getCurrentDestinationText()->makeStringAndClear(); switch (m_aStates.top().getDestination()) { - case Destination::FONTTABLE: - case Destination::FONTENTRY: - { - // Old documents can contain no encoding information in fontinfo, - // but there can be font name suffixes: Arial CE is not a special - // font, it is ordinal Arial, but with used cp 1250 encoding. - // Moreover these suffixes have priority over \cpgN and \fcharsetN - // in MS Word. - OUString aFontSuffix; - OUString aNameNoSuffix(aName); - sal_Int32 nLastSpace = aName.lastIndexOf(' '); - if (nLastSpace >= 0) - { - aFontSuffix = aName.copy(nLastSpace + 1); - aNameNoSuffix = aName.copy(0, nLastSpace); - sal_Int32 nEncoding = RTL_TEXTENCODING_DONTKNOW; - for (int i = 0; - aRTFFontNameSuffixes[i].codepage != RTL_TEXTENCODING_DONTKNOW; i++) - { - if (aFontSuffix.equalsAscii(aRTFFontNameSuffixes[i].suffix)) - { - nEncoding = aRTFFontNameSuffixes[i].codepage; - break; - } - } - if (nEncoding > RTL_TEXTENCODING_DONTKNOW) - { - m_nCurrentEncoding = nEncoding; - m_aStates.top().setCurrentEncoding(m_nCurrentEncoding); - } - else - { - // Unknown suffix: looks like it is just a part of font name, restore it - aNameNoSuffix = aName; - } - } - - m_aFontNames[m_nCurrentFontIndex] = aNameNoSuffix; - if (m_nCurrentEncoding >= 0) - { - m_aFontEncodings[m_nCurrentFontIndex] = m_nCurrentEncoding; - m_nCurrentEncoding = -1; - } - m_aStates.top().getTableAttributes().set(NS_ooxml::LN_CT_Font_name, - new RTFValue(aNameNoSuffix)); - - writerfilter::Reference<Properties>::Pointer_t const pProp( - new RTFReferenceProperties(m_aStates.top().getTableAttributes(), - m_aStates.top().getTableSprms())); - - //See fdo#47347 initial invalid font entry properties are inserted first, - //so when we attempt to insert the correct ones, there's already an - //entry in the map for them, so the new ones aren't inserted. - auto lb = m_aFontTableEntries.lower_bound(m_nCurrentFontIndex); - if (lb != m_aFontTableEntries.end() - && !(m_aFontTableEntries.key_comp()(m_nCurrentFontIndex, lb->first))) - lb->second = pProp; - else - m_aFontTableEntries.insert(lb, - std::make_pair(m_nCurrentFontIndex, pProp)); - } - break; case Destination::STYLEENTRY: { RTFValue::Pointer_t pType @@ -1467,6 +1470,8 @@ void RTFDocumentImpl::text(OUString& rString) } } break; + case Destination::FONTTABLE: + case Destination::FONTENTRY: case Destination::LEVELTEXT: case Destination::SHAPEPROPERTYNAME: case Destination::SHAPEPROPERTYVALUE: @@ -2216,17 +2221,26 @@ RTFError RTFDocumentImpl::beforePopState(RTFParserState& rState) { switch (rState.getDestination()) { + //Note: in fonttbl there may or may not be groups, so process it as no groups case Destination::FONTTABLE: + case Destination::FONTENTRY: { - writerfilter::Reference<Table>::Pointer_t const pTable( - new RTFReferenceTable(m_aFontTableEntries)); - Mapper().table(NS_ooxml::LN_FONTTABLE, pTable); - if (m_nDefaultFontIndex >= 0) + // Some text unhandled? Seems it is last font name + if (m_aStates.top().getCurrentDestinationText()->getLength()) + handleFontTableEntry(); + + if (rState.getDestination() == Destination::FONTTABLE) { - auto pValue = new RTFValue(m_aFontNames[getFontIndex(m_nDefaultFontIndex)]); - putNestedAttribute(m_aDefaultState.getCharacterSprms(), - NS_ooxml::LN_EG_RPrBase_rFonts, NS_ooxml::LN_CT_Fonts_ascii, - pValue); + writerfilter::Reference<Table>::Pointer_t const pTable( + new RTFReferenceTable(m_aFontTableEntries)); + Mapper().table(NS_ooxml::LN_FONTTABLE, pTable); + if (m_nDefaultFontIndex >= 0) + { + auto pValue = new RTFValue(m_aFontNames[getFontIndex(m_nDefaultFontIndex)]); + putNestedAttribute(m_aDefaultState.getCharacterSprms(), + NS_ooxml::LN_EG_RPrBase_rFonts, NS_ooxml::LN_CT_Fonts_ascii, + pValue); + } } } break; diff --git a/writerfilter/source/rtftok/rtfdocumentimpl.hxx b/writerfilter/source/rtftok/rtfdocumentimpl.hxx index 66e27a509be5..14ffc2f630a4 100644 --- a/writerfilter/source/rtftok/rtfdocumentimpl.hxx +++ b/writerfilter/source/rtftok/rtfdocumentimpl.hxx @@ -777,6 +777,7 @@ private: writerfilter::Reference<Properties>::Pointer_t getProperties(const RTFSprms& rAttributes, RTFSprms const& rSprms, Id nStyleType); void checkNeedPap(); + void handleFontTableEntry(); void sectBreak(bool bFinal = false); void prepareProperties(RTFParserState& rState, writerfilter::Reference<Properties>::Pointer_t& o_rpParagraphProperties,