include/svtools/htmlkywd.hxx | 1 include/svtools/htmltokn.h | 2 + svtools/qa/unit/testHtmlReader.cxx | 23 +++++++++++++++++++++ svtools/source/svhtml/htmlkywd.cxx | 4 +++ svtools/source/svhtml/parhtml.cxx | 39 +++++++++++++++++++++++++++++++++++++ sw/source/filter/html/swhtml.cxx | 1 6 files changed, 70 insertions(+)
New commits: commit f3e629ff15dcf2710901dbb942cee9b3c4e38af3 Author: Miklos Vajna <vmik...@collabora.com> AuthorDate: Tue Oct 25 15:55:34 2022 +0200 Commit: Xisco Fauli <xiscofa...@libreoffice.org> CommitDate: Wed Oct 26 08:43:52 2022 +0200 sw html import: fix handling of CDATA In case the HTML contained markup like <![CDATA[...]]>, we simply ignored it during import, even if e.g. the ODT import handles that correctly. The reason for this is that the svtools/ HTMLParser had code to parse <!-- ... ---> style comments, but not for CDATA. Fix the problem by introducing a new HtmlTokenId::CDATA, producing a matching token content in HTMLParser::GetNextToken_(), and finally map it to normal text on the Writer side. Note that HtmlTokenId doesn't allow non-on-off tokens past ONOFF_START, neither allows inserting a single token before ONOFF_START (it breaks getOnToken()), so for now just add a second, dummy token to avoid breakage. Change-Id: I605c3c21dc11986fda5d93d36148788a638e97b4 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/141813 Reviewed-by: Miklos Vajna <vmik...@collabora.com> Tested-by: Jenkins (cherry picked from commit b38730ae0ae92ca49b84a45853c2ed098ee9064f) Reviewed-on: https://gerrit.libreoffice.org/c/core/+/141838 Reviewed-by: Xisco Fauli <xiscofa...@libreoffice.org> diff --git a/include/svtools/htmlkywd.hxx b/include/svtools/htmlkywd.hxx index 5d6b7e629fe7..9a84cddd37bf 100644 --- a/include/svtools/htmlkywd.hxx +++ b/include/svtools/htmlkywd.hxx @@ -32,6 +32,7 @@ #define OOO_STRING_SVTOOLS_HTML_base "base" #define OOO_STRING_SVTOOLS_HTML_comment "!--" #define OOO_STRING_SVTOOLS_HTML_doctype "!DOCTYPE" +#define OOO_STRING_SVTOOLS_HTML_cdata "![cdata[" #define OOO_STRING_SVTOOLS_HTML_embed "embed" #define OOO_STRING_SVTOOLS_HTML_horzrule "hr" #define OOO_STRING_SVTOOLS_HTML_image "img" diff --git a/include/svtools/htmltokn.h b/include/svtools/htmltokn.h index bfa1f14d6812..9dca8a8f3ea7 100644 --- a/include/svtools/htmltokn.h +++ b/include/svtools/htmltokn.h @@ -58,6 +58,8 @@ enum class HtmlTokenId : sal_Int16 AREA, // Netscape 2.0 BASE, // HTML 3.0 COMMENT, + CDATA, + DUMMY, // so ONOFF_START is even DOCTYPE, EMBED, // Netscape 2.0 ignore </EMBED> HORZRULE, // ignore </HR> diff --git a/svtools/qa/unit/testHtmlReader.cxx b/svtools/qa/unit/testHtmlReader.cxx index 146458a200eb..37f74e903bcc 100644 --- a/svtools/qa/unit/testHtmlReader.cxx +++ b/svtools/qa/unit/testHtmlReader.cxx @@ -27,6 +27,7 @@ public: OUString m_aDocument; int m_nLineBreakCount = 0; + OUString m_aCdata; }; TestHTMLParser::TestHTMLParser(SvStream& rStream) @@ -40,6 +41,8 @@ void TestHTMLParser::NextToken(HtmlTokenId nToken) m_aDocument += aToken; else if (nToken == HtmlTokenId::LINEBREAK) ++m_nLineBreakCount; + else if (nToken == HtmlTokenId::CDATA) + m_aCdata = aToken; } /// Tests HTMLParser. @@ -76,6 +79,26 @@ CPPUNIT_TEST_FIXTURE(Test, testLineBreak) // This was 2, <br></br> was interpreted as 2 line breaks in XHTML mode. CPPUNIT_ASSERT_EQUAL(1, xParser->m_nLineBreakCount); } + +CPPUNIT_TEST_FIXTURE(Test, testCdata) +{ + // Given a document with CDATA: + SvMemoryStream aStream; + OString aDocument("A<![CDATA[B ü <]]>C"); + aStream.WriteBytes(aDocument.getStr(), aDocument.getLength()); + aStream.Seek(0); + + // When parsing that HTML: + tools::SvRef<TestHTMLParser> xParser = new TestHTMLParser(aStream); + xParser->CallParser(); + + // Then make sure that we get a cdata token with the correct content: + // Without the accompanying fix in place, this test would have failed with: + // - Expected: B ü < + // - Actual : + // i.e. the content inside CDATA was lost. + CPPUNIT_ASSERT_EQUAL(OUString("B ü <"), xParser->m_aCdata); +} } /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/svtools/source/svhtml/htmlkywd.cxx b/svtools/source/svhtml/htmlkywd.cxx index 2d51910d85e9..584322fac8bc 100644 --- a/svtools/source/svhtml/htmlkywd.cxx +++ b/svtools/source/svhtml/htmlkywd.cxx @@ -27,6 +27,9 @@ #include <svtools/htmltokn.h> #include <svtools/htmlkywd.hxx> +// If this is odd, then getOnToken() breaks. +static_assert(static_cast<sal_Int16>(HtmlTokenId::ABBREVIATION_ON) % 2 == 0); + namespace { template<typename T> @@ -64,6 +67,7 @@ using HTML_TokenEntry = TokenEntry<HtmlTokenId>; HTML_TokenEntry const aHTMLTokenTab[] = { {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_comment), HtmlTokenId::COMMENT}, {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_doctype), HtmlTokenId::DOCTYPE}, + {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_cdata), HtmlTokenId::CDATA}, {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_anchor), HtmlTokenId::ANCHOR_ON}, {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_abbreviation), HtmlTokenId::ABBREVIATION_ON}, // HTML 3.0 {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_acronym), HtmlTokenId::ACRONYM_ON}, // HTML 3.0 diff --git a/svtools/source/svhtml/parhtml.cxx b/svtools/source/svhtml/parhtml.cxx index c6962824b6b6..70d1da950172 100644 --- a/svtools/source/svhtml/parhtml.cxx +++ b/svtools/source/svhtml/parhtml.cxx @@ -1053,6 +1053,10 @@ HtmlTokenId HTMLParser::GetNextToken_() do { sTmpBuffer.appendUtf32( nNextCh ); nNextCh = GetNextChar(); + if (std::u16string_view(sTmpBuffer) == u"![CDATA[") + { + break; + } } while( '>' != nNextCh && '/' != nNextCh && !rtl::isAsciiWhiteSpace( nNextCh ) && !linguistic::IsControlChar(nNextCh) && IsParserWorking() && !rInput.eof() ); @@ -1151,6 +1155,41 @@ HtmlTokenId HTMLParser::GetNextToken_() nNextCh = '>'; } } + else if (nRet == HtmlTokenId::CDATA) + { + // Read until the closing ]]>. + bool bDone = false; + while (!bDone && !rInput.eof() && IsParserWorking()) + { + if (nNextCh == '>') + { + if (sTmpBuffer.getLength() >= 2) + { + bDone = sTmpBuffer[sTmpBuffer.getLength() - 2] == ']' + && sTmpBuffer[sTmpBuffer.getLength() - 1] == ']'; + if (bDone) + { + // Ignore ]] at the end. + sTmpBuffer.setLength(sTmpBuffer.getLength() - 2); + } + } + if (!bDone) + { + sTmpBuffer.appendUtf32(nNextCh); + } + } + else if (!linguistic::IsControlChar(nNextCh)) + { + sTmpBuffer.appendUtf32(nNextCh); + } + if (!bDone) + { + nNextCh = GetNextChar(); + } + } + aToken = sTmpBuffer; + sTmpBuffer.setLength(0); + } else { // TokenString not needed anymore diff --git a/sw/source/filter/html/swhtml.cxx b/sw/source/filter/html/swhtml.cxx index e76421579e9f..c5b33a847560 100644 --- a/sw/source/filter/html/swhtml.cxx +++ b/sw/source/filter/html/swhtml.cxx @@ -1519,6 +1519,7 @@ void SwHTMLParser::NextToken( HtmlTokenId nToken ) break; case HtmlTokenId::TEXTTOKEN: + case HtmlTokenId::CDATA: // insert string without spanning attributes at the end. if( !aToken.isEmpty() && ' '==aToken[0] && !IsReadPRE() ) {