filter/source/config/fragments/types/generic_HTML.xcu | 2 svtools/CppunitTest_svtools_html.mk | 1 svtools/qa/unit/testHtmlReader.cxx | 70 ++++++++++++++++++ svtools/source/svhtml/parhtml.cxx | 3 4 files changed, 74 insertions(+), 2 deletions(-)
New commits: commit 3fe64261b5658e28e2c0a1630cf878f066f77f0c Author: Miklos Vajna <vmik...@collabora.co.uk> Date: Wed Dec 13 14:46:26 2017 +0100 Related: tdf#114428 svtools HTML import: avoid XML declaration in body text Just ignore it for now. Change-Id: Idf82af611370d957c6704cce250941a8a0b90637 Reviewed-on: https://gerrit.libreoffice.org/46388 Tested-by: Jenkins <c...@libreoffice.org> Reviewed-by: Miklos Vajna <vmik...@collabora.co.uk> diff --git a/svtools/CppunitTest_svtools_html.mk b/svtools/CppunitTest_svtools_html.mk index e3e56e4d9949..6fbca2c06442 100644 --- a/svtools/CppunitTest_svtools_html.mk +++ b/svtools/CppunitTest_svtools_html.mk @@ -14,6 +14,7 @@ $(eval $(call gb_CppunitTest_use_external,svtools_html,boost_headers)) $(eval $(call gb_CppunitTest_use_sdk_api,svtools_html)) $(eval $(call gb_CppunitTest_add_exception_objects,svtools_html, \ + svtools/qa/unit/testHtmlReader \ svtools/qa/unit/testHtmlWriter \ )) diff --git a/svtools/qa/unit/testHtmlReader.cxx b/svtools/qa/unit/testHtmlReader.cxx new file mode 100644 index 000000000000..151976eabc9d --- /dev/null +++ b/svtools/qa/unit/testHtmlReader.cxx @@ -0,0 +1,70 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + */ + +#include <cppunit/TestFixture.h> +#include <cppunit/extensions/HelperMacros.h> +#include <com/sun/star/document/XDocumentProperties.hpp> +#include <svtools/parhtml.hxx> +#include <tools/ref.hxx> +#include <tools/stream.hxx> + +namespace +{ +/// Subclass of HTMLParser that can sense the import result. +class TestHTMLParser : public HTMLParser +{ +public: + TestHTMLParser(SvStream& rStream); + virtual void NextToken(HtmlTokenId nToken) override; + + OUString m_aDocument; +}; + +TestHTMLParser::TestHTMLParser(SvStream& rStream) + : HTMLParser(rStream) +{ +} + +void TestHTMLParser::NextToken(HtmlTokenId nToken) +{ + if (nToken == HtmlTokenId::TEXTTOKEN) + m_aDocument += aToken; +} + +/// Tests HTMLParser. +class Test : public CppUnit::TestFixture +{ +public: + void testTdf114428(); + + CPPUNIT_TEST_SUITE(Test); + CPPUNIT_TEST(testTdf114428); + CPPUNIT_TEST_SUITE_END(); +}; + +void Test::testTdf114428() +{ + SvMemoryStream aStream; + OString aDocument("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<html>hello</html>"); + aStream.WriteBytes(aDocument.getStr(), aDocument.getLength()); + aStream.Seek(0); + + tools::SvRef<TestHTMLParser> xParser = new TestHTMLParser(aStream); + xParser->CallParser(); + + // This was '<?xml version="1.0" encoding="utf-8"?> hello', XML declaration + // was not ignored. + CPPUNIT_ASSERT_EQUAL(OUString("hello"), xParser->m_aDocument.trim()); +} + +CPPUNIT_TEST_SUITE_REGISTRATION(Test); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/svtools/source/svhtml/parhtml.cxx b/svtools/source/svhtml/parhtml.cxx index 9e5974ae5d40..7ea1750974be 100644 --- a/svtools/source/svhtml/parhtml.cxx +++ b/svtools/source/svhtml/parhtml.cxx @@ -1042,7 +1042,8 @@ HtmlTokenId HTMLParser::GetNextToken_() bOffState = true; nNextCh = GetNextChar(); } - if( rtl::isAsciiAlpha( nNextCh ) || '!'==nNextCh ) + // Assume '<?' is a start of an XML declaration, ignore it. + if (rtl::isAsciiAlpha(nNextCh) || nNextCh == '!' || nNextCh == '?') { OUStringBuffer sTmpBuffer; do { commit 14daba5bd0ba64ff53ad98de7a84537ff03024ea Author: Miklos Vajna <vmik...@collabora.co.uk> Date: Wed Dec 13 11:13:40 2017 +0100 Related: tdf#114428 filter: associate .xhtml with HTML import Since there is no XHTML import; and this way the expensive "deep" detection (that calls into all the DLP libs, etc) can be avoided. Times for a hello world input: 56 -> 23 ms is spent in Desktop::loadComponentFromURL() (41% of original). Change-Id: Ia2dec3837cf0c548ae2c5a0ca4d47a57a6cbb92a Reviewed-on: https://gerrit.libreoffice.org/46387 Tested-by: Jenkins <c...@libreoffice.org> Reviewed-by: Miklos Vajna <vmik...@collabora.co.uk> diff --git a/filter/source/config/fragments/types/generic_HTML.xcu b/filter/source/config/fragments/types/generic_HTML.xcu index b00b048d3842..b29ba333aded 100644 --- a/filter/source/config/fragments/types/generic_HTML.xcu +++ b/filter/source/config/fragments/types/generic_HTML.xcu @@ -18,7 +18,7 @@ <node oor:name="generic_HTML" oor:op="replace" > <prop oor:name="DetectService"><value>com.sun.star.comp.filters.PlainTextFilterDetect</value></prop> <prop oor:name="URLPattern"><value>private:factory/swriter/web*</value></prop> - <prop oor:name="Extensions"><value>html htm</value></prop> + <prop oor:name="Extensions"><value>html htm xhtml</value></prop> <prop oor:name="MediaType"><value>text/html</value></prop> <prop oor:name="Preferred"><value>false</value></prop> <prop oor:name="PreferredFilter"><value>HTML</value></prop> _______________________________________________ Libreoffice-commits mailing list libreoffice-comm...@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/libreoffice-commits