include/rtl/character.hxx | 13 ++++++++ include/svtools/svparser.hxx | 4 +- svtools/source/svhtml/parhtml.cxx | 53 ++++++++++++++++++--------------- svtools/source/svrtf/parrtf.cxx | 6 +-- svtools/source/svrtf/svparser.cxx | 29 ++++++++++++------ sw/qa/extras/htmlexport/data/extb.html | 10 ++++++ sw/qa/extras/htmlexport/htmlexport.cxx | 13 ++++++++ 7 files changed, 90 insertions(+), 38 deletions(-)
New commits: commit 4647e778993250b8c9431e2890750916fb986ecc Author: Mark Hung <mark...@gmail.com> Date: Sun Dec 27 00:46:49 2015 +0800 tdf#81129 Support reading non-BMP characters in HTML documents. 1. Allow character entity ( &#nnnn; ) to exceed 0xffff in HTMLParser::ScanText() 2. Return a character as sal_uInt32 ( utf32 ) instead of sal_Unicode ( utf16 ) from SvParser::GetNextChar(). Conflicts: sw/qa/extras/htmlexport/htmlexport.cxx Change-Id: Ida455040970fae800f0f11471b27f53461fb78e4 Reviewed-on: https://gerrit.libreoffice.org/21152 Tested-by: Jenkins <c...@libreoffice.org> Reviewed-by: Mark Hung <mark...@gmail.com> diff --git a/include/rtl/character.hxx b/include/rtl/character.hxx index a3d09b9..49f6803 100644 --- a/include/rtl/character.hxx +++ b/include/rtl/character.hxx @@ -222,6 +222,19 @@ sal_uInt32 const surrogatesLowLast = 0xDFFF; } /// @endcond +/** Check if a codepoint is accessible via utf16 per RFC3629 + + @param code A non-BMP Unicode code point. + + @return True if the code is a valid codepoint. + + @since LibreOffice 5.2 +*/ +inline bool isValidCodePoint( sal_uInt32 code) +{ + return code <= 0x10FFFF; +} + /** Check for high surrogate. @param code A Unicode code point. diff --git a/include/svtools/svparser.hxx b/include/svtools/svparser.hxx index 3f60a40..cfbd115 100644 --- a/include/svtools/svparser.hxx +++ b/include/svtools/svparser.hxx @@ -59,7 +59,7 @@ protected: rtl_TextEncoding eSrcEnc; // Source encoding sal_uLong nNextChPos; - sal_Unicode nNextCh; // current character for the "lex" + sal_uInt32 nNextCh; // current character codepoint in UTF32 for the "lex" bool bDownloadingFile : 1; // true: An external file is @@ -128,7 +128,7 @@ public: inline void SetLineNr( sal_uLong nlNum ); // inline bottom inline void SetLinePos( sal_uLong nlPos ); // inline bottom - sal_Unicode GetNextChar(); + sal_uInt32 GetNextChar(); // Return next Unicode codepoint in UTF32. void RereadLookahead(); inline bool IsParserWorking() const { return SVPAR_WORKING == eState; } diff --git a/svtools/source/svhtml/parhtml.cxx b/svtools/source/svhtml/parhtml.cxx index 801e4e0..a8eff6d 100644 --- a/svtools/source/svhtml/parhtml.cxx +++ b/svtools/source/svhtml/parhtml.cxx @@ -25,6 +25,7 @@ #include <tools/color.hxx> #include <rtl/ustrbuf.hxx> #include <rtl/strbuf.hxx> +#include <rtl/character.hxx> #include <tools/tenccvt.hxx> #include <tools/datetime.hxx> @@ -429,7 +430,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak ) OUStringBuffer sTmpBuffer( MAX_LEN ); bool bContinue = true; bool bEqSignFound = false; - sal_Unicode cQuote = 0U; + sal_uInt32 cQuote = 0U; while( bContinue && IsParserWorking() ) { @@ -445,7 +446,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak ) sal_uLong nStreamPos = rInput.Tell(); sal_uLong nLinePos = GetLinePos(); - sal_Unicode cChar = 0U; + sal_uInt32 cChar = 0U; if( '#' == (nNextCh = GetNextChar()) ) { nNextCh = GetNextChar(); @@ -460,10 +461,10 @@ int HTMLParser::ScanText( const sal_Unicode cBreak ) { cChar = cChar * 16U + ( nNextCh <= '9' - ? sal_Unicode( nNextCh - '0' ) + ? sal_uInt32( nNextCh - '0' ) : ( nNextCh <= 'F' - ? sal_Unicode( nNextCh - 'A' + 10 ) - : sal_Unicode( nNextCh - 'a' + 10 ) ) ); + ? sal_uInt32( nNextCh - 'A' + 10 ) + : sal_uInt32( nNextCh - 'a' + 10 ) ) ); nNextCh = GetNextChar(); } } @@ -471,7 +472,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak ) { do { - cChar = cChar * 10U + sal_Unicode( nNextCh - '0'); + cChar = cChar * 10U + sal_uInt32( nNextCh - '0'); nNextCh = GetNextChar(); } while( HTML_ISDIGIT(nNextCh) ); @@ -500,6 +501,9 @@ int HTMLParser::ScanText( const sal_Unicode cBreak ) } else nNextCh = 0U; + + if ( ! rtl::isValidCodePoint( cChar ) ) + cChar = '?'; } else if( HTML_ISALPHA( nNextCh ) ) { @@ -507,7 +511,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak ) sal_Int32 nPos = 0L; do { - sEntityBuffer.append( nNextCh ); + sEntityBuffer.appendUtf32( nNextCh ); nPos++; nNextCh = GetNextChar(); } @@ -637,7 +641,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak ) if( IsParserWorking() ) { if( cChar ) - sTmpBuffer.append( cChar ); + sTmpBuffer.appendUtf32( cChar ); } else if( SVPAR_PENDING==eState && '>'!=cBreak ) { @@ -661,7 +665,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak ) case '=': if( '>'==cBreak && !cQuote ) bEqSignFound = true; - sTmpBuffer.append( nNextCh ); + sTmpBuffer.appendUtf32( nNextCh ); break; case '\\': @@ -684,7 +688,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak ) else if( cQuote && (cQuote==nNextCh ) ) cQuote = 0U; } - sTmpBuffer.append( nNextCh ); + sTmpBuffer.appendUtf32( nNextCh ); bEqSignFound = false; break; @@ -695,14 +699,15 @@ int HTMLParser::ScanText( const sal_Unicode cBreak ) } else { - sTmpBuffer.append( nNextCh ); + sTmpBuffer.appendUtf32( nNextCh ); } + break; case '<': bEqSignFound = false; if( '>'==cBreak ) - sTmpBuffer.append( nNextCh ); + sTmpBuffer.appendUtf32( nNextCh ); else bContinue = false; // break, String zusammen break; @@ -725,7 +730,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak ) if( '>'==cBreak ) { // cr/lf in tag is handled in _GetNextToken() - sTmpBuffer.append( nNextCh ); + sTmpBuffer.appendUtf32( nNextCh ); break; } else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea ) @@ -752,7 +757,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak ) nNextCh = ' '; // no break; case ' ': - sTmpBuffer.append( nNextCh ); + sTmpBuffer.appendUtf32( nNextCh ); if( '>'!=cBreak && (!bReadListing && !bReadXMP && !bReadPRE && !bReadTextArea) ) { @@ -787,7 +792,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak ) { do { // All remaining characters make their way into the text. - sTmpBuffer.append( nNextCh ); + sTmpBuffer.appendUtf32( nNextCh ); if( MAX_LEN == sTmpBuffer.getLength() ) { aToken += sTmpBuffer.makeStringAndClear(); @@ -864,7 +869,7 @@ int HTMLParser::_GetNextRawToken() } else if( '!' == nNextCh ) { - sTmpBuffer.append( nNextCh ); + sTmpBuffer.appendUtf32( nNextCh ); nNextCh = GetNextChar(); } @@ -872,7 +877,7 @@ int HTMLParser::_GetNextRawToken() while( (HTML_ISALPHA(nNextCh) || '-'==nNextCh) && IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN ) { - sTmpBuffer.append( nNextCh ); + sTmpBuffer.appendUtf32( nNextCh ); nNextCh = GetNextChar(); } @@ -959,7 +964,7 @@ int HTMLParser::_GetNextRawToken() } break; case '-': - sTmpBuffer.append( nNextCh ); + sTmpBuffer.appendUtf32( nNextCh ); if( bReadComment ) { bool bTwoMinus = false; @@ -970,7 +975,7 @@ int HTMLParser::_GetNextRawToken() if( MAX_LEN == sTmpBuffer.getLength() ) aToken += sTmpBuffer.makeStringAndClear(); - sTmpBuffer.append( nNextCh ); + sTmpBuffer.appendUtf32( nNextCh ); nNextCh = GetNextChar(); } @@ -1015,7 +1020,7 @@ int HTMLParser::_GetNextRawToken() // no break default: // all remaining characters are appended to the buffer - sTmpBuffer.append( nNextCh ); + sTmpBuffer.appendUtf32( nNextCh ); break; } @@ -1095,7 +1100,7 @@ int HTMLParser::_GetNextToken() { OUStringBuffer sTmpBuffer; do { - sTmpBuffer.append( nNextCh ); + sTmpBuffer.appendUtf32( nNextCh ); if( MAX_LEN == sTmpBuffer.getLength() ) aToken += sTmpBuffer.makeStringAndClear(); nNextCh = GetNextChar(); @@ -1166,10 +1171,10 @@ int HTMLParser::_GetNextToken() } bDone = aToken.endsWith( "--" ); if( !bDone ) - aToken += OUString(nNextCh); + aToken += OUString(&nNextCh,1); } else - aToken += OUString(nNextCh); + aToken += OUString(&nNextCh,1); if( !bDone ) nNextCh = GetNextChar(); } @@ -1261,7 +1266,7 @@ int HTMLParser::_GetNextToken() bDone = '>'==nNextCh && aToken.endsWith("%"); if( !bDone ) { - aToken += OUString(nNextCh); + aToken += OUString(&nNextCh,1); nNextCh = GetNextChar(); } } diff --git a/svtools/source/svrtf/parrtf.cxx b/svtools/source/svrtf/parrtf.cxx index f6f75eb..bdc73d3 100644 --- a/svtools/source/svrtf/parrtf.cxx +++ b/svtools/source/svrtf/parrtf.cxx @@ -191,7 +191,7 @@ int SvRTFParser::_GetNextToken() // can be also \{, \}, \'88 for( sal_uInt8 m = 0; m < nUCharOverread; ++m ) { - sal_Unicode cAnsi = nNextCh; + sal_uInt32 cAnsi = nNextCh; while( 0xD == cAnsi ) cAnsi = GetNextChar(); while( 0xA == cAnsi ) @@ -382,7 +382,7 @@ void SvRTFParser::ScanText( const sal_Unicode cBreak ) case '}': case '{': case '+': // I found in a RTF file - aStrBuffer.append(nNextCh); + aStrBuffer.append(sal_Unicode(nNextCh)); break; case '~': // nonbreaking space aStrBuffer.append(static_cast< sal_Unicode >(0xA0)); @@ -484,7 +484,7 @@ void SvRTFParser::ScanText( const sal_Unicode cBreak ) { do { // all other characters end up in the text - aStrBuffer.append(nNextCh); + aStrBuffer.appendUtf32(nNextCh); if (sal_Unicode(EOF) == (nNextCh = GetNextChar())) { diff --git a/svtools/source/svrtf/svparser.cxx b/svtools/source/svrtf/svparser.cxx index b5c377b..b862e66 100644 --- a/svtools/source/svrtf/svparser.cxx +++ b/svtools/source/svrtf/svparser.cxx @@ -22,6 +22,7 @@ #include <tools/debug.hxx> #include <rtl/textcvt.h> #include <rtl/tencinfo.h> +#include <rtl/character.hxx> #include <vector> @@ -35,7 +36,7 @@ struct SvParser_Impl long nTokenValue; // extra value (RTF) bool bTokenHasValue; // indicates whether nTokenValue is valid int nToken; // actual Token - sal_Unicode nNextCh; // actual character + sal_uInt32 nNextCh; // actual character int nSaveToken; // the token from Continue rtl_TextToUnicodeConverter hConv; @@ -148,9 +149,9 @@ void SvParser::RereadLookahead() nNextCh = GetNextChar(); } -sal_Unicode SvParser::GetNextChar() +sal_uInt32 SvParser::GetNextChar() { - sal_Unicode c = 0U; + sal_uInt32 c = 0U; // When reading multiple bytes, we don't have to care about the file // position when we run into the pending state. The file position is @@ -257,7 +258,7 @@ sal_Unicode SvParser::GetNextChar() ) { // no convserion shall take place - c = (sal_Unicode)c1; + c = reinterpret_cast<sal_uChar&>( c1 ); nChars = 1; } else @@ -280,6 +281,7 @@ sal_Unicode SvParser::GetNextChar() // read enough characters. if( pImplData->hContext != reinterpret_cast<rtl_TextToUnicodeContext>(1) ) { + sal_Unicode sCh[2]; while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL) != 0 ) { rInput.ReadChar( c1 ); @@ -289,7 +291,7 @@ sal_Unicode SvParser::GetNextChar() nChars = rtl_convertTextToUnicode( pImplData->hConv, pImplData->hContext, - &c1, 1, &cUC, 1, + &c1, 1, sCh , 2, RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR| RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR| RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR, @@ -299,7 +301,11 @@ sal_Unicode SvParser::GetNextChar() { if( 1 == nChars && 0 == nInfo ) { - c = cUC; + c = sal_uInt32( sCh[0] ); + } + else if( 2 == nChars && 0 == nInfo ) + { + c = rtl::combineSurrogates( sCh[0], sCh[1] ); } else if( 0 != nChars || 0 != nInfo ) { @@ -311,7 +317,7 @@ sal_Unicode SvParser::GetNextChar() "there is a converted character, but an error" ); // There are still errors, but nothing we can // do - c = (sal_Unicode)'?'; + c = (sal_uInt32)'?'; nChars = 1; } } @@ -356,7 +362,7 @@ sal_Unicode SvParser::GetNextChar() // There are still errors, so we use the first // character and restart after that. - c = (sal_Unicode)sBuffer[0]; + c = reinterpret_cast<sal_uChar&>( sBuffer[0] ); rInput.SeekRel( -(nLen-1) ); nChars = 1; } @@ -378,7 +384,7 @@ sal_Unicode SvParser::GetNextChar() "there is no converted character and no error" ); // #73398#: If the character could not be converted, // because a conversion is not available, do no conversion at all. - c = (sal_Unicode)c1; + c = reinterpret_cast<sal_uChar&>( c1 ); nChars = 1; } @@ -387,6 +393,10 @@ sal_Unicode SvParser::GetNextChar() } while( 0 == nChars && !bErr ); } + + if ( ! rtl::isValidCodePoint( c ) ) + c = (sal_uInt32) '?' ; + if( bErr ) { if( ERRCODE_IO_PENDING == rInput.GetError() ) @@ -405,6 +415,7 @@ sal_Unicode SvParser::GetNextChar() } else IncLinePos(); + return c; } diff --git a/sw/qa/extras/htmlexport/data/extb.html b/sw/qa/extras/htmlexport/data/extb.html new file mode 100644 index 0000000..be73fea --- /dev/null +++ b/sw/qa/extras/htmlexport/data/extb.html @@ -0,0 +1,10 @@ +<!DOCTYPE html> +<html> +<head> +<meta charset="UTF-8"/> +</head> +<body> +<p>ð¤¢</p> +<p>𤭢</p> +</body> +</html> diff --git a/sw/qa/extras/htmlexport/htmlexport.cxx b/sw/qa/extras/htmlexport/htmlexport.cxx index f951a0a..69b6b7d 100644 --- a/sw/qa/extras/htmlexport/htmlexport.cxx +++ b/sw/qa/extras/htmlexport/htmlexport.cxx @@ -272,6 +272,19 @@ DECLARE_HTMLEXPORT_TEST(testTdf83890, "tdf83890.odt") assertXPath(pDoc, "/html/body/ol[2]/ol", "start", "2"); } +DECLARE_HTMLEXPORT_TEST(testExtbChars, "extb.html") +{ + sal_uInt32 nCh = 0x24b62; + OUString aExpected( &nCh, 1); + // Assert that UTF8 encoded non-BMP Unicode character is correct + uno::Reference<text::XTextRange> xTextRange1 = getRun(getParagraph(1), 1); + CPPUNIT_ASSERT_EQUAL(aExpected, xTextRange1->getString()); + + // Assert that non-BMP Unicode in character entity format is correct + uno::Reference<text::XTextRange> xTextRange2 = getRun(getParagraph(2), 1); + CPPUNIT_ASSERT_EQUAL(aExpected, xTextRange2->getString()); +} + CPPUNIT_PLUGIN_IMPLEMENT(); /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
_______________________________________________ Libreoffice-commits mailing list libreoffice-comm...@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/libreoffice-commits