sc/qa/unit/subsequent_export-test.cxx | 4 - sc/source/filter/oox/richstring.cxx | 112 +++++++++++++++++++++++++++++++++- 2 files changed, 113 insertions(+), 3 deletions(-)
New commits: commit 2d5ba784a341aea1b7b2403842d2521d1548ea8f Author: Dennis Francis <dennis.fran...@collabora.com> AuthorDate: Tue Aug 17 14:38:21 2021 +0530 Commit: Andras Timar <andras.ti...@collabora.com> CommitDate: Wed Aug 18 20:28:28 2021 +0200 tdf#118470: sc oox: recover escaped unicode chars in strings import according to OOX open spec 2.1.1742 Part 1 Section 22.9.2.19, ST_Xstring (Escaped String). In this implementation, some restrictions mentioned in this spec are not kept for simplicity. Change-Id: If27797a9625d49be54c600c8a864965f1101ceb1 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/120665 Tested-by: Jenkins Reviewed-by: Andras Timar <andras.ti...@collabora.com> diff --git a/sc/qa/unit/subsequent_export-test.cxx b/sc/qa/unit/subsequent_export-test.cxx index 31c0aae3dd71..e1d3f78bb93b 100644 --- a/sc/qa/unit/subsequent_export-test.cxx +++ b/sc/qa/unit/subsequent_export-test.cxx @@ -3672,9 +3672,9 @@ void ScExportTest::testTdf80149() CPPUNIT_ASSERT_EQUAL(OUString("row 1"), rDoc.GetString(0, 0, 0)); // Without the fix in place, this test would have failed with - // - Expected: Character 0x16 is here ->>_x0016_<<-- + // - Expected: Character 0x16 is here ->><<-- // - Actual : - CPPUNIT_ASSERT_EQUAL(OUString("Character 0x16 is here ->>_x0016_<<--"), rDoc.GetString(1, 0, 0)); + CPPUNIT_ASSERT_EQUAL(OUString("Character 0x16 is here ->><<--"), rDoc.GetString(1, 0, 0)); CPPUNIT_ASSERT_EQUAL(OUString("File opens in libre office, but can't be saved as xlsx"), rDoc.GetString(2, 0, 0)); CPPUNIT_ASSERT_EQUAL(OUString("row 2"), rDoc.GetString(0, 1, 0)); CPPUNIT_ASSERT_EQUAL(OUString("Subsequent rows get truncated"), rDoc.GetString(1, 1, 0)); diff --git a/sc/source/filter/oox/richstring.cxx b/sc/source/filter/oox/richstring.cxx index a9d058f75ba5..7f8809824caa 100644 --- a/sc/source/filter/oox/richstring.cxx +++ b/sc/source/filter/oox/richstring.cxx @@ -48,6 +48,116 @@ bool lclNeedsRichTextFormat( const oox::xls::Font* pFont ) return pFont && pFont->needsRichTextFormat(); } +sal_Int32 lcl_getHexLetterValue(sal_Unicode nCode) +{ + if (nCode >= '0' && nCode <= '9') + return nCode - '0'; + + if (nCode >= 'A' && nCode <= 'F') + return nCode - 'A' + 10; + + if (nCode >= 'a' && nCode <= 'f') + return nCode - 'a' + 10; + + return -1; +} + +bool lcl_validEscape(sal_Unicode nCode) +{ + // Valid XML chars that can be escaped (ignoring the restrictions) as in the OOX open spec + // 2.1.1742 Part 1 Section 22.9.2.19, ST_Xstring (Escaped String) + if (nCode == 0x000D || nCode == 0x000A || nCode == 0x0009 || nCode == 0x005F) + return true; + + // Other valid XML chars in basic multilingual plane that cannot be escaped. + if ((nCode >= 0x0020 && nCode <= 0xD7FF) || (nCode >= 0xE000 && nCode <= 0xFFFD)) + return false; + + return true; +} + +OUString lcl_unEscapeUnicodeChars(const OUString& rSrc) +{ + // Example: Escaped representation of unicode char 0x000D is _x000D_ + + sal_Int32 nLen = rSrc.getLength(); + if (!nLen) + return rSrc; + + sal_Int32 nStart = 0; + bool bFound = true; + const OUString aPrefix = "_x"; + sal_Int32 nPrefixStart = rSrc.indexOf(aPrefix, nStart); + + if (nPrefixStart == -1) + return rSrc; + + OUStringBuffer aBuf(rSrc); + sal_Int32 nOffset = 0; // index offset in aBuf w.r.t rSrc. + + do + { + sal_Int32 nEnd = -1; + sal_Unicode nCode = 0; + bool bFoundThis = false; + for (sal_Int32 nIdx = 0; nIdx < 5; ++nIdx) + { + sal_Int32 nThisIdx = nPrefixStart + nIdx + 2; + if (nThisIdx >= nLen) + break; + + sal_Unicode nThisCode = rSrc[nThisIdx]; + sal_Int32 nLetter = lcl_getHexLetterValue(nThisCode); + + if (!nIdx && nLetter < 0) + break; + + if (nLetter >= 0) + { + nCode = (nCode << 4) + static_cast<sal_Unicode>(nLetter); + } + else if (nThisCode == '_') + { + nEnd = nThisIdx + 1; + bFoundThis = true; + break; + } + else + { + break; + } + } + + if (bFoundThis) + { + // nEnd is already set inside the inner loop in this case. + if (lcl_validEscape(nCode)) + { + bFound = true; + sal_Int32 nEscStrLen = nEnd - nPrefixStart; + aBuf.remove(nPrefixStart - nOffset, nEscStrLen); + aBuf.insert(nPrefixStart - nOffset, nCode); + + nOffset += nEscStrLen - 1; + } + } + else + { + // Start the next search just after last "_x" + nEnd = nPrefixStart + 2; + } + + nStart = nEnd; + nPrefixStart = rSrc.indexOf(aPrefix, nStart); + } + while (nPrefixStart != -1); + + if (bFound) + return aBuf.makeStringAndClear(); + + return rSrc; +} + } // namespace RichStringPortion::RichStringPortion( const WorkbookHelper& rHelper ) : @@ -59,7 +169,7 @@ RichStringPortion::RichStringPortion( const WorkbookHelper& rHelper ) : void RichStringPortion::setText( const OUString& rText ) { - maText = rText; + maText = lcl_unEscapeUnicodeChars(rText); } FontRef const & RichStringPortion::createFont()