svtools/source/svrtf/svparser.cxx            |   20 ++++++++++++++++++++
 sw/qa/extras/htmlimport/data/emojis16BE.html |binary
 sw/qa/extras/htmlimport/htmlimport.cxx       |    8 ++++++++
 3 files changed, 28 insertions(+)

New commits:
commit 21154ea8c450f9f5568b32123d34a20e498a9290
Author:     Mike Kaganski <mike.kagan...@collabora.com>
AuthorDate: Sat Dec 11 12:53:26 2021 +0300
Commit:     Mike Kaganski <mike.kagan...@collabora.com>
CommitDate: Sat Dec 11 12:21:57 2021 +0100

    tdf#146173: combine non-BMP characters' surrogates correctly
    
    Change-Id: Ib3af1f9e461f133d2f5b09b9db4fb87c1ede0b9f
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/126658
    Tested-by: Jenkins
    Reviewed-by: Mike Kaganski <mike.kagan...@collabora.com>

diff --git a/svtools/source/svrtf/svparser.cxx 
b/svtools/source/svrtf/svparser.cxx
index 0fec7a97097e..dd5068976ff7 100644
--- a/svtools/source/svrtf/svparser.cxx
+++ b/svtools/source/svrtf/svparser.cxx
@@ -247,7 +247,27 @@ sal_uInt32 SvParser<T>::GetNextChar()
         rInput.ReadUtf16(cUC);
         bErr = !rInput.good();
         if( !bErr )
+        {
             c = cUC;
+            if (rtl::isHighSurrogate(cUC))
+            {
+                const sal_uInt64 nPos = rInput.Tell();
+                rInput.ReadUtf16(cUC);
+                bErr = !rInput.good();
+                if (!bErr)
+                {
+                    if (rtl::isLowSurrogate(cUC))
+                        c = rtl::combineSurrogates(c, cUC);
+                    else
+                        rInput.Seek(nPos); // process lone high surrogate
+                }
+                else
+                {
+                    bErr = false; // process lone high surrogate
+                    rInput.Seek(nPos); // maybe step 1 byte back
+                }
+            }
+        }
     }
     else
     {
diff --git a/sw/qa/extras/htmlimport/data/emojis16BE.html 
b/sw/qa/extras/htmlimport/data/emojis16BE.html
new file mode 100644
index 000000000000..023aee1cb20e
Binary files /dev/null and b/sw/qa/extras/htmlimport/data/emojis16BE.html differ
diff --git a/sw/qa/extras/htmlimport/htmlimport.cxx 
b/sw/qa/extras/htmlimport/htmlimport.cxx
index a8b9a4df65a4..00e2ec99191f 100644
--- a/sw/qa/extras/htmlimport/htmlimport.cxx
+++ b/sw/qa/extras/htmlimport/htmlimport.cxx
@@ -528,6 +528,14 @@ CPPUNIT_TEST_FIXTURE(SwModelTestBase, testOleImgSvg)
     CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), xObjects->getCount());
 }
 
+CPPUNIT_TEST_FIXTURE(HtmlImportTest, testUTF16_nonBMP)
+{
+    load(mpTestDocumentPath, "emojis16BE.html");
+    // tdf#146173: non-BMP characters' surrogates didn't combine correctly
+    CPPUNIT_ASSERT_EQUAL(OUString(u"a text with emojis: 🌾 β˜€πŸ‘¨πŸΌβ€πŸŒΎπŸƒπŸΌβ€β™‚οΈπŸ€™πŸ½πŸ”"),
+                         getParagraph(1)->getString());
+}
+
 CPPUNIT_PLUGIN_IMPLEMENT();
 
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Reply via email to