comphelper/qa/string/test_string.cxx               |   29 ++++++++-
 comphelper/source/misc/string.cxx                  |    9 ++
 compilerplugins/clang/stringliteralvar.cxx         |   64 +++++++++++++++++++++
 compilerplugins/clang/test/stringliteralvar.cxx    |   21 ++++++
 include/comphelper/string.hxx                      |    5 +
 sdext/source/pdfimport/tree/drawtreevisiting.cxx   |    2 
 sdext/source/pdfimport/tree/writertreevisiting.cxx |    2 
 7 files changed, 127 insertions(+), 5 deletions(-)

New commits:
commit f1db364f294d2d9a40d77004aeeb36729ae1c4ca
Author:     Stephan Bergmann <sberg...@redhat.com>
AuthorDate: Fri Nov 18 16:47:07 2022 +0100
Commit:     Xisco Fauli <xiscofa...@libreoffice.org>
CommitDate: Fri Nov 25 17:48:10 2022 +0100

    Related tdf#104597, tdf#151546: Introduce 
comphelper::string::reverseCodePoints
    
    69e9925ded584113e52f84ef0ed7c224079fa061 "sdext.pdfimport: resolves 
tdf#104597:
    RTL script text runs are reversed" and 
f6004e1c457ddab5e0c91e6159875d25130b108a
    "tdf#151546: RTL text is reversed (Writer pdfimport)" had introduced two 
calls
    to comphelper::string::reverseString into sdext.  That function reverts on 
the
    basis of individual UTF-16 code units, not on the basis of Unicode code 
points.
    And while at least some pre-existing callers of that function want the 
former
    semantics (see below), these two new callers in sdext apparently want the 
latter
    semantics.  Therefore, introduce an additional function
    comphelper::string::reverseCodePoints with the latter semantics.
    
    I identified three other places that call comphelper::string::reverseString:
    * SbRtl_StrReverse in basic/source/runtime/methods1.cxx apparently 
implements
      some StrReverse Basic function, where a (presumably non-existing) Basic 
spec
      would need to decide which of the two semantics is called for.  So leave 
it
      alone for now.
    * SvtFileDialog::IsolateFilterFromPath_Impl in 
fpicker/source/office/iodlg.cxx
      reverts a string, operates on it, then reverts (parts of) it back.  
Whether or
      not that is the most elegant code, using the latter semantics here would
      apparently be wrong, as double invocation of
      comphelper::string::reverseCodePoints is not idempotent when the input is 
a
      malformed sequence of UTF-16 code units containing a low surrogate 
followed by
      a high surrogate.
    * AccessibleCell::getCellName in svx/source/table/accessiblecell.cxx 
apparently
      always operates on a string consisting only of Latin uppercase letters 
A--Z,
      for which both semantics are equivalent.  (So we can just as well stick 
with
      the simpler comphelper::string::reverseString here.)
    
    (Extending the tests in comphelper/qa/string/test_string.cxx ran into an 
issue
    where loplugin:stringliteralvar warns about deliberate uses of sal_Unicode
    arrays rather than UTF-16 string literals wrapped in OUStringLiteral, as 
those
    arrays deliberately contain malformed UTF-16 code unit sequences and thus
    converting them into UTF-16 string literals might be considered 
inappropriate,
    see the newly added comment at
    StringLiteralVar::isPotentiallyInitializedWithMalformedUtf16 in
    compilerplugins/clang/stringliteralvar.cxx for details.  So that loplugin 
had to
    be improved here, too.)
    
    Change-Id: I641cc32c76b0c5f6339ae44d8aa85df0022ffb05
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/142949
    Tested-by: Jenkins
    Reviewed-by: Stephan Bergmann <sberg...@redhat.com>
    Signed-off-by: Xisco Fauli <xiscofa...@libreoffice.org>
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/143290

diff --git a/comphelper/qa/string/test_string.cxx 
b/comphelper/qa/string/test_string.cxx
index 0a9850ed920f..58f9c3f63c16 100644
--- a/comphelper/qa/string/test_string.cxx
+++ b/comphelper/qa/string/test_string.cxx
@@ -17,6 +17,10 @@
  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  */
 
+#include <sal/config.h>
+
+#include <iterator>
+
 #include <comphelper/string.hxx>
 #include <cppuhelper/implbase.hxx>
 #include <com/sun/star/i18n/CharType.hpp>
@@ -43,6 +47,7 @@ public:
     void testDecimalStringToNumber();
     void testIsdigitAsciiString();
     void testReverseString();
+    void testReverseCodePoints();
     void testSplit();
     void testRemoveAny();
 
@@ -55,6 +60,7 @@ public:
     CPPUNIT_TEST(testDecimalStringToNumber);
     CPPUNIT_TEST(testIsdigitAsciiString);
     CPPUNIT_TEST(testReverseString);
+    CPPUNIT_TEST(testReverseCodePoints);
     CPPUNIT_TEST(testSplit);
     CPPUNIT_TEST(testRemoveAny);
     CPPUNIT_TEST_SUITE_END();
@@ -178,9 +184,28 @@ void TestString::testTokenCount()
 
 void TestString::testReverseString()
 {
-    OString aOut = ::comphelper::string::reverseString("ABC");
+    CPPUNIT_ASSERT_EQUAL(OUString(), comphelper::string::reverseString(u""));
+    CPPUNIT_ASSERT_EQUAL(OUString("cba"), 
comphelper::string::reverseString(u"abc"));
+    static sal_Unicode const rev[] = {'w', 0xDFFF, 0xDBFF, 'v', 0xDC00, 
0xD800, 'u'};
+    CPPUNIT_ASSERT_EQUAL(
+        OUString(rev, std::size(rev)),
+        comphelper::string::reverseString(u"u\U00010000v\U0010FFFFw"));
+    static sal_Unicode const malformed[] = {0xDC00, 0xD800};
+    CPPUNIT_ASSERT_EQUAL(
+        OUString(u"\U00010000"),
+        comphelper::string::reverseString(std::u16string_view(malformed, 
std::size(malformed))));
+}
 
-    CPPUNIT_ASSERT_EQUAL(OString("CBA"), aOut);
+void TestString::testReverseCodePoints() {
+    CPPUNIT_ASSERT_EQUAL(OUString(), 
comphelper::string::reverseCodePoints(""));
+    CPPUNIT_ASSERT_EQUAL(OUString("cba"), 
comphelper::string::reverseCodePoints("abc"));
+    CPPUNIT_ASSERT_EQUAL(
+        OUString(u"w\U0010FFFFv\U00010000u"),
+        comphelper::string::reverseCodePoints(u"u\U00010000v\U0010FFFFw"));
+    static sal_Unicode const malformed[] = {0xDC00, 0xD800};
+    CPPUNIT_ASSERT_EQUAL(
+        OUString(u"\U00010000"),
+        comphelper::string::reverseCodePoints(OUString(malformed, 
std::size(malformed))));
 }
 
 void TestString::testSplit()
diff --git a/comphelper/source/misc/string.cxx 
b/comphelper/source/misc/string.cxx
index 3b875a78e9a4..d40b9136bcde 100644
--- a/comphelper/source/misc/string.cxx
+++ b/comphelper/source/misc/string.cxx
@@ -496,6 +496,15 @@ OString reverseString(std::string_view rStr)
     return tmpl_reverseString<OString, std::string_view, OStringBuffer>(rStr);
 }
 
+OUString reverseCodePoints(OUString const & str) {
+    auto const len = str.getLength();
+    OUStringBuffer buf(len);
+    for (auto i = len; i != 0;) {
+        buf.appendUtf32(str.iterateCodePoints(&i, -1));
+    }
+    return buf.makeStringAndClear();
+}
+
 sal_Int32 indexOfAny(std::u16string_view rIn,
         sal_Unicode const*const pChars, sal_Int32 const nPos)
 {
diff --git a/compilerplugins/clang/stringliteralvar.cxx 
b/compilerplugins/clang/stringliteralvar.cxx
index 5ace384f1e16..fcd3690669e7 100644
--- a/compilerplugins/clang/stringliteralvar.cxx
+++ b/compilerplugins/clang/stringliteralvar.cxx
@@ -28,6 +28,7 @@
 #include <cassert>
 
 #include "check.hxx"
+#include "compat.hxx"
 #include "plugin.hxx"
 
 namespace
@@ -137,6 +138,10 @@ public:
                     return true;
                 }
                 auto const d = e1->getDecl();
+                if (isPotentiallyInitializedWithMalformedUtf16(d))
+                {
+                    return true;
+                }
                 if (!reportedArray_.insert(d).second)
                 {
                     return true;
@@ -188,6 +193,10 @@ public:
             return true;
         }
         auto const d = e->getDecl();
+        if (isPotentiallyInitializedWithMalformedUtf16(d))
+        {
+            return true;
+        }
         if (!reportedArray_.insert(d).second)
         {
             return true;
@@ -246,6 +255,61 @@ private:
         }
     }
 
+    // There is some confusion on the semantics of numeric-escape-sequences in 
string literals, see
+    // <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2029r4.html> 
"Proposed resolution
+    // for core issues 411, 1656, and 2333; numeric and universal character 
escapes in character and
+    // string literals", so suppress warnings about arrays that are 
deliberately not written as
+    // UTF-16 string literals because they contain lone surrogates:
+    bool isPotentiallyInitializedWithMalformedUtf16(ValueDecl const* decl) 
const
+    {
+        if (!decl->getType()->getArrayElementTypeNoTypeQual()->isChar16Type())
+        {
+            return false;
+        }
+        auto const init = cast<VarDecl>(decl)->getAnyInitializer();
+        if (init == nullptr)
+        {
+            return true;
+        }
+        auto const list = dyn_cast<InitListExpr>(init);
+        if (list == nullptr)
+        {
+            // Assuming that the initializer already is a string literal, 
assume that that string
+            // literal has no issues with malformed UTF-16:
+            if (isDebugMode())
+            {
+                assert(isa<clang::StringLiteral>(init));
+            }
+            return false;
+        }
+        auto highSurrogate = false;
+        for (auto const e : list->inits())
+        {
+            llvm::APSInt v;
+            if (!compat::EvaluateAsInt(e, v, compiler.getASTContext()))
+            {
+                return true;
+            }
+            if (highSurrogate)
+            {
+                if (v < 0xDC00 || v > 0xDFFF)
+                {
+                    return true;
+                }
+                highSurrogate = false;
+            }
+            else if (v >= 0xD800 && v <= 0xDBFF)
+            {
+                highSurrogate = true;
+            }
+            else if (v >= 0xDC00 && v <= 0xDFFF)
+            {
+                return true;
+            }
+        }
+        return highSurrogate;
+    }
+
     std::set<Decl const*> reportedAutomatic_;
     std::set<Decl const*> reportedArray_;
 };
diff --git a/compilerplugins/clang/test/stringliteralvar.cxx 
b/compilerplugins/clang/test/stringliteralvar.cxx
index 6e181be025a8..b79ee95a0669 100644
--- a/compilerplugins/clang/test/stringliteralvar.cxx
+++ b/compilerplugins/clang/test/stringliteralvar.cxx
@@ -110,4 +110,25 @@ void f11(int nStreamType)
     (void)sStreamType;
 }
 
+extern sal_Unicode const extarr[1];
+
+sal_Unicode init();
+
+void f12()
+{
+    // Suppress warnings if the array contains a malformed sequence of UTF-16 
code units...:
+    static sal_Unicode const arr1[] = { 0xD800 };
+    f(OUString(arr1, 1));
+    // ...Or potentially contains a malformed sequence of UTF-16 code units...:
+    f(OUString(extarr, 1));
+    sal_Unicode const arr2[] = { init() };
+    f(OUString(arr2, 1));
+    // ...But generate a warning if the array contains a well-formed sequence 
of UTF-16 code units
+    // containing surrogates:
+    // expected-error-re@+1 {{change type of variable 'arr3' from constant 
character array ('const sal_Unicode{{ ?}}[2]'{{( \(aka 'const 
char16_t\[2\]'\))?}}) to OUStringLiteral [loplugin:stringliteralvar]}}
+    static sal_Unicode const arr3[] = { 0xD800, 0xDC00 };
+    // expected-note-re@+1 {{first passed into a '{{(rtl::)?}}OUString' 
constructor here [loplugin:stringliteralvar]}}
+    f(OUString(arr3, 2));
+}
+
 /* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s 
cinkeys+=0=break: */
diff --git a/include/comphelper/string.hxx b/include/comphelper/string.hxx
index cbed62679f3c..a722e2cc428d 100644
--- a/include/comphelper/string.hxx
+++ b/include/comphelper/string.hxx
@@ -150,7 +150,7 @@ COMPHELPER_DLLPUBLIC sal_Int32 
getTokenCount(std::string_view rIn, char cTok);
 */
 COMPHELPER_DLLPUBLIC sal_Int32 getTokenCount(std::u16string_view rIn, 
sal_Unicode cTok);
 
-/** Reverse an OUString
+/** Reverse an OUString's UTF-16 code units.
 
   @param    rIn     the input OUString
   @return   the reversed input
@@ -164,6 +164,9 @@ COMPHELPER_DLLPUBLIC OUString 
reverseString(std::u16string_view rStr);
 */
 COMPHELPER_DLLPUBLIC OString reverseString(std::string_view rStr);
 
+/** Reverse an OUString's Unicode code points.
+*/
+COMPHELPER_DLLPUBLIC OUString reverseCodePoints(OUString const & str);
 
 namespace detail
 {
diff --git a/sdext/source/pdfimport/tree/drawtreevisiting.cxx 
b/sdext/source/pdfimport/tree/drawtreevisiting.cxx
index 5a811f20eede..13034dc9a19e 100644
--- a/sdext/source/pdfimport/tree/drawtreevisiting.cxx
+++ b/sdext/source/pdfimport/tree/drawtreevisiting.cxx
@@ -123,7 +123,7 @@ void DrawXmlEmitter::visit( TextElement& elem, const 
std::list< std::unique_ptr<
     }
 
     if (isRTL)  // If so, reverse string
-        str = ::comphelper::string::reverseString(str);
+        str = ::comphelper::string::reverseCodePoints(str);
 
     m_rEmitContext.rEmitter.beginTag( "text:span", aProps );
 
diff --git a/sdext/source/pdfimport/tree/writertreevisiting.cxx 
b/sdext/source/pdfimport/tree/writertreevisiting.cxx
index 060990f003b9..9ba6efd4265e 100644
--- a/sdext/source/pdfimport/tree/writertreevisiting.cxx
+++ b/sdext/source/pdfimport/tree/writertreevisiting.cxx
@@ -112,7 +112,7 @@ void WriterXmlEmitter::visit( TextElement& elem, const 
std::list< std::unique_pt
     }
 
     if (isRTL)  // If so, reverse string
-        str = ::comphelper::string::reverseString(str);
+        str = ::comphelper::string::reverseCodePoints(str);
 
     m_rEmitContext.rEmitter.beginTag( "text:span", aProps );
 

Reply via email to