sdext/source/pdfimport/test/testdocs/tdf104597_textrun.pdf |binary
 sdext/source/pdfimport/test/tests.cxx                      |   44 +++++++++++++
 sdext/source/pdfimport/tree/drawtreevisiting.cxx           |   17 +----
 sdext/source/pdfimport/tree/pdfiprocessor.cxx              |   18 -----
 4 files changed, 48 insertions(+), 31 deletions(-)

New commits:
commit 69e9925ded584113e52f84ef0ed7c224079fa061
Author:     Kevin Suo <suokunl...@126.com>
AuthorDate: Tue Oct 11 10:04:16 2022 +0800
Commit:     Thorsten Behrens <thorsten.behr...@allotropia.de>
CommitDate: Thu Oct 13 21:38:12 2022 +0200

    sdext.pdfimport: resolves tdf#104597: RTL script text runs are reversed
    
    For the simple Arabic string: ٱلسَّلَامُ عَلَيْك, the xpdfimport binary 
generates the
    follwing (drawchar) sequences:
    كَ
    يْ
    لَ
    عَ
    مُ
    ا
    لَ
    سَّ
    ل
    ٱ
    (i.e., in reversed order, one character by one character).
    
    Before this patch, after pdfimport the text shows up as لَسَّلٱ كَيْلَعَ 
مُا, which is reversed.
    
    It was surposed to combine these characters into text frames in
    DrawXmlOptimizer::optimizeTextElements(Element& rParent) 
(sdext/source/pdfimport/\
    tree/drawtreevisiting.cxx:677), but actually it was not combined 
successfully there.
    The single characters were then passed to 
sdext/source/pdfimport/tree/drawtreevisiting\
    .cxx:105, one by one, in the hope that the strings could be mirrored. The 
mirroring
    failed, because one single character, even after mirroring, always equals 
itself.
    
    The DrawXmlOptimizer::optimizeTextElements failed to combine the characters 
into
    one text frame, because the condition:
    (rCurGC.Transformation == rNextGC.Transformation || notTransformed(rNextGC))
    would never be true, as at least its horizontal position is different. A 
further analysis
    indicates that we do not need to check the transformation here at all, as 
this is an
    optimizer for a TextElement and in case a character is transformed then it 
would already
    be in a different draw element (thus will never be combined with this 
TextElement).
    
    After the fix of DrawXmlOptimizer::optimizeTextElements which now 
successfully
    combines the characters, there is another issue in the old 
PDFIProcessor::mirrorString
    function. It seems to mirror the characters within a word, but if a string 
contains
    two words, then the two words are not successfully switched (e.g. for 
string "abc def"
    it produces "cba fed" rather than "fed cba"),  which is not suitable for
    the case of RTL which requires all the characters been reversed. Fix this 
by using
    comphelper::string::reverseString.
    
    Change-Id: Ifa210836f1d6666dd56205ff0d243adfb4114794
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/141231
    Tested-by: Jenkins
    Reviewed-by: Thorsten Behrens <thorsten.behr...@allotropia.de>

diff --git a/sdext/source/pdfimport/test/testdocs/tdf104597_textrun.pdf 
b/sdext/source/pdfimport/test/testdocs/tdf104597_textrun.pdf
new file mode 100644
index 000000000000..dcee96aa3169
Binary files /dev/null and 
b/sdext/source/pdfimport/test/testdocs/tdf104597_textrun.pdf differ
diff --git a/sdext/source/pdfimport/test/tests.cxx 
b/sdext/source/pdfimport/test/tests.cxx
index 193ec2227620..25c12a23901c 100644
--- a/sdext/source/pdfimport/test/tests.cxx
+++ b/sdext/source/pdfimport/test/tests.cxx
@@ -29,6 +29,7 @@
 #include <rtl/math.hxx>
 #include <osl/file.hxx>
 #include <comphelper/sequence.hxx>
+#include <comphelper/string.hxx>
 
 #include <cppunit/TestAssert.h>
 #include <cppunit/extensions/HelperMacros.h>
@@ -786,6 +787,48 @@ namespace
 #endif
         }
 
+        void testTdf104597_textrun()
+        {
+#if HAVE_FEATURE_POPPLER
+            rtl::Reference<pdfi::PDFIRawAdaptor> xAdaptor(new 
pdfi::PDFIRawAdaptor(OUString(), getComponentContext()));
+            xAdaptor->setTreeVisitorFactory(createDrawTreeVisitorFactory());
+
+            OString aOutput;
+            CPPUNIT_ASSERT_MESSAGE("Converting PDF to ODF XML",
+                
xAdaptor->odfConvert(m_directories.getURLFromSrc(u"/sdext/source/pdfimport/test/testdocs/tdf104597_textrun.pdf"),
+                    new OutputWrapString(aOutput),
+                    nullptr));
+
+            // std::cout << aOutput << std::endl;
+            xmlDocUniquePtr pXmlDoc(xmlParseDoc(reinterpret_cast<xmlChar const 
*>(aOutput.getStr())));
+
+            // Test for امُ عَلَيْكَ
+            // TODO: How to get the "عَلَيْكَ" in xpath, as shown after the 
<text:s> tag?
+            OString xpath = 
"//draw:frame[@draw:transform='matrix(917.222222222222 0 0 917.222222222222 
14821.9583333333 2159.23861112778)']/draw:text-box/text:p/text:span";
+            OUString sContent = getXPathContent(pXmlDoc, xpath); // 
u"\nا\nُ\nم\n"
+            CPPUNIT_ASSERT_EQUAL(OUString(u"اُم"), sContent.replaceAll("\n", 
""));
+
+            // Test for ٱلَّسَل‬ . It appears in the 3rd frame, i.e. after the 
امُ عَلَيْكَ which is in the 2nd frame (from left to right)
+            // thus these two frames together appear as ٱلَّسَل امُ عَلَيْكَ 
in Draw‬.
+            xpath = "//draw:frame[@draw:transform='matrix(917.222222222222 0 0 
917.222222222222 17420.1666666667 
2159.23861112778)']/draw:text-box/text:p/text:span";
+            sContent = getXPathContent(pXmlDoc, xpath);
+            CPPUNIT_ASSERT_EQUAL(OUString(u"ٱلَّسَل"), 
sContent.replaceAll("\n", ""));
+
+            // Test for "LibreOffice LTR"
+            // TODO: How to get the "LTR" as shown after the <text:s> tag?
+            xpath = "//draw:frame[@draw:transform='matrix(917.222222222222 0 0 
917.222222222222 12779.375 5121.79583335)']/draw:text-box/text:p/text:span";
+            sContent = getXPathContent(pXmlDoc, xpath);
+            CPPUNIT_ASSERT_EQUAL(OUString(u"LibreOffice"), 
sContent.replaceAll("\n", ""));
+
+            /* Test for Chinese characters */
+            // Use last() instead of matrix below, because the matrix may be 
different on different OS due to fallback of Chinese fonts.
+            xpath = "//draw:frame[last()]/draw:text-box/text:p/text:span";
+            sContent = getXPathContent(pXmlDoc, xpath);
+            CPPUNIT_ASSERT_EQUAL(OUString(u"中文测试,中文"), 
sContent.replaceAll("\n", ""));
+#endif
+        }
+
+
         CPPUNIT_TEST_SUITE(PDFITest);
         CPPUNIT_TEST(testXPDFParser);
         CPPUNIT_TEST(testOdfWriterExport);
@@ -797,6 +840,7 @@ namespace
         CPPUNIT_TEST(testTdf78427_FontFeatures);
         CPPUNIT_TEST(testTdf78427_FontWeight_MyraidProSemibold);
         CPPUNIT_TEST(testTdf143959_nameFromFontFile);
+        CPPUNIT_TEST(testTdf104597_textrun);
         CPPUNIT_TEST_SUITE_END();
     };
 
diff --git a/sdext/source/pdfimport/tree/drawtreevisiting.cxx 
b/sdext/source/pdfimport/tree/drawtreevisiting.cxx
index 51ef5f232d85..9f760150343c 100644
--- a/sdext/source/pdfimport/tree/drawtreevisiting.cxx
+++ b/sdext/source/pdfimport/tree/drawtreevisiting.cxx
@@ -32,6 +32,7 @@
 #include <com/sun/star/i18n/CharacterClassification.hpp>
 #include <com/sun/star/i18n/ScriptType.hpp>
 #include <com/sun/star/i18n/DirectionProperty.hpp>
+#include <comphelper/string.hxx>
 
 #include <string.h>
 #include <string_view>
@@ -122,7 +123,7 @@ void DrawXmlEmitter::visit( TextElement& elem, const 
std::list< std::unique_ptr<
     }
 
     if (isRTL)  // If so, reverse string
-        str = PDFIProcessor::mirrorString( str );
+        str = ::comphelper::string::reverseString(str);
 
     m_rEmitContext.rEmitter.beginTag( "text:span", aProps );
 
@@ -656,15 +657,6 @@ static bool isSpaces(TextElement* pTextElem)
     return true;
 }
 
-static bool notTransformed(const GraphicsContext& GC)
-{
-    return
-        rtl::math::approxEqual(GC.Transformation.get(0,0), 100.00) &&
-        GC.Transformation.get(1,0) == 0.00 &&
-        GC.Transformation.get(0,1) == 0.00 &&
-        rtl::math::approxEqual(GC.Transformation.get(1,1), -100.00);
-}
-
 void DrawXmlOptimizer::optimizeTextElements(Element& rParent)
 {
     if( rParent.Children.empty() ) // this should not happen
@@ -705,13 +697,12 @@ void DrawXmlOptimizer::optimizeTextElements(Element& 
rParent)
                 // line and space optimization; works only in strictly 
horizontal mode
 
                 // concatenate consecutive text elements unless there is a
-                // font or text color or matrix change, leave a new span in 
that case
+                // font or text color change, leave a new span in that case
                 if( (pCur->FontId == pNext->FontId || isSpaces(pNext)) &&
                     rCurGC.FillColor.Red == rNextGC.FillColor.Red &&
                     rCurGC.FillColor.Green == rNextGC.FillColor.Green &&
                     rCurGC.FillColor.Blue == rNextGC.FillColor.Blue &&
-                    rCurGC.FillColor.Alpha == rNextGC.FillColor.Alpha &&
-                    (rCurGC.Transformation == rNextGC.Transformation || 
notTransformed(rNextGC))
+                    rCurGC.FillColor.Alpha == rNextGC.FillColor.Alpha
                     )
                 {
                     pCur->updateGeometryWith( pNext );
diff --git a/sdext/source/pdfimport/tree/pdfiprocessor.cxx 
b/sdext/source/pdfimport/tree/pdfiprocessor.cxx
index 40ee39705c25..164e06d85bde 100644
--- a/sdext/source/pdfimport/tree/pdfiprocessor.cxx
+++ b/sdext/source/pdfimport/tree/pdfiprocessor.cxx
@@ -695,24 +695,6 @@ void PDFIProcessor::sortElements(Element* pEle)
     pEle->Children.sort(lr_tb_sort);
 }
 
-// helper method: get a mirrored string
-OUString PDFIProcessor::mirrorString( const OUString& i_rString )
-{
-    const sal_Int32 nLen = i_rString.getLength();
-    OUStringBuffer aMirror( nLen );
-
-    sal_Int32 i = 0;
-    while(i < nLen)
-    {
-        // read one code point
-        const sal_uInt32 nCodePoint = i_rString.iterateCodePoints( &i );
-
-        // and append it mirrored
-        aMirror.appendUtf32( GetMirroredChar(nCodePoint) );
-    }
-    return aMirror.makeStringAndClear();
-}
-
 }
 
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Reply via email to