vcl/qa/cppunit/pdfexport/pdfexport2.cxx |    2 
 vcl/source/pdf/PDFiumLibrary.cxx        |   74 +++++++++++++++++++++++++++++---
 2 files changed, 70 insertions(+), 6 deletions(-)

New commits:
commit 25550b2daf29a4eb766dd22692c43b7be354a87c
Author:     Caolán McNamara <[email protected]>
AuthorDate: Wed Oct 8 21:23:57 2025 +0100
Commit:     Miklos Vajna <[email protected]>
CommitDate: Thu Oct 9 13:41:00 2025 +0200

    use FPDFText_GetUnicode to get text
    
    instead of FPDFTextObj_GetText, which is returning 0x2
    for some hyphens. If we use the slightly lower level
    apis we can get info as to substituted hyphens.
    
    Change-Id: I26efa9f1acb5ba819b63034399da4f1961373f13
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/192081
    Tested-by: Jenkins CollaboraOffice <[email protected]>
    Reviewed-by: Miklos Vajna <[email protected]>

diff --git a/vcl/qa/cppunit/pdfexport/pdfexport2.cxx 
b/vcl/qa/cppunit/pdfexport/pdfexport2.cxx
index 0713269c8f20..47c0a7f1fdb1 100644
--- a/vcl/qa/cppunit/pdfexport/pdfexport2.cxx
+++ b/vcl/qa/cppunit/pdfexport/pdfexport2.cxx
@@ -5528,7 +5528,7 @@ CPPUNIT_TEST_FIXTURE(PdfExportTest2, 
testTdf162194SoftHyphen)
 
     CPPUNIT_ASSERT_EQUAL(u"Waffle"_ustr, aText.at(0).trim());
     CPPUNIT_ASSERT_EQUAL(u"AAA Waf"_ustr, aText.at(1).trim());
-    CPPUNIT_ASSERT_EQUAL(u""_ustr, aText.at(2).trim());
+    CPPUNIT_ASSERT_EQUAL(u"-"_ustr, aText.at(2).trim());
     CPPUNIT_ASSERT_EQUAL(u"fle"_ustr, aText.at(3).trim());
 }
 
diff --git a/vcl/source/pdf/PDFiumLibrary.cxx b/vcl/source/pdf/PDFiumLibrary.cxx
index 17265501be58..381080b583d9 100644
--- a/vcl/source/pdf/PDFiumLibrary.cxx
+++ b/vcl/source/pdf/PDFiumLibrary.cxx
@@ -466,8 +466,6 @@ public:
     PDFiumTextPageImpl(FPDF_TEXTPAGE pTextPage);
     ~PDFiumTextPageImpl();
 
-    FPDF_TEXTPAGE getPointer() { return mpTextPage; }
-
     int countChars() override;
     unsigned int getUnicode(int index) override;
     std::unique_ptr<PDFiumSearchHandle> findStart(const OUString& rFindWhat, 
PDFFindFlags nFlags,
@@ -475,6 +473,73 @@ public:
 
     /// Returned rect is no longer upside down and is in mm100.
     basegfx::B2DRectangle getCharBox(int nIndex, double fPageHeight) override;
+
+    OUString getText(FPDF_PAGEOBJECT pPageObject)
+    {
+        OUStringBuffer aResult;
+
+        bool containsPreChar = false;
+        bool addLineFeed = false;
+        double posY(0), originX(0.0), originY(0.0);
+
+        // FPDFTextObj_GetText also does a similar loop over the entire
+        // contents of the text page, this is the intended to be the equivalent
+        // of that except for (currently) added recovery of hyphens.
+        int count = FPDFText_CountChars(mpTextPage);
+        for (int i = 0; i < count; ++i)
+        {
+            FPDF_PAGEOBJECT pOwner = FPDFText_GetTextObject(mpTextPage, i);
+            sal_Unicode cUnicode = FPDFText_GetUnicode(mpTextPage, i);
+            if (pOwner == pPageObject)
+            {
+                FPDFText_GetCharOrigin(mpTextPage, i, &originX, &originY);
+
+                if (fabs(posY - originY) > 0 && !containsPreChar && 
addLineFeed)
+                {
+                    posY = originY;
+                    if (!aResult.isEmpty())
+                        aResult.append("
");
+                }
+                containsPreChar = true;
+                addLineFeed = false;
+
+                switch (cUnicode)
+                {
+                    case 0:
+                        SAL_INFO("vcl.filter", "PDFiumImpl: cannot get unicode 
for char");
+                        break;
+                    default:
+                        aResult.append(cUnicode);
+                        break;
+                    case 0x2: // oddly pdfium replaces some '-' with 2.
+                    {
+                        int isHyphen = FPDFText_IsHyphen(mpTextPage, i);
+                        if (isHyphen == 1)
+                            aResult.append('-');
+                        else
+                        {
+                            SAL_WARN_IF(isHyphen == -1, "vcl.filter",
+                                        "PDFiumImpl: FPDFText_IsHyphen 
failure");
+                            aResult.append(cUnicode);
+                        }
+                    }
+                    break;
+                }
+            }
+            else if (cUnicode == ' ' && containsPreChar)
+            {
+                aResult.append(' ');
+                containsPreChar = false;
+                addLineFeed = false;
+            }
+            else
+            {
+                containsPreChar = false;
+                addLineFeed = true;
+            }
+        }
+        return aResult.toString();
+    }
 };
 
 class PDFiumSignatureImpl final : public PDFiumSignature
@@ -1077,9 +1142,8 @@ 
PDFiumPageObjectImpl::PDFiumPageObjectImpl(FPDF_PAGEOBJECT pPageObject)
 OUString PDFiumPageObjectImpl::getText(std::unique_ptr<PDFiumTextPage> const& 
rTextPage)
 {
     auto pTextPage = static_cast<PDFiumTextPageImpl*>(rTextPage.get());
-    return getUnicodeString([this, pTextPage](FPDF_WCHAR* buffer, unsigned 
long length) {
-        return FPDFTextObj_GetText(mpPageObject, pTextPage->getPointer(), 
buffer, length);
-    });
+    // FPDFTextObj_GetText may report some hyphens as 0x2
+    return pTextPage->getText(mpPageObject);
 }
 
 PDFPageObjectType PDFiumPageObjectImpl::getType()

Reply via email to