vcl/qa/cppunit/pdfexport/pdfexport2.cxx | 2 vcl/source/pdf/PDFiumLibrary.cxx | 74 +++++++++++++++++++++++++++++--- 2 files changed, 70 insertions(+), 6 deletions(-)
New commits: commit 25550b2daf29a4eb766dd22692c43b7be354a87c Author: Caolán McNamara <[email protected]> AuthorDate: Wed Oct 8 21:23:57 2025 +0100 Commit: Miklos Vajna <[email protected]> CommitDate: Thu Oct 9 13:41:00 2025 +0200 use FPDFText_GetUnicode to get text instead of FPDFTextObj_GetText, which is returning 0x2 for some hyphens. If we use the slightly lower level apis we can get info as to substituted hyphens. Change-Id: I26efa9f1acb5ba819b63034399da4f1961373f13 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/192081 Tested-by: Jenkins CollaboraOffice <[email protected]> Reviewed-by: Miklos Vajna <[email protected]> diff --git a/vcl/qa/cppunit/pdfexport/pdfexport2.cxx b/vcl/qa/cppunit/pdfexport/pdfexport2.cxx index 0713269c8f20..47c0a7f1fdb1 100644 --- a/vcl/qa/cppunit/pdfexport/pdfexport2.cxx +++ b/vcl/qa/cppunit/pdfexport/pdfexport2.cxx @@ -5528,7 +5528,7 @@ CPPUNIT_TEST_FIXTURE(PdfExportTest2, testTdf162194SoftHyphen) CPPUNIT_ASSERT_EQUAL(u"Waffle"_ustr, aText.at(0).trim()); CPPUNIT_ASSERT_EQUAL(u"AAA Waf"_ustr, aText.at(1).trim()); - CPPUNIT_ASSERT_EQUAL(u""_ustr, aText.at(2).trim()); + CPPUNIT_ASSERT_EQUAL(u"-"_ustr, aText.at(2).trim()); CPPUNIT_ASSERT_EQUAL(u"fle"_ustr, aText.at(3).trim()); } diff --git a/vcl/source/pdf/PDFiumLibrary.cxx b/vcl/source/pdf/PDFiumLibrary.cxx index 17265501be58..381080b583d9 100644 --- a/vcl/source/pdf/PDFiumLibrary.cxx +++ b/vcl/source/pdf/PDFiumLibrary.cxx @@ -466,8 +466,6 @@ public: PDFiumTextPageImpl(FPDF_TEXTPAGE pTextPage); ~PDFiumTextPageImpl(); - FPDF_TEXTPAGE getPointer() { return mpTextPage; } - int countChars() override; unsigned int getUnicode(int index) override; std::unique_ptr<PDFiumSearchHandle> findStart(const OUString& rFindWhat, PDFFindFlags nFlags, @@ -475,6 +473,73 @@ public: /// Returned rect is no longer upside down and is in mm100. basegfx::B2DRectangle getCharBox(int nIndex, double fPageHeight) override; + + OUString getText(FPDF_PAGEOBJECT pPageObject) + { + OUStringBuffer aResult; + + bool containsPreChar = false; + bool addLineFeed = false; + double posY(0), originX(0.0), originY(0.0); + + // FPDFTextObj_GetText also does a similar loop over the entire + // contents of the text page, this is the intended to be the equivalent + // of that except for (currently) added recovery of hyphens. + int count = FPDFText_CountChars(mpTextPage); + for (int i = 0; i < count; ++i) + { + FPDF_PAGEOBJECT pOwner = FPDFText_GetTextObject(mpTextPage, i); + sal_Unicode cUnicode = FPDFText_GetUnicode(mpTextPage, i); + if (pOwner == pPageObject) + { + FPDFText_GetCharOrigin(mpTextPage, i, &originX, &originY); + + if (fabs(posY - originY) > 0 && !containsPreChar && addLineFeed) + { + posY = originY; + if (!aResult.isEmpty()) + aResult.append(" "); + } + containsPreChar = true; + addLineFeed = false; + + switch (cUnicode) + { + case 0: + SAL_INFO("vcl.filter", "PDFiumImpl: cannot get unicode for char"); + break; + default: + aResult.append(cUnicode); + break; + case 0x2: // oddly pdfium replaces some '-' with 2. + { + int isHyphen = FPDFText_IsHyphen(mpTextPage, i); + if (isHyphen == 1) + aResult.append('-'); + else + { + SAL_WARN_IF(isHyphen == -1, "vcl.filter", + "PDFiumImpl: FPDFText_IsHyphen failure"); + aResult.append(cUnicode); + } + } + break; + } + } + else if (cUnicode == ' ' && containsPreChar) + { + aResult.append(' '); + containsPreChar = false; + addLineFeed = false; + } + else + { + containsPreChar = false; + addLineFeed = true; + } + } + return aResult.toString(); + } }; class PDFiumSignatureImpl final : public PDFiumSignature @@ -1077,9 +1142,8 @@ PDFiumPageObjectImpl::PDFiumPageObjectImpl(FPDF_PAGEOBJECT pPageObject) OUString PDFiumPageObjectImpl::getText(std::unique_ptr<PDFiumTextPage> const& rTextPage) { auto pTextPage = static_cast<PDFiumTextPageImpl*>(rTextPage.get()); - return getUnicodeString([this, pTextPage](FPDF_WCHAR* buffer, unsigned long length) { - return FPDFTextObj_GetText(mpPageObject, pTextPage->getPointer(), buffer, length); - }); + // FPDFTextObj_GetText may report some hyphens as 0x2 + return pTextPage->getText(mpPageObject); } PDFPageObjectType PDFiumPageObjectImpl::getType()
