sdext/source/pdfimport/filterdet.cxx | 61 ++++++++++++++++++++++++++++++++- sdext/source/pdfimport/filterdet.hxx | 37 +++++++++++++++++++- sdext/source/pdfimport/pdfiadaptor.cxx | 8 +++- 3 files changed, 103 insertions(+), 3 deletions(-)
New commits: commit 9b401a09c5cde0250457f45d9540ee15ab8e43f9 Author: Dr. David Alan Gilbert <d...@treblig.org> AuthorDate: Thu Apr 10 00:53:32 2025 +0100 Commit: Tomaž Vajngerl <qui...@gmail.com> CommitDate: Mon Apr 21 08:42:50 2025 +0200 tdf#55425, tdf#66580: sdext,pdfimport: Start detecting newer hybrids There are two mechanisms in Hybrid PDFs to mark the data, one uses a non-standardised entry in the PDF trailer, another newer one , uses a well-known name for a standard PDF embedded file. The import filter currently just detects the non-standard trailer addition, not the embedded file case, however the export code doesn't write the old trailer for some formats, e.g. PDF/UA and there's also a desire to get rid of the non-standard stuff (see tdf#66580 ). Add a detection mechanism for the newer embeddedFile form, keep the old trailer detect to be able to load ancient files that were generated before the embeddedFile format was added. This is the first step of opening the file and wiring up PDFium. Change-Id: I44c0ffeac5e62dc1345c604bd695a2c2da37260a Reviewed-on: https://gerrit.libreoffice.org/c/core/+/183765 Reviewed-by: Tomaž Vajngerl <qui...@gmail.com> Tested-by: Jenkins diff --git a/sdext/source/pdfimport/filterdet.cxx b/sdext/source/pdfimport/filterdet.cxx index 0654cc8beec6..8560679fd5bd 100644 --- a/sdext/source/pdfimport/filterdet.cxx +++ b/sdext/source/pdfimport/filterdet.cxx @@ -37,6 +37,7 @@ #include <cppuhelper/supportsservice.hxx> #include <comphelper/diagnose_ex.hxx> #include <tools/stream.hxx> +#include <vcl/filter/PDFiumLibrary.hxx> #include <memory> #include <utility> #include <string.h> @@ -284,6 +285,55 @@ bool copyToTemp(uno::Reference<io::XInputStream> const& xInput, oslFileHandle& r } // end anonymous namespace +// Check for a hybrid that is stored using the newer method, the standard PDF embedded file +// with a name of Original.o** and the matching MIME type. For this to match there must +// be exactly one embedded file. +// This uses PDFium to do the legwork. +uno::Reference<io::XStream> getEmbeddedFile(const OUString& rInPDFFileURL, + OUString& /*rOutMimetype*/, + OUString& /*io_rPwd*/, + const uno::Reference<uno::XComponentContext>& /*xContext*/, + const uno::Sequence<beans::PropertyValue>& /*rFilterData*/, + bool /*bMayUseUI*/) +{ + uno::Reference<io::XStream> xEmbed; + OUString aSysUPath; + auto pPdfium = vcl::pdf::PDFiumLibrary::get(); + if (pPdfium) + { + // Needs rewriting more C++ with autocleanup + // Start by mmaping the file because our pdfium wrapper only wraps the LoadMemDocument + oslFileHandle fileHandle = nullptr; + SAL_INFO("sdext.pdfimport", "getEmbeddedFile prior to openFile" << aSysUPath); + if (osl_openFile(rInPDFFileURL.pData, &fileHandle, osl_File_OpenFlag_Read) + != osl_File_E_None) + { + return xEmbed; + } + + sal_uInt64 nFileSize; + if (osl_getFileSize(fileHandle, &nFileSize) != osl_File_E_None) + { + osl_closeFile(fileHandle); + return xEmbed; + } + + void* pMemRawPdf; + if (osl_mapFile(fileHandle, &pMemRawPdf, nFileSize, 0, osl_File_MapFlag_RandomAccess) + != osl_File_E_None) + { + osl_closeFile(fileHandle); + return xEmbed; + } + + auto pPdfiumDoc = pPdfium->openDocument(pMemRawPdf, nFileSize, OString(/*TODO Pass*/)); + + osl_unmapMappedFile(fileHandle, pMemRawPdf, nFileSize); + osl_closeFile(fileHandle); + } + + return xEmbed; +} // XExtendedFilterDetection OUString SAL_CALL PDFDetector::detect( uno::Sequence< beans::PropertyValue >& rFilterData ) { @@ -338,7 +388,16 @@ OUString SAL_CALL PDFDetector::detect( uno::Sequence< beans::PropertyValue >& rF } OUString aEmbedMimetype; - xEmbedStream = getAdditionalStream(aURL, aEmbedMimetype, aPassword, m_xContext, rFilterData, false); + + // Try testing for the newer embedded file format + xEmbedStream = getEmbeddedFile(aURL, aEmbedMimetype, aPassword, m_xContext, rFilterData, false); + + if (aEmbedMimetype.isEmpty()) + { + // No success with embedd file, try the older trailer based AdditionalStream + xEmbedStream = + getAdditionalStream(aURL, aEmbedMimetype, aPassword, m_xContext, rFilterData, false); + } if (aFileHandle) osl_removeFile(aURL.pData); diff --git a/sdext/source/pdfimport/filterdet.hxx b/sdext/source/pdfimport/filterdet.hxx index 48bc8ca4e12a..d5e14b2b7055 100644 --- a/sdext/source/pdfimport/filterdet.hxx +++ b/sdext/source/pdfimport/filterdet.hxx @@ -56,7 +56,7 @@ public: }; -/** Retrieve embedded substream from PDF file +/** Retrieve embedded substream from PDF file using the old trailer system Useful e.g. for hybrid PDF @@ -91,6 +91,41 @@ css::uno::Reference< css::io::XStream > const css::uno::Sequence< css::beans::PropertyValue >& rFilterData, bool bMayUseUI ); +/** Retrieve embedded substream from PDF file using the standard EmbeddedFile + + Useful e.g. for hybrid PDF + + @param rPDFFile + URI of the pdf file + + @param o_rOutMimetype + Output parameter. Receives the mime type of the + substream. Used to distinguish between + draw/impress/writer/calc during import + + @param o_rOutPwd + In/Out parameter. If given password is wrong, user is queried + for another (if bMayUseUI is true) + + @param xContext + Component context + + @param rFilterData + Basically used to pass on XFilter::filter properties (function + uses it to retrieve interaction handler) + + @param bMayUseUI + When false, no dialog is opened to query user for alternate + password + */ +css::uno::Reference< css::io::XStream > + getEmbeddedFile( const OUString& rPDFFile, + OUString& o_rOutMimetype, + OUString& io_rOutPwd, + const css::uno::Reference< css::uno::XComponentContext >& xContext, + const css::uno::Sequence< css::beans::PropertyValue >& rFilterData, + bool bMayUseUI ); + bool checkDocChecksum( const OUString& rInPDFFileURL, sal_uInt32 nBytes, diff --git a/sdext/source/pdfimport/pdfiadaptor.cxx b/sdext/source/pdfimport/pdfiadaptor.cxx index 136f6d586d61..45250a911884 100644 --- a/sdext/source/pdfimport/pdfiadaptor.cxx +++ b/sdext/source/pdfimport/pdfiadaptor.cxx @@ -116,7 +116,13 @@ sal_Bool SAL_CALL PDFIHybridAdaptor::filter( const uno::Sequence< beans::Propert { OUString aEmbedMimetype; OUString aOrgPwd( aPwd ); - xSubStream = getAdditionalStream( aURL, aEmbedMimetype, aPwd, m_xContext, rFilterData, true ); + + xSubStream = getEmbeddedFile(aURL, aEmbedMimetype, aPwd, m_xContext, + rFilterData, true); + if (aEmbedMimetype.isEmpty()) { + xSubStream = getAdditionalStream(aURL, aEmbedMimetype, aPwd, m_xContext, + rFilterData, true); + } if( aOrgPwd != aPwd ) bAddPwdProp = true; }