sdext/source/pdfimport/filterdet.cxx   |   61 ++++++++++++++++++++++++++++++++-
 sdext/source/pdfimport/filterdet.hxx   |   37 +++++++++++++++++++-
 sdext/source/pdfimport/pdfiadaptor.cxx |    8 +++-
 3 files changed, 103 insertions(+), 3 deletions(-)

New commits:
commit 9b401a09c5cde0250457f45d9540ee15ab8e43f9
Author:     Dr. David Alan Gilbert <d...@treblig.org>
AuthorDate: Thu Apr 10 00:53:32 2025 +0100
Commit:     Tomaž Vajngerl <qui...@gmail.com>
CommitDate: Mon Apr 21 08:42:50 2025 +0200

    tdf#55425, tdf#66580: sdext,pdfimport: Start detecting newer hybrids
    
    There are two mechanisms in Hybrid PDFs to mark the data, one
    uses a non-standardised entry in the PDF trailer, another newer one , uses
    a well-known name for a standard PDF embedded file.
    
    The import filter currently just detects the non-standard trailer
    addition, not the embedded file case, however the export code
    doesn't write the old trailer for some formats, e.g. PDF/UA
    and there's also a desire to get rid of the non-standard stuff
    (see tdf#66580 ).
    
    Add a detection mechanism for the newer embeddedFile form,
    keep the old trailer detect to be able to load ancient files
    that were generated before the embeddedFile format was added.
    
    This is the first step of opening the file and wiring up
    PDFium.
    
    Change-Id: I44c0ffeac5e62dc1345c604bd695a2c2da37260a
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/183765
    Reviewed-by: Tomaž Vajngerl <qui...@gmail.com>
    Tested-by: Jenkins

diff --git a/sdext/source/pdfimport/filterdet.cxx 
b/sdext/source/pdfimport/filterdet.cxx
index 0654cc8beec6..8560679fd5bd 100644
--- a/sdext/source/pdfimport/filterdet.cxx
+++ b/sdext/source/pdfimport/filterdet.cxx
@@ -37,6 +37,7 @@
 #include <cppuhelper/supportsservice.hxx>
 #include <comphelper/diagnose_ex.hxx>
 #include <tools/stream.hxx>
+#include <vcl/filter/PDFiumLibrary.hxx>
 #include <memory>
 #include <utility>
 #include <string.h>
@@ -284,6 +285,55 @@ bool copyToTemp(uno::Reference<io::XInputStream> const& 
xInput, oslFileHandle& r
 
 } // end anonymous namespace
 
+// Check for a hybrid that is stored using the newer method, the standard PDF 
embedded file
+// with a name of Original.o** and the matching MIME type.  For this to match 
there must
+// be exactly one embedded file.
+// This uses PDFium to do the legwork.
+uno::Reference<io::XStream> getEmbeddedFile(const OUString& rInPDFFileURL,
+                                            OUString& /*rOutMimetype*/,
+                                            OUString& /*io_rPwd*/,
+                                            const 
uno::Reference<uno::XComponentContext>& /*xContext*/,
+                                            const 
uno::Sequence<beans::PropertyValue>& /*rFilterData*/,
+                                            bool /*bMayUseUI*/)
+{
+    uno::Reference<io::XStream> xEmbed;
+    OUString aSysUPath;
+    auto pPdfium = vcl::pdf::PDFiumLibrary::get();
+    if (pPdfium)
+    {
+        // Needs rewriting more C++ with autocleanup
+        // Start by mmaping the file because our pdfium wrapper only wraps the 
LoadMemDocument
+        oslFileHandle fileHandle = nullptr;
+        SAL_INFO("sdext.pdfimport", "getEmbeddedFile prior to openFile" << 
aSysUPath);
+        if (osl_openFile(rInPDFFileURL.pData, &fileHandle, 
osl_File_OpenFlag_Read)
+            != osl_File_E_None)
+        {
+            return xEmbed;
+        }
+
+        sal_uInt64 nFileSize;
+        if (osl_getFileSize(fileHandle, &nFileSize) != osl_File_E_None)
+        {
+            osl_closeFile(fileHandle);
+            return xEmbed;
+        }
+
+        void* pMemRawPdf;
+        if (osl_mapFile(fileHandle, &pMemRawPdf, nFileSize, 0, 
osl_File_MapFlag_RandomAccess)
+            != osl_File_E_None)
+        {
+            osl_closeFile(fileHandle);
+            return xEmbed;
+        }
+
+        auto pPdfiumDoc = pPdfium->openDocument(pMemRawPdf, nFileSize, 
OString(/*TODO Pass*/));
+
+        osl_unmapMappedFile(fileHandle, pMemRawPdf, nFileSize);
+        osl_closeFile(fileHandle);
+    }
+
+    return xEmbed;
+}
 // XExtendedFilterDetection
 OUString SAL_CALL PDFDetector::detect( uno::Sequence< beans::PropertyValue >& 
rFilterData )
 {
@@ -338,7 +388,16 @@ OUString SAL_CALL PDFDetector::detect( uno::Sequence< 
beans::PropertyValue >& rF
     }
 
     OUString aEmbedMimetype;
-    xEmbedStream = getAdditionalStream(aURL, aEmbedMimetype, aPassword, 
m_xContext, rFilterData, false);
+
+    // Try testing for the newer embedded file format
+    xEmbedStream = getEmbeddedFile(aURL, aEmbedMimetype, aPassword, 
m_xContext, rFilterData, false);
+
+    if (aEmbedMimetype.isEmpty())
+    {
+        // No success with embedd file, try the older trailer based 
AdditionalStream
+        xEmbedStream =
+            getAdditionalStream(aURL, aEmbedMimetype, aPassword, m_xContext, 
rFilterData, false);
+    }
 
     if (aFileHandle)
         osl_removeFile(aURL.pData);
diff --git a/sdext/source/pdfimport/filterdet.hxx 
b/sdext/source/pdfimport/filterdet.hxx
index 48bc8ca4e12a..d5e14b2b7055 100644
--- a/sdext/source/pdfimport/filterdet.hxx
+++ b/sdext/source/pdfimport/filterdet.hxx
@@ -56,7 +56,7 @@ public:
 };
 
 
-/** Retrieve embedded substream from PDF file
+/** Retrieve embedded substream from PDF file using the old trailer system
 
     Useful e.g. for hybrid PDF
 
@@ -91,6 +91,41 @@ css::uno::Reference< css::io::XStream >
                         const css::uno::Sequence< css::beans::PropertyValue >& 
   rFilterData,
                         bool                                                   
                         bMayUseUI );
 
+/** Retrieve embedded substream from PDF file using the standard EmbeddedFile
+
+    Useful e.g. for hybrid PDF
+
+    @param rPDFFile
+    URI of the pdf file
+
+    @param o_rOutMimetype
+    Output parameter. Receives the mime type of the
+    substream. Used to distinguish between
+    draw/impress/writer/calc during import
+
+    @param o_rOutPwd
+    In/Out parameter. If given password is wrong, user is queried
+    for another (if bMayUseUI is true)
+
+    @param xContext
+    Component context
+
+    @param rFilterData
+    Basically used to pass on XFilter::filter properties (function
+    uses it to retrieve interaction handler)
+
+    @param bMayUseUI
+    When false, no dialog is opened to query user for alternate
+    password
+ */
+css::uno::Reference< css::io::XStream >
+   getEmbeddedFile( const OUString&                                            
                rPDFFile,
+                    OUString&                                                  
                o_rOutMimetype,
+                    OUString&                                                  
                io_rOutPwd,
+                    const css::uno::Reference< css::uno::XComponentContext >& 
xContext,
+                    const css::uno::Sequence< css::beans::PropertyValue >&    
rFilterData,
+                    bool                                                       
                     bMayUseUI );
+
 
 bool checkDocChecksum( const OUString& rInPDFFileURL,
                        sal_uInt32           nBytes,
diff --git a/sdext/source/pdfimport/pdfiadaptor.cxx 
b/sdext/source/pdfimport/pdfiadaptor.cxx
index 136f6d586d61..45250a911884 100644
--- a/sdext/source/pdfimport/pdfiadaptor.cxx
+++ b/sdext/source/pdfimport/pdfiadaptor.cxx
@@ -116,7 +116,13 @@ sal_Bool SAL_CALL PDFIHybridAdaptor::filter( const 
uno::Sequence< beans::Propert
                     {
                         OUString aEmbedMimetype;
                         OUString aOrgPwd( aPwd );
-                        xSubStream = getAdditionalStream( aURL, 
aEmbedMimetype, aPwd, m_xContext, rFilterData, true );
+
+                        xSubStream = getEmbeddedFile(aURL, aEmbedMimetype, 
aPwd, m_xContext,
+                                                     rFilterData, true);
+                        if (aEmbedMimetype.isEmpty()) {
+                            xSubStream = getAdditionalStream(aURL, 
aEmbedMimetype, aPwd, m_xContext,
+                                                             rFilterData, 
true);
+                        }
                         if( aOrgPwd != aPwd )
                             bAddPwdProp = true;
                     }

Reply via email to