sc/source/filter/orcus/filterdetect.cxx |   44 ++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 11 deletions(-)

New commits:
commit 272e97c3eb9351af963d9d0ee25de9237918984a
Author:     Kohei Yoshida <kohei.yosh...@collabora.com>
AuthorDate: Wed Sep 17 21:22:24 2025 -0400
Commit:     Kohei Yoshida <kohei.yosh...@collabora.com>
CommitDate: Thu Sep 18 13:49:54 2025 +0200

    Only test the incoming document against the format being tested
    
    The old code would run the orcus detection code against multiple
    possbile types each time it was called regardless of what the tested
    format type was, which was not necessarily the most efficient.
    
    With this change, we check the incoming document only against the
    format type being tested.
    
    In addition, if the detected type is either JSON or XML, check if
    it contains at least one linkable range - a repeat structure that
    can be mapped to a range of cells.  If not, then don't try to import
    it as JSON or XML. The document will then most likely be imported
    as CSV.
    
    Change-Id: I2cd56d60c0b0274a0bfaa9317b1a7e85961f0cd1
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/191108
    Tested-by: Jenkins
    Reviewed-by: Kohei Yoshida <ko...@libreoffice.org>

diff --git a/sc/source/filter/orcus/filterdetect.cxx 
b/sc/source/filter/orcus/filterdetect.cxx
index 285d5a01d8cd..ba94beda7782 100644
--- a/sc/source/filter/orcus/filterdetect.cxx
+++ b/sc/source/filter/orcus/filterdetect.cxx
@@ -18,6 +18,8 @@
 #include <orcus_utils.hxx>
 
 #include <orcus/format_detection.hpp>
+#include <orcus/orcus_json.hpp>
+#include <orcus/orcus_xml.hpp>
 
 namespace com::sun::star::uno
 {
@@ -69,29 +71,49 @@ OUString 
OrcusFormatDetect::detect(css::uno::Sequence<css::beans::PropertyValue>
     if (bAborted)
         return OUString();
 
+    OUString aType; // type to test against
+    aMediaDescriptor[utl::MediaDescriptor::PROP_TYPENAME] >>= aType;
+    if (aType.isEmpty())
+        return OUString();
+
+    static const std::unordered_map<OUString, orcus::format_t> aMap = {
+        { "Gnumeric XML", orcus::format_t::gnumeric },
+        { "calc_MS_Excel_2003_XML", orcus::format_t::xls_xml },
+        { "Apache Parquet", orcus::format_t::parquet },
+        { "generic_XML", orcus::format_t::xml },
+        { "generic_JSON", orcus::format_t::json },
+    };
+
+    orcus::format_t eFormat{};
+
+    if (auto it = aMap.find(aType); it != aMap.end())
+        eFormat = it->second;
+    else
+        return OUString();
+
     css::uno::Reference<css::io::XInputStream> xInputStream(
         aMediaDescriptor[utl::MediaDescriptor::PROP_INPUTSTREAM], 
css::uno::UNO_QUERY);
 
     CopiedTempStream aTemp(xInputStream);
     auto aContent = toFileContent(aTemp.getFileName());
-    orcus::format_t eFormat = orcus::detect(aContent.str());
+    bool bValid = orcus::detect(aContent.str(), eFormat);
+    if (!bValid)
+        return OUString();
 
     switch (eFormat)
     {
-        case orcus::format_t::gnumeric:
-            return u"Gnumeric XML"_ustr;
-        case orcus::format_t::xls_xml:
-            return u"calc_MS_Excel_2003_XML"_ustr;
-        case orcus::format_t::parquet:
-            return u"Apache Parquet"_ustr;
-        case orcus::format_t::xml:
-            return u"generic_XML"_ustr;
         case orcus::format_t::json:
-            return u"generic_JSON"_ustr;
+            // make sure this JSON doc has at least one linkable range
+            bValid = orcus::orcus_json::has_range(aContent.str());
+            break;
+        case orcus::format_t::xml:
+            // make sure this XML doc has at least one linkable range
+            bValid = orcus::orcus_xml::has_range(aContent.str());
+            break;
         default:;
     }
 
-    return OUString();
+    return bValid ? aType : OUString();
 }
 }
 

Reply via email to