sc/source/filter/orcus/filterdetect.cxx | 44 ++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 11 deletions(-)
New commits: commit 272e97c3eb9351af963d9d0ee25de9237918984a Author: Kohei Yoshida <kohei.yosh...@collabora.com> AuthorDate: Wed Sep 17 21:22:24 2025 -0400 Commit: Kohei Yoshida <kohei.yosh...@collabora.com> CommitDate: Thu Sep 18 13:49:54 2025 +0200 Only test the incoming document against the format being tested The old code would run the orcus detection code against multiple possbile types each time it was called regardless of what the tested format type was, which was not necessarily the most efficient. With this change, we check the incoming document only against the format type being tested. In addition, if the detected type is either JSON or XML, check if it contains at least one linkable range - a repeat structure that can be mapped to a range of cells. If not, then don't try to import it as JSON or XML. The document will then most likely be imported as CSV. Change-Id: I2cd56d60c0b0274a0bfaa9317b1a7e85961f0cd1 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/191108 Tested-by: Jenkins Reviewed-by: Kohei Yoshida <ko...@libreoffice.org> diff --git a/sc/source/filter/orcus/filterdetect.cxx b/sc/source/filter/orcus/filterdetect.cxx index 285d5a01d8cd..ba94beda7782 100644 --- a/sc/source/filter/orcus/filterdetect.cxx +++ b/sc/source/filter/orcus/filterdetect.cxx @@ -18,6 +18,8 @@ #include <orcus_utils.hxx> #include <orcus/format_detection.hpp> +#include <orcus/orcus_json.hpp> +#include <orcus/orcus_xml.hpp> namespace com::sun::star::uno { @@ -69,29 +71,49 @@ OUString OrcusFormatDetect::detect(css::uno::Sequence<css::beans::PropertyValue> if (bAborted) return OUString(); + OUString aType; // type to test against + aMediaDescriptor[utl::MediaDescriptor::PROP_TYPENAME] >>= aType; + if (aType.isEmpty()) + return OUString(); + + static const std::unordered_map<OUString, orcus::format_t> aMap = { + { "Gnumeric XML", orcus::format_t::gnumeric }, + { "calc_MS_Excel_2003_XML", orcus::format_t::xls_xml }, + { "Apache Parquet", orcus::format_t::parquet }, + { "generic_XML", orcus::format_t::xml }, + { "generic_JSON", orcus::format_t::json }, + }; + + orcus::format_t eFormat{}; + + if (auto it = aMap.find(aType); it != aMap.end()) + eFormat = it->second; + else + return OUString(); + css::uno::Reference<css::io::XInputStream> xInputStream( aMediaDescriptor[utl::MediaDescriptor::PROP_INPUTSTREAM], css::uno::UNO_QUERY); CopiedTempStream aTemp(xInputStream); auto aContent = toFileContent(aTemp.getFileName()); - orcus::format_t eFormat = orcus::detect(aContent.str()); + bool bValid = orcus::detect(aContent.str(), eFormat); + if (!bValid) + return OUString(); switch (eFormat) { - case orcus::format_t::gnumeric: - return u"Gnumeric XML"_ustr; - case orcus::format_t::xls_xml: - return u"calc_MS_Excel_2003_XML"_ustr; - case orcus::format_t::parquet: - return u"Apache Parquet"_ustr; - case orcus::format_t::xml: - return u"generic_XML"_ustr; case orcus::format_t::json: - return u"generic_JSON"_ustr; + // make sure this JSON doc has at least one linkable range + bValid = orcus::orcus_json::has_range(aContent.str()); + break; + case orcus::format_t::xml: + // make sure this XML doc has at least one linkable range + bValid = orcus::orcus_xml::has_range(aContent.str()); + break; default:; } - return OUString(); + return bValid ? aType : OUString(); } }