(doris) 01/12: [Performance](Variant) Improve load performance for variant type (#33890)

yiguolei Sat, 18 May 2024 03:10:32 -0700

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


commit 691f3c5ee7a36a4ae374dbef731f25ce5c2bf20d
Author: lihangyu <15605149...@163.com>
AuthorDate: Sat May 11 11:31:24 2024 +0800

    [Performance](Variant) Improve load performance for variant type (#33890)
    
    1. remove phmap for padding rows
    2. add SimpleFieldVisitorToScarlarType for short circuit type deducing
    3. correct type coercion for conflict types bettween integers
    4. improve nullable column performance
    5. remove shared_ptr dependancy for DataType use TypeIndex instead
    6. Optimization by caching the order of fields (which is almost always the 
same)
    and a quick check to match the next expected field, instead of searching 
the hash table.
    
    benchmark:
    In clickbench data, load performance:
    12m36.799s ->7m10.934s about 43% latency reduce
    
    In variant_p2/performance.groovy:
    3min44s20 -> 1min15s80 about 66% latency reducy
---
 be/src/vec/columns/column_object.cpp               | 189 ++++++++++++++++-----
 be/src/vec/columns/column_object.h                 |  28 ++-
 be/src/vec/common/schema_util.cpp                  |  57 ++++---
 be/src/vec/common/schema_util.h                    |   4 +-
 be/src/vec/core/field.h                            |   5 +
 be/src/vec/json/parse2column.cpp                   |  22 +--
 .../suites/variant_p2/performance.groovy           |  36 ++++
 7 files changed, 248 insertions(+), 93 deletions(-)

diff --git a/be/src/vec/columns/column_object.cpp 
b/be/src/vec/columns/column_object.cpp
index ddb5bee6e01..3bae978f4d3 100644
--- a/be/src/vec/columns/column_object.cpp
+++ b/be/src/vec/columns/column_object.cpp
@@ -45,6 +45,7 @@
 #include "util/defer_op.h"
 #include "util/simd/bits.h"
 #include "vec/aggregate_functions/aggregate_function.h"
+#include "vec/aggregate_functions/helpers.h"
 #include "vec/columns/column.h"
 #include "vec/columns/column_array.h"
 #include "vec/columns/column_nullable.h"
@@ -56,6 +57,7 @@
 #include "vec/common/field_visitors.h"
 #include "vec/common/schema_util.h"
 #include "vec/common/string_buffer.hpp"
+#include "vec/common/string_ref.h"
 #include "vec/core/column_with_type_and_name.h"
 #include "vec/core/field.h"
 #include "vec/core/types.h"
@@ -68,6 +70,7 @@
 #include "vec/data_types/data_type_nothing.h"
 #include "vec/data_types/data_type_nullable.h"
 #include "vec/data_types/get_least_supertype.h"
+#include "vec/json/path_in_data.h"
 
 #ifdef __AVX2__
 #include "util/jsonb_parser_simd.h"
@@ -78,23 +81,22 @@
 namespace doris::vectorized {
 namespace {
 
-DataTypePtr create_array_of_type(DataTypePtr type, size_t num_dimensions, bool 
is_nullable) {
-    const DataTypeNullable* nullable = typeid_cast<const 
DataTypeNullable*>(type.get());
-    if ((nullable &&
-         typeid_cast<const 
ColumnObject::MostCommonType*>(nullable->get_nested_type().get())) ||
-        typeid_cast<const ColumnObject::MostCommonType*>(type.get())) {
+DataTypePtr create_array_of_type(TypeIndex type, size_t num_dimensions, bool 
is_nullable) {
+    if (type == ColumnObject::MOST_COMMON_TYPE_ID) {
         // JSONB type MUST NOT wrapped in ARRAY column, it should be top level.
         // So we ignored num_dimensions.
-        return type;
+        return is_nullable ? 
make_nullable(std::make_shared<ColumnObject::MostCommonType>())
+                           : std::make_shared<ColumnObject::MostCommonType>();
     }
+    DataTypePtr result = DataTypeFactory::instance().create_data_type(type, 
is_nullable);
     for (size_t i = 0; i < num_dimensions; ++i) {
-        type = std::make_shared<DataTypeArray>(std::move(type));
+        result = std::make_shared<DataTypeArray>(result);
         if (is_nullable) {
             // wrap array with nullable
-            type = make_nullable(type);
+            result = make_nullable(result);
         }
     }
-    return type;
+    return result;
 }
 
 DataTypePtr get_base_type_of_array(const DataTypePtr& type) {
@@ -149,6 +151,63 @@ public:
     }
 };
 
+// Visitor that allows to get type of scalar field
+// but exclude fields contain complex field.This is a faster version
+// for FieldVisitorToScalarType which does not support complex field.
+class SimpleFieldVisitorToScalarType : public StaticVisitor<size_t> {
+public:
+    size_t operator()(const Array& x) {
+        throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not 
supported");
+    }
+    size_t operator()(const UInt64& x) {
+        if (x <= std::numeric_limits<Int8>::max()) {
+            type = TypeIndex::Int8;
+        } else if (x <= std::numeric_limits<Int16>::max()) {
+            type = TypeIndex::Int16;
+        } else if (x <= std::numeric_limits<Int32>::max()) {
+            type = TypeIndex::Int32;
+        } else {
+            type = TypeIndex::Int64;
+        }
+        return 1;
+    }
+    size_t operator()(const Int64& x) {
+        if (x <= std::numeric_limits<Int8>::max() && x >= 
std::numeric_limits<Int8>::min()) {
+            type = TypeIndex::Int8;
+        } else if (x <= std::numeric_limits<Int16>::max() &&
+                   x >= std::numeric_limits<Int16>::min()) {
+            type = TypeIndex::Int16;
+        } else if (x <= std::numeric_limits<Int32>::max() &&
+                   x >= std::numeric_limits<Int32>::min()) {
+            type = TypeIndex::Int32;
+        } else {
+            type = TypeIndex::Int64;
+        }
+        return 1;
+    }
+    size_t operator()(const JsonbField& x) {
+        type = TypeIndex::JSONB;
+        return 1;
+    }
+    size_t operator()(const Null&) {
+        have_nulls = true;
+        return 1;
+    }
+    template <typename T>
+    size_t operator()(const T&) {
+        type = TypeId<NearestFieldType<T>>::value;
+        return 1;
+    }
+    void get_scalar_type(TypeIndex* data_type) const { *data_type = type; }
+    bool contain_nulls() const { return have_nulls; }
+
+    bool need_convert_field() const { return false; }
+
+private:
+    TypeIndex type = TypeIndex::Nothing;
+    bool have_nulls;
+};
+
 /// Visitor that allows to get type of scalar field
 /// or least common type of scalars in array.
 /// More optimized version of FieldToDataType.
@@ -208,8 +267,10 @@ public:
         type_indexes.insert(TypeId<NearestFieldType<T>>::value);
         return 0;
     }
-    void get_scalar_type(DataTypePtr* type) const {
-        get_least_supertype<LeastSupertypeOnError::Jsonb>(type_indexes, type);
+    void get_scalar_type(TypeIndex* type) const {
+        DataTypePtr data_type;
+        get_least_supertype<LeastSupertypeOnError::Jsonb>(type_indexes, 
&data_type);
+        *type = data_type->get_type_id();
     }
     bool contain_nulls() const { return have_nulls; }
     bool need_convert_field() const { return field_types.size() > 1; }
@@ -221,20 +282,30 @@ private:
 };
 
 } // namespace
-void get_field_info(const Field& field, FieldInfo* info) {
-    FieldVisitorToScalarType to_scalar_type_visitor;
+
+template <typename Visitor>
+void get_field_info_impl(const Field& field, FieldInfo* info) {
+    Visitor to_scalar_type_visitor;
     apply_visitor(to_scalar_type_visitor, field);
-    DataTypePtr type = nullptr;
-    to_scalar_type_visitor.get_scalar_type(&type);
+    TypeIndex type_id;
+    to_scalar_type_visitor.get_scalar_type(&type_id);
     // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
     *info = {
-            type,
+            type_id,
             to_scalar_type_visitor.contain_nulls(),
             to_scalar_type_visitor.need_convert_field(),
             apply_visitor(FieldVisitorToNumberOfDimensions(), field),
     };
 }
 
+void get_field_info(const Field& field, FieldInfo* info) {
+    if (field.is_complex_field()) {
+        get_field_info_impl<FieldVisitorToScalarType>(field, info);
+    } else {
+        get_field_info_impl<SimpleFieldVisitorToScalarType>(field, info);
+    }
+}
+
 ColumnObject::Subcolumn::Subcolumn(MutableColumnPtr&& data_, DataTypePtr type, 
bool is_nullable_,
                                    bool is_root_)
         : least_common_type(type), is_nullable(is_nullable_), 
is_root(is_root_) {
@@ -285,8 +356,8 @@ void 
ColumnObject::Subcolumn::add_new_column_part(DataTypePtr type) {
 }
 
 void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) {
-    auto base_type = std::move(info.scalar_type);
-    if (is_nothing(base_type)) {
+    auto base_type = WhichDataType(info.scalar_type_id);
+    if (base_type.is_nothing()) {
         insertDefault();
         return;
     }
@@ -295,7 +366,7 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo 
info) {
     if (is_nothing(least_common_type.get_base())) {
         column_dim = value_dim;
     }
-    if (is_nothing(base_type)) {
+    if (base_type.is_nothing()) {
         value_dim = column_dim;
     }
     bool type_changed = false;
@@ -305,29 +376,30 @@ void ColumnObject::Subcolumn::insert(Field field, 
FieldInfo info) {
                 "Dimension of types mismatched between inserted value and 
column, "
                 "expected:{}, but meet:{} for type:{}",
                 column_dim, value_dim, least_common_type.get()->get_name());
-        base_type = std::make_shared<MostCommonType>();
+        base_type = MOST_COMMON_TYPE_ID;
         value_dim = 0;
         type_changed = true;
     }
-    if (is_nullable && !is_nothing(base_type)) {
-        base_type = make_nullable(base_type);
-    }
-
-    const auto& least_common_base_type = least_common_type.get_base();
     if (data.empty()) {
-        add_new_column_part(create_array_of_type(std::move(base_type), 
value_dim, is_nullable));
-    } else if (!least_common_base_type->equals(*base_type) && 
!is_nothing(base_type)) {
-        if (!schema_util::is_conversion_required_between_integers(*base_type,
-                                                                  
*least_common_base_type)) {
+        add_new_column_part(create_array_of_type(base_type.idx, value_dim, 
is_nullable));
+    } else if (least_common_type.get_type_id() != base_type.idx && 
!base_type.is_nothing()) {
+        if (schema_util::is_conversion_required_between_integers(base_type.idx,
+                                                                 
least_common_type.get_type_id())) {
+            LOG_EVERY_N(INFO, 100) << "Conversion between " << 
getTypeName(base_type.idx) << " and "
+                                   << 
getTypeName(least_common_type.get_type_id());
+            DataTypePtr base_data_type;
+            TypeIndex base_data_type_id;
             get_least_supertype<LeastSupertypeOnError::Jsonb>(
-                    DataTypes {std::move(base_type), least_common_base_type}, 
&base_type);
+                    TypeIndexSet {base_type.idx, 
least_common_type.get_base_type_id()},
+                    &base_data_type);
             type_changed = true;
+            base_data_type_id = base_data_type->get_type_id();
             if (is_nullable) {
-                base_type = make_nullable(base_type);
+                base_data_type = make_nullable(base_data_type);
             }
-            if (!least_common_base_type->equals(*base_type)) {
+            if (!least_common_type.get_base()->equals(*base_data_type)) {
                 add_new_column_part(
-                        create_array_of_type(std::move(base_type), value_dim, 
is_nullable));
+                        create_array_of_type(base_data_type_id, value_dim, 
is_nullable));
             }
         }
     }
@@ -578,6 +650,14 @@ 
ColumnObject::Subcolumn::LeastCommonType::LeastCommonType(DataTypePtr type_)
     if (!WhichDataType(type).is_nothing()) {
         least_common_type_serder = type->get_serde();
     }
+    type_id = type->is_nullable() ? assert_cast<const 
DataTypeNullable*>(type.get())
+                                            ->get_nested_type()
+                                            ->get_type_id()
+                                  : type->get_type_id();
+    base_type_id = base_type->is_nullable() ? assert_cast<const 
DataTypeNullable*>(base_type.get())
+                                                      ->get_nested_type()
+                                                      ->get_type_id()
+                                            : base_type->get_type_id();
 }
 
 ColumnObject::ColumnObject(bool is_nullable_, bool create_root_)
@@ -677,14 +757,12 @@ void ColumnObject::try_insert(const Field& field) {
         return;
     }
     const auto& object = field.get<const VariantMap&>();
-    phmap::flat_hash_set<std::string> inserted;
     size_t old_size = size();
     for (const auto& [key_str, value] : object) {
         PathInData key;
         if (!key_str.empty()) {
             key = PathInData(key_str);
         }
-        inserted.insert(key_str);
         if (!has_subcolumn(key)) {
             bool succ = add_sub_column(key, old_size);
             if (!succ) {
@@ -700,7 +778,7 @@ void ColumnObject::try_insert(const Field& field) {
         subcolumn->insert(value);
     }
     for (auto& entry : subcolumns) {
-        if (!inserted.contains(entry->path.get_path())) {
+        if (old_size == entry->data.size()) {
             entry->data.insertDefault();
         }
     }
@@ -749,16 +827,6 @@ Status ColumnObject::try_insert_indices_from(const 
IColumn& src, const int* indi
     return Status::OK();
 }
 
-FieldInfo ColumnObject::Subcolumn::get_subcolumn_field_info() const {
-    const auto& base_type = least_common_type.get_base();
-    return FieldInfo {
-            .scalar_type = base_type,
-            .have_nulls = base_type->is_nullable(),
-            .need_convert = false,
-            .num_dimensions = least_common_type.get_dimensions(),
-    };
-}
-
 void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t 
length) {
 #ifndef NDEBUG
     check_consistency();
@@ -809,6 +877,33 @@ const ColumnObject::Subcolumn* 
ColumnObject::get_subcolumn(const PathInData& key
     return &node->data;
 }
 
+const ColumnObject::Subcolumn* ColumnObject::get_subcolumn_with_cache(const 
PathInData& key,
+                                                                      size_t 
key_index) const {
+    // Optimization by caching the order of fields (which is almost always the 
same)
+    // and a quick check to match the next expected field, instead of 
searching the hash table.
+    if (_prev_positions.size() > key_index && 
_prev_positions[key_index].second != nullptr &&
+        key == _prev_positions[key_index].first) {
+        return _prev_positions[key_index].second;
+    }
+    const auto* subcolumn = get_subcolumn(key);
+    if (key_index >= _prev_positions.size()) {
+        _prev_positions.resize(key_index + 1);
+    }
+    if (subcolumn != nullptr) {
+        _prev_positions[key_index] = std::make_pair(key, subcolumn);
+    }
+    return subcolumn;
+}
+
+ColumnObject::Subcolumn* ColumnObject::get_subcolumn(const PathInData& key, 
size_t key_index) {
+    return const_cast<ColumnObject::Subcolumn*>(get_subcolumn_with_cache(key, 
key_index));
+}
+
+const ColumnObject::Subcolumn* ColumnObject::get_subcolumn(const PathInData& 
key,
+                                                           size_t key_index) 
const {
+    return get_subcolumn_with_cache(key, key_index);
+}
+
 ColumnObject::Subcolumn* ColumnObject::get_subcolumn(const PathInData& key) {
     const auto* node = subcolumns.find_leaf(key);
     if (node == nullptr) {
@@ -1238,6 +1333,7 @@ void ColumnObject::finalize(bool ignore_sparse) {
     }
     std::swap(subcolumns, new_subcolumns);
     doc_structure = nullptr;
+    _prev_positions.clear();
 }
 
 void ColumnObject::finalize() {
@@ -1356,6 +1452,7 @@ void ColumnObject::clear() {
     Subcolumns empty;
     std::swap(empty, subcolumns);
     num_rows = 0;
+    _prev_positions.clear();
 }
 
 void ColumnObject::revise_to(int target_num_rows) {
diff --git a/be/src/vec/columns/column_object.h 
b/be/src/vec/columns/column_object.h
index 8573428ff2b..55abd534dd1 100644
--- a/be/src/vec/columns/column_object.h
+++ b/be/src/vec/columns/column_object.h
@@ -35,6 +35,7 @@
 
 #include "common/status.h"
 #include "olap/tablet_schema.h"
+#include "util/jsonb_document.h"
 #include "vec/columns/column.h"
 #include "vec/columns/subcolumn_tree.h"
 #include "vec/common/cow.h"
@@ -62,8 +63,8 @@ namespace doris::vectorized {
 /// It allows to recreate field with different number
 /// of dimensions or nullability.
 struct FieldInfo {
-    /// The common type of of all scalars in field.
-    DataTypePtr scalar_type;
+    /// The common type id of of all scalars in field.
+    TypeIndex scalar_type_id;
     /// Do we have NULL scalar in field.
     bool have_nulls;
     /// If true then we have scalars with different types in array and
@@ -72,6 +73,7 @@ struct FieldInfo {
     /// Number of dimension in array. 0 if field is scalar.
     size_t num_dimensions;
 };
+
 void get_field_info(const Field& field, FieldInfo* info);
 /** A column that represents object with dynamic set of subcolumns.
  *  Subcolumns are identified by paths in document and are stored in
@@ -91,6 +93,7 @@ public:
 
     // Using jsonb type as most common type, since it's adopted all types of 
json
     using MostCommonType = DataTypeJsonb;
+    constexpr static TypeIndex MOST_COMMON_TYPE_ID = TypeIndex::JSONB;
     class Subcolumn {
     public:
         Subcolumn() = default;
@@ -147,8 +150,6 @@ public:
         /// Returns last inserted field.
         Field get_last_field() const;
 
-        FieldInfo get_subcolumn_field_info() const;
-
         /// Returns single column if subcolumn in finalizes.
         /// Otherwise -- undefined behaviour.
         IColumn& get_finalized_column();
@@ -176,6 +177,10 @@ public:
 
             const DataTypePtr& get_base() const { return base_type; }
 
+            const TypeIndex& get_type_id() const { return type_id; }
+
+            const TypeIndex& get_base_type_id() const { return base_type_id; }
+
             size_t get_dimensions() const { return num_dimensions; }
 
             void remove_nullable() { type = 
doris::vectorized::remove_nullable(type); }
@@ -185,6 +190,8 @@ public:
         private:
             DataTypePtr type;
             DataTypePtr base_type;
+            TypeIndex type_id;
+            TypeIndex base_type_id;
             size_t num_dimensions = 0;
             DataTypeSerDeSPtr least_common_type_serder;
         };
@@ -227,6 +234,10 @@ private:
     // used for quickly row store encoding
     ColumnPtr rowstore_column;
 
+    using SubColumnWithName = std::pair<PathInData, const Subcolumn*>;
+    // Cached search results for previous row (keyed as index in JSON object) 
- used as a hint.
+    mutable std::vector<SubColumnWithName> _prev_positions;
+
 public:
     static constexpr auto COLUMN_NAME_DUMMY = "_dummy";
 
@@ -289,6 +300,9 @@ public:
     // return null if not found
     const Subcolumn* get_subcolumn(const PathInData& key) const;
 
+    // return null if not found
+    const Subcolumn* get_subcolumn(const PathInData& key, size_t index_hint) 
const;
+
     /** More efficient methods of manipulation */
     [[noreturn]] IColumn& get_data() {
         LOG(FATAL) << "Not implemented method get_data()";
@@ -302,6 +316,12 @@ public:
     // return null if not found
     Subcolumn* get_subcolumn(const PathInData& key);
 
+    // return null if not found
+    Subcolumn* get_subcolumn(const PathInData& key, size_t index_hint);
+
+    // return null if not found
+    const Subcolumn* get_subcolumn_with_cache(const PathInData& key, size_t 
index_hint) const;
+
     void incr_num_rows() { ++num_rows; }
 
     void incr_num_rows(size_t n) { num_rows += n; }
diff --git a/be/src/vec/common/schema_util.cpp 
b/be/src/vec/common/schema_util.cpp
index 5c7a2f8482a..2f9e5ded212 100644
--- a/be/src/vec/common/schema_util.cpp
+++ b/be/src/vec/common/schema_util.cpp
@@ -109,36 +109,41 @@ Array create_empty_array_field(size_t num_dimensions) {
     return array;
 }
 
-bool is_conversion_required_between_integers(const IDataType& lhs, const 
IDataType& rhs) {
+size_t get_size_of_interger(TypeIndex type) {
+    switch (type) {
+    case TypeIndex::Int8:
+        return sizeof(int8_t);
+    case TypeIndex::Int16:
+        return sizeof(int16_t);
+    case TypeIndex::Int32:
+        return sizeof(int32_t);
+    case TypeIndex::Int64:
+        return sizeof(int64_t);
+    case TypeIndex::Int128:
+        return sizeof(int128_t);
+    case TypeIndex::UInt8:
+        return sizeof(uint8_t);
+    case TypeIndex::UInt16:
+        return sizeof(uint16_t);
+    case TypeIndex::UInt32:
+        return sizeof(uint32_t);
+    case TypeIndex::UInt64:
+        return sizeof(uint64_t);
+    case TypeIndex::UInt128:
+        return sizeof(uint128_t);
+    default:
+        LOG(FATAL) << "Unknown integer type: " << getTypeName(type);
+        return 0;
+    }
+}
+
+bool is_conversion_required_between_integers(const TypeIndex& lhs, const 
TypeIndex& rhs) {
     WhichDataType which_lhs(lhs);
     WhichDataType which_rhs(rhs);
     bool is_native_int = which_lhs.is_native_int() && 
which_rhs.is_native_int();
     bool is_native_uint = which_lhs.is_native_uint() && 
which_rhs.is_native_uint();
-    return (is_native_int || is_native_uint) &&
-           lhs.get_size_of_value_in_memory() <= 
rhs.get_size_of_value_in_memory();
-}
-
-bool is_conversion_required_between_integers(FieldType lhs, FieldType rhs) {
-    // We only support signed integers for semi-structure data at present
-    // TODO add unsigned integers
-    if (lhs == FieldType::OLAP_FIELD_TYPE_BIGINT) {
-        return !(rhs == FieldType::OLAP_FIELD_TYPE_TINYINT ||
-                 rhs == FieldType::OLAP_FIELD_TYPE_SMALLINT ||
-                 rhs == FieldType::OLAP_FIELD_TYPE_INT || rhs == 
FieldType::OLAP_FIELD_TYPE_BIGINT);
-    }
-    if (lhs == FieldType::OLAP_FIELD_TYPE_INT) {
-        return !(rhs == FieldType::OLAP_FIELD_TYPE_TINYINT ||
-                 rhs == FieldType::OLAP_FIELD_TYPE_SMALLINT ||
-                 rhs == FieldType::OLAP_FIELD_TYPE_INT);
-    }
-    if (lhs == FieldType::OLAP_FIELD_TYPE_SMALLINT) {
-        return !(rhs == FieldType::OLAP_FIELD_TYPE_TINYINT ||
-                 rhs == FieldType::OLAP_FIELD_TYPE_SMALLINT);
-    }
-    if (lhs == FieldType::OLAP_FIELD_TYPE_TINYINT) {
-        return !(rhs == FieldType::OLAP_FIELD_TYPE_TINYINT);
-    }
-    return true;
+    return (!is_native_int && !is_native_uint) ||
+           get_size_of_interger(lhs) > get_size_of_interger(rhs);
 }
 
 Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, 
ColumnPtr* result) {
diff --git a/be/src/vec/common/schema_util.h b/be/src/vec/common/schema_util.h
index e6ed60480f5..078081593c5 100644
--- a/be/src/vec/common/schema_util.h
+++ b/be/src/vec/common/schema_util.h
@@ -34,6 +34,7 @@
 #include "vec/columns/column_object.h"
 #include "vec/core/columns_with_type_and_name.h"
 #include "vec/core/field.h"
+#include "vec/core/types.h"
 #include "vec/data_types/data_type.h"
 #include "vec/json/path_in_data.h"
 
@@ -66,8 +67,7 @@ Status cast_column(const ColumnWithTypeAndName& arg, const 
DataTypePtr& type, Co
 /// If both of types are signed/unsigned integers and size of left field type
 /// is less than right type, we don't need to convert field,
 /// because all integer fields are stored in Int64/UInt64.
-bool is_conversion_required_between_integers(const IDataType& lhs, const 
IDataType& rhs);
-bool is_conversion_required_between_integers(FieldType lhs, FieldType rhs);
+bool is_conversion_required_between_integers(const TypeIndex& lhs, const 
TypeIndex& rhs);
 
 struct ExtraInfo {
     // -1 indicates it's not a Frontend generated column
diff --git a/be/src/vec/core/field.h b/be/src/vec/core/field.h
index 356216e7074..de2d544e7e0 100644
--- a/be/src/vec/core/field.h
+++ b/be/src/vec/core/field.h
@@ -493,6 +493,11 @@ public:
         return *this;
     }
 
+    bool is_complex_field() const {
+        return which == Types::Array || which == Types::Map || which == 
Types::Tuple ||
+               which == Types::VariantMap;
+    }
+
     Field& operator=(Field&& rhs) {
         if (this != &rhs) {
             if (which != rhs.which) {
diff --git a/be/src/vec/json/parse2column.cpp b/be/src/vec/json/parse2column.cpp
index cc3c649bb70..0f61e24dad7 100644
--- a/be/src/vec/json/parse2column.cpp
+++ b/be/src/vec/json/parse2column.cpp
@@ -148,36 +148,28 @@ void parse_json_to_variant(IColumn& column, const char* 
src, size_t length,
     }
     auto& [paths, values] = *result;
     assert(paths.size() == values.size());
-    phmap::flat_hash_set<std::string> paths_set;
-    size_t num_rows = column_object.size();
+    size_t old_num_rows = column_object.size();
     for (size_t i = 0; i < paths.size(); ++i) {
         FieldInfo field_info;
         get_field_info(values[i], &field_info);
-        if (is_nothing(field_info.scalar_type)) {
+        if (WhichDataType(field_info.scalar_type_id).is_nothing()) {
             continue;
         }
-        if (!paths_set.insert(paths[i].get_path()).second) {
-            // return Status::DataQualityError(
-            //         fmt::format("Object has ambiguous path {}, {}", 
paths[i].get_path()));
-            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Object has 
ambiguous path {}",
-                                   paths[i].get_path());
-        }
-
-        if (!column_object.has_subcolumn(paths[i])) {
-            column_object.add_sub_column(paths[i], num_rows);
+        if (column_object.get_subcolumn(paths[i], i) == nullptr) {
+            column_object.add_sub_column(paths[i], old_num_rows);
         }
-        auto* subcolumn = column_object.get_subcolumn(paths[i]);
+        auto* subcolumn = column_object.get_subcolumn(paths[i], i);
         if (!subcolumn) {
             throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to 
find sub column {}",
                                    paths[i].get_path());
         }
-        assert(subcolumn->size() == num_rows);
+        DCHECK_EQ(subcolumn->size(), old_num_rows);
         subcolumn->insert(std::move(values[i]), std::move(field_info));
     }
     // /// Insert default values to missed subcolumns.
     const auto& subcolumns = column_object.get_subcolumns();
     for (const auto& entry : subcolumns) {
-        if (!paths_set.contains(entry->path.get_path())) {
+        if (entry->data.size() == old_num_rows) {
             entry->data.insertDefault();
         }
     }
diff --git a/regression-test/suites/variant_p2/performance.groovy 
b/regression-test/suites/variant_p2/performance.groovy
new file mode 100644
index 00000000000..1f10dd90c04
--- /dev/null
+++ b/regression-test/suites/variant_p2/performance.groovy
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("regression_test_variant_performance", "p2"){
+    sql """CREATE TABLE IF NOT EXISTS var_perf (
+                        k bigint,
+                        v variant
+
+                    )
+                    DUPLICATE KEY(`k`)
+                    DISTRIBUTED BY RANDOM BUCKETS 4
+                    properties("replication_num" = "1", 
"disable_auto_compaction" = "false");
+    """
+    sql """
+        insert into var_perf 
+                SELECT *, 
'{"field1":348,"field2":596,"field3":781,"field4":41,"field5":922,"field6":84,"field7":222,"field8":312,"field9":490,"field10":715,"field11":837,"field12":753,"field13":171,"field14":727,"field15":739,"field16":545,"field17":964,"field18":540,"field19":685,"field20":828,"field21":157,"field22":404,"field23":287,"field24":481,"field25":476,"field26":559,"field27":144,"field28":545,"field29":70,"field30":668,"field31":820,"field32":193,"field33":465,"field34":347,
 [...]
+                        from numbers("number" = "10000000")
+                union all
+                SELECT *, 
'{"field1":201,"field2":465,"field3":977,"field4":101112,"field5":131415,"field6":216,"field7":192021,"field8":822324,"field9":525627,"field10":928930,"field11":413233,"field12":243536,"field13":373839,"field14":404142,"field15":434445,"field16":1464748,"field17":495051,"field18":525354,"field19":565657,"field20":1585960,"field21":616263,"field22":646566,"field23":676869,"field24":707172,"field25":737475,"field26":767778,"field27":798081,"field28":828384,"field2
 [...]
+                        from numbers("number" = "10000000")
+        """
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

(doris) 01/12: [Performance](Variant) Improve load performance for variant type (#33890)

Reply via email to