This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch variant-sparse
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/variant-sparse by this push:
     new f853ef7e639 [opt](variant) improve performance for handling nullable 
column (#50021)
f853ef7e639 is described below

commit f853ef7e639b4c0b649b196e88d89107d3dc663d
Author: lihangyu <[email protected]>
AuthorDate: Wed Apr 16 16:12:01 2025 +0800

    [opt](variant) improve performance for handling nullable column (#50021)
---
 .../segment_v2/variant_column_writer_impl.cpp       |  6 +++---
 be/src/vec/columns/column_object.cpp                | 21 +++++++++------------
 be/src/vec/columns/column_object.h                  |  2 +-
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp 
b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
index 1a8e8ed38af..4ff1f1f6de3 100644
--- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
+++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
@@ -134,7 +134,7 @@ Status 
convert_and_write_column(vectorized::OlapBlockDataConvertor* converter,
     const uint8_t* nullmap = converted_column->get_nullmap();
     RETURN_IF_ERROR(writer->append(nullmap, converted_column->get_data(), 
num_rows));
 
-    converter->clear_source_content();
+    converter->clear_source_content(column_id);
     return Status::OK();
 }
 
@@ -291,8 +291,8 @@ Status 
VariantColumnWriterImpl::_process_root_column(vectorized::ColumnObject* p
                               .data()
                     : nullptr;
     RETURN_IF_ERROR(_root_writer->append(nullmap, column->get_data(), 
num_rows));
+    converter->clear_source_content(column_id);
     ++column_id;
-    converter->clear_source_content();
 
     _opts.meta->set_num_rows(num_rows);
     return Status::OK();
@@ -408,8 +408,8 @@ Status VariantColumnWriterImpl::_process_sparse_column(
                           vectorized::ColumnObject::get_sparse_column_type());
     RETURN_IF_ERROR(
             _sparse_column_writer->append(column->get_nullmap(), 
column->get_data(), num_rows));
+    converter->clear_source_content(column_id);
     ++column_id;
-    converter->clear_source_content();
 
     // get stastics
     // todo: reuse the statics from collected stastics from compaction stage
diff --git a/be/src/vec/columns/column_object.cpp 
b/be/src/vec/columns/column_object.cpp
index 8662ba5b6cb..b495679d783 100644
--- a/be/src/vec/columns/column_object.cpp
+++ b/be/src/vec/columns/column_object.cpp
@@ -260,7 +260,7 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo 
info) {
         if (schema_util::is_conversion_required_between_integers(
                     base_type.idx, least_common_type.get_base_type_id())) {
             VLOG_DEBUG << "Conversion between " << getTypeName(base_type.idx) 
<< " and "
-                       << getTypeName(least_common_type.get_type_id());
+                       << getTypeName(least_common_type.get_base_type_id());
             DataTypePtr base_data_type;
             TypeIndex base_data_type_id;
             get_least_supertype_jsonb(
@@ -792,7 +792,8 @@ void ColumnObject::try_insert(const Field& field) {
     }
     for (auto& entry : subcolumns) {
         if (old_size == entry->data.size()) {
-            bool inserted = try_insert_default_from_nested(entry);
+            bool inserted = UNLIKELY(entry->path.has_nested_part() &&
+                                     try_insert_default_from_nested(entry));
             if (!inserted) {
                 entry->data.insert_default();
             }
@@ -838,7 +839,6 @@ bool ColumnObject::Subcolumn::is_null_at(size_t n) const {
         }
         ind -= part->size();
     }
-
     throw doris::Exception(ErrorCode::OUT_OF_BOUND, "Index ({}) for getting 
field is out of range",
                            n);
 }
@@ -873,7 +873,6 @@ void ColumnObject::Subcolumn::get(size_t n, Field& res) 
const {
 
         ind -= part->size();
     }
-
     throw doris::Exception(ErrorCode::OUT_OF_BOUND, "Index ({}) for getting 
field is out of range",
                            n);
 }
@@ -894,20 +893,18 @@ void 
ColumnObject::Subcolumn::serialize_to_sparse_column(ColumnString* key, std:
     row -= num_of_defaults_in_prefix;
     for (size_t i = 0; i < data.size(); ++i) {
         const auto& part = data[i];
-        size_t current_column_size = part->size();
+        const auto& nullable_col =
+                assert_cast<const ColumnNullable&, 
TypeCheckOnRelease::DISABLE>(*part);
+        size_t current_column_size = nullable_col.get_null_map_data().size();
         if (row < current_column_size) {
             // no need null in sparse column
-            if (!assert_cast<const ColumnNullable&, 
TypeCheckOnRelease::DISABLE>(*part).is_null_at(
-                        row)) {
+            if (!nullable_col.is_null_at(row)) {
                 // insert key
                 key->insert_data(path.data(), path.size());
 
                 // every subcolumn is always Nullable
                 auto nullable_serde =
                         
std::static_pointer_cast<DataTypeNullableSerDe>(data_serdes[i]);
-                auto& nullable_col =
-                        assert_cast<const ColumnNullable&, 
TypeCheckOnRelease::DISABLE>(*part);
-
                 // insert value
                 ColumnString::Chars& chars = value->get_chars();
                 nullable_serde->get_nested_serde()->write_one_cell_to_binary(
@@ -1343,7 +1340,6 @@ size_t 
ColumnObject::Subcolumn::serialize_text_json(size_t n, BufferWritable& ou
 
         ind -= part->size();
     }
-
     throw doris::Exception(ErrorCode::OUT_OF_BOUND,
                            "Index ({}) for serializing JSON is out of range", 
n);
 }
@@ -1906,7 +1902,7 @@ Status ColumnObject::finalize(FinalizeMode mode) {
         for (size_t i = 0; i < std::min(size_t(_max_subcolumns_count), 
sorted_by_size.size());
              ++i) {
             // if too many null values, then consider it as sparse column
-            if ((double)sorted_by_size[i].second < (double)num_rows * 0.95) {
+            if ((double)sorted_by_size[i].second < (double)num_rows * 0.99) {
                 continue;
             }
             selected_path.insert(sorted_by_size[i].first);
@@ -2035,6 +2031,7 @@ void ColumnObject::clear_column_data() {
             (*std::move(part)).clear();
         }
         entry->data.num_of_defaults_in_prefix = 0;
+        entry->data.current_num_of_defaults = 0;
         entry->data.num_rows = 0;
     }
     serialized_sparse_column->clear();
diff --git a/be/src/vec/columns/column_object.h 
b/be/src/vec/columns/column_object.h
index cbc2bb78c2b..96e5d7b6f69 100644
--- a/be/src/vec/columns/column_object.h
+++ b/be/src/vec/columns/column_object.h
@@ -171,7 +171,7 @@ public:
 
         void reset_current_num_of_defaults() { current_num_of_defaults = 0; }
 
-        size_t cur_num_of_defaults() { return current_num_of_defaults; }
+        size_t cur_num_of_defaults() const { return current_num_of_defaults; }
 
         void insert_many_defaults(size_t length);
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to