This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch variant-sparse
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/variant-sparse by this push:
     new 47b6deb43b4 fix 10 (#45831)
47b6deb43b4 is described below

commit 47b6deb43b40acfe6be485c146216763501c0b21
Author: lihangyu <lihan...@selectdb.com>
AuthorDate: Wed Dec 25 10:18:44 2024 +0800

    fix 10 (#45831)
---
 be/src/olap/rowset/segment_v2/column_reader.cpp    | 32 ++++++++++++---
 be/src/olap/rowset/segment_v2/column_reader.h      |  6 ++-
 .../rowset/segment_v2/hierarchical_data_reader.cpp | 21 +++++++++-
 be/src/olap/rowset/segment_v2/segment.cpp          |  9 +++--
 .../segment_v2/variant_column_writer_impl.cpp      | 10 +++--
 be/src/vec/columns/column_object.cpp               | 45 +++++++++++++++-------
 be/src/vec/columns/column_object.h                 |  2 -
 .../data/variant_p0/compaction/test_compaction.out | 16 ++++----
 8 files changed, 103 insertions(+), 38 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp 
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 28fb748a365..c1000df1bff 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -227,6 +227,24 @@ const SubcolumnColumnReaders::Node* 
VariantColumnReader::get_reader_by_path(
     return _subcolumn_readers->find_leaf(relative_path);
 }
 
+int64_t VariantColumnReader::get_metadata_size() const {
+    int64_t size = ColumnReader::get_metadata_size();
+    if (_statistics) {
+        for (const auto& [path, _] : _statistics->subcolumns_non_null_size) {
+            size += path.size() + sizeof(size_t);
+        }
+        for (const auto& [path, _] : _statistics->sparse_column_non_null_size) 
{
+            size += path.size() + sizeof(size_t);
+        }
+    }
+
+    for (const auto& reader : *_subcolumn_readers) {
+        size += reader->data.reader->get_metadata_size();
+        size += reader->path.get_path().size();
+    }
+    return size;
+}
+
 Status VariantColumnReader::new_iterator(ColumnIterator** iterator,
                                          const TabletColumn& target_col) {
     // root column use unique id, leaf column use parent_unique_id
@@ -303,8 +321,12 @@ Status VariantColumnReader::init(const 
ColumnReaderOptions& opts, const SegmentF
         }
         auto relative_path = path.copy_pop_front();
         auto get_data_type_fn = [&]() {
+            // root subcolumn is ColumnObject::MostCommonType which is jsonb
             if (relative_path.empty()) {
-                return 
make_nullable(std::make_unique<vectorized::ColumnObject::MostCommonType>());
+                return self_column_pb.is_nullable()
+                               ? make_nullable(std::make_unique<
+                                               
vectorized::ColumnObject::MostCommonType>())
+                               : 
std::make_unique<vectorized::ColumnObject::MostCommonType>();
             }
             return 
vectorized::DataTypeFactory::instance().create_data_type(column_pb);
         };
@@ -327,11 +349,11 @@ Status VariantColumnReader::init(const 
ColumnReaderOptions& opts, const SegmentF
     if (self_column_pb.has_variant_statistics()) {
         _statistics = std::make_unique<VariantStatistics>();
         const auto& variant_stats = self_column_pb.variant_statistics();
-        for (const auto& [path, _] : 
variant_stats.sparse_column_non_null_size()) {
-            _statistics->sparse_column_non_null_size.emplace(path.data(), 
path.size());
+        for (const auto& [path, size] : 
variant_stats.sparse_column_non_null_size()) {
+            _statistics->sparse_column_non_null_size.emplace(path, size);
         }
-        for (const auto& [path, _] : variant_stats.subcolumn_non_null_size()) {
-            _statistics->subcolumns_non_null_size.emplace(path.data(), 
path.size());
+        for (const auto& [path, size] : 
variant_stats.subcolumn_non_null_size()) {
+            _statistics->subcolumns_non_null_size.emplace(path, size);
         }
     }
     return Status::OK();
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h 
b/be/src/olap/rowset/segment_v2/column_reader.h
index 646e657b162..16a0c91b157 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -217,6 +217,8 @@ public:
 
     virtual FieldType get_meta_type() { return _meta_type; }
 
+    int64_t get_metadata_size() const override;
+
 private:
     ColumnReader(const ColumnReaderOptions& opts, const ColumnMetaPB& meta, 
uint64_t num_rows,
                  io::FileReaderSPtr file_reader);
@@ -255,8 +257,6 @@ private:
 
     Status _calculate_row_ranges(const std::vector<uint32_t>& page_indexes, 
RowRanges* row_ranges);
 
-    int64_t get_metadata_size() const override;
-
 private:
     int64_t _meta_length;
     FieldType _meta_type;
@@ -312,6 +312,8 @@ public:
 
     const VariantStatistics* get_stats() const { return _statistics.get(); }
 
+    int64_t get_metadata_size() const override;
+
 private:
     std::unique_ptr<SubcolumnColumnReaders> _subcolumn_readers;
     std::unique_ptr<ColumnReader> _sparse_column_reader;
diff --git a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp 
b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp
index de0123a330a..651cfb69655 100644
--- a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp
@@ -257,8 +257,7 @@ Status 
HierarchicalDataReader::_init_container(vectorized::MutableColumnPtr& con
         // auto column = root_var.get_root();
         // auto type = root_var.get_root_type();
         MutableColumnPtr column = _root_reader->column->get_ptr();
-        container_variant.add_sub_column({}, std::move(column),
-                                         ColumnObject::get_most_common_type());
+        container_variant.add_sub_column({}, std::move(column), 
_root_reader->type);
     }
     // parent path -> subcolumns
     std::map<PathInData, PathsWithColumnAndType> nested_subcolumns;
@@ -354,8 +353,26 @@ Status 
HierarchicalDataReader::_process_sparse_column(vectorized::ColumnObject&
                         auto sub_path = get_sub_path(path, path_prefix);
                         sparse_data_paths->insert_data(sub_path.data(), 
sub_path.size());
                         
sparse_data_values->insert_from(src_sparse_data_values, lower_bound_index);
+                    } else {
+                        // insert into root column, example:  access v['b'] 
and b is in sparse column
+                        // data example:
+                        // {"b" : 123}
+                        // {"b" : {"c" : 456}}
+                        // b maybe in sparse column, and b.c is in subolumn, 
put `b` into root column to distinguish
+                        // from "" which is empty path and root
+                        if (container_variant.is_null_root()) {
+                            container_variant.add_sub_column({}, 
sparse_data_offsets.size());
+                        }
+                        const auto& data = 
ColumnObject::deserialize_from_sparse_column(
+                                &src_sparse_data_values, lower_bound_index);
+                        
container_variant.get_subcolumn({})->insert(data.first, data.second);
                     }
                 }
+                // if root was created, and not seen in sparse data, insert 
default
+                if (!container_variant.is_null_root() &&
+                    container_variant.get_subcolumn({})->size() == 
sparse_data_offsets.size()) {
+                    container_variant.get_subcolumn({})->insert_default();
+                }
                 sparse_data_offsets.push_back(sparse_data_paths->size());
             }
         }
diff --git a/be/src/olap/rowset/segment_v2/segment.cpp 
b/be/src/olap/rowset/segment_v2/segment.cpp
index 9b505e4a4a5..1e6508de0e2 100644
--- a/be/src/olap/rowset/segment_v2/segment.cpp
+++ b/be/src/olap/rowset/segment_v2/segment.cpp
@@ -233,8 +233,11 @@ Status Segment::new_iterator(SchemaSPtr schema, const 
StorageReadOptions& read_o
         if (col.is_extracted_column()) {
             auto relative_path = col.path_info_ptr()->copy_pop_front();
             int32_t unique_id = col.unique_id() > 0 ? col.unique_id() : 
col.parent_unique_id();
-            const auto* node = 
((VariantColumnReader*)(_column_readers.at(unique_id).get()))
-                                       ->get_reader_by_path(relative_path);
+            const auto* node =
+                    _column_readers.contains(unique_id)
+                            ? 
((VariantColumnReader*)(_column_readers.at(unique_id).get()))
+                                      ->get_reader_by_path(relative_path)
+                            : nullptr;
             reader = node != nullptr ? node->data.reader.get() : nullptr;
         } else {
             reader = _column_readers.contains(col.unique_id())
@@ -828,7 +831,7 @@ ColumnReader* Segment::_get_column_reader(const 
TabletColumn& col) {
     if (col.has_path_info() || col.is_variant_type()) {
         auto relative_path = col.path_info_ptr()->copy_pop_front();
         int32_t unique_id = col.unique_id() > 0 ? col.unique_id() : 
col.parent_unique_id();
-        const auto* node = col.has_path_info()
+        const auto* node = col.has_path_info() && 
_column_readers.contains(unique_id)
                                    ? 
((VariantColumnReader*)(_column_readers.at(unique_id).get()))
                                              
->get_reader_by_path(relative_path)
                                    : nullptr;
diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp 
b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
index 33499a8e7e2..0326e31f096 100644
--- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
+++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
@@ -110,6 +110,7 @@ Status 
VariantColumnWriterImpl::_get_subcolumn_paths_from_stats(std::set<std::st
         // reserve 1 for root column
         for (const auto& [size, path] : paths_with_sizes) {
             if (paths.size() < vectorized::ColumnObject::MAX_SUBCOLUMNS - 1) {
+                VLOG_DEBUG << "pick " << path << " as subcolumn";
                 paths.emplace(path);
             }
             // // todo : Add all remaining paths into shared data statistics 
until we reach its max size;
@@ -120,6 +121,7 @@ Status 
VariantColumnWriterImpl::_get_subcolumn_paths_from_stats(std::set<std::st
     } else {
         // Use all dynamic paths from all source columns.
         for (const auto& [path, _] : path_to_total_number_of_non_null_values) {
+            VLOG_DEBUG << "pick " << path << " as subcolumn";
             paths.emplace(path);
         }
     }
@@ -151,9 +153,11 @@ Status 
VariantColumnWriterImpl::_process_root_column(vectorized::ColumnObject* p
         return status;
     }
     const uint8_t* nullmap =
-            
vectorized::check_and_get_column<vectorized::ColumnUInt8>(_null_column.get())
-                    ->get_data()
-                    .data();
+            _null_column
+                    ? 
vectorized::check_and_get_column<vectorized::ColumnUInt8>(_null_column.get())
+                              ->get_data()
+                              .data()
+                    : nullptr;
     RETURN_IF_ERROR(_root_writer->append(nullmap, column->get_data(), 
num_rows));
     ++column_id;
     converter->clear_source_content();
diff --git a/be/src/vec/columns/column_object.cpp 
b/be/src/vec/columns/column_object.cpp
index 595326839c5..8e2e4e06efb 100644
--- a/be/src/vec/columns/column_object.cpp
+++ b/be/src/vec/columns/column_object.cpp
@@ -641,8 +641,8 @@ MutableColumnPtr ColumnObject::apply_for_columns(Func&& 
func) const {
     }
     auto sparse_column = func(serialized_sparse_column);
     res->serialized_sparse_column = sparse_column->assume_mutable();
-    res->set_num_rows(serialized_sparse_column->size());
-    check_consistency();
+    res->num_rows = res->serialized_sparse_column->size();
+    res->check_consistency();
     return res;
 }
 
@@ -815,11 +815,6 @@ ColumnObject::ColumnObject(bool is_nullable_, bool 
create_root_)
     ENABLE_CHECK_CONSISTENCY(this);
 }
 
-ColumnObject::ColumnObject(MutableColumnPtr&& sparse_column)
-        : is_nullable(true),
-          num_rows(sparse_column->size()),
-          serialized_sparse_column(std::move(sparse_column)) {}
-
 ColumnObject::ColumnObject(bool is_nullable_, DataTypePtr type, 
MutableColumnPtr&& column)
         : is_nullable(is_nullable_), num_rows(0) {
     add_sub_column({}, std::move(column), type);
@@ -994,7 +989,8 @@ bool ColumnObject::Subcolumn::is_null_at(size_t n) const {
     ind -= num_of_defaults_in_prefix;
     for (const auto& part : data) {
         if (ind < part->size()) {
-            return assert_cast<const ColumnNullable&>(*part).is_null_at(ind);
+            const auto* nullable = 
check_and_get_column<ColumnNullable>(part.get());
+            return nullable ? nullable->is_null_at(ind) : false;
         }
         ind -= part->size();
     }
@@ -1061,14 +1057,16 @@ void 
ColumnObject::Subcolumn::serialize_to_sparse_column(ColumnString* key, std:
         const auto& part = data[i];
         if (row < part->size()) {
             // no need null in sparse column
-            if (!assert_cast<const ColumnNullable&>(*part).is_null_at(row)) {
+            if (!assert_cast<const ColumnNullable&, 
TypeCheckOnRelease::DISABLE>(*part).is_null_at(
+                        row)) {
                 // insert key
                 key->insert_data(path.data(), path.size());
 
                 // every subcolumn is always Nullable
                 auto nullable_serde = 
std::static_pointer_cast<DataTypeNullableSerDe>(
                         
data_types[i]->get_serde(CURRENT_SERIALIZE_NESTING_LEVEL));
-                auto& nullable_col = assert_cast<const ColumnNullable&>(*part);
+                auto& nullable_col =
+                        assert_cast<const ColumnNullable&, 
TypeCheckOnRelease::DISABLE>(*part);
 
                 // insert value
                 ColumnString::Chars& chars = value->get_chars();
@@ -1310,6 +1308,7 @@ void ColumnObject::insert_range_from(const IColumn& src, 
size_t start, size_t le
     auto it = src_path_and_subcoumn_for_sparse_column.begin();
     auto end = src_path_and_subcoumn_for_sparse_column.end();
     while (it != end) {
+        VLOG_DEBUG << "pick " << it->first << " as sparse column";
         sorted_src_subcolumn_for_sparse_column.emplace_back(it->first, 
it->second);
         ++it;
     }
@@ -1707,9 +1706,16 @@ bool ColumnObject::is_visible_root_value(size_t nrow) 
const {
     if (subcolumns.get_root()->data.is_null_at(nrow)) {
         return false;
     }
-    nrow = nrow - subcolumns.get_root()->data.num_of_defaults_in_prefix;
-    const auto& nullable = assert_cast<const 
ColumnNullable&>(*subcolumns.get_root()->data.data[0]);
-    return !nullable.get_data_at(nrow).empty();
+    int ind = nrow - subcolumns.get_root()->data.num_of_defaults_in_prefix;
+    for (const auto& part : subcolumns.get_root()->data.data) {
+        if (ind < part->size()) {
+            return !part->get_data_at(ind).empty();
+        }
+        ind -= part->size();
+    }
+
+    throw doris::Exception(ErrorCode::OUT_OF_BOUND, "Index ({}) for getting 
field is out of range",
+                           nrow);
 }
 
 Status ColumnObject::serialize_one_row_to_json_format(int64_t row_num, 
BufferWritable& output,
@@ -1962,6 +1968,10 @@ Status ColumnObject::finalize(FinalizeMode mode) {
 
         // 3. pick MAX_SUBCOLUMNS selected subcolumns
         for (size_t i = 0; i < std::min(MAX_SUBCOLUMNS, 
sorted_by_size.size()); ++i) {
+            // if too many null values, then consider it as sparse column
+            if (sorted_by_size[i].second < num_rows * 0.95) {
+                continue;
+            }
             selected_path.insert(sorted_by_size[i].first);
         }
         std::map<std::string_view, Subcolumn> remaing_subcolumns;
@@ -1970,6 +1980,7 @@ Status ColumnObject::finalize(FinalizeMode mode) {
             if (selected_path.find(entry->path.get_path()) != 
selected_path.end()) {
                 new_subcolumns.add(entry->path, entry->data);
             } else {
+                VLOG_DEBUG << "pick " << entry->path.get_path() << " as sparse 
column";
                 remaing_subcolumns.emplace(entry->path.get_path(), 
entry->data);
             }
         }
@@ -2138,7 +2149,15 @@ const DataTypePtr ColumnObject::NESTED_TYPE = 
std::make_shared<vectorized::DataT
         
std::make_shared<vectorized::DataTypeArray>(std::make_shared<vectorized::DataTypeNullable>(
                 std::make_shared<vectorized::DataTypeObject>())));
 
+// const size_t ColumnObject::MAX_SUBCOLUMNS = 5;
+#ifndef NDEBUG
+const size_t ColumnObject::MAX_SUBCOLUMNS = []() -> size_t {
+    std::srand(std::time(nullptr)); // 初始化随机数种子
+    return 1 + std::rand() % 10;    // 随机值范围 [1, 10]
+}();
+#else
 const size_t ColumnObject::MAX_SUBCOLUMNS = 5;
+#endif
 
 DataTypePtr ColumnObject::get_root_type() const {
     return subcolumns.get_root()->data.get_least_common_type();
diff --git a/be/src/vec/columns/column_object.h 
b/be/src/vec/columns/column_object.h
index d1b19dfc6c2..fa207d19c39 100644
--- a/be/src/vec/columns/column_object.h
+++ b/be/src/vec/columns/column_object.h
@@ -274,8 +274,6 @@ public:
 
     explicit ColumnObject(bool is_nullable_, bool create_root = true);
 
-    explicit ColumnObject(MutableColumnPtr&& sparse_column);
-
     explicit ColumnObject(bool is_nullable_, DataTypePtr type, 
MutableColumnPtr&& column);
 
     // create without root, num_rows = size
diff --git a/regression-test/data/variant_p0/compaction/test_compaction.out 
b/regression-test/data/variant_p0/compaction/test_compaction.out
index 0b905e3930f..7ccf1277bc0 100644
--- a/regression-test/data/variant_p0/compaction/test_compaction.out
+++ b/regression-test/data/variant_p0/compaction/test_compaction.out
@@ -8,8 +8,8 @@
 3      {"x":[3]}
 4      {"y":1}
 4      {"y":1}
-5      {"z":2.0}
-5      {"z":2.0}
+5      {"z":2}
+5      {"z":2}
 6      {"x":111}
 6      {"x":111}
 7      {"m":1}
@@ -96,8 +96,8 @@
 3      {"x":[3]}
 4      {"y":1}
 4      {"y":1}
-5      {"z":2.0}
-5      {"z":2.0}
+5      {"z":2}
+5      {"z":2}
 6      {"x":111}
 6      {"x":111}
 7      {"m":1}
@@ -180,7 +180,7 @@
 2      {"a":"1"}
 3      {"x":[3]}
 4      {"y":1}
-5      {"z":2.0}
+5      {"z":2}
 6      {"x":111}
 7      {"m":1}
 8      {"l":2}
@@ -233,7 +233,7 @@
 2      {"a":"1"}
 3      {"x":[3]}
 4      {"y":1}
-5      {"z":2.0}
+5      {"z":2}
 6      {"x":111}
 7      {"m":1}
 8      {"l":2}
@@ -284,7 +284,7 @@
 2      {"a":"1"}
 3      {"x":[3]}
 4      {"y":1}
-5      {"z":2.0}
+5      {"z":2}
 6      {"x":111}
 7      {"m":1}
 8      {"l":2}
@@ -337,7 +337,7 @@
 2      {"a":"1"}
 3      {"x":[3]}
 4      {"y":1}
-5      {"z":2.0}
+5      {"z":2}
 6      {"x":111}
 7      {"m":1}
 8      {"l":2}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to