This is an automated email from the ASF dual-hosted git repository. eldenmoon pushed a commit to branch variant-sparse in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/variant-sparse by this push: new 47b6deb43b4 fix 10 (#45831) 47b6deb43b4 is described below commit 47b6deb43b40acfe6be485c146216763501c0b21 Author: lihangyu <lihan...@selectdb.com> AuthorDate: Wed Dec 25 10:18:44 2024 +0800 fix 10 (#45831) --- be/src/olap/rowset/segment_v2/column_reader.cpp | 32 ++++++++++++--- be/src/olap/rowset/segment_v2/column_reader.h | 6 ++- .../rowset/segment_v2/hierarchical_data_reader.cpp | 21 +++++++++- be/src/olap/rowset/segment_v2/segment.cpp | 9 +++-- .../segment_v2/variant_column_writer_impl.cpp | 10 +++-- be/src/vec/columns/column_object.cpp | 45 +++++++++++++++------- be/src/vec/columns/column_object.h | 2 - .../data/variant_p0/compaction/test_compaction.out | 16 ++++---- 8 files changed, 103 insertions(+), 38 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index 28fb748a365..c1000df1bff 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -227,6 +227,24 @@ const SubcolumnColumnReaders::Node* VariantColumnReader::get_reader_by_path( return _subcolumn_readers->find_leaf(relative_path); } +int64_t VariantColumnReader::get_metadata_size() const { + int64_t size = ColumnReader::get_metadata_size(); + if (_statistics) { + for (const auto& [path, _] : _statistics->subcolumns_non_null_size) { + size += path.size() + sizeof(size_t); + } + for (const auto& [path, _] : _statistics->sparse_column_non_null_size) { + size += path.size() + sizeof(size_t); + } + } + + for (const auto& reader : *_subcolumn_readers) { + size += reader->data.reader->get_metadata_size(); + size += reader->path.get_path().size(); + } + return size; +} + Status VariantColumnReader::new_iterator(ColumnIterator** iterator, const TabletColumn& target_col) { // root column use unique id, leaf column use parent_unique_id @@ -303,8 +321,12 @@ Status VariantColumnReader::init(const ColumnReaderOptions& opts, const SegmentF } auto relative_path = path.copy_pop_front(); auto get_data_type_fn = [&]() { + // root subcolumn is ColumnObject::MostCommonType which is jsonb if (relative_path.empty()) { - return make_nullable(std::make_unique<vectorized::ColumnObject::MostCommonType>()); + return self_column_pb.is_nullable() + ? make_nullable(std::make_unique< + vectorized::ColumnObject::MostCommonType>()) + : std::make_unique<vectorized::ColumnObject::MostCommonType>(); } return vectorized::DataTypeFactory::instance().create_data_type(column_pb); }; @@ -327,11 +349,11 @@ Status VariantColumnReader::init(const ColumnReaderOptions& opts, const SegmentF if (self_column_pb.has_variant_statistics()) { _statistics = std::make_unique<VariantStatistics>(); const auto& variant_stats = self_column_pb.variant_statistics(); - for (const auto& [path, _] : variant_stats.sparse_column_non_null_size()) { - _statistics->sparse_column_non_null_size.emplace(path.data(), path.size()); + for (const auto& [path, size] : variant_stats.sparse_column_non_null_size()) { + _statistics->sparse_column_non_null_size.emplace(path, size); } - for (const auto& [path, _] : variant_stats.subcolumn_non_null_size()) { - _statistics->subcolumns_non_null_size.emplace(path.data(), path.size()); + for (const auto& [path, size] : variant_stats.subcolumn_non_null_size()) { + _statistics->subcolumns_non_null_size.emplace(path, size); } } return Status::OK(); diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index 646e657b162..16a0c91b157 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -217,6 +217,8 @@ public: virtual FieldType get_meta_type() { return _meta_type; } + int64_t get_metadata_size() const override; + private: ColumnReader(const ColumnReaderOptions& opts, const ColumnMetaPB& meta, uint64_t num_rows, io::FileReaderSPtr file_reader); @@ -255,8 +257,6 @@ private: Status _calculate_row_ranges(const std::vector<uint32_t>& page_indexes, RowRanges* row_ranges); - int64_t get_metadata_size() const override; - private: int64_t _meta_length; FieldType _meta_type; @@ -312,6 +312,8 @@ public: const VariantStatistics* get_stats() const { return _statistics.get(); } + int64_t get_metadata_size() const override; + private: std::unique_ptr<SubcolumnColumnReaders> _subcolumn_readers; std::unique_ptr<ColumnReader> _sparse_column_reader; diff --git a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp index de0123a330a..651cfb69655 100644 --- a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp +++ b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp @@ -257,8 +257,7 @@ Status HierarchicalDataReader::_init_container(vectorized::MutableColumnPtr& con // auto column = root_var.get_root(); // auto type = root_var.get_root_type(); MutableColumnPtr column = _root_reader->column->get_ptr(); - container_variant.add_sub_column({}, std::move(column), - ColumnObject::get_most_common_type()); + container_variant.add_sub_column({}, std::move(column), _root_reader->type); } // parent path -> subcolumns std::map<PathInData, PathsWithColumnAndType> nested_subcolumns; @@ -354,8 +353,26 @@ Status HierarchicalDataReader::_process_sparse_column(vectorized::ColumnObject& auto sub_path = get_sub_path(path, path_prefix); sparse_data_paths->insert_data(sub_path.data(), sub_path.size()); sparse_data_values->insert_from(src_sparse_data_values, lower_bound_index); + } else { + // insert into root column, example: access v['b'] and b is in sparse column + // data example: + // {"b" : 123} + // {"b" : {"c" : 456}} + // b maybe in sparse column, and b.c is in subolumn, put `b` into root column to distinguish + // from "" which is empty path and root + if (container_variant.is_null_root()) { + container_variant.add_sub_column({}, sparse_data_offsets.size()); + } + const auto& data = ColumnObject::deserialize_from_sparse_column( + &src_sparse_data_values, lower_bound_index); + container_variant.get_subcolumn({})->insert(data.first, data.second); } } + // if root was created, and not seen in sparse data, insert default + if (!container_variant.is_null_root() && + container_variant.get_subcolumn({})->size() == sparse_data_offsets.size()) { + container_variant.get_subcolumn({})->insert_default(); + } sparse_data_offsets.push_back(sparse_data_paths->size()); } } diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index 9b505e4a4a5..1e6508de0e2 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -233,8 +233,11 @@ Status Segment::new_iterator(SchemaSPtr schema, const StorageReadOptions& read_o if (col.is_extracted_column()) { auto relative_path = col.path_info_ptr()->copy_pop_front(); int32_t unique_id = col.unique_id() > 0 ? col.unique_id() : col.parent_unique_id(); - const auto* node = ((VariantColumnReader*)(_column_readers.at(unique_id).get())) - ->get_reader_by_path(relative_path); + const auto* node = + _column_readers.contains(unique_id) + ? ((VariantColumnReader*)(_column_readers.at(unique_id).get())) + ->get_reader_by_path(relative_path) + : nullptr; reader = node != nullptr ? node->data.reader.get() : nullptr; } else { reader = _column_readers.contains(col.unique_id()) @@ -828,7 +831,7 @@ ColumnReader* Segment::_get_column_reader(const TabletColumn& col) { if (col.has_path_info() || col.is_variant_type()) { auto relative_path = col.path_info_ptr()->copy_pop_front(); int32_t unique_id = col.unique_id() > 0 ? col.unique_id() : col.parent_unique_id(); - const auto* node = col.has_path_info() + const auto* node = col.has_path_info() && _column_readers.contains(unique_id) ? ((VariantColumnReader*)(_column_readers.at(unique_id).get())) ->get_reader_by_path(relative_path) : nullptr; diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp index 33499a8e7e2..0326e31f096 100644 --- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp +++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp @@ -110,6 +110,7 @@ Status VariantColumnWriterImpl::_get_subcolumn_paths_from_stats(std::set<std::st // reserve 1 for root column for (const auto& [size, path] : paths_with_sizes) { if (paths.size() < vectorized::ColumnObject::MAX_SUBCOLUMNS - 1) { + VLOG_DEBUG << "pick " << path << " as subcolumn"; paths.emplace(path); } // // todo : Add all remaining paths into shared data statistics until we reach its max size; @@ -120,6 +121,7 @@ Status VariantColumnWriterImpl::_get_subcolumn_paths_from_stats(std::set<std::st } else { // Use all dynamic paths from all source columns. for (const auto& [path, _] : path_to_total_number_of_non_null_values) { + VLOG_DEBUG << "pick " << path << " as subcolumn"; paths.emplace(path); } } @@ -151,9 +153,11 @@ Status VariantColumnWriterImpl::_process_root_column(vectorized::ColumnObject* p return status; } const uint8_t* nullmap = - vectorized::check_and_get_column<vectorized::ColumnUInt8>(_null_column.get()) - ->get_data() - .data(); + _null_column + ? vectorized::check_and_get_column<vectorized::ColumnUInt8>(_null_column.get()) + ->get_data() + .data() + : nullptr; RETURN_IF_ERROR(_root_writer->append(nullmap, column->get_data(), num_rows)); ++column_id; converter->clear_source_content(); diff --git a/be/src/vec/columns/column_object.cpp b/be/src/vec/columns/column_object.cpp index 595326839c5..8e2e4e06efb 100644 --- a/be/src/vec/columns/column_object.cpp +++ b/be/src/vec/columns/column_object.cpp @@ -641,8 +641,8 @@ MutableColumnPtr ColumnObject::apply_for_columns(Func&& func) const { } auto sparse_column = func(serialized_sparse_column); res->serialized_sparse_column = sparse_column->assume_mutable(); - res->set_num_rows(serialized_sparse_column->size()); - check_consistency(); + res->num_rows = res->serialized_sparse_column->size(); + res->check_consistency(); return res; } @@ -815,11 +815,6 @@ ColumnObject::ColumnObject(bool is_nullable_, bool create_root_) ENABLE_CHECK_CONSISTENCY(this); } -ColumnObject::ColumnObject(MutableColumnPtr&& sparse_column) - : is_nullable(true), - num_rows(sparse_column->size()), - serialized_sparse_column(std::move(sparse_column)) {} - ColumnObject::ColumnObject(bool is_nullable_, DataTypePtr type, MutableColumnPtr&& column) : is_nullable(is_nullable_), num_rows(0) { add_sub_column({}, std::move(column), type); @@ -994,7 +989,8 @@ bool ColumnObject::Subcolumn::is_null_at(size_t n) const { ind -= num_of_defaults_in_prefix; for (const auto& part : data) { if (ind < part->size()) { - return assert_cast<const ColumnNullable&>(*part).is_null_at(ind); + const auto* nullable = check_and_get_column<ColumnNullable>(part.get()); + return nullable ? nullable->is_null_at(ind) : false; } ind -= part->size(); } @@ -1061,14 +1057,16 @@ void ColumnObject::Subcolumn::serialize_to_sparse_column(ColumnString* key, std: const auto& part = data[i]; if (row < part->size()) { // no need null in sparse column - if (!assert_cast<const ColumnNullable&>(*part).is_null_at(row)) { + if (!assert_cast<const ColumnNullable&, TypeCheckOnRelease::DISABLE>(*part).is_null_at( + row)) { // insert key key->insert_data(path.data(), path.size()); // every subcolumn is always Nullable auto nullable_serde = std::static_pointer_cast<DataTypeNullableSerDe>( data_types[i]->get_serde(CURRENT_SERIALIZE_NESTING_LEVEL)); - auto& nullable_col = assert_cast<const ColumnNullable&>(*part); + auto& nullable_col = + assert_cast<const ColumnNullable&, TypeCheckOnRelease::DISABLE>(*part); // insert value ColumnString::Chars& chars = value->get_chars(); @@ -1310,6 +1308,7 @@ void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t le auto it = src_path_and_subcoumn_for_sparse_column.begin(); auto end = src_path_and_subcoumn_for_sparse_column.end(); while (it != end) { + VLOG_DEBUG << "pick " << it->first << " as sparse column"; sorted_src_subcolumn_for_sparse_column.emplace_back(it->first, it->second); ++it; } @@ -1707,9 +1706,16 @@ bool ColumnObject::is_visible_root_value(size_t nrow) const { if (subcolumns.get_root()->data.is_null_at(nrow)) { return false; } - nrow = nrow - subcolumns.get_root()->data.num_of_defaults_in_prefix; - const auto& nullable = assert_cast<const ColumnNullable&>(*subcolumns.get_root()->data.data[0]); - return !nullable.get_data_at(nrow).empty(); + int ind = nrow - subcolumns.get_root()->data.num_of_defaults_in_prefix; + for (const auto& part : subcolumns.get_root()->data.data) { + if (ind < part->size()) { + return !part->get_data_at(ind).empty(); + } + ind -= part->size(); + } + + throw doris::Exception(ErrorCode::OUT_OF_BOUND, "Index ({}) for getting field is out of range", + nrow); } Status ColumnObject::serialize_one_row_to_json_format(int64_t row_num, BufferWritable& output, @@ -1962,6 +1968,10 @@ Status ColumnObject::finalize(FinalizeMode mode) { // 3. pick MAX_SUBCOLUMNS selected subcolumns for (size_t i = 0; i < std::min(MAX_SUBCOLUMNS, sorted_by_size.size()); ++i) { + // if too many null values, then consider it as sparse column + if (sorted_by_size[i].second < num_rows * 0.95) { + continue; + } selected_path.insert(sorted_by_size[i].first); } std::map<std::string_view, Subcolumn> remaing_subcolumns; @@ -1970,6 +1980,7 @@ Status ColumnObject::finalize(FinalizeMode mode) { if (selected_path.find(entry->path.get_path()) != selected_path.end()) { new_subcolumns.add(entry->path, entry->data); } else { + VLOG_DEBUG << "pick " << entry->path.get_path() << " as sparse column"; remaing_subcolumns.emplace(entry->path.get_path(), entry->data); } } @@ -2138,7 +2149,15 @@ const DataTypePtr ColumnObject::NESTED_TYPE = std::make_shared<vectorized::DataT std::make_shared<vectorized::DataTypeArray>(std::make_shared<vectorized::DataTypeNullable>( std::make_shared<vectorized::DataTypeObject>()))); +// const size_t ColumnObject::MAX_SUBCOLUMNS = 5; +#ifndef NDEBUG +const size_t ColumnObject::MAX_SUBCOLUMNS = []() -> size_t { + std::srand(std::time(nullptr)); // 初始化随机数种子 + return 1 + std::rand() % 10; // 随机值范围 [1, 10] +}(); +#else const size_t ColumnObject::MAX_SUBCOLUMNS = 5; +#endif DataTypePtr ColumnObject::get_root_type() const { return subcolumns.get_root()->data.get_least_common_type(); diff --git a/be/src/vec/columns/column_object.h b/be/src/vec/columns/column_object.h index d1b19dfc6c2..fa207d19c39 100644 --- a/be/src/vec/columns/column_object.h +++ b/be/src/vec/columns/column_object.h @@ -274,8 +274,6 @@ public: explicit ColumnObject(bool is_nullable_, bool create_root = true); - explicit ColumnObject(MutableColumnPtr&& sparse_column); - explicit ColumnObject(bool is_nullable_, DataTypePtr type, MutableColumnPtr&& column); // create without root, num_rows = size diff --git a/regression-test/data/variant_p0/compaction/test_compaction.out b/regression-test/data/variant_p0/compaction/test_compaction.out index 0b905e3930f..7ccf1277bc0 100644 --- a/regression-test/data/variant_p0/compaction/test_compaction.out +++ b/regression-test/data/variant_p0/compaction/test_compaction.out @@ -8,8 +8,8 @@ 3 {"x":[3]} 4 {"y":1} 4 {"y":1} -5 {"z":2.0} -5 {"z":2.0} +5 {"z":2} +5 {"z":2} 6 {"x":111} 6 {"x":111} 7 {"m":1} @@ -96,8 +96,8 @@ 3 {"x":[3]} 4 {"y":1} 4 {"y":1} -5 {"z":2.0} -5 {"z":2.0} +5 {"z":2} +5 {"z":2} 6 {"x":111} 6 {"x":111} 7 {"m":1} @@ -180,7 +180,7 @@ 2 {"a":"1"} 3 {"x":[3]} 4 {"y":1} -5 {"z":2.0} +5 {"z":2} 6 {"x":111} 7 {"m":1} 8 {"l":2} @@ -233,7 +233,7 @@ 2 {"a":"1"} 3 {"x":[3]} 4 {"y":1} -5 {"z":2.0} +5 {"z":2} 6 {"x":111} 7 {"m":1} 8 {"l":2} @@ -284,7 +284,7 @@ 2 {"a":"1"} 3 {"x":[3]} 4 {"y":1} -5 {"z":2.0} +5 {"z":2} 6 {"x":111} 7 {"m":1} 8 {"l":2} @@ -337,7 +337,7 @@ 2 {"a":"1"} 3 {"x":[3]} 4 {"y":1} -5 {"z":2.0} +5 {"z":2} 6 {"x":111} 7 {"m":1} 8 {"l":2} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org