This is an automated email from the ASF dual-hosted git repository.
eldenmoon pushed a commit to branch variant-sparse
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/variant-sparse by this push:
new f853ef7e639 [opt](variant) improve performance for handling nullable
column (#50021)
f853ef7e639 is described below
commit f853ef7e639b4c0b649b196e88d89107d3dc663d
Author: lihangyu <[email protected]>
AuthorDate: Wed Apr 16 16:12:01 2025 +0800
[opt](variant) improve performance for handling nullable column (#50021)
---
.../segment_v2/variant_column_writer_impl.cpp | 6 +++---
be/src/vec/columns/column_object.cpp | 21 +++++++++------------
be/src/vec/columns/column_object.h | 2 +-
3 files changed, 13 insertions(+), 16 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
index 1a8e8ed38af..4ff1f1f6de3 100644
--- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
+++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
@@ -134,7 +134,7 @@ Status
convert_and_write_column(vectorized::OlapBlockDataConvertor* converter,
const uint8_t* nullmap = converted_column->get_nullmap();
RETURN_IF_ERROR(writer->append(nullmap, converted_column->get_data(),
num_rows));
- converter->clear_source_content();
+ converter->clear_source_content(column_id);
return Status::OK();
}
@@ -291,8 +291,8 @@ Status
VariantColumnWriterImpl::_process_root_column(vectorized::ColumnObject* p
.data()
: nullptr;
RETURN_IF_ERROR(_root_writer->append(nullmap, column->get_data(),
num_rows));
+ converter->clear_source_content(column_id);
++column_id;
- converter->clear_source_content();
_opts.meta->set_num_rows(num_rows);
return Status::OK();
@@ -408,8 +408,8 @@ Status VariantColumnWriterImpl::_process_sparse_column(
vectorized::ColumnObject::get_sparse_column_type());
RETURN_IF_ERROR(
_sparse_column_writer->append(column->get_nullmap(),
column->get_data(), num_rows));
+ converter->clear_source_content(column_id);
++column_id;
- converter->clear_source_content();
// get stastics
// todo: reuse the statics from collected stastics from compaction stage
diff --git a/be/src/vec/columns/column_object.cpp
b/be/src/vec/columns/column_object.cpp
index 8662ba5b6cb..b495679d783 100644
--- a/be/src/vec/columns/column_object.cpp
+++ b/be/src/vec/columns/column_object.cpp
@@ -260,7 +260,7 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo
info) {
if (schema_util::is_conversion_required_between_integers(
base_type.idx, least_common_type.get_base_type_id())) {
VLOG_DEBUG << "Conversion between " << getTypeName(base_type.idx)
<< " and "
- << getTypeName(least_common_type.get_type_id());
+ << getTypeName(least_common_type.get_base_type_id());
DataTypePtr base_data_type;
TypeIndex base_data_type_id;
get_least_supertype_jsonb(
@@ -792,7 +792,8 @@ void ColumnObject::try_insert(const Field& field) {
}
for (auto& entry : subcolumns) {
if (old_size == entry->data.size()) {
- bool inserted = try_insert_default_from_nested(entry);
+ bool inserted = UNLIKELY(entry->path.has_nested_part() &&
+ try_insert_default_from_nested(entry));
if (!inserted) {
entry->data.insert_default();
}
@@ -838,7 +839,6 @@ bool ColumnObject::Subcolumn::is_null_at(size_t n) const {
}
ind -= part->size();
}
-
throw doris::Exception(ErrorCode::OUT_OF_BOUND, "Index ({}) for getting
field is out of range",
n);
}
@@ -873,7 +873,6 @@ void ColumnObject::Subcolumn::get(size_t n, Field& res)
const {
ind -= part->size();
}
-
throw doris::Exception(ErrorCode::OUT_OF_BOUND, "Index ({}) for getting
field is out of range",
n);
}
@@ -894,20 +893,18 @@ void
ColumnObject::Subcolumn::serialize_to_sparse_column(ColumnString* key, std:
row -= num_of_defaults_in_prefix;
for (size_t i = 0; i < data.size(); ++i) {
const auto& part = data[i];
- size_t current_column_size = part->size();
+ const auto& nullable_col =
+ assert_cast<const ColumnNullable&,
TypeCheckOnRelease::DISABLE>(*part);
+ size_t current_column_size = nullable_col.get_null_map_data().size();
if (row < current_column_size) {
// no need null in sparse column
- if (!assert_cast<const ColumnNullable&,
TypeCheckOnRelease::DISABLE>(*part).is_null_at(
- row)) {
+ if (!nullable_col.is_null_at(row)) {
// insert key
key->insert_data(path.data(), path.size());
// every subcolumn is always Nullable
auto nullable_serde =
std::static_pointer_cast<DataTypeNullableSerDe>(data_serdes[i]);
- auto& nullable_col =
- assert_cast<const ColumnNullable&,
TypeCheckOnRelease::DISABLE>(*part);
-
// insert value
ColumnString::Chars& chars = value->get_chars();
nullable_serde->get_nested_serde()->write_one_cell_to_binary(
@@ -1343,7 +1340,6 @@ size_t
ColumnObject::Subcolumn::serialize_text_json(size_t n, BufferWritable& ou
ind -= part->size();
}
-
throw doris::Exception(ErrorCode::OUT_OF_BOUND,
"Index ({}) for serializing JSON is out of range",
n);
}
@@ -1906,7 +1902,7 @@ Status ColumnObject::finalize(FinalizeMode mode) {
for (size_t i = 0; i < std::min(size_t(_max_subcolumns_count),
sorted_by_size.size());
++i) {
// if too many null values, then consider it as sparse column
- if ((double)sorted_by_size[i].second < (double)num_rows * 0.95) {
+ if ((double)sorted_by_size[i].second < (double)num_rows * 0.99) {
continue;
}
selected_path.insert(sorted_by_size[i].first);
@@ -2035,6 +2031,7 @@ void ColumnObject::clear_column_data() {
(*std::move(part)).clear();
}
entry->data.num_of_defaults_in_prefix = 0;
+ entry->data.current_num_of_defaults = 0;
entry->data.num_rows = 0;
}
serialized_sparse_column->clear();
diff --git a/be/src/vec/columns/column_object.h
b/be/src/vec/columns/column_object.h
index cbc2bb78c2b..96e5d7b6f69 100644
--- a/be/src/vec/columns/column_object.h
+++ b/be/src/vec/columns/column_object.h
@@ -171,7 +171,7 @@ public:
void reset_current_num_of_defaults() { current_num_of_defaults = 0; }
- size_t cur_num_of_defaults() { return current_num_of_defaults; }
+ size_t cur_num_of_defaults() const { return current_num_of_defaults; }
void insert_many_defaults(size_t length);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]