This is an automated email from the ASF dual-hosted git repository.
eldenmoon pushed a commit to branch variant-sparse
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/variant-sparse by this push:
new a9967223329 fix error stats (#49582)
a9967223329 is described below
commit a9967223329eada676feee5cc48e119affc4c9a6
Author: Sun Chenyang <[email protected]>
AuthorDate: Fri Mar 28 12:21:56 2025 +0800
fix error stats (#49582)
---
be/src/vec/common/schema_util.cpp | 75 ++++++++++++++++++++++-----------------
1 file changed, 43 insertions(+), 32 deletions(-)
diff --git a/be/src/vec/common/schema_util.cpp
b/be/src/vec/common/schema_util.cpp
index 7f775e2b76d..2f8a4de72e7 100644
--- a/be/src/vec/common/schema_util.cpp
+++ b/be/src/vec/common/schema_util.cpp
@@ -746,23 +746,48 @@ Status check_path_stats(const
std::vector<RowsetSharedPtr>& intputs, RowsetShare
}
std::unordered_map<int32_t, PathToNoneNullValues> output_uid_to_path_stats;
RETURN_IF_ERROR(collect_path_stats(output, output_uid_to_path_stats));
- for (const auto& [uid, stats] : original_uid_to_path_stats) {
- if (output_uid_to_path_stats.find(uid) ==
output_uid_to_path_stats.end()) {
+ for (const auto& [uid, stats] : output_uid_to_path_stats) {
+ if (original_uid_to_path_stats.find(uid) ==
original_uid_to_path_stats.end()) {
return Status::InternalError("Path stats not found for uid {},
tablet_id {}", uid,
tablet->tablet_id());
}
- if (stats.size() != output_uid_to_path_stats.at(uid).size()) {
- return Status::InternalError("Path stats size not match for uid
{}, tablet_id {}", uid,
- tablet->tablet_id());
+
+ // In input rowsets, some rowsets may have statistics values exceeding
the maximum limit,
+ // which leads to inaccurate statistics
+ if (stats.size() > VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE)
{
+ // When there is only one segment, we can ensure that the size of
each path in output stats is accurate
+ if (output->num_segments() == 1) {
+ for (const auto& [path, size] : stats) {
+ if (original_uid_to_path_stats.at(uid).at(path) > size) {
+ return Status::InternalError(
+ "Path stats not smaller for uid {} with path
`{}`, input size {}, "
+ "output "
+ "size {}, "
+ "tablet_id {}",
+ uid, path,
original_uid_to_path_stats.at(uid).at(path), size,
+ tablet->tablet_id());
+ }
+ }
+ }
}
- for (const auto& [path, size] : stats) {
- if (output_uid_to_path_stats.at(uid).at(path) != size) {
+ // in this case, input stats is accurate, so we check the stats size
and stats value
+ else {
+ if (stats.size() != original_uid_to_path_stats.at(uid).size()) {
return Status::InternalError(
- "Path stats not match for uid {} with path `{}`, input
size {}, output "
- "size {}, "
- "tablet_id {}",
- uid, path, size,
output_uid_to_path_stats.at(uid).at(path),
- tablet->tablet_id());
+ "Path stats size not match for uid {}, tablet_id {},
input size {}, output "
+ "size {}",
+ uid, tablet->tablet_id(),
original_uid_to_path_stats.at(uid).size(),
+ stats.size());
+ }
+ for (const auto& [path, size] : stats) {
+ if (original_uid_to_path_stats.at(uid).at(path) != size) {
+ return Status::InternalError(
+ "Path stats not match for uid {} with path `{}`,
input size {}, output "
+ "size {}, "
+ "tablet_id {}",
+ uid, path,
original_uid_to_path_stats.at(uid).at(path), size,
+ tablet->tablet_id());
+ }
}
}
}
@@ -831,49 +856,35 @@ Status get_compaction_schema(const
std::vector<RowsetSharedPtr>& rowsets,
void calculate_variant_stats(const IColumn& encoded_sparse_column,
segment_v2::VariantStatisticsPB* stats, size_t
row_pos,
size_t num_rows) {
- size_t limit = VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE -
- stats->sparse_column_non_null_size().size();
// Cast input column to ColumnMap type since sparse column is stored as a
map
const auto& map_column = assert_cast<const
ColumnMap&>(encoded_sparse_column);
- // Map to store path frequencies - tracks how many times each path appears
- std::unordered_map<StringRef, size_t> sparse_data_paths_statistics;
-
// Get the keys column which contains the paths as strings
const auto& sparse_data_paths =
assert_cast<const ColumnString*>(map_column.get_keys_ptr().get());
const auto& serialized_sparse_column_offsets =
assert_cast<const
ColumnArray::Offsets64&>(map_column.get_offsets());
+ auto& count_map = *stats->mutable_sparse_column_non_null_size();
// Iterate through all paths in the sparse column
for (size_t i = row_pos; i != row_pos + num_rows; ++i) {
size_t offset = serialized_sparse_column_offsets[i - 1];
size_t end = serialized_sparse_column_offsets[i];
for (size_t j = offset; j != end; ++j) {
auto path = sparse_data_paths->get_data_at(j);
+
+ const auto& sparse_path = path.to_string();
// If path already exists in statistics, increment its count
- if (auto it = sparse_data_paths_statistics.find(path);
- it != sparse_data_paths_statistics.end()) {
+ if (auto it = count_map.find(sparse_path); it != count_map.end()) {
++it->second;
}
// If path doesn't exist and we haven't hit the max statistics
size limit,
// add it with count 1
- else if (sparse_data_paths_statistics.size() < limit) {
- sparse_data_paths_statistics.emplace(path, 1);
+ else if (count_map.size() <
VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE) {
+ count_map.emplace(sparse_path, 1);
}
}
}
- // Copy the collected statistics into the protobuf stats object
- // This maps each path string to its frequency count
- for (const auto& [path, size] : sparse_data_paths_statistics) {
- const auto& sparse_path = path.to_string();
- auto& count_map = *stats->mutable_sparse_column_non_null_size();
- if (auto it = count_map.find(sparse_path); it != count_map.end()) {
- it->second += size;
- } else {
- count_map.emplace(sparse_path, size);
- }
- }
if (stats->sparse_column_non_null_size().size() >
VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE) {
throw doris::Exception(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]