github-actions[bot] commented on code in PR #62589: URL: https://github.com/apache/doris/pull/62589#discussion_r3322030867
########## be/src/exec/runtime_filter/runtime_filter_partition_pruner.cpp: ########## @@ -0,0 +1,857 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exec/runtime_filter/runtime_filter_partition_pruner.h" + +#include <gen_cpp/PlanNodes_types.h> + +#include <optional> +#include <unordered_set> +#include <utility> + +#include "core/block/block.h" +#include "core/column/column.h" +#include "core/column/column_nullable.h" +#include "core/data_type/data_type_nullable.h" +#include "core/field.h" +#include "exprs/bloom_filter_func.h" +#include "exprs/hybrid_set.h" +#include "exprs/runtime_filter_expr.h" +#include "exprs/vexpr.h" +#include "exprs/vexpr_context.h" +#include "exprs/vliteral.h" +#include "exprs/vslot_ref.h" +#include "runtime/descriptors.h" + +namespace doris { + +// NOLINTBEGIN(readability-function-cognitive-complexity,readability-function-size) +// Complexity is inflated by macro expansion for each PrimitiveType case. +Status ParsedPartitionBoundaries::parse( + const std::vector<TPartitionBoundary>& boundaries, + const phmap::flat_hash_map<int, SlotDescriptor*>& slot_descs) { + for (const auto& tb : boundaries) { + DORIS_CHECK(tb.__isset.partition_id); + DORIS_CHECK(tb.__isset.slot_id); + SlotId slot_id = tb.slot_id; + + auto slot_it = slot_descs.find(slot_id); + DORIS_CHECK(slot_it != slot_descs.end()); + SlotDescriptor* slot = slot_it->second; + // Reuse the slot's pre-built DataType: walking through VLiteral here + // would cost a `DataTypeFactory::create_data_type(node)` heap allocation + // and a one-row `ColumnConst` allocation per boundary endpoint. With + // thousands of partitions that dominates BuildTasksTime. + const DataTypePtr& slot_type = slot->type(); + PrimitiveType ptype = slot_type->get_primitive_type(); + int precision = cast_set<int>(slot_type->get_precision()); + int scale = cast_set<int>(slot_type->get_scale()); + bool is_nullable = slot->is_nullable(); + + // Store slot data type for potential projection use + _slot_data_types[slot_id] = slot_type; + + ParsedBoundary boundary; + boundary.partition_id = tb.partition_id; + boundary.slot_id = slot_id; + boundary.is_nullable = is_nullable; + + bool parsed_ok = false; + +#define BUILD_BOUNDARY_CVR(NAME) \ + case TYPE_##NAME: { \ + using CppType = typename PrimitiveTypeTraits<TYPE_##NAME>::CppType; \ + bool is_list = tb.__isset.list_values && !tb.list_values.empty(); \ + bool is_range = tb.__isset.range_start || tb.__isset.range_end; \ + DORIS_CHECK(is_list || is_range); \ + ColumnValueRange<TYPE_##NAME> cvr(slot->col_name(), is_nullable, precision, scale); \ + /* Returns nullopt if `node` is a NULL literal; the caller then sets contain_null */ \ + /* on the CVR instead of trying to extract a typed value (which would dereference */ \ + /* a null data pointer for the non-string branch). */ \ + auto parse_texpr_node = [&](const TExprNode& node) -> std::optional<CppType> { \ + if (node.node_type == TExprNodeType::NULL_LITERAL) { \ + return std::nullopt; \ + } \ + /* `Field` value is copied into the CVR by `add_fixed_value` / */ \ + /* `add_range` (both take CppType by const-ref / by value), so the */ \ + /* temporary `Field`'s lifetime ending at this expression's full-statement */ \ + /* boundary is safe -- including for `String` payloads. */ \ + Field field = slot_type->get_field(node); \ + return std::make_optional<CppType>(field.get<TYPE_##NAME>()); \ + }; \ + if (is_list) { \ + auto empty_cvr = ColumnValueRange<TYPE_##NAME>::create_empty_column_value_range( \ + is_nullable, precision, scale); \ + bool list_has_null = false; \ + bool list_has_value = false; \ + for (const auto& node : tb.list_values) { \ + auto parsed = parse_texpr_node(node); \ + if (!parsed) { \ + list_has_null = true; \ + continue; \ + } \ + static_cast<void>(empty_cvr.add_fixed_value(*parsed)); \ + list_has_value = true; \ + } \ + if (list_has_value) { \ + cvr.intersection(empty_cvr); \ + } \ + if (list_has_null && is_nullable) { \ + /* Track NULL membership on ParsedBoundary; calling */ \ + /* cvr.set_contain_null(true) here would invoke */ \ + /* set_empty_value_range() and discard the concrete fixed */ \ + /* values we just inserted, turning {NULL, v} into a */ \ + /* NULL-only boundary. */ \ + boundary.contains_null = true; \ + if (!list_has_value) { \ + boundary.only_null = true; \ + } \ + } \ + } else { \ Review Comment: Nullable RANGE partition columns are still unsafe here. `allow_partition_column_nullable` is enabled by default for RANGE partitions, and `NullLiteral.compareLiteral()` sorts NULL below every concrete value, so a `VALUES LESS THAN (...)` first partition can contain NULL rows. FE will still classify a direct partition `SlotRef` as prunable and serialize this boundary with no `range_start`; BE builds only the non-NULL CVR and leaves `ParsedBoundary::contains_null` false. For a null-safe join such as `f.k <=> d.k` with an IN runtime filter whose build side contains only NULL, `rf_contains_null` is true but the first RANGE partition reaches the regular CVR intersection path, intersects an empty non-NULL RF value set, and gets pruned even though its NULL rows should match. Please record nullable MINVALUE-side RANGE boundaries as containing NULL (or conservatively skip RF partition pruning for nullable RANGE partition columns) and add a regression case for a nullable RANGE partition plu s `<=>` runtime filter. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
