BiteTheDDDDt commented on code in PR #11468: URL: https://github.com/apache/doris/pull/11468#discussion_r938432418
########## be/src/vec/exec/volap_scan_node.cpp: ########## @@ -2019,4 +1342,535 @@ Status VOlapScanNode::get_hints(TabletSharedPtr table, const TPaloScanRange& sca return Status::OK(); } +template <bool IsNotIn> +bool VOlapScanNode::_should_push_down_in_predicate(VInPredicate* pred, VExprContext* expr_ctx) { + if (pred->is_not_in() != IsNotIn) { + return false; + } + InState* state = reinterpret_cast<InState*>( + expr_ctx->fn_context(pred->fn_context_index()) + ->get_function_state(FunctionContext::FRAGMENT_LOCAL)); + HybridSetBase* set = state->hybrid_set.get(); + + // if there are too many elements in InPredicate, exceed the limit, + // we will not push any condition of this column to storage engine. + // because too many conditions pushed down to storage engine may even + // slow down the query process. + // ATTN: This is just an experience value. You may need to try + // different thresholds to improve performance. + if (set->size() > _max_pushdown_conditions_per_column) { + VLOG_NOTICE << "Predicate value num " << set->size() << " exceed limit " + << _max_pushdown_conditions_per_column; + return false; + } + return true; +} + +bool VOlapScanNode::_should_push_down_function( + VectorizedFnCall* fn_call, const std::function<bool(const std::string&)>& fn_checker) { + return fn_checker(fn_call->fn().name.function_name); +} + +bool VOlapScanNode::_should_push_down_function_filter(VectorizedFnCall* fn_call, + VExprContext* expr_ctx, + std::string* constant_str, + doris_udf::FunctionContext** fn_ctx) { + if (fn_call->fn().name.function_name != "like") { + return false; + } + + auto children = fn_call->children(); + doris_udf::FunctionContext* func_cxt = expr_ctx->fn_context(fn_call->fn_context_index()); + if (!func_cxt) { + return false; + } + if (children.size() != 2) { + return false; + } + for (size_t i = 0; i < children.size(); i++) { + if (VExpr::expr_without_cast(children[i])->node_type() != TExprNodeType::SLOT_REF) { + // not a slot ref(column) + continue; + } + if (!children[1 - i]->is_constant()) { + // only handle constant value + return false; + } else if (children[1 - i]->type().is_string_type()) { + return false; + } else { + if (const ColumnConst* const_column = check_and_get_column<ColumnConst>( + children[1 - i]->get_const_col(expr_ctx)->column_ptr)) { + *constant_str = std::string(const_column->get_data_at(0).data, + const_column->get_data_at(0).size); + } else { + return false; + } + } + } + *fn_ctx = func_cxt; + return true; +} + +bool VOlapScanNode::_should_push_down_binary_predicate( + VectorizedFnCall* fn_call, VExprContext* expr_ctx, StringRef* constant_val, + int* slot_ref_child, const std::function<bool(const std::string&)>& fn_checker) { + if (!fn_checker(fn_call->fn().name.function_name)) { + return false; + } + + auto children = fn_call->children(); + DCHECK(children.size() == 2); + for (size_t i = 0; i < children.size(); i++) { + if (VExpr::expr_without_cast(children[i])->node_type() != TExprNodeType::SLOT_REF) { + // not a slot ref(column) + continue; + } + if (!children[1 - i]->is_constant()) { + // only handle constant value + return false; + } else { + if (const ColumnConst* const_column = check_and_get_column<ColumnConst>( + children[1 - i]->get_const_col(expr_ctx)->column_ptr)) { + *slot_ref_child = i; + *constant_val = const_column->get_data_at(0); + } else { + return false; + } + } + } + return true; +} + +bool VOlapScanNode::_is_predicate_acting_on_slot( + VExpr* expr, + const std::function<bool(const std::vector<VExpr*>&, const VSlotRef**, VExpr**)>& checker, + SlotDescriptor** slot_desc, ColumnValueRangeType** range) { + const VSlotRef* slot_ref = nullptr; + VExpr* child_contains_slot = nullptr; + if (!checker(expr->children(), &slot_ref, &child_contains_slot)) { + // not a slot ref(column) + return false; + } + + auto entry = _id_to_slot_column_value_range.find(slot_ref->slot_id()); + if (_id_to_slot_column_value_range.end() == entry) { + return false; + } + *slot_desc = entry->second.first; + DCHECK(child_contains_slot != nullptr); + if (child_contains_slot->type().type != (*slot_desc)->type().type) { + if (!ignore_cast(*slot_desc, child_contains_slot)) { + // the type of predicate not match the slot's type + return false; + } + } + *range = &(entry->second.second); + return true; +} + +template <PrimitiveType T> +Status VOlapScanNode::_normalize_in_and_eq_predicate(VExpr* expr, VExprContext* expr_ctx, + SlotDescriptor* slot, + ColumnValueRange<T>& range, bool* push_down) { + RETURN_IF_PUSH_DOWN + auto temp_range = ColumnValueRange<T>::create_empty_column_value_range(slot->type().precision, + slot->type().scale); + bool effect = false; + // 1. Normalize in conjuncts like 'where col in (v1, v2, v3)' + if (TExprNodeType::IN_PRED == expr->node_type()) { + VInPredicate* pred = static_cast<VInPredicate*>(expr); + if (!_should_push_down_in_predicate<false>(pred, expr_ctx)) { + return Status::OK(); + } + + // begin to push InPredicate value into ColumnValueRange + InState* state = reinterpret_cast<InState*>( + expr_ctx->fn_context(pred->fn_context_index()) + ->get_function_state(FunctionContext::FRAGMENT_LOCAL)); + HybridSetBase::IteratorBase* iter = state->hybrid_set->begin(); + while (iter->has_next()) { + // column in (nullptr) is always false so continue to + // dispose next item + if (nullptr == iter->get_value()) { + iter->next(); + continue; + } + auto value = const_cast<void*>(iter->get_value()); + RETURN_IF_ERROR(change_value_range<true>(temp_range, value, + ColumnValueRange<T>::add_fixed_value_range)); + iter->next(); + } + + range.intersection(temp_range); + effect = true; + } else if (TExprNodeType::BINARY_PRED == expr->node_type()) { + DCHECK(expr->children().size() == 2); + auto eq_checker = [&](const std::string& fn_name) { return fn_name == "eq"; }; + + StringRef value; + int slot_ref_child = -1; + if (_should_push_down_binary_predicate(reinterpret_cast<VectorizedFnCall*>(expr), expr_ctx, + &value, &slot_ref_child, eq_checker)) { + DCHECK(slot_ref_child >= 0); + // where A = nullptr should return empty result set + if (value.data != nullptr) { + if constexpr (T == TYPE_CHAR || T == TYPE_VARCHAR || T == TYPE_STRING || + T == TYPE_HLL) { + auto val = StringValue(value.data, value.size); + RETURN_IF_ERROR( + change_value_range<true>(temp_range, reinterpret_cast<void*>(&val), + ColumnValueRange<T>::add_fixed_value_range)); + } else { + RETURN_IF_ERROR(change_value_range<true>( + temp_range, reinterpret_cast<void*>(const_cast<char*>(value.data)), + ColumnValueRange<T>::add_fixed_value_range)); + } + range.intersection(temp_range); + effect = true; + } + } + } + + // exceed limit, no conditions will be pushed down to storage engine. + if (range.get_fixed_value_size() > _max_pushdown_conditions_per_column) { + range.set_whole_value_range(); + } else { + *push_down = effect; + } + return Status::OK(); +} + +template <PrimitiveType T> +Status VOlapScanNode::_normalize_not_in_and_not_eq_predicate(VExpr* expr, VExprContext* expr_ctx, + SlotDescriptor* slot, + ColumnValueRange<T>& range, + bool* push_down) { + RETURN_IF_PUSH_DOWN + bool is_fixed_range = range.is_fixed_value_range(); + auto not_in_range = ColumnValueRange<T>::create_empty_column_value_range(range.column_name()); + bool effect = false; + // 1. Normalize in conjuncts like 'where col in (v1, v2, v3)' + if (TExprNodeType::IN_PRED == expr->node_type()) { + VInPredicate* pred = static_cast<VInPredicate*>(expr); + if (!_should_push_down_in_predicate<true>(pred, expr_ctx)) { + return Status::OK(); + } + + // begin to push InPredicate value into ColumnValueRange + InState* state = reinterpret_cast<InState*>( + expr_ctx->fn_context(pred->fn_context_index()) + ->get_function_state(FunctionContext::FRAGMENT_LOCAL)); + HybridSetBase::IteratorBase* iter = state->hybrid_set->begin(); + while (iter->has_next()) { + // column not in (nullptr) is always true + if (nullptr == iter->get_value()) { + continue; + } + auto value = const_cast<void*>(iter->get_value()); + if (is_fixed_range) { + RETURN_IF_ERROR(change_value_range<true>( + range, value, ColumnValueRange<T>::remove_fixed_value_range)); + } else { + RETURN_IF_ERROR(change_value_range<true>( + not_in_range, value, ColumnValueRange<T>::add_fixed_value_range)); + } + iter->next(); + } + effect = true; + } else if (TExprNodeType::BINARY_PRED == expr->node_type()) { + DCHECK(expr->children().size() == 2); + + auto ne_checker = [&](const std::string& fn_name) { return fn_name == "ne"; }; + StringRef value; + int slot_ref_child = -1; + if (_should_push_down_binary_predicate(reinterpret_cast<VectorizedFnCall*>(expr), expr_ctx, + &value, &slot_ref_child, ne_checker)) { + DCHECK(slot_ref_child >= 0); + // where A = nullptr should return empty result set + if (value.data != nullptr) { + if constexpr (T == TYPE_CHAR || T == TYPE_VARCHAR || T == TYPE_STRING || + T == TYPE_HLL) { + auto val = StringValue(value.data, value.size); + if (is_fixed_range) { + RETURN_IF_ERROR(change_value_range<true>( + range, reinterpret_cast<void*>(&val), + ColumnValueRange<T>::remove_fixed_value_range)); + } else { + RETURN_IF_ERROR(change_value_range<true>( + not_in_range, reinterpret_cast<void*>(&val), + ColumnValueRange<T>::add_fixed_value_range)); + } + } else { + if (is_fixed_range) { + RETURN_IF_ERROR(change_value_range<true>( + range, reinterpret_cast<void*>(const_cast<char*>(value.data)), + ColumnValueRange<T>::remove_fixed_value_range)); + } else { + RETURN_IF_ERROR(change_value_range<true>( + not_in_range, + reinterpret_cast<void*>(const_cast<char*>(value.data)), + ColumnValueRange<T>::add_fixed_value_range)); + } + } + effect = true; + } + } + } + + if (is_fixed_range || + not_in_range.get_fixed_value_size() <= _max_pushdown_conditions_per_column) { + if (!is_fixed_range) { + // push down not in condition to storage engine + not_in_range.to_in_condition(_olap_filter, false); + } + *push_down = effect; + } + return Status::OK(); +} + +template <PrimitiveType T> +Status VOlapScanNode::_normalize_is_null_predicate(VExpr* expr, VExprContext* expr_ctx, + SlotDescriptor* slot, ColumnValueRange<T>& range, + bool* push_down) { + RETURN_IF_PUSH_DOWN + auto is_null_checker = [&](const std::string& fn_name) { return fn_name == "is_null_pred"; }; + auto is_not_null_checker = [&](const std::string& fn_name) { + return fn_name == "is_not_null_pred"; + }; + if (TExprNodeType::FUNCTION_CALL == expr->node_type()) { + if (_should_push_down_function(reinterpret_cast<VectorizedFnCall*>(expr), + is_null_checker)) { + auto temp_range = ColumnValueRange<T>::create_empty_column_value_range( + slot->type().precision, slot->type().scale); + temp_range.set_contain_null(true); + range.intersection(temp_range); + *push_down = true; + } else if (_should_push_down_function(reinterpret_cast<VectorizedFnCall*>(expr), + is_not_null_checker)) { + auto temp_range = ColumnValueRange<T>::create_empty_column_value_range( + slot->type().precision, slot->type().scale); + temp_range.set_contain_null(false); + range.intersection(temp_range); + *push_down = true; + } + } + return Status::OK(); +} + +template <PrimitiveType T> +Status VOlapScanNode::_normalize_noneq_binary_predicate(VExpr* expr, VExprContext* expr_ctx, + SlotDescriptor* slot, + ColumnValueRange<T>& range, + bool* push_down) { + RETURN_IF_PUSH_DOWN + if (TExprNodeType::BINARY_PRED == expr->node_type()) { + DCHECK(expr->children().size() == 2); + + auto noneq_checker = [&](const std::string& fn_name) { + return fn_name != "ne" && fn_name != "eq"; + }; + StringRef value; + int slot_ref_child = -1; + if (_should_push_down_binary_predicate(reinterpret_cast<VectorizedFnCall*>(expr), expr_ctx, + &value, &slot_ref_child, noneq_checker)) { + DCHECK(slot_ref_child >= 0); + std::string fn_name = + reinterpret_cast<VectorizedFnCall*>(expr)->fn().name.function_name; + + // where A = nullptr should return empty result set + if (value.data != nullptr) { + *push_down = true; + if constexpr (T == TYPE_CHAR || T == TYPE_VARCHAR || T == TYPE_STRING || + T == TYPE_HLL) { + auto val = StringValue(value.data, value.size); + RETURN_IF_ERROR(change_value_range<false>(range, reinterpret_cast<void*>(&val), + ColumnValueRange<T>::add_value_range, + fn_name, slot_ref_child)); + } else { + RETURN_IF_ERROR(change_value_range<false>( + range, reinterpret_cast<void*>(const_cast<char*>(value.data)), + ColumnValueRange<T>::add_value_range, fn_name, slot_ref_child)); + } + } + } + } + return Status::OK(); +} + +Status VOlapScanNode::_normalize_bloom_filter(VExpr* expr, VExprContext* expr_ctx, + SlotDescriptor* slot, bool* push_down) { + RETURN_IF_PUSH_DOWN + if (TExprNodeType::BLOOM_PRED == expr->node_type()) { + DCHECK(expr->children().size() == 1); + _bloom_filters_push_down.emplace_back( + slot->col_name(), + (reinterpret_cast<const VBloomPredicate*>( + (reinterpret_cast<VRuntimeFilterWrapper*>(expr))->impl())) + ->get_bloom_filter_func()); + *push_down = true; + } + return Status::OK(); +} + +Status VOlapScanNode::_normalize_function_filters(VExpr* expr, VExprContext* expr_ctx, + SlotDescriptor* slot, bool* push_down) { + RETURN_IF_PUSH_DOWN + bool opposite = false; + VExpr* fn_expr = expr; + if (TExprNodeType::COMPOUND_PRED == expr->node_type() && + expr->fn().name.function_name == "not") { + fn_expr = fn_expr->children()[0]; + opposite = true; + } + + std::string str; Review Comment: better move declare to inner loop -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org