This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 80eae07e043 branch-3.0:[fix](parquet/orc) Disable string dictionary
filtering when predicate express is not binary pred and in pred (#50749)
(#51267)
80eae07e043 is described below
commit 80eae07e0439983f2ae37329f0be3b1441127bf9
Author: Socrates <[email protected]>
AuthorDate: Wed Jun 11 10:52:58 2025 +0800
branch-3.0:[fix](parquet/orc) Disable string dictionary filtering when
predicate express is not binary pred and in pred (#50749) (#51267)
cherry-pick: #50749
---
be/src/vec/exec/format/orc/vorc_reader.cpp | 34 ++++++------------
.../exec/format/parquet/vparquet_group_reader.cpp | 38 +++++++--------------
.../hive/test_string_dict_filter.out | Bin 11747 -> 20131 bytes
.../hive/test_string_dict_filter.groovy | 36 +++++++++++++++++++
4 files changed, 60 insertions(+), 48 deletions(-)
diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index 193756bc64d..891ec81e992 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -2068,29 +2068,17 @@ bool OrcReader::_can_filter_by_dict(int slot_id) {
return false;
}
- std::function<bool(const VExpr* expr)> visit_function_call = [&](const
VExpr* expr) {
- // TODO: The current implementation of dictionary filtering does not
take into account
- // the implementation of NULL values because the dictionary itself
does not contain
- // NULL value encoding. As a result, many NULL-related functions or
expressions
- // cannot work properly, such as is null, is not null, coalesce, etc.
- // Here we first disable dictionary filtering when predicate expr is
not slot.
- // Implementation of NULL value dictionary filtering will be carried
out later.
- if (expr->node_type() != TExprNodeType::SLOT_REF) {
- return false;
- }
- for (auto& child : expr->children()) {
- if (!visit_function_call(child.get())) {
- return false;
- }
- }
- return true;
- };
- for (auto& ctx : _slot_id_to_filter_conjuncts->at(slot_id)) {
- if (!visit_function_call(ctx->root().get())) {
- return false;
- }
- }
- return true;
+ // TODO: The current implementation of dictionary filtering does not take
into account
+ // the implementation of NULL values because the dictionary itself does
not contain
+ // NULL value encoding. As a result, many NULL-related functions or
expressions
+ // cannot work properly, such as is null, is not null, coalesce, etc.
+ // Here we check if the predicate expr is IN or BINARY_PRED.
+ // Implementation of NULL value dictionary filtering will be carried out
later.
+ return std::ranges::all_of(_slot_id_to_filter_conjuncts->at(slot_id),
[&](const auto& ctx) {
+ return (ctx->root()->node_type() == TExprNodeType::IN_PRED ||
+ ctx->root()->node_type() == TExprNodeType::BINARY_PRED) &&
+ ctx->root()->children()[0]->node_type() ==
TExprNodeType::SLOT_REF;
+ });
}
Status OrcReader::on_string_dicts_loaded(
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
index a18626066b1..5c5489d3f86 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
@@ -199,37 +199,25 @@ bool RowGroupReader::_can_filter_by_dict(int slot_id,
return false;
}
- if (_slot_id_to_filter_conjuncts->find(slot_id) ==
_slot_id_to_filter_conjuncts->end()) {
+ if (!is_dictionary_encoded(column_metadata)) {
return false;
}
- if (!is_dictionary_encoded(column_metadata)) {
+ if (_slot_id_to_filter_conjuncts->find(slot_id) ==
_slot_id_to_filter_conjuncts->end()) {
return false;
}
- std::function<bool(const VExpr* expr)> visit_function_call = [&](const
VExpr* expr) {
- // TODO: The current implementation of dictionary filtering does not
take into account
- // the implementation of NULL values because the dictionary itself
does not contain
- // NULL value encoding. As a result, many NULL-related functions or
expressions
- // cannot work properly, such as is null, is not null, coalesce, etc.
- // Here we first disable dictionary filtering when predicate is not
slot.
- // Implementation of NULL value dictionary filtering will be carried
out later.
- if (expr->node_type() != TExprNodeType::SLOT_REF) {
- return false;
- }
- for (auto& child : expr->children()) {
- if (!visit_function_call(child.get())) {
- return false;
- }
- }
- return true;
- };
- for (auto& ctx : _slot_id_to_filter_conjuncts->at(slot_id)) {
- if (!visit_function_call(ctx->root().get())) {
- return false;
- }
- }
- return true;
+ // TODO: The current implementation of dictionary filtering does not take
into account
+ // the implementation of NULL values because the dictionary itself does
not contain
+ // NULL value encoding. As a result, many NULL-related functions or
expressions
+ // cannot work properly, such as is null, is not null, coalesce, etc.
+ // Here we check if the predicate expr is IN or BINARY_PRED.
+ // Implementation of NULL value dictionary filtering will be carried out
later.
+ return std::ranges::all_of(_slot_id_to_filter_conjuncts->at(slot_id),
[&](const auto& ctx) {
+ return (ctx->root()->node_type() == TExprNodeType::IN_PRED ||
+ ctx->root()->node_type() == TExprNodeType::BINARY_PRED) &&
+ ctx->root()->children()[0]->node_type() ==
TExprNodeType::SLOT_REF;
+ });
}
// This function is copied from
diff --git
a/regression-test/data/external_table_p0/hive/test_string_dict_filter.out
b/regression-test/data/external_table_p0/hive/test_string_dict_filter.out
index 2a8cebd8723..97b1fb2ff4c 100644
Binary files
a/regression-test/data/external_table_p0/hive/test_string_dict_filter.out and
b/regression-test/data/external_table_p0/hive/test_string_dict_filter.out differ
diff --git
a/regression-test/suites/external_table_p0/hive/test_string_dict_filter.groovy
b/regression-test/suites/external_table_p0/hive/test_string_dict_filter.groovy
index 1929c813c55..18e62570ad7 100644
---
a/regression-test/suites/external_table_p0/hive/test_string_dict_filter.groovy
+++
b/regression-test/suites/external_table_p0/hive/test_string_dict_filter.groovy
@@ -62,6 +62,24 @@ suite("test_string_dict_filter",
"p0,external,hive,external_docker,external_dock
qt_q15 """
select count(o_orderpriority) from ( select (case when o_orderpriority
= 'x' then '1' when o_orderpriority = 'y' then '2' else '0' end) as
o_orderpriority from test_string_dict_filter_parquet ) as A where
o_orderpriority = '0';
"""
+ qt_q16 """
+ select * from test_string_dict_filter_parquet where cast(o_orderstatus
as string) = 'F';
+ """
+ qt_q17 """
+ select * from test_string_dict_filter_parquet where cast(o_orderstatus
as string) = 'O';
+ """
+ qt_q18 """
+ select * from test_string_dict_filter_parquet where cast(o_orderstatus
as string) in ('O', 'F');
+ """
+ qt_q19 """
+ select * from test_string_dict_filter_parquet where
cast(o_orderpriority as string) is null;
+ """
+ qt_q20 """
+ select * from test_string_dict_filter_parquet where
cast(o_orderpriority as string) is not null;
+ """
+ qt_q21 """
+ select * from test_string_dict_filter_parquet where
cast(o_orderpriority as string) in ('5-LOW', NULL);
+ """
}
def q_orc = {
qt_q01 """
@@ -109,6 +127,24 @@ suite("test_string_dict_filter",
"p0,external,hive,external_docker,external_dock
qt_q15 """
select count(o_orderpriority) from ( select (case when o_orderpriority
= 'x' then '1' when o_orderpriority = 'y' then '2' else '0' end) as
o_orderpriority from test_string_dict_filter_orc ) as A where o_orderpriority =
'0';
"""
+ qt_q16 """
+ select * from test_string_dict_filter_orc where cast(o_orderstatus as
string) = 'F';
+ """
+ qt_q17 """
+ select * from test_string_dict_filter_orc where cast(o_orderstatus as
string) = 'O';
+ """
+ qt_q18 """
+ select * from test_string_dict_filter_orc where cast(o_orderstatus as
string) in ('O', 'F');
+ """
+ qt_q19 """
+ select * from test_string_dict_filter_orc where cast(o_orderpriority
as string) is null;
+ """
+ qt_q20 """
+ select * from test_string_dict_filter_orc where cast(o_orderpriority
as string) is not null;
+ """
+ qt_q21 """
+ select * from test_string_dict_filter_orc where cast(o_orderpriority
as string) in ('5-LOW', NULL);
+ """
}
String enabled = context.config.otherConfigs.get("enableHiveTest")
if (enabled == null || !enabled.equalsIgnoreCase("true")) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]