This is an automated email from the ASF dual-hosted git repository.
michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new c53987480 IMPALA-13193: RuntimeFilter on parquet dictionary should
evaluate NULL values
c53987480 is described below
commit c53987480726b114e0c3537c71297df2834a4962
Author: ttttttz <[email protected]>
AuthorDate: Sun Jul 7 12:02:17 2024 +0800
IMPALA-13193: RuntimeFilter on parquet dictionary should evaluate NULL
values
NULL values are not included in the parquet dictionary. If the column
contains NULL values, add evaluating for NULL values.
Testing:
- Added a test case in parquet-dictionary-runtime-filter.test
Change-Id: I0f69405c0c08feb47141d080a828847e5094163f
Reviewed-on: http://gerrit.cloudera.org:8080/21566
Reviewed-by: Impala Public Jenkins <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>
---
be/src/exec/parquet/hdfs-parquet-scanner.cc | 29 ++++++++++++++++++
.../parquet-dictionary-runtime-filter.test | 34 ++++++++++++++++++++++
2 files changed, 63 insertions(+)
diff --git a/be/src/exec/parquet/hdfs-parquet-scanner.cc
b/be/src/exec/parquet/hdfs-parquet-scanner.cc
index 29df05da0..208d0e14a 100644
--- a/be/src/exec/parquet/hdfs-parquet-scanner.cc
+++ b/be/src/exec/parquet/hdfs-parquet-scanner.cc
@@ -1921,6 +1921,35 @@ Status HdfsParquetScanner::EvalDictionaryFilters(const
parquet::RowGroup& row_gr
break; // Passed the conjunct and runtime filter does not exist.
}
}
+
+ // NULL values are not included in the parquet dictionary. If the column
contains
+ // NULL values, add evaluating for NULL values.
+ bool has_set_null_count =
+ col_metadata.__isset.statistics &&
col_metadata.statistics.__isset.null_count;
+ bool should_eval_null_value = !has_set_null_count
+ || (has_set_null_count && col_metadata.statistics.null_count > 0);
+ if (!column_has_match && should_eval_null_value) {
+ dict_filter_tuple->SetNull(slot_desc->null_indicator_offset());
+ TupleRow row;
+ row.SetTuple(0, dict_filter_tuple);
+ // Although the FE guarantees that dict_filter_conjunct evaluates
+ // to false on NULL, this condition is added for safety.
+ if (dict_filter_conjunct_evals == nullptr
+ || (dict_filter_conjunct_evals != nullptr
+ && ExecNode::EvalConjuncts(dict_filter_conjunct_evals->data(),
+ dict_filter_conjunct_evals->size(), &row))) {
+ column_has_match = true;
+ if (runtime_filters != nullptr && should_eval_runtime_filter) {
+ for (int rf_idx = 0; rf_idx < runtime_filters->size(); rf_idx++) {
+ if (!runtime_filters->at(rf_idx)->Eval(&row)) {
+ column_has_match = false;
+ break;
+ }
+ }
+ }
+ }
+ }
+
// Free all expr result allocations now that we're done with the filter.
context_->expr_results_pool()->Clear();
diff --git
a/testdata/workloads/functional-query/queries/QueryTest/parquet-dictionary-runtime-filter.test
b/testdata/workloads/functional-query/queries/QueryTest/parquet-dictionary-runtime-filter.test
index 452978fdb..e78487611 100644
---
a/testdata/workloads/functional-query/queries/QueryTest/parquet-dictionary-runtime-filter.test
+++
b/testdata/workloads/functional-query/queries/QueryTest/parquet-dictionary-runtime-filter.test
@@ -128,4 +128,38 @@ SELECT * FROM parquet_dict_runtime_filter_partitioned a
---- RUNTIME_PROFILE
aggregation(SUM, NumDictFilteredRowGroups): 2
aggregation(SUM, RowGroups total): 2
+====
+---- QUERY
+CREATE TABLE parquet_dict_runtime_filter_null_values (
+ col_1 BIGINT,
+ col_2 STRING
+)
+STORED AS PARQUET;
+====
+---- QUERY
+INSERT INTO parquet_dict_runtime_filter_null_values
+ VALUES (0, "abc"), (1, NULL), (2, NULL), (3, "abc");
+====
+---- QUERY
+CREATE TABLE null_values_dim_tbl (
+ col_1 STRING
+)
+STORED AS PARQUET;
+====
+---- QUERY
+INSERT INTO null_values_dim_tbl VALUES (NULL);
+====
+---- QUERY
+SET RUNTIME_FILTER_WAIT_TIME_MS=$RUNTIME_FILTER_WAIT_TIME_MS;
+SELECT * FROM parquet_dict_runtime_filter_null_values a
+ JOIN null_values_dim_tbl b
+ ON IFNULL(a.col_2, '') = IFNULL(b.col_1, '');
+---- RESULTS
+1,'NULL','NULL'
+2,'NULL','NULL'
+---- TYPES
+BIGINT,STRING,STRING
+---- RUNTIME_PROFILE
+aggregation(SUM, NumDictFilteredRowGroups): 0
+aggregation(SUM, RowGroups total): 1
====
\ No newline at end of file