(impala) branch master updated: IMPALA-13193: RuntimeFilter on parquet dictionary should evaluate NULL values

michaelsmith Wed, 10 Jul 2024 16:24:14 -0700

This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git



The following commit(s) were added to refs/heads/master by this push:
     new c53987480 IMPALA-13193: RuntimeFilter on parquet dictionary should 
evaluate NULL values
c53987480 is described below

commit c53987480726b114e0c3537c71297df2834a4962
Author: ttttttz <[email protected]>
AuthorDate: Sun Jul 7 12:02:17 2024 +0800

    IMPALA-13193: RuntimeFilter on parquet dictionary should evaluate NULL 
values
    
    NULL values are not included in the parquet dictionary. If the column
    contains NULL values, add evaluating for NULL values.
    
    Testing:
    - Added a test case in parquet-dictionary-runtime-filter.test
    
    Change-Id: I0f69405c0c08feb47141d080a828847e5094163f
    Reviewed-on: http://gerrit.cloudera.org:8080/21566
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/exec/parquet/hdfs-parquet-scanner.cc        | 29 ++++++++++++++++++
 .../parquet-dictionary-runtime-filter.test         | 34 ++++++++++++++++++++++
 2 files changed, 63 insertions(+)

diff --git a/be/src/exec/parquet/hdfs-parquet-scanner.cc 
b/be/src/exec/parquet/hdfs-parquet-scanner.cc
index 29df05da0..208d0e14a 100644
--- a/be/src/exec/parquet/hdfs-parquet-scanner.cc
+++ b/be/src/exec/parquet/hdfs-parquet-scanner.cc
@@ -1921,6 +1921,35 @@ Status HdfsParquetScanner::EvalDictionaryFilters(const 
parquet::RowGroup& row_gr
         break; // Passed the conjunct and runtime filter does not exist.
       }
     }
+
+    // NULL values are not included in the parquet dictionary. If the column 
contains
+    // NULL values, add evaluating for NULL values.
+    bool has_set_null_count =
+        col_metadata.__isset.statistics && 
col_metadata.statistics.__isset.null_count;
+    bool should_eval_null_value = !has_set_null_count
+        || (has_set_null_count && col_metadata.statistics.null_count > 0);
+    if (!column_has_match && should_eval_null_value) {
+      dict_filter_tuple->SetNull(slot_desc->null_indicator_offset());
+      TupleRow row;
+      row.SetTuple(0, dict_filter_tuple);
+      // Although the FE guarantees that dict_filter_conjunct evaluates
+      // to false on NULL, this condition is added for safety.
+      if (dict_filter_conjunct_evals == nullptr
+          || (dict_filter_conjunct_evals != nullptr
+          && ExecNode::EvalConjuncts(dict_filter_conjunct_evals->data(),
+              dict_filter_conjunct_evals->size(), &row))) {
+        column_has_match = true;
+        if (runtime_filters != nullptr && should_eval_runtime_filter) {
+          for (int rf_idx = 0; rf_idx < runtime_filters->size(); rf_idx++) {
+            if (!runtime_filters->at(rf_idx)->Eval(&row)) {
+              column_has_match = false;
+              break;
+            }
+          }
+        }
+      }
+    }
+
     // Free all expr result allocations now that we're done with the filter.
     context_->expr_results_pool()->Clear();
 
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/parquet-dictionary-runtime-filter.test
 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-dictionary-runtime-filter.test
index 452978fdb..e78487611 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/parquet-dictionary-runtime-filter.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-dictionary-runtime-filter.test
@@ -128,4 +128,38 @@ SELECT * FROM parquet_dict_runtime_filter_partitioned a
 ---- RUNTIME_PROFILE
 aggregation(SUM, NumDictFilteredRowGroups): 2
 aggregation(SUM, RowGroups total): 2
+====
+---- QUERY
+CREATE TABLE parquet_dict_runtime_filter_null_values (
+  col_1 BIGINT,
+  col_2 STRING
+)
+STORED AS PARQUET;
+====
+---- QUERY
+INSERT INTO parquet_dict_runtime_filter_null_values
+  VALUES (0, "abc"), (1, NULL), (2, NULL), (3, "abc");
+====
+---- QUERY
+CREATE TABLE null_values_dim_tbl (
+  col_1 STRING
+)
+STORED AS PARQUET;
+====
+---- QUERY
+INSERT INTO null_values_dim_tbl VALUES (NULL);
+====
+---- QUERY
+SET RUNTIME_FILTER_WAIT_TIME_MS=$RUNTIME_FILTER_WAIT_TIME_MS;
+SELECT * FROM parquet_dict_runtime_filter_null_values a
+  JOIN null_values_dim_tbl b
+  ON IFNULL(a.col_2, '') = IFNULL(b.col_1, '');
+---- RESULTS
+1,'NULL','NULL'
+2,'NULL','NULL'
+---- TYPES
+BIGINT,STRING,STRING
+---- RUNTIME_PROFILE
+aggregation(SUM, NumDictFilteredRowGroups): 0
+aggregation(SUM, RowGroups total): 1
 ====
\ No newline at end of file

(impala) branch master updated: IMPALA-13193: RuntimeFilter on parquet dictionary should evaluate NULL values

Reply via email to