This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit a056808bc2b6d22c88022321cc5e007dbea01036 Author: Xuebin Su <[email protected]> AuthorDate: Fri May 30 11:32:44 2025 +0800 IMPALA-14110: Avoid decoding values for counting columns For a counting column, its slot descriptor is null and its data decoder is not initialized. Therefore, trying to decode the values when skipping them will lead to check failure. This patch fixes the issue by returning early when trying to skip values if the current column is a counting column to avoid trying to decode any value. Testing: - Passed TestZippingUnnest in exhaustive mode. - Added test cases to make sure that page filtering works for counting columns. Change-Id: Ia707335c50cc0653097f375aae3f10609e0eb091 Reviewed-on: http://gerrit.cloudera.org:8080/22974 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/exec/parquet/hdfs-parquet-scanner.cc | 2 +- be/src/exec/parquet/parquet-column-readers.cc | 5 +++++ .../QueryTest/nested-types-parquet-page-index.test | 23 ++++++++++++++++++++++ .../QueryTest/zipping-unnest-in-from-clause.test | 2 ++ 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/be/src/exec/parquet/hdfs-parquet-scanner.cc b/be/src/exec/parquet/hdfs-parquet-scanner.cc index de286314f..d56935821 100644 --- a/be/src/exec/parquet/hdfs-parquet-scanner.cc +++ b/be/src/exec/parquet/hdfs-parquet-scanner.cc @@ -2976,7 +2976,7 @@ Status HdfsParquetScanner::CreateColumnReaders(const TupleDescriptor& tuple_desc // handled in ProcessFooter()), or no materialized columns appear in this file // (e.g. due to schema evolution, or if there's only a position slot). Create a single // column reader that we will use to count the number of tuples we should output. We - // will not read any values from this reader. + // will not read or skip any values from this reader. ParquetColumnReader* reader; RETURN_IF_ERROR(CreateCountingReader( tuple_desc.tuple_path(), schema_resolver, &reader)); diff --git a/be/src/exec/parquet/parquet-column-readers.cc b/be/src/exec/parquet/parquet-column-readers.cc index 03f548904..6f31df188 100644 --- a/be/src/exec/parquet/parquet-column-readers.cc +++ b/be/src/exec/parquet/parquet-column-readers.cc @@ -393,6 +393,11 @@ Status ScalarColumnReader<bool, parquet::Type::BOOLEAN, true>::InitDataDecoder( template <typename InternalType, parquet::Type::type PARQUET_TYPE, bool MATERIALIZED> bool ScalarColumnReader<InternalType, PARQUET_TYPE, MATERIALIZED>::SkipEncodedValuesInPage(int64_t num_values) { + // Return true if this is a counting column before decoding any value. + if (!MATERIALIZED) { + DCHECK_EQ(slot_desc_, nullptr); + return true; + } if (bool_decoder_) { return bool_decoder_->SkipValues(num_values); } diff --git a/testdata/workloads/functional-query/queries/QueryTest/nested-types-parquet-page-index.test b/testdata/workloads/functional-query/queries/QueryTest/nested-types-parquet-page-index.test index 497b13cbb..972be7c65 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/nested-types-parquet-page-index.test +++ b/testdata/workloads/functional-query/queries/QueryTest/nested-types-parquet-page-index.test @@ -749,3 +749,26 @@ group by l_shipmode, o_orderpriority ---- TYPES STRING, STRING, BIGINT ==== +---- QUERY +# Test if page filtering works for counting columns when late materialization +# is disabled. +set PARQUET_LATE_MATERIALIZATION_THRESHOLD = -1; +select d_38 from nested_decimals n, n.arr where d_38 > 7; +---- RESULTS +8 +8 +8 +---- TYPES +DECIMAL +==== +---- QUERY +# Test if page filtering works for counting columns when late materialization +# is enabled (the default). +set PARQUET_LATE_MATERIALIZATION_THRESHOLD = ""; +select d_38 from nested_decimals n, n.arr where d_38 > 7; +---- RESULTS +8 +8 +8 +---- TYPES +DECIMAL diff --git a/testdata/workloads/functional-query/queries/QueryTest/zipping-unnest-in-from-clause.test b/testdata/workloads/functional-query/queries/QueryTest/zipping-unnest-in-from-clause.test index 591e36af2..89236341c 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/zipping-unnest-in-from-clause.test +++ b/testdata/workloads/functional-query/queries/QueryTest/zipping-unnest-in-from-clause.test @@ -286,6 +286,8 @@ INT ==== ---- QUERY # Similar as above but there is a where clause on a non-array field. +# When PARQUET_LATE_MATERIALIZATION_THRESHOLD > 0, it tests if we skip rows +# correctly for a counting column. select id from complextypes_arrays t, unnest(t.arr1) where id = 7; ---- RESULTS 7
