This is an automated email from the ASF dual-hosted git repository. csringhofer pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 9cd593840fdfac5d64ddd0bd71d3942d8f872e2c Author: Zoltan Borok-Nagy <[email protected]> AuthorDate: Thu Nov 28 20:08:47 2024 +0100 IMPALA-13589: SELECT INPUT__FILE__NAME can crash Impala If the user only queries virtual column INPUT__FILE__NAME from a table backed by text files, and the last row doesn't end with the row delimiter (e.g. '\n') then Impala crashes. In HdfsTextScanner::FinishScanRange() there is specific code to deal with the last row if it doesn't end with the row delimiter, and we fill the last tuple here. This code wasn't active when we only read INPUT__FILE__NAME, which means the last tuple contained garbage which caused a segfault later. The fix is to always fill the last tuple if we have a template tuple as it means we either have partition expressions, or file-level virtual columns like INPUT__FILE__NAME. Other file-level virtual columns only apply to Iceberg tables which don't support text data files, so those are not affected by this bug. Testing * added e2e tests Change-Id: I0ea8e7fed77cbc9ae90a858eafeee9dcfd73d143 Reviewed-on: http://gerrit.cloudera.org:8080/22141 Tested-by: Impala Public Jenkins <[email protected]> Reviewed-by: Gabor Kaszab <[email protected]> --- be/src/exec/text/hdfs-text-scanner.cc | 2 +- .../virtual-column-input-file-name-in-table.test | 30 ++++++++++++++++++++++ .../QueryTest/virtual-column-input-file-name.test | 15 +++++++++++ 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/be/src/exec/text/hdfs-text-scanner.cc b/be/src/exec/text/hdfs-text-scanner.cc index d7abf960e..2add0c857 100644 --- a/be/src/exec/text/hdfs-text-scanner.cc +++ b/be/src/exec/text/hdfs-text-scanner.cc @@ -320,7 +320,7 @@ Status HdfsTextScanner::FinishScanRange(RowBatch* row_batch) { !boundary_row_.IsEmpty() || (delimited_text_parser_->HasUnfinishedTuple() && (!scan_node_->materialized_slots().empty() || - scan_node_->num_materialized_partition_keys() > 0))) { + template_tuple_ != nullptr))) { // There is data in the partial column because there is a missing row delimiter // at the end of the file. Copy the data into a new string buffer that gets // memory from the row batch pool, so that the boundary pool could be freed. diff --git a/testdata/workloads/functional-query/queries/QueryTest/virtual-column-input-file-name-in-table.test b/testdata/workloads/functional-query/queries/QueryTest/virtual-column-input-file-name-in-table.test index 2eb25b1c9..2ed1d5145 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/virtual-column-input-file-name-in-table.test +++ b/testdata/workloads/functional-query/queries/QueryTest/virtual-column-input-file-name-in-table.test @@ -23,3 +23,33 @@ select input__file__name, * from i_n_f; ---- TYPES STRING, STRING ==== +---- QUERY +# IMPALA-13589: Select INPUT_FILE__NAME only +select input__file__name from functional.table_no_newline; +---- RESULTS +regex:'$NAMENODE/test-warehouse/table_no_newline/table_no_newline.csv' +regex:'$NAMENODE/test-warehouse/table_no_newline/table_no_newline.csv' +regex:'$NAMENODE/test-warehouse/table_no_newline/table_no_newline.csv' +regex:'$NAMENODE/test-warehouse/table_no_newline/table_no_newline.csv' +regex:'$NAMENODE/test-warehouse/table_no_newline/table_no_newline.csv' +---- TYPES +STRING +==== +---- QUERY +# IMPALA-13589: Select INPUT_FILE__NAME only in partitioned table +select input__file__name from functional.table_no_newline_part; +---- RESULTS +regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2010/month=3/table_no_newline.csv' +regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2010/month=3/table_no_newline.csv' +regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2010/month=3/table_no_newline.csv' +regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2010/month=3/table_no_newline.csv' +regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2010/month=3/table_no_newline.csv' +regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2015/month=3/table_missing_columns.csv' +regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2015/month=3/table_missing_columns.csv' +regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2015/month=3/table_missing_columns.csv' +regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2015/month=3/table_missing_columns.csv' +regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2015/month=3/table_missing_columns.csv' +regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2015/month=3/table_missing_columns.csv' +---- TYPES +STRING +==== diff --git a/testdata/workloads/functional-query/queries/QueryTest/virtual-column-input-file-name.test b/testdata/workloads/functional-query/queries/QueryTest/virtual-column-input-file-name.test index e97445814..97a02939e 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/virtual-column-input-file-name.test +++ b/testdata/workloads/functional-query/queries/QueryTest/virtual-column-input-file-name.test @@ -1,5 +1,20 @@ ==== ---- QUERY +# Select INPUT_FILE__NAME only +select input__file__name from alltypestiny; +---- RESULTS +regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=1(/base_\d*/|/).*' +regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=1(/base_\d*/|/).*' +regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=2(/base_\d*/|/).*' +regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=2(/base_\d*/|/).*' +regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=3(/base_\d*/|/).*' +regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=3(/base_\d*/|/).*' +regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=4(/base_\d*/|/).*' +regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=4(/base_\d*/|/).*' +---- TYPES +STRING +==== +---- QUERY # Select INPUT_FILE__NAME plus all cols select input__file__name, * from alltypestiny; ---- RESULTS
