This is an automated email from the ASF dual-hosted git repository.

csringhofer pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 9cd593840fdfac5d64ddd0bd71d3942d8f872e2c
Author: Zoltan Borok-Nagy <[email protected]>
AuthorDate: Thu Nov 28 20:08:47 2024 +0100

    IMPALA-13589: SELECT INPUT__FILE__NAME can crash Impala
    
    If the user only queries virtual column INPUT__FILE__NAME
    from a table backed by text files, and the last row doesn't
    end with the row delimiter (e.g. '\n') then Impala crashes.
    
    In HdfsTextScanner::FinishScanRange() there is specific code
    to deal with the last row if it doesn't end with the row
    delimiter, and we fill the last tuple here. This code wasn't active
    when we only read INPUT__FILE__NAME, which means the last
    tuple contained garbage which caused a segfault later.
    
    The fix is to always fill the last tuple if we have a template
    tuple as it means we either have partition expressions, or
    file-level virtual columns like INPUT__FILE__NAME.
    
    Other file-level virtual columns only apply to Iceberg tables
    which don't support text data files, so those are not affected
    by this bug.
    
    Testing
     * added e2e tests
    
    Change-Id: I0ea8e7fed77cbc9ae90a858eafeee9dcfd73d143
    Reviewed-on: http://gerrit.cloudera.org:8080/22141
    Tested-by: Impala Public Jenkins <[email protected]>
    Reviewed-by: Gabor Kaszab <[email protected]>
---
 be/src/exec/text/hdfs-text-scanner.cc              |  2 +-
 .../virtual-column-input-file-name-in-table.test   | 30 ++++++++++++++++++++++
 .../QueryTest/virtual-column-input-file-name.test  | 15 +++++++++++
 3 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/be/src/exec/text/hdfs-text-scanner.cc 
b/be/src/exec/text/hdfs-text-scanner.cc
index d7abf960e..2add0c857 100644
--- a/be/src/exec/text/hdfs-text-scanner.cc
+++ b/be/src/exec/text/hdfs-text-scanner.cc
@@ -320,7 +320,7 @@ Status HdfsTextScanner::FinishScanRange(RowBatch* 
row_batch) {
           !boundary_row_.IsEmpty() ||
           (delimited_text_parser_->HasUnfinishedTuple() &&
               (!scan_node_->materialized_slots().empty() ||
-                  scan_node_->num_materialized_partition_keys() > 0))) {
+                  template_tuple_ != nullptr))) {
         // There is data in the partial column because there is a missing row 
delimiter
         // at the end of the file. Copy the data into a new string buffer that 
gets
         // memory from the row batch pool, so that the boundary pool could be 
freed.
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/virtual-column-input-file-name-in-table.test
 
b/testdata/workloads/functional-query/queries/QueryTest/virtual-column-input-file-name-in-table.test
index 2eb25b1c9..2ed1d5145 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/virtual-column-input-file-name-in-table.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/virtual-column-input-file-name-in-table.test
@@ -23,3 +23,33 @@ select input__file__name, * from i_n_f;
 ---- TYPES
 STRING, STRING
 ====
+---- QUERY
+# IMPALA-13589: Select INPUT_FILE__NAME only
+select input__file__name from functional.table_no_newline;
+---- RESULTS
+regex:'$NAMENODE/test-warehouse/table_no_newline/table_no_newline.csv'
+regex:'$NAMENODE/test-warehouse/table_no_newline/table_no_newline.csv'
+regex:'$NAMENODE/test-warehouse/table_no_newline/table_no_newline.csv'
+regex:'$NAMENODE/test-warehouse/table_no_newline/table_no_newline.csv'
+regex:'$NAMENODE/test-warehouse/table_no_newline/table_no_newline.csv'
+---- TYPES
+STRING
+====
+---- QUERY
+# IMPALA-13589: Select INPUT_FILE__NAME only in partitioned table
+select input__file__name from functional.table_no_newline_part;
+---- RESULTS
+regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2010/month=3/table_no_newline.csv'
+regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2010/month=3/table_no_newline.csv'
+regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2010/month=3/table_no_newline.csv'
+regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2010/month=3/table_no_newline.csv'
+regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2010/month=3/table_no_newline.csv'
+regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2015/month=3/table_missing_columns.csv'
+regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2015/month=3/table_missing_columns.csv'
+regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2015/month=3/table_missing_columns.csv'
+regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2015/month=3/table_missing_columns.csv'
+regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2015/month=3/table_missing_columns.csv'
+regex:'$NAMENODE/test-warehouse/table_no_newline_part/year=2015/month=3/table_missing_columns.csv'
+---- TYPES
+STRING
+====
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/virtual-column-input-file-name.test
 
b/testdata/workloads/functional-query/queries/QueryTest/virtual-column-input-file-name.test
index e97445814..97a02939e 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/virtual-column-input-file-name.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/virtual-column-input-file-name.test
@@ -1,5 +1,20 @@
 ====
 ---- QUERY
+# Select INPUT_FILE__NAME only
+select input__file__name from alltypestiny;
+---- RESULTS
+regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=1(/base_\d*/|/).*'
+regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=1(/base_\d*/|/).*'
+regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=2(/base_\d*/|/).*'
+regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=2(/base_\d*/|/).*'
+regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=3(/base_\d*/|/).*'
+regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=3(/base_\d*/|/).*'
+regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=4(/base_\d*/|/).*'
+regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=4(/base_\d*/|/).*'
+---- TYPES
+STRING
+====
+---- QUERY
 # Select INPUT_FILE__NAME plus all cols
 select input__file__name, * from alltypestiny;
 ---- RESULTS

Reply via email to