This is an automated email from the ASF dual-hosted git repository.

dbecker pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new 23c1f0d4e IMPALA-12903: Querying virtual column FILE__POSITION for 
TEXT and JSON tables crashes Impala
23c1f0d4e is described below

commit 23c1f0d4e195c066b965fbf6b4da95bade12f2dd
Author: Zoltan Borok-Nagy <[email protected]>
AuthorDate: Thu Mar 14 17:09:51 2024 +0100

    IMPALA-12903: Querying virtual column FILE__POSITION for TEXT and JSON 
tables crashes Impala
    
    Impala generates segmentation fault when it queries the virtual column
    FILE__POSITION for TEXT or JSON tables. When the scanners that do not
    support the FILE__POSITION virtual column detect its presence they
    try to report an error and close themselves. The segfault is in the
    scanners' Close() method when they try to dereference a NULL stream
    object.
    
    This patch simply adds NULL-checks in Close().
    
    Alternatively we could detect the presence of FILE__POSITION during
    planning in the HdfsScanNode, but doing it in the scanners lets us
    handle more queries, e.g. queries that dynamically prune partitions
    and the surviving partitions all have file formats that support
    FILE__POSITION.
    
    Testing:
     * added negative tests to properly report the errors
     * added tests for mixed file format tables
    
    Change-Id: I8e1af8d526f9046aceddb5944da9e6f9c63768b0
    Reviewed-on: http://gerrit.cloudera.org:8080/21148
    Tested-by: Impala Public Jenkins <[email protected]>
    Reviewed-by: Zoltan Borok-Nagy <[email protected]>
---
 be/src/exec/json/hdfs-json-scanner.cc              |  6 ++--
 be/src/exec/text/hdfs-text-scanner.cc              |  2 +-
 .../virtual-column-file-position-generic.test      | 31 ++++++++++++++++
 .../virtual-column-file-position-negative.test     | 41 ++++++++++++++++++++++
 tests/query_test/test_scanners.py                  | 17 +++++++++
 5 files changed, 94 insertions(+), 3 deletions(-)

diff --git a/be/src/exec/json/hdfs-json-scanner.cc 
b/be/src/exec/json/hdfs-json-scanner.cc
index 90023130e..7644a8671 100644
--- a/be/src/exec/json/hdfs-json-scanner.cc
+++ b/be/src/exec/json/hdfs-json-scanner.cc
@@ -95,8 +95,10 @@ void HdfsJsonScanner::Close(RowBatch* row_batch) {
   // Verify all resources (if any) have been transferred or freed.
   DCHECK_EQ(template_tuple_pool_.get()->total_allocated_bytes(), 0);
   DCHECK_EQ(data_buffer_pool_.get()->total_allocated_bytes(), 0);
-  scan_node_->RangeComplete(THdfsFileFormat::JSON,
-      stream_->file_desc()->file_compression);
+  if (stream_ != nullptr) {
+    scan_node_->RangeComplete(THdfsFileFormat::JSON,
+        stream_->file_desc()->file_compression);
+  }
   CloseInternal();
 }
 
diff --git a/be/src/exec/text/hdfs-text-scanner.cc 
b/be/src/exec/text/hdfs-text-scanner.cc
index db2fcf5f8..d7abf960e 100644
--- a/be/src/exec/text/hdfs-text-scanner.cc
+++ b/be/src/exec/text/hdfs-text-scanner.cc
@@ -209,7 +209,7 @@ void HdfsTextScanner::Close(RowBatch* row_batch) {
   DCHECK_EQ(template_tuple_pool_.get()->total_allocated_bytes(), 0);
   DCHECK_EQ(data_buffer_pool_.get()->total_allocated_bytes(), 0);
   DCHECK_EQ(boundary_pool_.get()->total_allocated_bytes(), 0);
-  if (!only_parsing_header_) {
+  if (!only_parsing_header_ && stream_ != nullptr) {
     scan_node_->RangeComplete(THdfsFileFormat::TEXT,
         stream_->file_desc()->file_compression);
   }
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/virtual-column-file-position-generic.test
 
b/testdata/workloads/functional-query/queries/QueryTest/virtual-column-file-position-generic.test
index 45100a434..ff6a6050d 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/virtual-column-file-position-generic.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/virtual-column-file-position-generic.test
@@ -155,3 +155,34 @@ order by id;
 ---- TYPES
 BIGINT, BIGINT, INT
 ====
+---- QUERY
+# Regression test for IMPALA-12903. The following query uses static pruning. 
The surviving
+# partitions have file formats that support virtual column FILE__POSITION.
+select file__position, year, month, id, date_string_col
+from functional.alltypesmixedformat
+where year=2009 and month = 4 and id < 905
+order by id;
+---- RESULTS
+regex:\d+,2009,4,900,'04/01/09'
+regex:\d+,2009,4,901,'04/01/09'
+regex:\d+,2009,4,902,'04/01/09'
+regex:\d+,2009,4,903,'04/01/09'
+regex:\d+,2009,4,904,'04/01/09'
+---- TYPES
+BIGINT,INT,INT,INT,STRING
+====
+---- QUERY
+# Regression test for IMPALA-12903. The following query uses dynamic pruning. 
The
+# surviving partitions have file formats that support virtual column 
FILE__POSITION.
+select straight_join lhs.file__position, lhs.year, lhs.month, lhs.id
+from functional.alltypesmixedformat lhs, functional.alltypes rhs
+where lhs.id = rhs.id and lhs.year = rhs.year and lhs.month = rhs.month and
+    rhs.id > 900 and rhs.id < 900 + rhs.month
+order by id;
+---- RESULTS
+regex:\d+,2009,4,901
+regex:\d+,2009,4,902
+regex:\d+,2009,4,903
+---- TYPES
+BIGINT,INT,INT,INT
+====
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/virtual-column-file-position-negative.test
 
b/testdata/workloads/functional-query/queries/QueryTest/virtual-column-file-position-negative.test
new file mode 100644
index 000000000..8afd5ad52
--- /dev/null
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/virtual-column-file-position-negative.test
@@ -0,0 +1,41 @@
+====
+---- QUERY
+select file__position from functional.alltypes
+---- CATCH
+Virtual column FILE__POSITION is not supported for TEXT files.
+====
+---- QUERY
+select file__position from functional_avro.alltypes
+---- CATCH
+Virtual column FILE__POSITION is not supported for AVRO files.
+====
+---- QUERY
+select file__position from functional_json.alltypes
+---- CATCH
+Virtual column FILE__POSITION is not supported for JSON files.
+====
+---- QUERY
+select file__position from functional_seq.alltypes
+---- CATCH
+Virtual column FILE__POSITION is not supported for SEQUENCE_FILE files.
+====
+---- QUERY
+select file__position from functional_rc.alltypes
+---- CATCH
+Virtual column FILE__POSITION is not supported for RC_FILE files.
+====
+---- QUERY
+select file__position, year, month, id, date_string_col from 
functional.alltypesmixedformat where year=2009 and month = 3;
+---- CATCH
+Virtual column FILE__POSITION is not supported for RC_FILE files.
+====
+---- QUERY
+select file__position, year, month, id, date_string_col from 
functional.alltypesmixedformat where year=2009 and month = 1;
+---- CATCH
+Virtual column FILE__POSITION is not supported for TEXT files.
+====
+---- QUERY
+select file__position, year, month, id, date_string_col from 
functional.alltypesmixedformat;
+---- CATCH
+Virtual column FILE__POSITION is not supported
+====
diff --git a/tests/query_test/test_scanners.py 
b/tests/query_test/test_scanners.py
index bac1516a5..6b2bb4fbe 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -175,6 +175,23 @@ class TestScannersVirtualColumns(ImpalaTestSuite):
     self.run_test_case('QueryTest/mixing-virtual-columns', vector, 
unique_database)
 
 
+class TestScannersVirtualColumnsNegative(ImpalaTestSuite):
+  @classmethod
+  def get_workload(cls):
+    return 'functional-query'
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestScannersVirtualColumnsNegative, cls).add_test_dimensions()
+    # In the tests we explicitly refer to the databases, i.e. no need to
+    # run this test with multiple file formats.
+    cls.ImpalaTestMatrix.add_dimension(
+        create_uncompressed_text_dimension(cls.get_workload()))
+
+  def test_virtual_column_file_position_negative(self, vector):
+    self.run_test_case('QueryTest/virtual-column-file-position-negative', 
vector)
+
+
 class TestIcebergVirtualColumns(ImpalaTestSuite):
 
   @classmethod

Reply via email to