This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch branch-3.4.2
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 763b378f4f55757364273b39f30732b9cc486953
Author: ttttttz <[email protected]>
AuthorDate: Fri May 27 14:49:11 2022 +0800

    IMPALA-11296: Fix infinite loop when reading orc files
    
    When querying an ORC table, selecting only the missing fields of ORC
    files causes the query to be executed indefinitely. The
    corresponding execution node will see some resident threads that
    occupy CPU abnormally. The problem is caused by this:
    when OrcComplexColumnReader.children_.empty() is true,
    OrcComplexColumnReader.row_idx_ will remain constant, causing an
    infinite loop at HdfsOrcScanner::TransferTuples().
    We should allow empty 'children_' for original files.
    
    Testing:
    - Added a test to test_scanners.py that ensures the query can be
      executed successfully when selecting only the missing fields of
      ORC files.
    Change-Id: Ic7ecf5e9c94ffcc02d3ca6c2ec8d55a685ec3968
    Reviewed-on: http://gerrit.cloudera.org:8080/18571
    Reviewed-by: Quanlong Huang <[email protected]>
    Reviewed-by: Zoltan Borok-Nagy <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/exec/orc-column-readers.cc |  7 +++++++
 tests/query_test/test_scanners.py | 25 +++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/be/src/exec/orc-column-readers.cc 
b/be/src/exec/orc-column-readers.cc
index 40f441853..c2c7438c3 100644
--- a/be/src/exec/orc-column-readers.cc
+++ b/be/src/exec/orc-column-readers.cc
@@ -406,6 +406,13 @@ Status 
OrcStructReader::TopLevelReadValueBatch(ScratchTupleBatch* scratch_batch,
     }
   }
   row_idx_ += scratch_batch->num_tuples - scratch_batch_idx;
+  if (children_.empty()) {
+    DCHECK_EQ(scratch_batch_idx, scratch_batch->num_tuples);
+    int num_to_fake_read = std::min(scratch_batch->capacity - 
scratch_batch->num_tuples,
+                                    (int)batch_->numElements - row_idx_);
+    scratch_batch->num_tuples += num_to_fake_read;
+    row_idx_ += num_to_fake_read;
+  }
   return Status::OK();
 }
 
diff --git a/tests/query_test/test_scanners.py 
b/tests/query_test/test_scanners.py
index 4a2cf66fd..9cb169a33 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -1419,6 +1419,31 @@ class TestOrc(ImpalaTestSuite):
 
     self.run_test_case('QueryTest/hive2-pre-gregorian-date-orc', vector, 
unique_database)
 
+  @SkipIfABFS.hive
+  @SkipIfADLS.hive
+  @SkipIfIsilon.hive
+  @SkipIfLocal.hive
+  @SkipIfS3.hive
+  def test_missing_field_orc(self, unique_database):
+    """Test scanning orc files with missing fields in file meta."""
+    orc_tbl_name = unique_database + ".missing_field_orc"
+    self.client.execute("create table %s (f0 int) stored as orc" % 
orc_tbl_name)
+    self.run_stmt_in_hive("insert into table %s select 1" % orc_tbl_name)
+    self.client.execute("refresh %s" % orc_tbl_name)
+    self.client.execute("alter table %s add columns(f1 int)" % orc_tbl_name)
+    result = self.client.execute("select f1 from %s " % orc_tbl_name)
+    assert result.data == ['NULL']
+
+    orc_tbl_name = unique_database + ".lineitem_orc_ext"
+    test_file = "/test-warehouse/tpch.lineitem_orc_def"
+    create_sql = "create external table %s like tpch_orc_def.lineitem " \
+                 "location '%s'" % (orc_tbl_name, test_file)
+    self.client.execute(create_sql)
+    self.client.execute("alter table %s add columns (new_col int)" % 
orc_tbl_name)
+    result = self.execute_query("select count(*) from %s where new_col is null"
+                                % orc_tbl_name)
+    assert len(result.data) == 1
+    assert "6001215" in result.data
 
 class TestScannerReservation(ImpalaTestSuite):
   @classmethod

Reply via email to