This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 5c4e771241a7f847d2349ae248bc268243e071ed Author: stiga-huang <[email protected]> AuthorDate: Sun Jun 30 21:17:40 2024 +0800 IMPALA-13161: Fix column index overflow in DelimitedTextParser DelimitedTextParser tracks the current column index inside the current row that is under parsing. The row could have arbitrary numbers of fields. The index, 'column_idx_', is defined as int type which could overflow when there are more than 2^31 fields in the row. This index is only used to check whether the current column should be materialized. It doesn't make sense to track the index if it's larger than the number of columns of the table. This patch fixes the overflow issue by only bumping 'column_idx_' when it's smaller than the number of columns of the table. Tests - Add e2e test Change-Id: I527a8971e92e270d5576c2155e4622dd6d43d745 Reviewed-on: http://gerrit.cloudera.org:8080/21559 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/exec/delimited-text-parser.inline.h | 4 +++- tests/query_test/test_delimited_text.py | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/be/src/exec/delimited-text-parser.inline.h b/be/src/exec/delimited-text-parser.inline.h index aa6b5c4bd..86e5e3f4e 100644 --- a/be/src/exec/delimited-text-parser.inline.h +++ b/be/src/exec/delimited-text-parser.inline.h @@ -71,7 +71,9 @@ inline Status DelimitedTextParser<DELIMITED_TUPLES>::AddColumn(int64_t len, } if (PROCESS_ESCAPES) current_column_has_escape_ = false; *next_column_start += len + 1; - ++column_idx_; + // No need to keep bumping 'column_idx_' if it's already 'num_cols_'. Otherwise, + // a large file with full of field delimiters might lead to 'column_idx_' overflow. + if (column_idx_ < num_cols_) ++column_idx_; return Status::OK(); } diff --git a/tests/query_test/test_delimited_text.py b/tests/query_test/test_delimited_text.py index 34bd93e04..8fb5b3d4b 100644 --- a/tests/query_test/test_delimited_text.py +++ b/tests/query_test/test_delimited_text.py @@ -20,6 +20,7 @@ # from __future__ import absolute_import, division, print_function +from subprocess import check_call from tests.common.impala_test_suite import ImpalaTestSuite from tests.common.test_dimensions import ( create_single_exec_option_dimension, @@ -77,3 +78,18 @@ class TestDelimitedText(ImpalaTestSuite): cleanup/setup""" self.run_test_case('QueryTest/delimited-latin-text', vector, unique_database, encoding="latin-1") + + def test_large_file_of_field_delimiters(self, vector, unique_database): + """IMPALA-13161: Verifies reading a large file which has full of field delimiters + won't causing crash due to overflows""" + tbl = unique_database + ".tbl" + self.execute_query("create table {}(i int)".format(tbl)) + table_loc = self._get_table_location(tbl, vector) + # Generate a 3GB data file that has full of '\x00' (the default field delimiter) + with open("data.txt", "wb") as f: + long_str = "\x00" * 1024 * 1024 * 3 + [f.write(long_str) for i in range(1024)] + check_call(["hdfs", "dfs", "-put", "data.txt", table_loc]) + self.execute_query("refresh " + tbl) + res = self.execute_query("select count(*) from " + tbl) + assert res.data == ["1"]
