This is an automated email from the ASF dual-hosted git repository. boroknagyz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 49aaaa2cd5a379f6f0ebbb4ee8015f1fd297b56d Author: Zoltan Borok-Nagy <[email protected]> AuthorDate: Wed Apr 2 16:10:36 2025 +0200 IMPALA-13927: Fix crash on invalid BINARY data in TEXT tables BINARY data in text files are expected to be Base64 encoded. TextConverter::WriteSlot has a bug when it decodes base64 code, it does not set the NULL-indicator bit to NULL for the slots of the invalid BINARY values. Therefore later Tuple::CopyStrings can try to copy invalid StringValue objects. This patch fixes TextConverter::WriteSlot to set the NULL-indicator bit in case of Base64 parse errors. Testing * e2e test added Change-Id: I79b712e2abe8ce6ecfbce508fd9e4e93fd63c964 Reviewed-on: http://gerrit.cloudera.org:8080/22721 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/exec/text-converter.inline.h | 15 ++++-- bin/rat_exclude_files.txt | 1 + testdata/data/README | 3 ++ testdata/data/invalid_binary_data.txt | 2 + .../queries/QueryTest/invalid-binary-type.test | 53 ++++++++++++++++++++++ tests/data_errors/test_data_errors.py | 27 +++++++++++ 6 files changed, 98 insertions(+), 3 deletions(-) diff --git a/be/src/exec/text-converter.inline.h b/be/src/exec/text-converter.inline.h index 67c6f3eae..6455126be 100644 --- a/be/src/exec/text-converter.inline.h +++ b/be/src/exec/text-converter.inline.h @@ -71,7 +71,10 @@ inline bool TextConverter::WriteSlot(const SlotDescriptor* slot_desc, Tuple* tup base64_decode = true; reuse_data = false; int64_t out_len; - if (!Base64DecodeBufLen(data, len, &out_len)) return false; + if (!Base64DecodeBufLen(data, len, &out_len)) { + parse_result = StringParser::PARSE_FAILURE; + break; + } buffer_len = out_len; } @@ -92,10 +95,16 @@ inline bool TextConverter::WriteSlot(const SlotDescriptor* slot_desc, Tuple* tup str.ptr = type.IsVarLenStringType() ? reinterpret_cast<char*>(pool->TryAllocateUnaligned(buffer_len)) : reinterpret_cast<char*>(slot); - if (UNLIKELY(str.ptr == nullptr)) return false; + if (UNLIKELY(str.ptr == nullptr)) { + parse_result = StringParser::PARSE_FAILURE; + break; + } if (base64_decode) { unsigned out_len; - if(!Base64Decode(data, len, buffer_len, str.ptr, &out_len)) return false; + if(!Base64Decode(data, len, buffer_len, str.ptr, &out_len)) { + parse_result = StringParser::PARSE_FAILURE; + break; + } DCHECK_LE(out_len, buffer_len); str.len = out_len; } else if (need_escape) { diff --git a/bin/rat_exclude_files.txt b/bin/rat_exclude_files.txt index fdefb4a82..fb94a13c6 100644 --- a/bin/rat_exclude_files.txt +++ b/bin/rat_exclude_files.txt @@ -156,6 +156,7 @@ testdata/data/decimal-tiny.txt testdata/data/decimal_tbl.txt testdata/data/decimal_rtf_tiny_tbl.txt testdata/data/decimal_rtf_tbl.txt +testdata/data/invalid_binary_data.txt testdata/data/overflow.txt testdata/data/text-comma-backslash-newline.txt testdata/data/text-dollar-hash-pipe.txt diff --git a/testdata/data/README b/testdata/data/README index d20db0976..6db023867 100644 --- a/testdata/data/README +++ b/testdata/data/README @@ -1330,3 +1330,6 @@ Generated by ORC C++ library using the following code writer->add(*batch); writer->close(); + +invalid_binary_data.txt: +Hand-written file where BINARY values are not Base64 encoded. diff --git a/testdata/data/invalid_binary_data.txt b/testdata/data/invalid_binary_data.txt new file mode 100644 index 000000000..8b36759ea --- /dev/null +++ b/testdata/data/invalid_binary_data.txt @@ -0,0 +1,2 @@ +3933 0a 25 25 45 4f 46 0a2016-10-31 20:46:09.871 +4065 66 0a 33 38 32 32 30 0a 25 25 45 4f 46 0a2016-10-31 20:46:10.137 diff --git a/testdata/workloads/functional-query/queries/QueryTest/invalid-binary-type.test b/testdata/workloads/functional-query/queries/QueryTest/invalid-binary-type.test new file mode 100644 index 000000000..47bc2aced --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/invalid-binary-type.test @@ -0,0 +1,53 @@ +==== +---- QUERY +SELECT * FROM invalid_binary; +---- ERRORS +Error converting column: 1 to BINARY +Error converting column: 1 to BINARY +Error parsing row: file: __HDFS_FILENAME__, before offset: 123 +Error parsing row: file: __HDFS_FILENAME__, before offset: 123 +---- RESULTS +39,'NULL','2016-10-31 20:46:09.871' +40,'NULL','2016-10-31 20:46:10.137' +---- TYPES +BIGINT,BINARY,STRING +==== +---- QUERY +SELECT bindata FROM invalid_binary; +---- ERRORS +Error converting column: 1 to BINARY +Error converting column: 1 to BINARY +Error parsing row: file: __HDFS_FILENAME__, before offset: 123 +Error parsing row: file: __HDFS_FILENAME__, before offset: 123 +---- RESULTS +'NULL' +'NULL' +---- TYPES +BINARY +==== +---- QUERY +SELECT id, bindata FROM invalid_binary; +---- ERRORS +Error converting column: 1 to BINARY +Error converting column: 1 to BINARY +Error parsing row: file: __HDFS_FILENAME__, before offset: 123 +Error parsing row: file: __HDFS_FILENAME__, before offset: 123 +---- RESULTS +39,'NULL' +40,'NULL' +---- TYPES +BIGINT,BINARY +==== +---- QUERY +SELECT bindata, date_str FROM invalid_binary; +---- ERRORS +Error converting column: 1 to BINARY +Error converting column: 1 to BINARY +Error parsing row: file: __HDFS_FILENAME__, before offset: 123 +Error parsing row: file: __HDFS_FILENAME__, before offset: 123 +---- RESULTS +'NULL','2016-10-31 20:46:09.871' +'NULL','2016-10-31 20:46:10.137' +---- TYPES +BINARY,STRING +==== diff --git a/tests/data_errors/test_data_errors.py b/tests/data_errors/test_data_errors.py index f58a3a9f2..cd701141b 100644 --- a/tests/data_errors/test_data_errors.py +++ b/tests/data_errors/test_data_errors.py @@ -24,6 +24,7 @@ import pytest import subprocess from tests.common.impala_connection import IMPALA_CONNECTION_EXCEPTION +from tests.common.file_utils import create_table_and_copy_files from tests.common.impala_test_suite import ImpalaTestSuite from tests.common.skip import SkipIf, SkipIfFS from tests.common.test_dimensions import create_exec_option_dimension @@ -154,6 +155,32 @@ class TestHdfsRcFileScanNodeErrors(TestHdfsScanNodeErrors): self.run_test_case('DataErrorsTest/hdfs-rcfile-scan-node-errors', vector) +class TestBinaryTypeInText(ImpalaTestSuite): + @classmethod + def get_workload(cls): + return 'functional-query' + + @classmethod + def add_test_dimensions(cls): + super(TestBinaryTypeInText, cls).add_test_dimensions() + cls.ImpalaTestMatrix.add_constraint( + lambda v: v.get_value('table_format').file_format == 'text') + + def test_invalid_binary_type(self, vector, unique_database): + """Test scanning text files with invalid BINARY data.""" + vector.get_value('exec_option')['abort_on_error'] = 0 + tbl_name = "invalid_binary" + create_sql = """create table {}.{} (id BIGINT, bindata BINARY, date_str STRING) + ROW FORMAT DELIMITED FIELDS TERMINATED BY '\u0001' + WITH SERDEPROPERTIES ('field.delim'='\u0001', 'line.delim'='\n', + 'serialization.format'='\u0001') + STORED AS textfile""".format( + unique_database, tbl_name) + create_table_and_copy_files(self.client, create_sql, unique_database, tbl_name, + ["/testdata/data/invalid_binary_data.txt"]) + self.run_test_case('QueryTest/invalid-binary-type', vector, unique_database) + + @SkipIfFS.qualified_path class TestHdfsJsonScanNodeErrors(TestHdfsScanNodeErrors): @classmethod
