This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 49aaaa2cd5a379f6f0ebbb4ee8015f1fd297b56d
Author: Zoltan Borok-Nagy <[email protected]>
AuthorDate: Wed Apr 2 16:10:36 2025 +0200

    IMPALA-13927: Fix crash on invalid BINARY data in TEXT tables
    
    BINARY data in text files are expected to be Base64 encoded.
    TextConverter::WriteSlot has a bug when it decodes base64 code,
    it does not set the NULL-indicator bit to NULL for the slots of
    the invalid BINARY values. Therefore later Tuple::CopyStrings can
    try to copy invalid StringValue objects.
    
    This patch fixes TextConverter::WriteSlot to set the NULL-indicator
    bit in case of Base64 parse errors.
    
    Testing
     * e2e test added
    
    Change-Id: I79b712e2abe8ce6ecfbce508fd9e4e93fd63c964
    Reviewed-on: http://gerrit.cloudera.org:8080/22721
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/exec/text-converter.inline.h                | 15 ++++--
 bin/rat_exclude_files.txt                          |  1 +
 testdata/data/README                               |  3 ++
 testdata/data/invalid_binary_data.txt              |  2 +
 .../queries/QueryTest/invalid-binary-type.test     | 53 ++++++++++++++++++++++
 tests/data_errors/test_data_errors.py              | 27 +++++++++++
 6 files changed, 98 insertions(+), 3 deletions(-)

diff --git a/be/src/exec/text-converter.inline.h 
b/be/src/exec/text-converter.inline.h
index 67c6f3eae..6455126be 100644
--- a/be/src/exec/text-converter.inline.h
+++ b/be/src/exec/text-converter.inline.h
@@ -71,7 +71,10 @@ inline bool TextConverter::WriteSlot(const SlotDescriptor* 
slot_desc, Tuple* tup
         base64_decode = true;
         reuse_data = false;
         int64_t out_len;
-        if (!Base64DecodeBufLen(data, len, &out_len)) return false;
+        if (!Base64DecodeBufLen(data, len, &out_len)) {
+          parse_result = StringParser::PARSE_FAILURE;
+          break;
+        }
         buffer_len = out_len;
       }
 
@@ -92,10 +95,16 @@ inline bool TextConverter::WriteSlot(const SlotDescriptor* 
slot_desc, Tuple* tup
         str.ptr = type.IsVarLenStringType() ?
             reinterpret_cast<char*>(pool->TryAllocateUnaligned(buffer_len)) :
             reinterpret_cast<char*>(slot);
-        if (UNLIKELY(str.ptr == nullptr)) return false;
+        if (UNLIKELY(str.ptr == nullptr)) {
+          parse_result = StringParser::PARSE_FAILURE;
+          break;
+        }
         if (base64_decode) {
           unsigned out_len;
-          if(!Base64Decode(data, len, buffer_len, str.ptr, &out_len)) return 
false;
+          if(!Base64Decode(data, len, buffer_len, str.ptr, &out_len)) {
+            parse_result = StringParser::PARSE_FAILURE;
+            break;
+          }
           DCHECK_LE(out_len, buffer_len);
           str.len = out_len;
         } else if (need_escape) {
diff --git a/bin/rat_exclude_files.txt b/bin/rat_exclude_files.txt
index fdefb4a82..fb94a13c6 100644
--- a/bin/rat_exclude_files.txt
+++ b/bin/rat_exclude_files.txt
@@ -156,6 +156,7 @@ testdata/data/decimal-tiny.txt
 testdata/data/decimal_tbl.txt
 testdata/data/decimal_rtf_tiny_tbl.txt
 testdata/data/decimal_rtf_tbl.txt
+testdata/data/invalid_binary_data.txt
 testdata/data/overflow.txt
 testdata/data/text-comma-backslash-newline.txt
 testdata/data/text-dollar-hash-pipe.txt
diff --git a/testdata/data/README b/testdata/data/README
index d20db0976..6db023867 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -1330,3 +1330,6 @@ Generated by ORC C++ library using the following code
 
   writer->add(*batch);
   writer->close();
+
+invalid_binary_data.txt:
+Hand-written file where BINARY values are not Base64 encoded.
diff --git a/testdata/data/invalid_binary_data.txt 
b/testdata/data/invalid_binary_data.txt
new file mode 100644
index 000000000..8b36759ea
--- /dev/null
+++ b/testdata/data/invalid_binary_data.txt
@@ -0,0 +1,2 @@
+3933 0a 25 25 45 4f 46 0a2016-10-31 20:46:09.871
+4065 66 0a 33 38 32 32 30 0a 25 25 45 4f 46 0a2016-10-31 20:46:10.137
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/invalid-binary-type.test
 
b/testdata/workloads/functional-query/queries/QueryTest/invalid-binary-type.test
new file mode 100644
index 000000000..47bc2aced
--- /dev/null
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/invalid-binary-type.test
@@ -0,0 +1,53 @@
+====
+---- QUERY
+SELECT * FROM invalid_binary;
+---- ERRORS
+Error converting column: 1 to BINARY
+Error converting column: 1 to BINARY
+Error parsing row: file: __HDFS_FILENAME__, before offset: 123
+Error parsing row: file: __HDFS_FILENAME__, before offset: 123
+---- RESULTS
+39,'NULL','2016-10-31 20:46:09.871'
+40,'NULL','2016-10-31 20:46:10.137'
+---- TYPES
+BIGINT,BINARY,STRING
+====
+---- QUERY
+SELECT bindata FROM invalid_binary;
+---- ERRORS
+Error converting column: 1 to BINARY
+Error converting column: 1 to BINARY
+Error parsing row: file: __HDFS_FILENAME__, before offset: 123
+Error parsing row: file: __HDFS_FILENAME__, before offset: 123
+---- RESULTS
+'NULL'
+'NULL'
+---- TYPES
+BINARY
+====
+---- QUERY
+SELECT id, bindata FROM invalid_binary;
+---- ERRORS
+Error converting column: 1 to BINARY
+Error converting column: 1 to BINARY
+Error parsing row: file: __HDFS_FILENAME__, before offset: 123
+Error parsing row: file: __HDFS_FILENAME__, before offset: 123
+---- RESULTS
+39,'NULL'
+40,'NULL'
+---- TYPES
+BIGINT,BINARY
+====
+---- QUERY
+SELECT bindata, date_str FROM invalid_binary;
+---- ERRORS
+Error converting column: 1 to BINARY
+Error converting column: 1 to BINARY
+Error parsing row: file: __HDFS_FILENAME__, before offset: 123
+Error parsing row: file: __HDFS_FILENAME__, before offset: 123
+---- RESULTS
+'NULL','2016-10-31 20:46:09.871'
+'NULL','2016-10-31 20:46:10.137'
+---- TYPES
+BINARY,STRING
+====
diff --git a/tests/data_errors/test_data_errors.py 
b/tests/data_errors/test_data_errors.py
index f58a3a9f2..cd701141b 100644
--- a/tests/data_errors/test_data_errors.py
+++ b/tests/data_errors/test_data_errors.py
@@ -24,6 +24,7 @@ import pytest
 import subprocess
 
 from tests.common.impala_connection import IMPALA_CONNECTION_EXCEPTION
+from tests.common.file_utils import create_table_and_copy_files
 from tests.common.impala_test_suite import ImpalaTestSuite
 from tests.common.skip import SkipIf, SkipIfFS
 from tests.common.test_dimensions import create_exec_option_dimension
@@ -154,6 +155,32 @@ class TestHdfsRcFileScanNodeErrors(TestHdfsScanNodeErrors):
     self.run_test_case('DataErrorsTest/hdfs-rcfile-scan-node-errors', vector)
 
 
+class TestBinaryTypeInText(ImpalaTestSuite):
+  @classmethod
+  def get_workload(cls):
+    return 'functional-query'
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestBinaryTypeInText, cls).add_test_dimensions()
+    cls.ImpalaTestMatrix.add_constraint(
+      lambda v: v.get_value('table_format').file_format == 'text')
+
+  def test_invalid_binary_type(self, vector, unique_database):
+    """Test scanning text files with invalid BINARY data."""
+    vector.get_value('exec_option')['abort_on_error'] = 0
+    tbl_name = "invalid_binary"
+    create_sql = """create table {}.{} (id BIGINT, bindata BINARY, date_str 
STRING)
+        ROW FORMAT DELIMITED FIELDS TERMINATED BY '\u0001'
+        WITH SERDEPROPERTIES ('field.delim'='\u0001', 'line.delim'='\n',
+                              'serialization.format'='\u0001')
+        STORED AS textfile""".format(
+        unique_database, tbl_name)
+    create_table_and_copy_files(self.client, create_sql, unique_database, 
tbl_name,
+        ["/testdata/data/invalid_binary_data.txt"])
+    self.run_test_case('QueryTest/invalid-binary-type', vector, 
unique_database)
+
+
 @SkipIfFS.qualified_path
 class TestHdfsJsonScanNodeErrors(TestHdfsScanNodeErrors):
   @classmethod

Reply via email to