This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git

The following commit(s) were added to refs/heads/master by this push:
     new e40275ae44a [Fix](parquet-reader) Fix parquet reader crash in 
set_dict(). (#40643)
e40275ae44a is described below

commit e40275ae44ae7aa7424aa1672305a96954cb39e1
Author: Qi Chen <kaka11.c...@gmail.com>
AuthorDate: Fri Sep 13 10:52:00 2024 +0800

    [Fix](parquet-reader) Fix parquet reader crash in set_dict(). (#40643)
    
    ## Proposed changes
    
    ### Issue
    ```
    *** is nereids: 1 ***
    tablet id: 4
    Abort at 1725864966 (unix time) try "date -d @1725864966" if you are using 
GNU date ***
    *** Set a breakpoint in static void __GI_abort() to debug ***
    PC: @ 0x7f007fb4090a04
    *** SIGSEGV (address not mapped to object 0xa0fa868a41d6) received by PID 
404737 (TID 274135 OR 0x7ece29df700) from PID 1755584205; stack trace: ***
    #0 __GI_raise
    #1 __GI_abort
    #2 sig_handler
    #3 _sigaction
    #4 JVM_handle_linux_signal
    #5 _sigaction
    #6 
doris::vectorized::ByteArrayDictDecoder::set_dict(std::unique_ptr<unsigned 
char[], std::default_delete<unsigned char[]>> &&, int, unsigned long)
    at 
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp:41
    #7 doris::vectorized::ColumnChunkReader::_decode_dict_page() at 
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp:258
    #8 doris::vectorized::ColumnChunkReader::next_page() at 
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp:105
    #9 
doris::vectorized::ParquetColumnReader::_read_column_data(doris::vectorized::Block*,
 bool*) at 
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:508
    #10 
doris::vectorized::ScalarColumnReader::_next_value(doris::vectorized::ICollumn*,
 unsigned long, unsigned long*, bool*) at 
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:699
    #11 
doris::vectorized::RowGroupReader::_read_column_data(doris::vectorized::Block*, 
std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>> 
&, std::vector<doris::vectorized::ColumnSelectVector>*, unsigned long, unsigned 
long*, bool*) at 
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp:425
    #12 
doris::vectorized::RowGroupReader::get_next_block(doris::vectorized::Block*, 
bool*) at 
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp:311
    #13 doris::vectorized::ParquetReader::get_next(doris::vectorized::Block*, 
bool*) at 
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_reader.cpp:533
    #14 
doris::vectorized::VFileScanner::_get_next_reader_block(doris::RuntimeState*, 
doris::vectorized::Block*, bool*) at 
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/vfile_scanner.cpp:368
    #15 doris::vectorized::VFileScanner::_get_block_impl(doris::RuntimeState*, 
doris::vectorized::Block*, bool*) at 
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/vfile_scanner.cpp:411
    #16 doris::vectorized::VScanner::get_block(doris::RuntimeState*, 
doris::vectorized::Block*, bool*) at 
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/vscanner.cpp:431
    #17 doris::vectorized::VScanner::get_block(doris::RuntimeState*, 
doris::vectorized::Block*, bool*) at 
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/vscanner.cpp:96
    #18 
doris::vectorized::ScannerScheduler::submit(doris::vectorized::ScannerContext*, 
std::shared_ptr<doris::vectorized::ScanTask>) at 
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/scanner_context.cpp:96
    #19 doris::Thread::supervise_thread(void*) at 
/mnt/disk1/yy/git/enterprise-core/be/src/util/thread.cpp:499
    #20 start_thread
    #21 clone in /lib64/libc.so.6
    ```
    
    ### Solution
    It is not known why the parquet dictionary page will be null in this
    case, causing a crash. This PR adds defensive code to prevent the crash.
---
 be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp | 3 +++
 be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp 
b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp
index 7d9f708011c..4be7cb8b667 100644
--- a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp
+++ b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp
@@ -32,6 +32,9 @@ namespace doris::vectorized {
 Status ByteArrayDictDecoder::set_dict(std::unique_ptr<uint8_t[]>& dict, 
int32_t length,
                                       size_t num_values) {
     _dict = std::move(dict);
+    if (_dict == nullptr) {
+        return Status::Corruption("Wrong dictionary data for byte array type, 
dict is null.");
+    }
     _dict_items.reserve(num_values);
     uint32_t offset_cursor = 0;
     char* dict_item_address = reinterpret_cast<char*>(_dict.get());
diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp 
b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
index 0bcc0bd5e73..6e7d3c7b99d 100644
--- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
+++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
@@ -107,6 +107,9 @@ protected:
             return Status::Corruption("Wrong dictionary data for fixed length 
type");
         }
         _dict = std::move(dict);
+        if (_dict == nullptr) {
+            return Status::Corruption("Wrong dictionary data for byte array 
type, dict is null.");
+        }
         char* dict_item_address = reinterpret_cast<char*>(_dict.get());
         _dict_items.resize(num_values);
         for (size_t i = 0; i < num_values; ++i) {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to