This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new b4663062da [feature-wip](parquet-reader) bug fix, parquet footer 
buffer is small when containing many columns (#12477)
b4663062da is described below

commit b4663062da972d95297e374155c13a6de8d16ab0
Author: Ashin Gau <ashin...@users.noreply.github.com>
AuthorDate: Fri Sep 9 09:12:34 2022 +0800

    [feature-wip](parquet-reader) bug fix, parquet footer buffer is small when 
containing many columns (#12477)
    
    Failed when reading parquet file with many columns(>1600).
    
    mysql> select int_col from types_sf100_r100w limit 5;
    ERROR 1105 (HY000): errCode = 2, detailMessage = Couldn't deserialize 
thrift msg:
    TProtocolException: Invalid data
    parse_thrift_footer uses fixed length buffer(=64k) to read parquet footer, 
but the meta data of a parquet file with 1600 columns can exceed 5MB.
    
    Therefore, the buffer size needs to be applied according to the actual 
length.
---
 .../vec/exec/format/parquet/parquet_thrift_util.h  | 24 ++++++++++------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/be/src/vec/exec/format/parquet/parquet_thrift_util.h 
b/be/src/vec/exec/format/parquet/parquet_thrift_util.h
index 7852926509..5811f034bd 100644
--- a/be/src/vec/exec/format/parquet/parquet_thrift_util.h
+++ b/be/src/vec/exec/format/parquet/parquet_thrift_util.h
@@ -31,29 +31,24 @@
 namespace doris::vectorized {
 
 constexpr uint8_t PARQUET_VERSION_NUMBER[4] = {'P', 'A', 'R', '1'};
-constexpr int64_t PARQUET_FOOTER_READ_SIZE = 64 * 1024;
 constexpr uint32_t PARQUET_FOOTER_SIZE = 8;
 
 static Status parse_thrift_footer(FileReader* file, 
std::shared_ptr<FileMetaData>& file_metadata) {
-    // try with buffer on stack
-    uint8_t buff[PARQUET_FOOTER_READ_SIZE];
+    uint8_t footer[PARQUET_FOOTER_SIZE];
     int64_t file_size = file->size();
-    // read footer bytes
-    uint64_t footer_read_size = std::min(file_size, PARQUET_FOOTER_READ_SIZE);
-
     int64_t bytes_read = 0;
-    RETURN_IF_ERROR(
-            file->readat(file_size - footer_read_size, footer_read_size, 
&bytes_read, buff));
+    RETURN_IF_ERROR(file->readat(file_size - PARQUET_FOOTER_SIZE, 
PARQUET_FOOTER_SIZE, &bytes_read,
+                                 footer));
+    DCHECK_EQ(bytes_read, PARQUET_FOOTER_SIZE);
 
     // validate magic
-    uint8_t* magic_ptr = buff + footer_read_size - 
sizeof(PARQUET_VERSION_NUMBER);
+    uint8_t* magic_ptr = footer + PARQUET_FOOTER_SIZE - 
sizeof(PARQUET_VERSION_NUMBER);
     if (memcmp(magic_ptr, PARQUET_VERSION_NUMBER, 
sizeof(PARQUET_VERSION_NUMBER)) != 0) {
         return Status::Corruption("Invalid magic number in parquet file");
     }
 
     // get metadata_size
-    uint8_t* footer_buff = buff + footer_read_size - PARQUET_FOOTER_SIZE;
-    uint32_t metadata_size = decode_fixed32_le(footer_buff);
+    uint32_t metadata_size = decode_fixed32_le(footer);
     if (metadata_size > file_size - PARQUET_FOOTER_SIZE) {
         Status::Corruption("Parquet file size is ", file_size,
                            " bytes, smaller than the size reported by footer's 
(", metadata_size,
@@ -61,8 +56,11 @@ static Status parse_thrift_footer(FileReader* file, 
std::shared_ptr<FileMetaData
     }
     tparquet::FileMetaData t_metadata;
     // deserialize footer
-    RETURN_IF_ERROR(
-            deserialize_thrift_msg(footer_buff - metadata_size, 
&metadata_size, true, &t_metadata));
+    uint8_t meta_buff[metadata_size];
+    RETURN_IF_ERROR(file->readat(file_size - PARQUET_FOOTER_SIZE - 
metadata_size, metadata_size,
+                                 &bytes_read, meta_buff));
+    DCHECK_EQ(bytes_read, metadata_size);
+    RETURN_IF_ERROR(deserialize_thrift_msg(meta_buff, &metadata_size, true, 
&t_metadata));
     file_metadata.reset(new FileMetaData(t_metadata));
     RETURN_IF_ERROR(file_metadata->init_schema());
     return Status::OK();


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to