This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new b4663062da [feature-wip](parquet-reader) bug fix, parquet footer buffer is small when containing many columns (#12477) b4663062da is described below commit b4663062da972d95297e374155c13a6de8d16ab0 Author: Ashin Gau <ashin...@users.noreply.github.com> AuthorDate: Fri Sep 9 09:12:34 2022 +0800 [feature-wip](parquet-reader) bug fix, parquet footer buffer is small when containing many columns (#12477) Failed when reading parquet file with many columns(>1600). mysql> select int_col from types_sf100_r100w limit 5; ERROR 1105 (HY000): errCode = 2, detailMessage = Couldn't deserialize thrift msg: TProtocolException: Invalid data parse_thrift_footer uses fixed length buffer(=64k) to read parquet footer, but the meta data of a parquet file with 1600 columns can exceed 5MB. Therefore, the buffer size needs to be applied according to the actual length. --- .../vec/exec/format/parquet/parquet_thrift_util.h | 24 ++++++++++------------ 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/be/src/vec/exec/format/parquet/parquet_thrift_util.h b/be/src/vec/exec/format/parquet/parquet_thrift_util.h index 7852926509..5811f034bd 100644 --- a/be/src/vec/exec/format/parquet/parquet_thrift_util.h +++ b/be/src/vec/exec/format/parquet/parquet_thrift_util.h @@ -31,29 +31,24 @@ namespace doris::vectorized { constexpr uint8_t PARQUET_VERSION_NUMBER[4] = {'P', 'A', 'R', '1'}; -constexpr int64_t PARQUET_FOOTER_READ_SIZE = 64 * 1024; constexpr uint32_t PARQUET_FOOTER_SIZE = 8; static Status parse_thrift_footer(FileReader* file, std::shared_ptr<FileMetaData>& file_metadata) { - // try with buffer on stack - uint8_t buff[PARQUET_FOOTER_READ_SIZE]; + uint8_t footer[PARQUET_FOOTER_SIZE]; int64_t file_size = file->size(); - // read footer bytes - uint64_t footer_read_size = std::min(file_size, PARQUET_FOOTER_READ_SIZE); - int64_t bytes_read = 0; - RETURN_IF_ERROR( - file->readat(file_size - footer_read_size, footer_read_size, &bytes_read, buff)); + RETURN_IF_ERROR(file->readat(file_size - PARQUET_FOOTER_SIZE, PARQUET_FOOTER_SIZE, &bytes_read, + footer)); + DCHECK_EQ(bytes_read, PARQUET_FOOTER_SIZE); // validate magic - uint8_t* magic_ptr = buff + footer_read_size - sizeof(PARQUET_VERSION_NUMBER); + uint8_t* magic_ptr = footer + PARQUET_FOOTER_SIZE - sizeof(PARQUET_VERSION_NUMBER); if (memcmp(magic_ptr, PARQUET_VERSION_NUMBER, sizeof(PARQUET_VERSION_NUMBER)) != 0) { return Status::Corruption("Invalid magic number in parquet file"); } // get metadata_size - uint8_t* footer_buff = buff + footer_read_size - PARQUET_FOOTER_SIZE; - uint32_t metadata_size = decode_fixed32_le(footer_buff); + uint32_t metadata_size = decode_fixed32_le(footer); if (metadata_size > file_size - PARQUET_FOOTER_SIZE) { Status::Corruption("Parquet file size is ", file_size, " bytes, smaller than the size reported by footer's (", metadata_size, @@ -61,8 +56,11 @@ static Status parse_thrift_footer(FileReader* file, std::shared_ptr<FileMetaData } tparquet::FileMetaData t_metadata; // deserialize footer - RETURN_IF_ERROR( - deserialize_thrift_msg(footer_buff - metadata_size, &metadata_size, true, &t_metadata)); + uint8_t meta_buff[metadata_size]; + RETURN_IF_ERROR(file->readat(file_size - PARQUET_FOOTER_SIZE - metadata_size, metadata_size, + &bytes_read, meta_buff)); + DCHECK_EQ(bytes_read, metadata_size); + RETURN_IF_ERROR(deserialize_thrift_msg(meta_buff, &metadata_size, true, &t_metadata)); file_metadata.reset(new FileMetaData(t_metadata)); RETURN_IF_ERROR(file_metadata->init_schema()); return Status::OK(); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org