This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-1.2-lts in repository https://gitbox.apache.org/repos/asf/doris.git
commit 29e6480bc8be1fc882c3e7a1f28b7164a3b71c97 Author: Ashin Gau <ashin...@users.noreply.github.com> AuthorDate: Thu Feb 2 09:22:09 2023 +0800 [fix](multi-catalog) remove the eof check among parquet columns (#16302) Read parquet file failed: ``` ERROR 1105 (HY000): errCode = 2, detailMessage = [INTERNAL_ERROR]Read parquet file xxx failed, reason = [CORRUPTION]The number of rows are not equal among parquet columns ``` This error may be thrown when reading non-predicate columns in lazy-read, for example: A row group with 1000 rows has tow non-predicate columns. Column A has one page, Column B has two pages with 500 rows for each page. The read range of `ParquetColumnReader` is [0, 400), and the rows between [0, 450) are all filtered by predicate columns. So column A can skip the first page, and reach the EOF, while column B can also skip the first page, but doesn't read the EOF. --- be/src/vec/exec/format/parquet/vparquet_group_reader.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index 34b478114e..5b1c0fd828 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -134,7 +134,6 @@ Status RowGroupReader::_read_column_data(Block* block, const std::vector<std::st ColumnSelectVector& select_vector) { size_t batch_read_rows = 0; bool has_eof = false; - int col_idx = 0; for (auto& read_col : columns) { auto& column_with_type_and_name = block->get_by_name(read_col); auto& column_ptr = column_with_type_and_name.column; @@ -150,15 +149,13 @@ Status RowGroupReader::_read_column_data(Block* block, const std::vector<std::st &col_eof)); col_read_rows += loop_rows; } - if (col_idx > 0 && (has_eof ^ col_eof)) { - return Status::Corruption("The number of rows are not equal among parquet columns"); - } if (batch_read_rows > 0 && batch_read_rows != col_read_rows) { return Status::Corruption("Can't read the same number of rows among parquet columns"); } batch_read_rows = col_read_rows; - has_eof = col_eof; - col_idx++; + if (col_eof) { + has_eof = true; + } } *read_rows = batch_read_rows; *batch_eof = has_eof; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org