This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push: new adb7730b64b [fix](parquet) the end offset of column chunk may be wrong in parquet metadata #28891 (#28893) adb7730b64b is described below commit adb7730b64b0a0d882fec6517af9c2dc73cdb7b5 Author: Ashin Gau <ashin...@users.noreply.github.com> AuthorDate: Sat Dec 23 22:23:07 2023 +0800 [fix](parquet) the end offset of column chunk may be wrong in parquet metadata #28891 (#28893) backport: #28891 --- .../vec/exec/format/parquet/vparquet_column_chunk_reader.cpp | 2 ++ .../vec/exec/format/parquet/vparquet_column_chunk_reader.h | 11 ++++++++++- be/src/vec/exec/format/parquet/vparquet_page_reader.h | 5 +++++ regression-test/data/external_table_p2/tvf/test_tvf_p2.out | 12 ++++++++++++ .../suites/external_table_p2/tvf/test_tvf_p2.groovy | 6 ++++++ 5 files changed, 35 insertions(+), 1 deletion(-) diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp index aca5ae96e81..579bfa5a4ae 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp @@ -98,10 +98,12 @@ Status ColumnChunkReader::next_page() { return next_page(); } else if (_page_reader->get_page_header()->type == tparquet::PageType::DATA_PAGE_V2) { _remaining_num_values = _page_reader->get_page_header()->data_page_header_v2.num_values; + _chunk_parsed_values += _remaining_num_values; _state = HEADER_PARSED; return Status::OK(); } else { _remaining_num_values = _page_reader->get_page_header()->data_page_header.num_values; + _chunk_parsed_values += _remaining_num_values; _state = HEADER_PARSED; return Status::OK(); } diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h index 9f62fddce70..aa30af1e94c 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h @@ -91,9 +91,17 @@ public: Status init(); // Whether the chunk reader has a more page to read. - bool has_next_page() { return _page_reader->has_next_page(); } + bool has_next_page() { return _chunk_parsed_values < _metadata.num_values; } + // Deprecated // Seek to the specific page, page_header_offset must be the start offset of the page header. + // _end_offset may exceed the actual data area, so we can only use the number of parsed values + // to determine whether there are remaining pages to read. That's to say we can't use the + // PageLocation in parquet metadata to seek to the specified page. We should call next_page() + // and skip_page() to skip pages one by one. + // todo: change this interface to seek_to_page(int64_t page_header_offset, size_t num_parsed_values) + // and set _chunk_parsed_values = num_parsed_values + // [[deprecated]] void seek_to_page(int64_t page_header_offset) { _remaining_num_values = 0; _page_reader->seek_to_page(page_header_offset); @@ -201,6 +209,7 @@ private: LevelDecoder _rep_level_decoder; LevelDecoder _def_level_decoder; + size_t _chunk_parsed_values = 0; uint32_t _remaining_num_values = 0; Slice _page_data; std::unique_ptr<uint8_t[]> _decompress_buf; diff --git a/be/src/vec/exec/format/parquet/vparquet_page_reader.h b/be/src/vec/exec/format/parquet/vparquet_page_reader.h index 089409786f4..3b816629c6e 100644 --- a/be/src/vec/exec/format/parquet/vparquet_page_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_page_reader.h @@ -45,6 +45,11 @@ public: uint64_t length); ~PageReader() = default; + // Deprecated + // Parquet file may not be standardized, + // _end_offset may exceed the actual data area. + // ColumnChunkReader::has_next_page() use the number of parsed values for judgment + // [[deprecated]] bool has_next_page() const { return _offset < _end_offset; } Status next_page_header(); diff --git a/regression-test/data/external_table_p2/tvf/test_tvf_p2.out b/regression-test/data/external_table_p2/tvf/test_tvf_p2.out index 6a44b7322dc..53b454df858 100644 --- a/regression-test/data/external_table_p2/tvf/test_tvf_p2.out +++ b/regression-test/data/external_table_p2/tvf/test_tvf_p2.out @@ -50,6 +50,18 @@ 32.024 64.0000 128.901468 32.024 64.0000 128.901468 2023-07-07 2023-07-07 2021-07-07T19:15:31.123456 2023-07-07 2023-07-07 2021-07-07T19:15:31.123456 32.689 64.2580 128.745382 32.689 64.2580 128.745382 2023-11-11 2023-11-11 2022-11-11T16:35:37.123456 2023-11-11 2023-11-11 2022-11-11T16:35:37.123456 +-- !wrong_page_header -- +53587 38687 2689589 99480 2971 1999262 218 386 5265 86 33.14 56.33 40.55 0.00 3487.30 2850.04 4844.38 69.74 0.00 3487.30 3557.04 637.26 +53587 47417 2689589 99480 2971 1999262 218 1216 5265 75 36.98 54.36 46.74 736.15 3505.50 2773.50 4077.00 138.46 736.15 2769.35 2907.81 -4.15 +53587 72713 2689589 99480 2971 1999262 218 1250 5265 15 54.97 85.75 75.46 0.00 1131.90 824.55 1286.25 56.59 0.00 1131.90 1188.49 307.35 +\N 124637 2689589 99480 \N \N 218 196 5265 \N 17.56 33.01 \N \N \N \N 3069.93 17.18 \N 245.52 \N \N +\N 132020 2689589 \N 2971 \N 218 \N 5265 \N \N 132.06 \N \N \N 8965.20 12281.58 255.41 \N \N 3448.10 \N +53587 132623 2689589 99480 2971 1999262 218 31 5265 74 62.58 90.74 77.12 0.00 5706.88 4630.92 6714.76 57.06 0.00 5706.88 5763.94 1075.96 +53587 298010 2689589 99480 2971 1999262 218 802 5265 59 83.41 159.31 70.09 0.00 4135.31 4921.19 9399.29 289.47 0.00 4135.31 4424.78 -785.88 +47821 4657 8720303 1194037 2971 4208705 574 656 5960 55 68.94 91.69 82.52 0.00 4538.60 3791.70 5042.95 408.47 0.00 4538.60 4947.07 746.90 +47821 50869 8720303 1194037 2971 4208705 574 1284 5960 77 2.87 2.92 0.37 0.00 28.49 220.99 224.84 0.00 0.00 28.49 28.49 -192.50 +47821 51494 8720303 1194037 2971 4208705 574 145 5960 18 69.14 94.72 14.20 0.00 255.60 1244.52 1704.96 7.66 0.00 255.60 263.26 -988.92 + -- !viewfs -- 25001 25001 25001 diff --git a/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy b/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy index 6564d074097..4f8aa482450 100644 --- a/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy +++ b/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy @@ -60,6 +60,12 @@ suite("test_tvf_p2", "p2") { "format" = "parquet"); """ + // test for wrong page header + qt_wrong_page_header """select * from hdfs( + "uri" = "hdfs://${nameNodeHost}:${hdfsPort}/catalog/tvf/parquet/wrong_page_header.parquet", + "format" = "parquet") order by ss_ticket_number,ss_item_sk limit 10; + """ + // viewfs qt_viewfs """select count(id), count(m1), count(m2) from hdfs( --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org