This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 96d4778f2ec [fix](parquet) the end offset of column chunk may be wrong 
in parquet metadata (#28891)
96d4778f2ec is described below

commit 96d4778f2ec837eef5bfe94b18bde4ccd0400c23
Author: Ashin Gau <ashin...@users.noreply.github.com>
AuthorDate: Sat Dec 23 22:21:04 2023 +0800

    [fix](parquet) the end offset of column chunk may be wrong in parquet 
metadata (#28891)
---
 .../vec/exec/format/parquet/vparquet_column_chunk_reader.cpp |  2 ++
 .../vec/exec/format/parquet/vparquet_column_chunk_reader.h   | 11 ++++++++++-
 be/src/vec/exec/format/parquet/vparquet_page_reader.h        |  5 +++++
 regression-test/data/external_table_p2/tvf/test_tvf_p2.out   | 12 ++++++++++++
 .../suites/external_table_p2/tvf/test_tvf_p2.groovy          |  6 ++++++
 5 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp 
b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp
index 928b5ae70bd..6feb9bc1025 100644
--- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp
@@ -97,10 +97,12 @@ Status ColumnChunkReader::next_page() {
         return next_page();
     } else if (_page_reader->get_page_header()->type == 
tparquet::PageType::DATA_PAGE_V2) {
         _remaining_num_values = 
_page_reader->get_page_header()->data_page_header_v2.num_values;
+        _chunk_parsed_values += _remaining_num_values;
         _state = HEADER_PARSED;
         return Status::OK();
     } else {
         _remaining_num_values = 
_page_reader->get_page_header()->data_page_header.num_values;
+        _chunk_parsed_values += _remaining_num_values;
         _state = HEADER_PARSED;
         return Status::OK();
     }
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h 
b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h
index 21ee808e48f..c8a49e098a5 100644
--- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h
@@ -91,9 +91,17 @@ public:
     Status init();
 
     // Whether the chunk reader has a more page to read.
-    bool has_next_page() { return _page_reader->has_next_page(); }
+    bool has_next_page() { return _chunk_parsed_values < _metadata.num_values; 
}
 
+    // Deprecated
     // Seek to the specific page, page_header_offset must be the start offset 
of the page header.
+    // _end_offset may exceed the actual data area, so we can only use the 
number of parsed values
+    // to determine whether there are remaining pages to read. That's to say 
we can't use the
+    // PageLocation in parquet metadata to seek to the specified page. We 
should call next_page()
+    // and skip_page() to skip pages one by one.
+    // todo: change this interface to seek_to_page(int64_t page_header_offset, 
size_t num_parsed_values)
+    // and set _chunk_parsed_values = num_parsed_values
+    // [[deprecated]]
     void seek_to_page(int64_t page_header_offset) {
         _remaining_num_values = 0;
         _page_reader->seek_to_page(page_header_offset);
@@ -201,6 +209,7 @@ private:
 
     LevelDecoder _rep_level_decoder;
     LevelDecoder _def_level_decoder;
+    size_t _chunk_parsed_values = 0;
     uint32_t _remaining_num_values = 0;
     Slice _page_data;
     std::unique_ptr<uint8_t[]> _decompress_buf;
diff --git a/be/src/vec/exec/format/parquet/vparquet_page_reader.h 
b/be/src/vec/exec/format/parquet/vparquet_page_reader.h
index 730b9a3001b..bdd0a8d0f5f 100644
--- a/be/src/vec/exec/format/parquet/vparquet_page_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_page_reader.h
@@ -45,6 +45,11 @@ public:
                uint64_t length);
     ~PageReader() = default;
 
+    // Deprecated
+    // Parquet file may not be standardized,
+    // _end_offset may exceed the actual data area.
+    // ColumnChunkReader::has_next_page() use the number of parsed values for 
judgment
+    // [[deprecated]]
     bool has_next_page() const { return _offset < _end_offset; }
 
     Status next_page_header();
diff --git a/regression-test/data/external_table_p2/tvf/test_tvf_p2.out 
b/regression-test/data/external_table_p2/tvf/test_tvf_p2.out
index 6a44b7322dc..53b454df858 100644
--- a/regression-test/data/external_table_p2/tvf/test_tvf_p2.out
+++ b/regression-test/data/external_table_p2/tvf/test_tvf_p2.out
@@ -50,6 +50,18 @@
 32.024 64.0000 128.901468      32.024  64.0000 128.901468      2023-07-07      
2023-07-07      2021-07-07T19:15:31.123456      2023-07-07      2023-07-07      
2021-07-07T19:15:31.123456
 32.689 64.2580 128.745382      32.689  64.2580 128.745382      2023-11-11      
2023-11-11      2022-11-11T16:35:37.123456      2023-11-11      2023-11-11      
2022-11-11T16:35:37.123456
 
+-- !wrong_page_header --
+53587  38687   2689589 99480   2971    1999262 218     386     5265    86      
33.14   56.33   40.55   0.00    3487.30 2850.04 4844.38 69.74   0.00    3487.30 
3557.04 637.26
+53587  47417   2689589 99480   2971    1999262 218     1216    5265    75      
36.98   54.36   46.74   736.15  3505.50 2773.50 4077.00 138.46  736.15  2769.35 
2907.81 -4.15
+53587  72713   2689589 99480   2971    1999262 218     1250    5265    15      
54.97   85.75   75.46   0.00    1131.90 824.55  1286.25 56.59   0.00    1131.90 
1188.49 307.35
+\N     124637  2689589 99480   \N      \N      218     196     5265    \N      
17.56   33.01   \N      \N      \N      \N      3069.93 17.18   \N      245.52  
\N      \N
+\N     132020  2689589 \N      2971    \N      218     \N      5265    \N      
\N      132.06  \N      \N      \N      8965.20 12281.58        255.41  \N      
\N      3448.10 \N
+53587  132623  2689589 99480   2971    1999262 218     31      5265    74      
62.58   90.74   77.12   0.00    5706.88 4630.92 6714.76 57.06   0.00    5706.88 
5763.94 1075.96
+53587  298010  2689589 99480   2971    1999262 218     802     5265    59      
83.41   159.31  70.09   0.00    4135.31 4921.19 9399.29 289.47  0.00    4135.31 
4424.78 -785.88
+47821  4657    8720303 1194037 2971    4208705 574     656     5960    55      
68.94   91.69   82.52   0.00    4538.60 3791.70 5042.95 408.47  0.00    4538.60 
4947.07 746.90
+47821  50869   8720303 1194037 2971    4208705 574     1284    5960    77      
2.87    2.92    0.37    0.00    28.49   220.99  224.84  0.00    0.00    28.49   
28.49   -192.50
+47821  51494   8720303 1194037 2971    4208705 574     145     5960    18      
69.14   94.72   14.20   0.00    255.60  1244.52 1704.96 7.66    0.00    255.60  
263.26  -988.92
+
 -- !viewfs --
 25001  25001   25001
 
diff --git a/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy 
b/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy
index 39015b7b76f..96626af32b7 100644
--- a/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy
+++ b/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy
@@ -60,6 +60,12 @@ suite("test_tvf_p2", 
"p2,external,tvf,external_remote,external_remote_tvf") {
             "format" = "parquet");
         """
 
+        // test for wrong page header
+        qt_wrong_page_header """select * from hdfs(
+            "uri" = 
"hdfs://${nameNodeHost}:${hdfsPort}/catalog/tvf/parquet/wrong_page_header.parquet",
+            "format" = "parquet") order by ss_ticket_number,ss_item_sk limit 
10;
+        """
+
         // viewfs
         qt_viewfs """select count(id), count(m1), count(m2)
             from hdfs(


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to