This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 1b7b6bc6fbb258d985665f9f79bc3901e13ac2d4
Author: Qi Chen <kaka11.c...@gmail.com>
AuthorDate: Thu Aug 17 23:26:11 2023 +0800

    [Fix](orc-reader) Fix filling partition or missing column used incorrect 
row count. (#23096)
    
    [Fix](orc-reader) Fix filling partition or missing column used incorrect 
row count.
    
    `_row_reader->nextBatch` returns number of read rows. When orc lazy 
materialization is turned on, the number of read rows includes filtered rows, 
so caller must look at `numElements` in the row batch to determine how
    many rows were not filtered which will to fill to the block.
    
    In this case, filling partition or missing column used incorrect row count 
which will cause be crash by `filter.size() != offsets.size()` in filter column 
step.
    
    When orc lazy materialization is turned off, add 
`_convert_dict_cols_to_string_cols(block, nullptr)` if `(block->rows() == 0)`.
---
 be/src/vec/exec/format/orc/vorc_reader.cpp         | 14 +++---
 .../hive/test_external_catalog_hive_partition.out  | 50 ++++++++++++++++++++++
 .../test_external_catalog_hive_partition.groovy    |  2 +
 3 files changed, 61 insertions(+), 5 deletions(-)

diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp 
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index 1dd000bd0f..a1b1aa34b5 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -1413,8 +1413,10 @@ Status OrcReader::get_next_block(Block* block, size_t* 
read_rows, bool* eof) {
         }
         *read_rows = rr;
 
-        RETURN_IF_ERROR(_fill_partition_columns(block, rr, 
_lazy_read_ctx.partition_columns));
-        RETURN_IF_ERROR(_fill_missing_columns(block, rr, 
_lazy_read_ctx.missing_columns));
+        RETURN_IF_ERROR(_fill_partition_columns(block, _batch->numElements,
+                                                
_lazy_read_ctx.partition_columns));
+        RETURN_IF_ERROR(
+                _fill_missing_columns(block, _batch->numElements, 
_lazy_read_ctx.missing_columns));
 
         if (block->rows() == 0) {
             *eof = true;
@@ -1487,16 +1489,18 @@ Status OrcReader::get_next_block(Block* block, size_t* 
read_rows, bool* eof) {
         }
         *read_rows = rr;
 
+        RETURN_IF_ERROR(_fill_partition_columns(block, _batch->numElements,
+                                                
_lazy_read_ctx.partition_columns));
         RETURN_IF_ERROR(
-                _fill_partition_columns(block, *read_rows, 
_lazy_read_ctx.partition_columns));
-        RETURN_IF_ERROR(_fill_missing_columns(block, *read_rows, 
_lazy_read_ctx.missing_columns));
+                _fill_missing_columns(block, _batch->numElements, 
_lazy_read_ctx.missing_columns));
 
         if (block->rows() == 0) {
+            _convert_dict_cols_to_string_cols(block, nullptr);
             *eof = true;
             return Status::OK();
         }
 
-        _build_delete_row_filter(block, rr);
+        _build_delete_row_filter(block, _batch->numElements);
 
         std::vector<uint32_t> columns_to_filter;
         int column_to_keep = block->columns();
diff --git 
a/regression-test/data/external_table_p2/hive/test_external_catalog_hive_partition.out
 
b/regression-test/data/external_table_p2/hive/test_external_catalog_hive_partition.out
index 5608999eb5..c823189e68 100644
--- 
a/regression-test/data/external_table_p2/hive/test_external_catalog_hive_partition.out
+++ 
b/regression-test/data/external_table_p2/hive/test_external_catalog_hive_partition.out
@@ -23,6 +23,31 @@
 -- !q06 --
 2023-01-03T00:00       100     0.3     test3
 
+-- !q07 --
+1994   50063846        1820677
+1995   58220229        1820677
+1995   66859335        1820677
+1997   77350500        1820677
+1995   98899109        1820677
+1996   122310373       1820677
+1996   138664326       1820677
+1995   145803300       1820677
+1998   187514084       1820677
+1994   197627203       1820677
+1993   216217095       1820677
+1997   260737890       1820677
+1998   279581856       1820677
+1992   296560224       1820677
+1993   306190854       1820677
+1997   329189126       1820677
+1992   389043491       1820677
+1997   435247522       1820677
+1998   449388167       1820677
+1994   526241665       1820677
+1998   533034534       1820677
+1996   576018657       1820677
+1997   582732039       1820677
+
 -- !q01 --
 0.1    test1   2023-01-01T00:00        \N
 0.2    test2   2023-01-02T00:00        \N
@@ -47,6 +72,31 @@
 -- !q06 --
 2023-01-03T00:00       100     0.3     test3
 
+-- !q07 --
+1994   50063846        1820677
+1995   58220229        1820677
+1995   66859335        1820677
+1997   77350500        1820677
+1995   98899109        1820677
+1996   122310373       1820677
+1996   138664326       1820677
+1995   145803300       1820677
+1998   187514084       1820677
+1994   197627203       1820677
+1993   216217095       1820677
+1997   260737890       1820677
+1998   279581856       1820677
+1992   296560224       1820677
+1993   306190854       1820677
+1997   329189126       1820677
+1992   389043491       1820677
+1997   435247522       1820677
+1998   449388167       1820677
+1994   526241665       1820677
+1998   533034534       1820677
+1996   576018657       1820677
+1997   582732039       1820677
+
 -- !q01 --
 0.1    test1   2023-01-01T00:00        \N
 0.2    test2   2023-01-02T00:00        \N
diff --git 
a/regression-test/suites/external_table_p2/hive/test_external_catalog_hive_partition.groovy
 
b/regression-test/suites/external_table_p2/hive/test_external_catalog_hive_partition.groovy
index fc6e7fbc23..642121c22e 100644
--- 
a/regression-test/suites/external_table_p2/hive/test_external_catalog_hive_partition.groovy
+++ 
b/regression-test/suites/external_table_p2/hive/test_external_catalog_hive_partition.groovy
@@ -39,6 +39,7 @@ suite("test_external_catalog_hive_partition", "p2") {
             qt_q04 """ select * from multi_catalog.parquet_partitioned_columns 
order by t_float """
             qt_q05 """ select * from multi_catalog.parquet_partitioned_columns 
where t_int is null order by t_float """
             qt_q06 """ select * from multi_catalog.parquet_partitioned_columns 
where t_int is not null order by t_float """
+            qt_q07 """ select  o_orderyear, o_orderkey, o_custkey from 
multi_catalog.orders_par_parquet where o_custkey=1820677 order by o_orderkey """
         }
         // test orc format
         def q01_orc = {
@@ -48,6 +49,7 @@ suite("test_external_catalog_hive_partition", "p2") {
             qt_q04 """ select * from multi_catalog.orc_partitioned_columns 
order by t_float """
             qt_q05 """ select * from multi_catalog.orc_partitioned_columns 
where t_int is null order by t_float """
             qt_q06 """ select * from multi_catalog.orc_partitioned_columns 
where t_int is not null order by t_float """
+            qt_q07 """ select  o_orderyear, o_orderkey, o_custkey from 
multi_catalog.orders_par_orc where o_custkey=1820677 order by o_orderkey """
         }
         // test text format
         def q01_text = {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to