github-actions[bot] commented on code in PR #59263:
URL: https://github.com/apache/doris/pull/59263#discussion_r3468127296


##########
be/src/storage/segment/column_reader.cpp:
##########
@@ -1262,18 +1363,33 @@ void 
MapFileColumnIterator::remove_pruned_sub_iterators() {
 
 Status MapFileColumnIterator::set_access_paths(const TColumnAccessPaths& 
all_access_paths,
                                                const TColumnAccessPaths& 
predicate_access_paths) {
-    if (all_access_paths.empty()) {
+    if (all_access_paths.empty() && predicate_access_paths.empty()) {
         return Status::OK();
     }
 
     if (!predicate_access_paths.empty()) {
-        set_reading_flag(ReadingFlag::READING_FOR_PREDICATE);
+        set_reading_flag_self(ReadingFlag::READING_FOR_PREDICATE);
         DLOG(INFO) << "Map column iterator set sub-column " << _column_name
                    << " to READING_FOR_PREDICATE";
     }
 
+    const bool has_current_level_data_path =
+            std::ranges::any_of(all_access_paths, [this](const 
TColumnAccessPath& path) {
+                return is_current_level_data_access_path(path, _column_name);
+            });
     auto sub_all_access_paths = 
DORIS_TRY(_get_sub_access_paths(all_access_paths));
-    auto sub_predicate_access_paths = 
DORIS_TRY(_get_sub_access_paths(predicate_access_paths));
+    auto sub_predicate_access_paths =
+            DORIS_TRY(_get_sub_access_paths(predicate_access_paths, true));
+    if (has_current_level_data_path) {
+        remove_current_level_meta_access_paths(sub_all_access_paths);
+    }
+
+    if (sub_predicate_access_paths.empty() && _reading_flag == 
ReadingFlag::READING_FOR_PREDICATE) {
+        // if no sub-column in predicate_access_paths, but current column is 
READING_FOR_PREDICATE,
+        // then we should set key/value iterator to READING_FOR_PREDICATE too.
+        _key_iterator->set_reading_flag(ReadingFlag::READING_FOR_PREDICATE);
+        _val_iterator->set_reading_flag(ReadingFlag::READING_FOR_PREDICATE);
+    }
 
     if (sub_all_access_paths.empty()) {

Review Comment:
   When the projection needs the whole map but the predicate needs a nested map 
element, this return drops the predicate subpaths. For example, `select map_col 
from t where map_col['a'] is null` can give BE all-access paths collapsed to 
`[map_col]`, while the predicate paths still contain the key/value subpaths for 
`map_col['a']`. `_get_sub_access_paths(all_access_paths)` consumes the root 
path and marks the key/value iterators as `NEED_TO_READ`, then this branch 
returns before the loop below can promote the predicate key/value paths. Since 
`SegmentIterator` will put this column in `PREDICATE` mode, `NEED_TO_READ` 
children do not read in the first phase and the predicate is evaluated on 
placeholders. Please keep routing `sub_predicate_access_paths` even when 
`sub_all_access_paths` is empty, or disable the lazy split for this shape.



##########
be/src/storage/segment/column_reader.cpp:
##########
@@ -2021,18 +2317,27 @@ Status StringFileColumnIterator::set_access_paths(
 FileColumnIterator::FileColumnIterator(std::shared_ptr<ColumnReader> reader) : 
_reader(reader) {}
 
 void ColumnIterator::_check_and_set_meta_read_mode(const TColumnAccessPaths& 
sub_all_access_paths) {
+    bool has_offset_path = false;
+    bool has_null_path = false;
     for (const auto& path : sub_all_access_paths) {
-        if (!path.data_access_path.path.empty()) {
-            if (StringCaseEqual()(path.data_access_path.path[0], 
ACCESS_OFFSET)) {
-                _read_mode = ReadMode::OFFSET_ONLY;
-                return;
-            } else if (StringCaseEqual()(path.data_access_path.path[0], 
ACCESS_NULL)) {
-                _read_mode = ReadMode::NULL_MAP_ONLY;
-                return;
-            }
+        if (!is_current_level_meta_access_path(path)) {
+            _read_mode = ReadMode::DEFAULT;
+            return;
+        }
+        const auto& component = path.data_access_path.path[0];
+        if (StringCaseEqual()(component, ACCESS_OFFSET)) {
+            has_offset_path = true;
+        } else {
+            has_null_path = true;
         }
     }
-    _read_mode = ReadMode::DEFAULT;
+    if (has_offset_path == has_null_path) {

Review Comment:
   This makes queries that now carry both metadata paths lose the metadata-only 
path entirely. The PR preserves cases such as `length(str_col) ... OR str_col 
IS NULL` as both `str_col.OFFSET` and `str_col.NULL`, but this branch converts 
OFFSET+NULL to `DEFAULT`, so arrays/maps read child data and strings stop using 
`only_read_offsets`. The OFFSET_ONLY paths already materialize the nullable 
parent null map while skipping child/string data, so OFFSET+NULL can still be 
served as metadata-only instead of falling back to full reads.



##########
regression-test/suites/datatype_p0/complex_types/test_pruned_columns.groovy:
##########
@@ -16,65 +16,169 @@
 // under the License.
 
 suite("test_pruned_columns") {
+    sql "set batch_size = 32;"
     sql """DROP TABLE IF EXISTS `tbl_test_pruned_columns`"""
     sql """
         CREATE TABLE `tbl_test_pruned_columns` (
             `id` int NULL,
             `s` struct<city:text,data:array<map<int,struct<a:int,b:double>>>, 
value:int> NULL
         ) ENGINE=OLAP
         DUPLICATE KEY(`id`)
-        DISTRIBUTED BY RANDOM BUCKETS AUTO
+        DISTRIBUTED BY RANDOM BUCKETS 2
         PROPERTIES (
         "replication_allocation" = "tag.location.default: 1"
         );
     """
 
     sql """
-        insert into `tbl_test_pruned_columns` values
-            (1, named_struct('city', 'beijing', 'data', array(map(1, 
named_struct('a', 10, 'b', 20.0), 2, named_struct('a', 30, 'b', 40))), 'value', 
1)),
-            (2, named_struct('city', 'shanghai', 'data', array(map(2, 
named_struct('a', 50, 'b', 40.0), 1, named_struct('a', 70, 'b', 80))), 'value', 
2)),
-            (3, named_struct('city', 'guangzhou', 'data', array(map(1, 
named_struct('a', 90, 'b', 60.0), 2, named_struct('a', 110, 'b', 40))), 
'value', 3)),
-            (4, named_struct('city', 'shenzhen', 'data', array(map(2, 
named_struct('a', 130, 'b', 20.0), 1, named_struct('a', 150, 'b', 40))), 
'value', 4)),
-            (5, named_struct('city', 'hangzhou', 'data', array(map(1, 
named_struct('a', 170, 'b', 80.0), 2, named_struct('a', 190, 'b', 40))), 
'value', 5)),
-            (6, named_struct('city', 'nanjing', 'data', array(map(2, 
named_struct('a', 210, 'b', 60.0), 1, named_struct('a', 230, 'b', 40))), 
'value', 6)),
-            (7, named_struct('city', 'tianjin', 'data', array(map(1, 
named_struct('a', 250, 'b', 20.0), 2, named_struct('a', 270, 'b', 40))), 
'value', 7)),
-            (8, named_struct('city', 'chongqing', 'data', array(map(2, 
named_struct('a', 290, 'b', 80.0), 1, named_struct('a', 310, 'b', 40))), 
'value', 8)),
-            (9, named_struct('city', 'wuhan', 'data', array(map(1, 
named_struct('a', 330, 'b', 60.0), 2, named_struct('a', 350, 'b', 40))), 
'value', 9)),
-            (10, named_struct('city', 'xian', 'data', array(map(2, 
named_struct('a', 370, 'b', 20.0), 1, named_struct('a', 390, 'b', 40))), 
'value', 10)),
-            (11, named_struct('city', 'changsha', 'data', array(map(1, 
named_struct('a', 410, 'b', 80.0), 2, named_struct('a', 430, 'b', 40))), 
'value', 11)),
-            (12, named_struct('city', 'qingdao', 'data', array(map(2, 
named_struct('a', 450, 'b', 60.0), 1, named_struct('a', 470, 'b', 40))), 
'value', 12)),
-            (13, named_struct('city', 'dalian', 'data', array(map(1, 
named_struct('a', 490, 'b', 20.0), 2, named_struct('a', 510, 'b', 40))), 
'value', 13));
+        insert into `tbl_test_pruned_columns`
+        select
+            number as id,
+            named_struct(
+                'city',
+                case (number % 10)
+                    when 0 then 'beijing'
+                    when 1 then 'shanghai'
+                    when 2 then 'shenzhen'
+                    when 3 then 'guangzhou'
+                    when 4 then 'hangzhou'
+                    when 5 then 'chengdu'
+                    when 6 then 'wuhan'
+                    when 7 then 'xian'
+                    when 8 then 'nanjing'
+                    else null
+                end,
+                'data',
+                array(
+                    map(
+                        1, named_struct('a', number * 10, 'b', (number * 10 + 
number % 5) * 1.0),
+                        2, named_struct('a', number * 10 + 20, 'b', (number % 
10 + 1) * 10.0)
+                    ),
+                    map(
+                        (number % 3 + 1), named_struct('a', number * 5, 'b', 
number * 2.5),
+                        (number % 5 + 2), named_struct('a', number * 3, 'b', 
number * 1.5)
+                    )
+                ),
+                'value',
+                number
+            ) as s
+        from numbers("number" = "3000");
     """
 
     qt_sql """
-        select * from `tbl_test_pruned_columns` order by 1;
+        select element_at(s, 'city'), count() from `tbl_test_pruned_columns` 
group by element_at(s, 'city') order by 1, 2;
     """
 
     qt_sql1 """
-        select b.id, array_map(x -> element_at(map_values(x)[1], 'a'), 
element_at(s, 'data')) from `tbl_test_pruned_columns` t join (select 1 id) b on 
t.id = b.id order by 1;
+        select
+            b.id
+            , array_map(x -> element_at(map_values(x)[1], 'a')
+            , element_at(s, 'data'))
+        from `tbl_test_pruned_columns` t join (select 1 id) b on t.id = b.id
+        order by 1, 2 limit 0, 20;
+    """
+
+    qt_sql1_1 """
+        select
+            b.id
+            , array_map(x -> element_at(map_values(x)[1], 'a')
+            , element_at(s, 'data'))
+        from `tbl_test_pruned_columns` t join (select 1 id) b on t.id = b.id
+        order by 1, 2 limit 100, 20;
+    """
+
+    qt_sql1_2 """
+        select
+            b.id
+            , array_map(x -> element_at(map_values(x)[1], 'a')
+            , element_at(s, 'data'))
+        from `tbl_test_pruned_columns` t join (select 1 id) b on t.id = b.id
+        order by 1 desc, 2 limit 100, 20;
     """
 
     qt_sql2 """
-        select id, element_at(s, 'city') from `tbl_test_pruned_columns` order 
by 1;
+        select id, element_at(s, 'city') from `tbl_test_pruned_columns` order 
by 1 limit 0, 20;
+    """
+
+    qt_sql2_1 """
+        select id, element_at(s, 'city') from `tbl_test_pruned_columns` order 
by 1 limit 100, 20;
+    """
+
+    qt_sql2_2 """
+        select id, element_at(s, 'city') from `tbl_test_pruned_columns` order 
by 1 desc limit 0, 20;
     """
 
     qt_sql3 """
-        select id, element_at(s, 'data') from `tbl_test_pruned_columns` order 
by 1;
+        select id, element_at(s, 'data') from `tbl_test_pruned_columns` order 
by 1 limit 0, 20;
+    """
+
+    qt_sql3_1 """
+        select id, element_at(s, 'data') from `tbl_test_pruned_columns` order 
by 1 limit 200, 20;
+    """
+
+    qt_sql3_2 """
+        select id, element_at(s, 'data') from `tbl_test_pruned_columns` order 
by 1 desc limit 0, 20;
     """
 
     qt_sql4 """
-        select id, element_at(s, 'data') from `tbl_test_pruned_columns` where 
element_at(element_at(s, 'data')[1][2], 'b') = 40 order by 1;
+        select
+            id
+            , element_at(s, 'data')
+        from `tbl_test_pruned_columns`
+        where element_at(element_at(s, 'data')[1][2], 'b') = 40
+        order by 1 limit 0, 20;
+    """
+
+    qt_sql4_1 """
+        select
+            id
+            , element_at(s, 'data')
+        from `tbl_test_pruned_columns`
+        where element_at(element_at(s, 'data')[1][2], 'b') = 40
+        order by 1 limit 100, 20;
+    """
+
+    qt_sql4_2 """
+        select
+            id
+            , element_at(s, 'data')
+        from `tbl_test_pruned_columns`
+        where element_at(element_at(s, 'data')[1][2], 'b') = 40
+        order by 1 desc limit 0, 20;
     """
 
     qt_sql5 """
-        select id, element_at(s, 'city') from `tbl_test_pruned_columns` where 
element_at(element_at(s, 'data')[1][2], 'b') = 40 order by 1;
+        select
+            id
+            , element_at(s, 'city')
+        from `tbl_test_pruned_columns`
+        where element_at(element_at(s, 'data')[1][2], 'b') = 40
+        order by 1, 2 limit 0, 20;
     """
 
     qt_sql5_1 """
-        select /*+ set enable_prune_nested_column = 1; */ sum(s.value) from 
`tbl_test_pruned_columns` where id in(1,2,3,4,8,9,10,11,13);
+        select
+            id
+            , element_at(s, 'city')
+        from `tbl_test_pruned_columns`
+        where element_at(element_at(s, 'data')[1][2], 'b') = 40
+        order by 1, 2 limit 100, 20;
     """
 
     qt_sql5_2 """
+        select
+            id
+            , element_at(s, 'city')
+        from `tbl_test_pruned_columns`
+        where element_at(element_at(s, 'data')[1][2], 'b') = 40
+        order by 1 desc, 2 limit 0, 20;
+    """
+
+    qt_sql5_3 """
+        select /*+ set enable_prune_nested_column = 1; */ sum(s.value) from 
`tbl_test_pruned_columns` where id in(1,2,3,4,8,9,10,11,13);
+    """
+
+    qt_sql5_4 """
         select /*+ set enable_prune_nested_column = 0; */ sum(s.value) from 
`tbl_test_pruned_columns` where id in(1,2,3,4,8,9,10,11,13);

Review Comment:
   This hint does not set the session variable. The Nereids parser only applies 
per-query variables through `SET_VAR(...)`, so `/*+ set 
enable_prune_nested_column = 0; */` is ignored and this case still runs with 
the default `enable_prune_nested_column=true`. Please use `/*+ 
SET_VAR(enable_prune_nested_column=false) */` or explicit `set 
enable_prune_nested_column=false` / restore statements so the disabled path is 
actually covered.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to