This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch vector-index-dev
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/vector-index-dev by this push:
new 706c8d6b4fb fix delete condition with virtual column (#52240)
706c8d6b4fb is described below
commit 706c8d6b4fbb54d4214b61a53a88aa58d78d3ea9
Author: zhiqiang <[email protected]>
AuthorDate: Wed Jun 25 14:52:30 2025 +0800
fix delete condition with virtual column (#52240)
### What problem does this PR solve?
Issue Number: close #xxx
Related PR: #xxx
Problem Summary:
### Release note
None
### Check List (For Author)
- Test <!-- At least one of them must be included. -->
- [ ] Regression test
- [ ] Unit Test
- [ ] Manual test (add detailed scripts or steps below)
- [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
- [ ] Previous test can cover this change.
- [ ] No code files have been changed.
- [ ] Other reason <!-- Add your reason? -->
- Behavior changed:
- [ ] No.
- [ ] Yes. <!-- Explain the behavior change -->
- Does this need documentation?
- [ ] No.
- [ ] Yes. <!-- Add document PR link here. eg:
https://github.com/apache/doris-website/pull/1214 -->
### Check List (For Reviewer who merge this PR)
- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
---
be/src/olap/rowset/beta_rowset_reader.cpp | 9 +-
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 103 ++++++++++++++-------
regression-test/data/ann_index_p0/delete_where.out | Bin 0 -> 253 bytes
.../suites/ann_index_p0/delete_where.groovy | 49 ++++++++++
4 files changed, 122 insertions(+), 39 deletions(-)
diff --git a/be/src/olap/rowset/beta_rowset_reader.cpp
b/be/src/olap/rowset/beta_rowset_reader.cpp
index 4265a9b1255..ad2913e6956 100644
--- a/be/src/olap/rowset/beta_rowset_reader.cpp
+++ b/be/src/olap/rowset/beta_rowset_reader.cpp
@@ -128,8 +128,9 @@ Status
BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context
std::vector<ColumnId> read_columns;
// std::vector<SlotDescriptor*>* return_columns =
_read_context->return_columns;
- std::set<uint32_t> read_columns_set;
- std::set<uint32_t> delete_columns_set;
+ std::set<ColumnId> read_columns_set;
+ std::set<ColumnId> delete_columns_set;
+ // all columns used fro delete condition all added to the read columns.
read_columns.insert(read_columns.end(),
_read_context->return_columns->begin(),
_read_context->return_columns->end());
read_columns_set.insert(_read_context->return_columns->begin(),
@@ -141,8 +142,8 @@ Status
BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context
}
}
VLOG_NOTICE << "read columns size: " << read_columns.size();
- LOG_INFO("Tablet columns {}, read columns size: {}",
- _read_context->tablet_schema->num_columns(), read_columns.size());
+ LOG_INFO("After add delete predicates, tablet columns count {}, read
columns cid [{}]",
+ _read_context->tablet_schema->num_columns(),
fmt::join(read_columns, ", "));
_input_schema =
std::make_shared<Schema>(_read_context->tablet_schema->columns(), read_columns);
if (_read_context->predicates != nullptr) {
_read_options.column_predicates.insert(_read_options.column_predicates.end(),
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 790c9362ffe..3e8120e3ce5 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -418,6 +418,9 @@ Status SegmentIterator::_lazy_init() {
if (!_opts.row_ranges.is_empty()) {
_row_bitmap &= RowRanges::ranges_to_roaring(_opts.row_ranges);
}
+
+ RETURN_IF_ERROR(_apply_ann_topn_predicate());
+
if (_opts.read_orderby_key_reverse) {
_range_iter.reset(new BackwardBitmapRangeIterator(_row_bitmap));
} else {
@@ -604,8 +607,6 @@ Status
SegmentIterator::_get_row_ranges_by_column_conditions() {
_opts.stats->rows_conditions_filtered += (pre_size -
_row_bitmap.cardinality());
}
- RETURN_IF_ERROR(_apply_ann_topn_predicate());
-
// TODO(hkp): calculate filter rate to decide whether to
// use zone map/bloom filter/secondary index or not.
return Status::OK();
@@ -620,14 +621,15 @@ Status SegmentIterator::_apply_ann_topn_predicate() {
size_t src_col_idx = _ann_topn_runtime->get_src_column_idx();
ColumnId src_cid = _schema->column_id(src_col_idx);
IndexIterator* ann_index_iterator = _index_iterators[src_cid].get();
-
- if (ann_index_iterator == nullptr || !_common_expr_ctxs_push_down.empty()
||
- !_col_predicates.empty()) {
+ bool has_ann_index = ann_index_iterator != nullptr;
+ bool has_common_expr_push_down = !_common_expr_ctxs_push_down.empty();
+ bool has_column_predicate = std::any_of(_is_pred_column.begin(),
_is_pred_column.end(),
+ [](bool is_pred) { return is_pred;
});
+ if (!has_ann_index || has_common_expr_push_down || has_column_predicate) {
VLOG_DEBUG << fmt::format(
- "Can not apply ann topn, has index iterators: {}, has common
expr ctxs "
- "push down: {}, has column predicates: {}",
- _index_iterators.size(), !_common_expr_ctxs_push_down.empty(),
- !_col_predicates.empty());
+ "Ann topn can not be evaluated by ann index, has_ann_index:
{}, "
+ "has_common_expr_push_down: {}, has_column_predicate: {}",
+ has_ann_index, has_common_expr_push_down,
has_column_predicate);
return Status::OK();
}
@@ -867,6 +869,9 @@ bool SegmentIterator::_is_literal_node(const
TExprNodeType::type& node_type) {
}
// TODO:Unit test.
+// Get all slot refs from expr, and mark them as common expr columns.
+// If expr is a virtual slot ref, get the slot ref from the column expr,
virtual slot ref it self is not
+// regarded as common expr column.
Status SegmentIterator::_extract_common_expr_columns(const
vectorized::VExprSPtr& expr) {
auto& children = expr->children();
for (int i = 0; i < children.size(); ++i) {
@@ -1586,9 +1591,14 @@ Status SegmentIterator::_vec_init_lazy_materialization()
{
_is_need_short_eval = true;
}
- // make _schema_block_id_map
// ColumnId to column index in block
- _schema_block_id_map.resize(_schema->columns().size());
+ // ColumnId will contail all columns in tablet schema, including virtual
columns and global rowid column,
+ _schema_block_id_map.resize(_schema->columns().size(), -1);
+ // Use cols read by query to initialize _schema_block_id_map.
+ // We need to know the index of each column in the block.
+ // There is an assumption here that the columns in the block are in the
same order as in the read schema.
+ // TODO: A probelm is that, delete condition columns will exist in
_schema->column_ids but not in block if
+ // delete column is not read by the query.
for (int i = 0; i < _schema->num_column_ids(); i++) {
auto cid = _schema->column_id(i);
_schema_block_id_map[cid] = i;
@@ -1699,10 +1709,12 @@ Status
SegmentIterator::_vec_init_lazy_materialization() {
"_cols_read_by_column_predicate: [{}], "
"_cols_not_included_by_any_predicates: [{}], "
"_cols_read_by_common_expr: [{}], "
- "columns_to_filter: [{}]",
+ "columns_to_filter: [{}], "
+ "_schema_block_id_map: [{}]",
_lazy_materialization_read,
fmt::join(_cols_read_by_column_predicate, ","),
fmt::join(_cols_not_included_by_any_predicates, ","),
- fmt::join(_cols_read_by_common_expr, ","),
fmt::join(_columns_to_filter, ","));
+ fmt::join(_cols_read_by_common_expr, ","),
fmt::join(_columns_to_filter, ","),
+ fmt::join(_schema_block_id_map, ","));
return Status::OK();
}
@@ -1856,7 +1868,8 @@ Status
SegmentIterator::_init_return_columns(vectorized::Block* block, uint32_t
for (auto entry : _virtual_column_exprs) {
auto cid = entry.first;
- _current_return_columns[cid] =
vectorized::ColumnNothing::create(nrows_read_limit);
+ _current_return_columns[cid] = vectorized::ColumnNothing::create(0);
+ _current_return_columns[cid]->reserve(nrows_read_limit);
}
return Status::OK();
@@ -1864,8 +1877,17 @@ Status
SegmentIterator::_init_return_columns(vectorized::Block* block, uint32_t
void SegmentIterator::_output_non_pred_columns(vectorized::Block* block) {
SCOPED_RAW_TIMER(&_opts.stats->output_col_ns);
+ VLOG_DEBUG << fmt::format(
+ "Output non-predicate columns,
_cols_not_included_by_any_predicates: [{}], "
+ "_schema_block_id_map: [{}]",
+ fmt::join(_cols_not_included_by_any_predicates, ","),
+ fmt::join(_schema_block_id_map, ","));
for (auto cid : _cols_not_included_by_any_predicates) {
auto loc = _schema_block_id_map[cid];
+ if (vectorized::check_and_get_column<const vectorized::ColumnNothing>(
+ _current_return_columns[cid].get())) {
+ VLOG_DEBUG << fmt::format("Column {} of pos {} will be
ColumnNothing.", cid, loc);
+ }
// if loc > block->columns() means the column is delete column and
should
// not output by block, so just skip the column.
if (loc < block->columns()) {
@@ -1901,7 +1923,10 @@ Status SegmentIterator::_read_columns_by_index(uint32_t
nrows_read_limit, uint32
nrows_read = _range_iter->read_batch_rowids(_block_rowids.data(),
nrows_read_limit);
bool is_continuous = (nrows_read > 1) &&
(_block_rowids[nrows_read - 1] - _block_rowids[0] ==
nrows_read - 1);
- LOG_INFO("nrows_read from range iterator: {}, is_continus {}", nrows_read,
is_continuous);
+ VLOG_DEBUG << fmt::format(
+ "nrows_read from range iterator: {}, is_continus {},
_cols_read_by_column_predicate "
+ "[{}]",
+ nrows_read, is_continuous,
fmt::join(_cols_read_by_column_predicate, ","));
for (auto cid : _cols_read_by_column_predicate) {
auto& column = _current_return_columns[cid];
@@ -2401,7 +2426,7 @@ Status
SegmentIterator::_next_batch_internal(vectorized::Block* block) {
// to reduce cost of read short circuit columns.
// In SSB test, it make no difference; So need more
scenarios to test
selected_size =
_evaluate_short_circuit_predicate(_sel_rowid_idx.data(), selected_size);
-
+ LOG_INFO("After evaluate predicates, selected size: {} ",
selected_size);
if (selected_size > 0) {
// step 3.1: output short circuit and predicate column
// when lazy materialization enables, _predicate_column_ids =
distinct(_short_cir_pred_column_ids + _vec_pred_column_ids)
@@ -2424,19 +2449,13 @@ Status
SegmentIterator::_next_batch_internal(vectorized::Block* block) {
_cols_read_by_common_expr.end()) {
_replace_version_col(selected_size);
}
+
RETURN_IF_ERROR(_convert_to_expected_type(_cols_read_by_common_expr));
for (auto cid : _cols_read_by_common_expr) {
auto loc = _schema_block_id_map[cid];
block->replace_by_position(loc,
std::move(_current_return_columns[cid]));
}
-
- for (const auto pair : _vir_cid_to_idx_in_block) {
- auto cid = pair.first;
- auto loc = pair.second;
- block->replace_by_position(loc,
-
std::move(_current_return_columns[cid]));
- }
}
DCHECK(block->columns() >
_schema_block_id_map[*_common_expr_columns.begin()]);
@@ -2465,17 +2484,31 @@ Status
SegmentIterator::_next_batch_internal(vectorized::Block* block) {
_execute_common_expr(_sel_rowid_idx.data(),
selected_size, block));
}
}
- } else if (_is_need_expr_eval) {
-
RETURN_IF_ERROR(_convert_to_expected_type(_cols_read_by_common_expr));
- for (auto cid : _cols_read_by_common_expr) {
- auto loc = _schema_block_id_map[cid];
- block->replace_by_position(loc,
std::move(_current_return_columns[cid]));
- }
-
+ } else {
+ // If column_predicate filters out all rows, the corresponding
column in _current_return_columns[cid] must be a ColumnNothing.
+ // Because:
+ // 1. Before each batch, _init_return_columns is called to
initialize _current_return_columns, and virtual columns in
_current_return_columns are initialized as ColumnNothing.
+ // 2. When select_size == 0, the read method of
VirtualColumnIterator will definitely not be called, so the corresponding
Column remains a ColumnNothing
for (const auto pair : _vir_cid_to_idx_in_block) {
auto cid = pair.first;
- auto loc = pair.second;
- block->replace_by_position(loc,
std::move(_current_return_columns[cid]));
+ auto pos = pair.second;
+ const vectorized::ColumnNothing* nothing_col =
+
vectorized::check_and_get_column<vectorized::ColumnNothing>(
+ _current_return_columns[cid].get());
+ DCHECK(nothing_col != nullptr)
+ << fmt::format("ColumnNothing expected, but got
{}, cid: {}, pos: {}",
+
_current_return_columns[cid]->get_name(), cid, pos);
+ _current_return_columns[cid] =
_opts.vir_col_idx_to_type[pos]->create_column();
+ }
+
+ if (_is_need_expr_eval) {
+ // rows of this batch are all filtered by column
predicates.
+
RETURN_IF_ERROR(_convert_to_expected_type(_cols_read_by_common_expr));
+
+ for (auto cid : _cols_read_by_common_expr) {
+ auto loc = _schema_block_id_map[cid];
+ block->replace_by_position(loc,
std::move(_current_return_columns[cid]));
+ }
}
}
} else if (_is_need_expr_eval) {
@@ -2868,8 +2901,8 @@ bool SegmentIterator::_can_opt_topn_reads() {
return all_true;
}
+// Before get next batch. make sure all virtual columns in block has type
ColumnNothing.
void SegmentIterator::_init_virtual_columns(vectorized::Block* block) {
- // Before get next batch. make sure all virtual columns has type
ColumnNothing.
for (const auto& pair : _vir_cid_to_idx_in_block) {
auto& col_with_type_and_name = block->get_by_position(pair.second);
col_with_type_and_name.column = vectorized::ColumnNothing::create(0);
@@ -2906,8 +2939,8 @@ Status
SegmentIterator::_materialization_of_virtual_column(vectorized::Block* bl
if (vectorized::check_and_get_column<const vectorized::ColumnNothing>(
block->get_by_position(idx_in_block).column.get())) {
- LOG_INFO("Virtual column is doing materialization, cid {},
column_expr {}", cid,
- column_expr->root()->debug_string());
+ LOG_INFO("Virtual column is doing materialization, cid {}, col idx
{}", cid,
+ idx_in_block);
int result_cid = -1;
RETURN_IF_ERROR(column_expr->execute(block, &result_cid));
diff --git a/regression-test/data/ann_index_p0/delete_where.out
b/regression-test/data/ann_index_p0/delete_where.out
new file mode 100644
index 00000000000..9f88b492f3f
Binary files /dev/null and b/regression-test/data/ann_index_p0/delete_where.out
differ
diff --git a/regression-test/suites/ann_index_p0/delete_where.groovy
b/regression-test/suites/ann_index_p0/delete_where.groovy
new file mode 100644
index 00000000000..fd9a54a6a15
--- /dev/null
+++ b/regression-test/suites/ann_index_p0/delete_where.groovy
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("delete_where_with_ann") {
+ sql "drop table if exists delete_where_with_ann"
+ test {
+ sql """
+ CREATE TABLE delete_where_with_ann (
+ id INT NOT NULL COMMENT "",
+ vec ARRAY<FLOAT> NOT NULL COMMENT "",
+ value INT NULL COMMENT "",
+ INDEX ann_idx (vec) USING ANN PROPERTIES(
+ "index_type" = "hnsw",
+ "metric_type" = "l2_distance",
+ "dim" = "3"
+ )
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id) COMMENT "OLAP"
+ DISTRIBUTED BY HASH(id) BUCKETS AUTO
+ PROPERTIES (
+ "replication_num" = "1"
+ );
+ """
+ }
+
+ sql "insert into delete_where_with_ann values (1, [1.0, 2.0, 3.0], 11),(2,
[4.0, 5.0, 6.0], 22),(3, [7.0, 8.0, 9.0], 33)"
+
+ qt_sql_1 "select * from delete_where_with_ann order by id"
+
+ sql "delete from delete_where_with_ann where id = 1"
+
+ qt_sql_2 "select * from delete_where_with_ann order by id"
+
+ qt_sql_3 "select id, l2_distance_approximate(vec, [1.0, 2.0, 3.0]) as dist
from delete_where_with_ann order by dist limit 2;"
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]