This is an automated email from the ASF dual-hosted git repository.
csun5285 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new d6747286c9c [fix](scan) read delete-condition columns in full to avoid
leaking deleted rows (#64601)
d6747286c9c is described below
commit d6747286c9c8ef6486e90fc47533b9413ce81890
Author: Chenyang Sun <[email protected]>
AuthorDate: Thu Jun 18 18:43:03 2026 +0800
[fix](scan) read delete-condition columns in full to avoid leaking deleted
rows (#64601)
Strip meta-only (OFFSET/NULL) access paths for delete-condition columns
in TabletReader so those columns fall back to a full read and the delete
predicate is evaluated against the real values.
---
be/src/storage/tablet/tablet_reader.cpp | 27 +++++
be/src/storage/tablet/tablet_reader.h | 7 ++
be/test/storage/tablet_reader_test.cpp | 131 +++++++++++++++++++++
.../delete_p0/test_offset_only_delete_leak.out | 22 ++++
.../delete_p0/test_offset_only_delete_leak.groovy | 57 +++++++++
5 files changed, 244 insertions(+)
diff --git a/be/src/storage/tablet/tablet_reader.cpp
b/be/src/storage/tablet/tablet_reader.cpp
index 46fcfc8c5b4..2e7e73a3676 100644
--- a/be/src/storage/tablet/tablet_reader.cpp
+++ b/be/src/storage/tablet/tablet_reader.cpp
@@ -27,7 +27,9 @@
#include <memory>
#include <numeric>
#include <ostream>
+#include <set>
#include <shared_mutex>
+#include <unordered_map>
#include "common/compiler_util.h" // IWYU pragma: keep
#include "common/config.h"
@@ -43,10 +45,12 @@
#include "runtime/query_context.h"
#include "runtime/runtime_predicate.h"
#include "runtime/runtime_state.h"
+#include "storage/delete/delete_handler.h"
#include "storage/index/bloom_filter/bloom_filter.h"
#include "storage/itoken_extractor.h"
#include "storage/olap_common.h"
#include "storage/olap_define.h"
+#include "storage/predicate/block_column_predicate.h"
#include "storage/predicate/column_predicate.h"
#include "storage/predicate/like_column_predicate.h"
#include "storage/predicate/predicate_creator.h"
@@ -77,6 +81,21 @@ Status TabletReader::init(const ReaderParams& read_params) {
return res;
}
+void TabletReader::remove_delete_columns_from_access_paths(
+ const DeleteHandler& delete_handler, const TabletSchema& tablet_schema,
+ std::map<int32_t, TColumnAccessPaths>& all_access_paths) {
+ auto delete_predicates = AndBlockColumnPredicate::create_shared();
+ std::unordered_map<int32_t, std::vector<std::shared_ptr<const
ColumnPredicate>>>
+ del_predicates_for_zone_map;
+ delete_handler.get_delete_conditions_after_version(0,
delete_predicates.get(),
+
&del_predicates_for_zone_map);
+ std::set<ColumnId> delete_column_ids;
+ delete_predicates->get_all_column_ids(delete_column_ids);
+ for (auto cid : delete_column_ids) {
+ all_access_paths.erase(tablet_schema.column(cid).unique_id());
+ }
+}
+
Status TabletReader::_capture_rs_readers(const ReaderParams& read_params) {
SCOPED_RAW_TIMER(&_stats.tablet_reader_capture_rs_readers_timer_ns);
if (read_params.rs_splits.empty()) {
@@ -199,6 +218,14 @@ Status TabletReader::_capture_rs_readers(const
ReaderParams& read_params) {
_reader_context.all_access_paths = read_params.all_access_paths;
_reader_context.predicate_access_paths =
read_params.predicate_access_paths;
+ // Force a full read of delete-condition columns: the FE can't see storage
deletes and may
+ // mark them meta-only (OFFSET/NULL), whose content-less read makes the
delete predicate
+ // match nothing and leak deleted rows.
+ if (!_delete_handler.empty() && !_reader_context.all_access_paths.empty())
{
+ remove_delete_columns_from_access_paths(_delete_handler,
*_tablet_schema,
+
_reader_context.all_access_paths);
+ }
+
// Propagate general read limit for DUP_KEYS and UNIQUE_KEYS with MOW
_reader_context.general_read_limit = read_params.general_read_limit;
diff --git a/be/src/storage/tablet/tablet_reader.h
b/be/src/storage/tablet/tablet_reader.h
index aad3920639f..3d03caf6d74 100644
--- a/be/src/storage/tablet/tablet_reader.h
+++ b/be/src/storage/tablet/tablet_reader.h
@@ -274,6 +274,13 @@ public:
const std::vector<RowsetSharedPtr>& input_rowsets,
TabletReader::ReaderParams* reader_params, Block* block);
+ // Remove the delete-condition columns from `all_access_paths` so they
fall back to a full
+ // read (a meta-only read would make the storage delete predicate match
nothing and leak
+ // deleted rows).
+ static void remove_delete_columns_from_access_paths(
+ const DeleteHandler& delete_handler, const TabletSchema&
tablet_schema,
+ std::map<int32_t, TColumnAccessPaths>& all_access_paths);
+
protected:
friend class VCollectIterator;
friend class DeleteHandler;
diff --git a/be/test/storage/tablet_reader_test.cpp
b/be/test/storage/tablet_reader_test.cpp
new file mode 100644
index 00000000000..6c7db4fcb0c
--- /dev/null
+++ b/be/test/storage/tablet_reader_test.cpp
@@ -0,0 +1,131 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "storage/tablet/tablet_reader.h"
+
+#include <gen_cpp/olap_file.pb.h>
+#include <gtest/gtest.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "storage/delete/delete_handler.h"
+#include "storage/rowset/rowset_meta.h"
+#include "storage/tablet/tablet_schema.h"
+
+namespace doris {
+
+class TabletReaderTest : public testing::Test {
+protected:
+ static TabletSchemaSPtr create_schema(
+ const std::vector<std::pair<std::string, int32_t>>& name_and_uid) {
+ TabletSchemaPB schema_pb;
+ schema_pb.set_keys_type(KeysType::DUP_KEYS);
+ bool first = true;
+ for (const auto& [name, uid] : name_and_uid) {
+ auto* col = schema_pb.add_column();
+ col->set_unique_id(uid);
+ col->set_name(name);
+ col->set_type("INT");
+ col->set_is_key(first);
+ col->set_is_nullable(!first);
+ first = false;
+ }
+ auto schema = std::make_shared<TabletSchema>();
+ schema->init_from_pb(schema_pb);
+ return schema;
+ }
+
+ // Build a DeleteHandler initialized with a single delete-predicate rowset.
+ static void init_delete_handler(DeleteHandler& handler, const
TabletSchemaSPtr& schema,
+ const DeletePredicatePB& delete_predicate)
{
+ auto rs_meta = std::make_shared<RowsetMeta>();
+ rs_meta->set_tablet_schema(schema);
+ rs_meta->set_version(Version(2, 2));
+ rs_meta->set_delete_predicate(delete_predicate);
+ ASSERT_TRUE(handler.init(schema, {rs_meta}, /*version=*/100).ok());
+ }
+};
+
+// The delete columns (resolved by the delete handler to field ids) are mapped
back to their
+// column unique ids and stripped from all_access_paths; unrelated columns
keep their paths.
+TEST_F(TabletReaderTest, remove_delete_columns_from_access_paths) {
+ auto schema = create_schema({{"k1", 10}, {"k2", 11}, {"v1", 12}, {"v2",
13}, {"v4", 15}});
+
+ DeletePredicatePB delete_predicate;
+ auto* p1 = delete_predicate.add_sub_predicates_v2(); // k1 -> erase 10
+ p1->set_column_name("k1");
+ p1->set_column_unique_id(10);
+ p1->set_op("=");
+ p1->set_cond_value("1");
+ auto* p2 = delete_predicate.add_sub_predicates_v2(); // v1 -> erase 12
+ p2->set_column_name("v1");
+ p2->set_column_unique_id(12);
+ p2->set_op("=");
+ p2->set_cond_value("2");
+ auto* in1 = delete_predicate.add_in_predicates(); // k2 IN (...) -> erase
11
+ in1->set_column_name("k2");
+ in1->set_column_unique_id(11);
+ in1->set_is_not_in(false);
+ in1->add_values("3");
+ in1->add_values("4");
+
+ DeleteHandler handler;
+ init_delete_handler(handler, schema, delete_predicate);
+
+ std::map<int32_t, TColumnAccessPaths> access_paths;
+ for (int32_t uid : {10, 11, 12, 13, 15}) {
+ access_paths[uid] = TColumnAccessPaths {};
+ }
+
+ TabletReader::remove_delete_columns_from_access_paths(handler, *schema,
access_paths);
+
+ EXPECT_EQ(size_t(2), access_paths.size());
+ EXPECT_EQ(size_t(1), access_paths.count(13)) << "non-delete column must
keep its access path";
+ EXPECT_EQ(size_t(1), access_paths.count(15)) << "non-delete column must
keep its access path";
+ for (int32_t uid : {10, 11, 12}) {
+ EXPECT_EQ(size_t(0), access_paths.count(uid)) << "delete column " <<
uid << " not erased";
+ }
+}
+
+// A delete condition whose column has no access path leaves the map untouched.
+TEST_F(TabletReaderTest, remove_delete_columns_keeps_unrelated_paths) {
+ auto schema = create_schema({{"k1", 10}, {"v1", 12}});
+
+ DeletePredicatePB delete_predicate;
+ auto* p1 = delete_predicate.add_sub_predicates_v2();
+ p1->set_column_name("k1");
+ p1->set_column_unique_id(10);
+ p1->set_op("=");
+ p1->set_cond_value("1");
+
+ DeleteHandler handler;
+ init_delete_handler(handler, schema, delete_predicate);
+
+ std::map<int32_t, TColumnAccessPaths> access_paths;
+ access_paths[12] = TColumnAccessPaths {};
+ access_paths[15] = TColumnAccessPaths {};
+
+ TabletReader::remove_delete_columns_from_access_paths(handler, *schema,
access_paths);
+
+ EXPECT_EQ(size_t(2), access_paths.size());
+}
+
+} // namespace doris
diff --git a/regression-test/data/delete_p0/test_offset_only_delete_leak.out
b/regression-test/data/delete_p0/test_offset_only_delete_leak.out
new file mode 100644
index 00000000000..69a018babd7
--- /dev/null
+++ b/regression-test/data/delete_p0/test_offset_only_delete_leak.out
@@ -0,0 +1,22 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !count --
+7
+
+-- !length_ge_0 --
+7
+
+-- !length_gt_0 --
+5
+
+-- !empty_string --
+2
+
+-- !full_read --
+10 aa
+2 aa
+4 bb
+5
+6 aa
+8 cc
+9
+
diff --git
a/regression-test/suites/delete_p0/test_offset_only_delete_leak.groovy
b/regression-test/suites/delete_p0/test_offset_only_delete_leak.groovy
new file mode 100644
index 00000000000..e3c8d4b0672
--- /dev/null
+++ b/regression-test/suites/delete_p0/test_offset_only_delete_leak.groovy
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// When a query only needs a column's length (length(v), or v='' which is
rewritten to
+// length=0), a dict-encoded varchar column is read in OFFSET_ONLY mode, which
only
+// materializes lengths and leaves the char content empty. If that same column
also
+// carries a DELETE condition, the storage-side delete predicate compares the
empty
+// content, matches nothing, and the deleted rows leak back into the scan
output.
+suite("test_offset_only_delete_leak", "p0") {
+ sql "drop table if exists test_offset_only_delete_leak"
+ sql """
+ create table test_offset_only_delete_leak (
+ k int,
+ v varchar(50)
+ ) duplicate key(k)
+ distributed by hash(k) buckets 1
+ properties("replication_num" = "1")
+ """
+
+ // low cardinality -> the column gets dictionary encoded
+ sql """
+ insert into test_offset_only_delete_leak values
+ (1,'--'),(2,'aa'),(3,'--'),(4,'bb'),(5,''),
+ (6,'aa'),(7,'--'),(8,'cc'),(9,''),(10,'aa')
+ """
+
+ // delete the three '--' rows (k = 1,3,7); kept as a delete predicate
applied at read time.
+ sql "delete from test_offset_only_delete_leak where v = '--'"
+
+ // plain count does not read v.
+ qt_count "select count(*) from test_offset_only_delete_leak"
+
+ // length(v) only needs v's length -> OFFSET_ONLY read; the delete on v
must still apply,
+ // so the deleted '--' rows must not be counted.
+ qt_length_ge_0 "select count(*) from test_offset_only_delete_leak where
length(v) >= 0"
+ qt_length_gt_0 "select count(*) from test_offset_only_delete_leak where
length(v) > 0"
+
+ // v = '' is rewritten to length(v) = 0 -> also OFFSET_ONLY.
+ qt_empty_string "select count(*) from test_offset_only_delete_leak where v
= ''"
+
+ // outputting v forces a full read; the deleted rows must never appear
either way.
+ order_qt_full_read "select k, v from test_offset_only_delete_leak where
length(v) >= 0 order by k, v"
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]