This is an automated email from the ASF dual-hosted git repository.

zhangchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 2bedb15b04b [Fix](cloud-mow) Remove potential existing split delete 
bitmap KVs before update them in schema change (#51353)
2bedb15b04b is described below

commit 2bedb15b04b7bf10f636a57796bb2daa126416b2
Author: bobhan1 <[email protected]>
AuthorDate: Wed Jun 4 23:39:13 2025 +0800

    [Fix](cloud-mow) Remove potential existing split delete bitmap KVs before 
update them in schema change (#51353)
    
    ### What problem does this PR solve?
    
    If a schema change job failed and retry on BE, the previous failed sc
    job will not clear its delete bitmap KVs written in MS. So later retry
    may update existing delete bitmap KVs on new tablet. Considering that
    delete bitmap KVs will be split into multiple KVs when too large, we may
    get wrong delete bitmaps if we don't remove the previous KVs before put
    them.
    This PR use pending delete bitmaps to avoid this situation for SC.
---
 cloud/src/meta-service/meta_service.cpp     |   4 +-
 cloud/src/meta-service/meta_service_job.cpp |   6 ++
 cloud/test/meta_service_job_test.cpp        |  34 +++++++++
 cloud/test/meta_service_test.cpp            | 111 ++++++++++++++++++++++++++++
 4 files changed, 153 insertions(+), 2 deletions(-)

diff --git a/cloud/src/meta-service/meta_service.cpp 
b/cloud/src/meta-service/meta_service.cpp
index f2baa7fd5a9..3dcf166c0b2 100644
--- a/cloud/src/meta-service/meta_service.cpp
+++ b/cloud/src/meta-service/meta_service.cpp
@@ -2208,13 +2208,13 @@ void 
MetaServiceImpl::update_delete_bitmap(google::protobuf::RpcController* cont
         }
     }
 
-    // no need to record pending key for compaction or schema change,
+    // no need to record pending key for compaction,
     // because delete bitmap will attach to new rowset, just delete new rowset 
if failed
     // lock_id > 0 : load
     // lock_id = -1 : compaction
     // lock_id = -2 : schema change
     // lock_id = -3 : compaction update delete bitmap without lock
-    if (request->lock_id() > 0) {
+    if (request->lock_id() > 0 || request->lock_id() == -2) {
         std::string pending_val;
         if (is_explicit_txn && !is_first_sub_txn) {
             // put current delete bitmap keys and previous sub txns' into 
tablet's pending delete bitmap KV
diff --git a/cloud/src/meta-service/meta_service_job.cpp 
b/cloud/src/meta-service/meta_service_job.cpp
index 8ebaf0d9baa..0ff10ed5a86 100644
--- a/cloud/src/meta-service/meta_service_job.cpp
+++ b/cloud/src/meta-service/meta_service_job.cpp
@@ -30,6 +30,7 @@
 #include "common/logging.h"
 #include "common/util.h"
 #include "cpp/sync_point.h"
+#include "keys.h"
 #include "meta-service/keys.h"
 #include "meta-service/meta_service_helper.h"
 #include "meta-service/meta_service_tablet_stats.h"
@@ -1407,6 +1408,11 @@ void process_schema_change_job(MetaServiceCode& code, 
std::string& msg, std::str
         if (!success) {
             return;
         }
+
+        std::string pending_key = meta_pending_delete_bitmap_key({instance_id, 
new_tablet_id});
+        txn->remove(pending_key);
+        LOG(INFO) << "xxx sc remove delete bitmap pending key, pending_key=" 
<< hex(pending_key)
+                  << " tablet_id=" << new_tablet_id << "job_id=" << 
schema_change.id();
     }
 
     for (size_t i = 0; i < schema_change.txn_ids().size(); ++i) {
diff --git a/cloud/test/meta_service_job_test.cpp 
b/cloud/test/meta_service_job_test.cpp
index 9bee02691e9..f0e5cb8fe3d 100644
--- a/cloud/test/meta_service_job_test.cpp
+++ b/cloud/test/meta_service_job_test.cpp
@@ -3387,12 +3387,46 @@ TEST(MetaServiceJobTest, SchemaChangeJobWithMoWTest) {
 
         res_code = get_delete_bitmap_lock(meta_service.get(), table_id, -2, 
12345);
         ASSERT_EQ(res_code, MetaServiceCode::OK);
+
+        std::string pending_key = meta_pending_delete_bitmap_key({instance_id, 
new_tablet_id});
+        std::string pending_val;
+        std::unique_ptr<Transaction> txn;
+        ASSERT_EQ(meta_service->txn_kv()->create_txn(&txn), 
TxnErrorCode::TXN_OK);
+        ASSERT_EQ(txn->get(pending_key, &pending_val), 
TxnErrorCode::TXN_KEY_NOT_FOUND);
+
+        res_code = update_delete_bitmap(meta_service.get(), table_id, 
partition_id, new_tablet_id,
+                                        -2, 12345);
+        ASSERT_EQ(res_code, MetaServiceCode::OK);
+
+        // schema change job should write pending delete bitmap key
+        ASSERT_EQ(meta_service->txn_kv()->create_txn(&txn), 
TxnErrorCode::TXN_OK);
+        ASSERT_EQ(txn->get(pending_key, &pending_val), TxnErrorCode::TXN_OK);
+        PendingDeleteBitmapPB pending_info;
+        ASSERT_TRUE(pending_info.ParseFromString(pending_val));
+        ASSERT_EQ(pending_info.delete_bitmap_keys_size(), 3);
+        for (int i = 0; i < 3; ++i) {
+            std::string_view k1 = pending_info.delete_bitmap_keys(i);
+            k1.remove_prefix(1);
+            std::vector<std::tuple<std::variant<int64_t, std::string>, int, 
int>> out;
+            decode_key(&k1, &out);
+            // 0x01 "meta" ${instance_id} "delete_bitmap" ${tablet_id} 
${rowset_id} ${version} ${segment_id} -> roaringbitmap
+            ASSERT_EQ(std::get<std::int64_t>(std::get<0>(out[3])), 
new_tablet_id);
+            ASSERT_EQ(std::get<std::string>(std::get<0>(out[4])),
+                      "0200000003ea308a3647dbea83220ed4b8897f2288244a91");
+            ASSERT_EQ(std::get<std::int64_t>(std::get<0>(out[5])), i);
+            ASSERT_EQ(std::get<std::int64_t>(std::get<0>(out[6])), 0);
+        }
+
         finish_schema_change_job(meta_service.get(), tablet_id, new_tablet_id, 
"job1", "be1",
                                  output_rowsets, res);
         ASSERT_EQ(res.status().code(), MetaServiceCode::OK);
         res_code = remove_delete_bitmap_lock(meta_service.get(), table_id, -2, 
12345);
         ASSERT_EQ(res_code, MetaServiceCode::LOCK_EXPIRED);
         res.Clear();
+
+        // pending delete bitmap key on new tablet should be removed after 
schema change job finishes
+        ASSERT_EQ(meta_service->txn_kv()->create_txn(&txn), 
TxnErrorCode::TXN_OK);
+        ASSERT_EQ(txn->get(pending_key, &pending_val), 
TxnErrorCode::TXN_KEY_NOT_FOUND);
     }
 
     {
diff --git a/cloud/test/meta_service_test.cpp b/cloud/test/meta_service_test.cpp
index c0acad685ce..31dd25eab75 100644
--- a/cloud/test/meta_service_test.cpp
+++ b/cloud/test/meta_service_test.cpp
@@ -5542,6 +5542,117 @@ TEST(MetaServiceTest, UpdateDeleteBitmapFailCase) {
     ASSERT_EQ(get_delete_bitmap_res.segment_delete_bitmaps(0), data1);
 }
 
+TEST(MetaServiceTest, UpdateDeleteBitmapScOverrideExistingKey) {
+    auto meta_service = get_meta_service();
+    brpc::Controller cntl;
+    size_t split_size = 90 * 1000; // see cloud/src/common/util.h
+
+    extern std::string get_instance_id(const std::shared_ptr<ResourceManager>& 
rc_mgr,
+                                       const std::string& cloud_unique_id);
+    auto instance_id = get_instance_id(meta_service->resource_mgr(), 
"test_cloud_unique_id");
+
+    {
+        // schema change should use pending delete bitmap to clear previous 
failed trials
+        int64_t db_id = 99999;
+        int64_t table_id = 1801;
+        int64_t index_id = 4801;
+        int64_t t1p1 = 2001;
+        int64_t tablet_id = 3001;
+        int64_t txn_id;
+        ASSERT_NO_FATAL_FAILURE(create_tablet_with_db_id(meta_service.get(), 
db_id, table_id,
+                                                         index_id, t1p1, 
tablet_id));
+        begin_txn_and_commit_rowset(meta_service.get(), "label11", db_id, 
table_id, t1p1, tablet_id,
+                                    &txn_id);
+        int64_t lock_id = -2;
+        int64_t initiator = 1009;
+        int64_t version = 100;
+
+        get_delete_bitmap_update_lock(meta_service.get(), table_id, t1p1, 
lock_id, initiator);
+
+        {
+            UpdateDeleteBitmapRequest update_delete_bitmap_req;
+            UpdateDeleteBitmapResponse update_delete_bitmap_res;
+            // will be splited and stored in 5 KVs
+            std::string data1(split_size * 5, 'c');
+            update_delete_bitmap(meta_service.get(), update_delete_bitmap_req,
+                                 update_delete_bitmap_res, table_id, t1p1, 
lock_id, initiator,
+                                 tablet_id, txn_id, version, data1);
+            ASSERT_EQ(update_delete_bitmap_res.status().code(), 
MetaServiceCode::OK);
+
+            GetDeleteBitmapRequest get_delete_bitmap_req;
+            GetDeleteBitmapResponse get_delete_bitmap_res;
+            get_delete_bitmap_req.set_cloud_unique_id("test_cloud_unique_id");
+            get_delete_bitmap_req.set_tablet_id(tablet_id);
+            get_delete_bitmap_req.add_rowset_ids("123");
+            get_delete_bitmap_req.add_begin_versions(0);
+            get_delete_bitmap_req.add_end_versions(version);
+            meta_service->get_delete_bitmap(
+                    reinterpret_cast<google::protobuf::RpcController*>(&cntl),
+                    &get_delete_bitmap_req, &get_delete_bitmap_res, nullptr);
+            ASSERT_EQ(get_delete_bitmap_res.status().code(), 
MetaServiceCode::OK);
+            ASSERT_EQ(get_delete_bitmap_res.rowset_ids_size(), 1);
+            ASSERT_EQ(get_delete_bitmap_res.versions_size(), 1);
+            ASSERT_EQ(get_delete_bitmap_res.segment_ids_size(), 1);
+            ASSERT_EQ(get_delete_bitmap_res.segment_delete_bitmaps_size(), 1);
+            ASSERT_EQ(get_delete_bitmap_res.segment_delete_bitmaps(0), data1);
+        }
+
+        {
+            std::string pending_key = 
meta_pending_delete_bitmap_key({instance_id, tablet_id});
+            std::string pending_val;
+            std::unique_ptr<Transaction> txn;
+            ASSERT_EQ(meta_service->txn_kv()->create_txn(&txn), 
TxnErrorCode::TXN_OK);
+            ASSERT_EQ(txn->get(pending_key, &pending_val), 
TxnErrorCode::TXN_OK);
+            PendingDeleteBitmapPB pending_info;
+            ASSERT_TRUE(pending_info.ParseFromString(pending_val));
+            ASSERT_EQ(pending_info.delete_bitmap_keys_size(), 1);
+
+            std::string_view k1 = pending_info.delete_bitmap_keys(0);
+            k1.remove_prefix(1);
+            std::vector<std::tuple<std::variant<int64_t, std::string>, int, 
int>> out;
+            decode_key(&k1, &out);
+            // 0x01 "meta" ${instance_id} "delete_bitmap" ${tablet_id} 
${rowset_id} ${version} ${segment_id} -> roaringbitmap
+            auto encoded_tablet_id = 
std::get<std::int64_t>(std::get<0>(out[3]));
+            ASSERT_EQ(encoded_tablet_id, tablet_id);
+            auto encoded_rowset_id = 
std::get<std::string>(std::get<0>(out[4]));
+            ASSERT_EQ(encoded_rowset_id, "123");
+            auto encoded_version = std::get<std::int64_t>(std::get<0>(out[5]));
+            ASSERT_EQ(encoded_version, version);
+            auto encoded_segment_id = 
std::get<std::int64_t>(std::get<0>(out[6]));
+            ASSERT_EQ(encoded_segment_id, 0);
+        }
+
+        {
+            UpdateDeleteBitmapRequest update_delete_bitmap_req;
+            UpdateDeleteBitmapResponse update_delete_bitmap_res;
+            // will be splited and stored in 3 KVs
+            // if we don't remove previous splited KVs, will crash when reading
+            std::string data2(split_size * 3, 'a');
+            update_delete_bitmap(meta_service.get(), update_delete_bitmap_req,
+                                 update_delete_bitmap_res, table_id, t1p1, 
lock_id, initiator,
+                                 tablet_id, txn_id, version, data2);
+            ASSERT_EQ(update_delete_bitmap_res.status().code(), 
MetaServiceCode::OK);
+
+            GetDeleteBitmapRequest get_delete_bitmap_req;
+            GetDeleteBitmapResponse get_delete_bitmap_res;
+            get_delete_bitmap_req.set_cloud_unique_id("test_cloud_unique_id");
+            get_delete_bitmap_req.set_tablet_id(tablet_id);
+            get_delete_bitmap_req.add_rowset_ids("123");
+            get_delete_bitmap_req.add_begin_versions(0);
+            get_delete_bitmap_req.add_end_versions(version);
+            meta_service->get_delete_bitmap(
+                    reinterpret_cast<google::protobuf::RpcController*>(&cntl),
+                    &get_delete_bitmap_req, &get_delete_bitmap_res, nullptr);
+            ASSERT_EQ(get_delete_bitmap_res.status().code(), 
MetaServiceCode::OK);
+            ASSERT_EQ(get_delete_bitmap_res.rowset_ids_size(), 1);
+            ASSERT_EQ(get_delete_bitmap_res.versions_size(), 1);
+            ASSERT_EQ(get_delete_bitmap_res.segment_ids_size(), 1);
+            ASSERT_EQ(get_delete_bitmap_res.segment_delete_bitmaps_size(), 1);
+            ASSERT_EQ(get_delete_bitmap_res.segment_delete_bitmaps(0), data2);
+        }
+    }
+}
+
 TEST(MetaServiceTest, UpdateDeleteBitmap) {
     auto meta_service = get_meta_service();
     remove_delete_bitmap_lock(meta_service.get(), 112);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to