This is an automated email from the ASF dual-hosted git repository.

hellostephen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new f7ed94624e9 branch-3.0-pick: [Fix](cloud) Should consider tablet state 
change whether to skip `sync_rowsets` in publish phase (#48400) (#48667)
f7ed94624e9 is described below

commit f7ed94624e9625948acc7e7b272df3e54149f6a1
Author: bobhan1 <bao...@selectdb.com>
AuthorDate: Mon Mar 10 15:13:15 2025 +0800

    branch-3.0-pick: [Fix](cloud) Should consider tablet state change whether 
to skip `sync_rowsets` in publish phase (#48400) (#48667)
    
    pick https://github.com/apache/doris/pull/48400
---
 .../cloud/cloud_engine_calc_delete_bitmap_task.cpp |  20 ++-
 .../cloud/cloud_engine_calc_delete_bitmap_task.h   |   3 +
 cloud/src/meta-service/meta_service.cpp            |  34 ++++-
 cloud/test/meta_service_test.cpp                   |  23 ++-
 .../transaction/CloudGlobalTransactionMgr.java     |  18 ++-
 .../transaction/DeleteBitmapUpdateLockContext.java |   6 +
 .../org/apache/doris/master/ReportHandler.java     |  16 ++
 gensrc/proto/cloud.proto                           |   1 +
 gensrc/thrift/AgentService.thrift                  |   2 +
 .../test_tablet_state_change_in_publish_phase.out  | Bin 0 -> 227 bytes
 ...est_tablet_state_change_in_publish_phase.groovy | 161 +++++++++++++++++++++
 11 files changed, 270 insertions(+), 14 deletions(-)

diff --git a/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp 
b/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp
index e85b160cf2f..c4ae5513001 100644
--- a/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp
+++ b/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp
@@ -22,6 +22,7 @@
 #include <memory>
 #include <random>
 #include <thread>
+#include <type_traits>
 
 #include "cloud/cloud_meta_mgr.h"
 #include "cloud/cloud_tablet.h"
@@ -75,6 +76,7 @@ Status CloudEngineCalcDeleteBitmapTask::execute() {
         bool has_compaction_stats = partition.__isset.base_compaction_cnts &&
                                     
partition.__isset.cumulative_compaction_cnts &&
                                     partition.__isset.cumulative_points;
+        bool has_tablet_states = partition.__isset.tablet_states;
         for (size_t i = 0; i < partition.tablet_ids.size(); i++) {
             auto tablet_id = partition.tablet_ids[i];
             auto tablet_calc_delete_bitmap_ptr = 
std::make_shared<CloudTabletCalcDeleteBitmapTask>(
@@ -84,6 +86,9 @@ Status CloudEngineCalcDeleteBitmapTask::execute() {
                         partition.base_compaction_cnts[i], 
partition.cumulative_compaction_cnts[i],
                         partition.cumulative_points[i]);
             }
+            if (has_tablet_states) {
+                
tablet_calc_delete_bitmap_ptr->set_tablet_state(partition.tablet_states[i]);
+            }
             auto submit_st = token->submit_func([=]() {
                 auto st = tablet_calc_delete_bitmap_ptr->handle();
                 if (!st.ok()) {
@@ -128,6 +133,9 @@ void 
CloudTabletCalcDeleteBitmapTask::set_compaction_stats(int64_t ms_base_compa
     _ms_cumulative_compaction_cnt = ms_cumulative_compaction_cnt;
     _ms_cumulative_point = ms_cumulative_point;
 }
+void CloudTabletCalcDeleteBitmapTask::set_tablet_state(int64_t tablet_state) {
+    _ms_tablet_state = tablet_state;
+}
 
 Status CloudTabletCalcDeleteBitmapTask::handle() const {
     VLOG_DEBUG << "start calculate delete bitmap on tablet " << _tablet_id;
@@ -146,7 +154,10 @@ Status CloudTabletCalcDeleteBitmapTask::handle() const {
     int64_t max_version = tablet->max_version_unlocked();
     int64_t t2 = MonotonicMicros();
 
-    auto should_sync_rowsets_produced_by_compaction = [&]() {
+    auto should_sync_rowsets = [&]() {
+        if (_version != max_version + 1) {
+            return true;
+        }
         if (_ms_base_compaction_cnt == -1) {
             return true;
         }
@@ -156,9 +167,12 @@ Status CloudTabletCalcDeleteBitmapTask::handle() const {
         std::shared_lock rlock(tablet->get_header_lock());
         return _ms_base_compaction_cnt > tablet->base_compaction_cnt() ||
                _ms_cumulative_compaction_cnt > 
tablet->cumulative_compaction_cnt() ||
-               _ms_cumulative_point > tablet->cumulative_layer_point();
+               _ms_cumulative_point > tablet->cumulative_layer_point() ||
+               (_ms_tablet_state.has_value() &&
+                _ms_tablet_state.value() != // an SC job finished on other BEs 
during this load job
+                        
static_cast<std::underlying_type_t<TabletState>>(tablet->tablet_state()));
     };
-    if (_version != max_version + 1 || 
should_sync_rowsets_produced_by_compaction()) {
+    if (should_sync_rowsets()) {
         auto sync_st = tablet->sync_rowsets();
         if (!sync_st.ok()) {
             LOG(WARNING) << "failed to sync rowsets. tablet_id=" << _tablet_id
diff --git a/be/src/cloud/cloud_engine_calc_delete_bitmap_task.h 
b/be/src/cloud/cloud_engine_calc_delete_bitmap_task.h
index e3733d3e696..62bd91b0a8a 100644
--- a/be/src/cloud/cloud_engine_calc_delete_bitmap_task.h
+++ b/be/src/cloud/cloud_engine_calc_delete_bitmap_task.h
@@ -18,6 +18,7 @@
 #pragma once
 
 #include <memory>
+#include <optional>
 
 #include "cloud/cloud_storage_engine.h"
 #include "cloud/cloud_tablet.h"
@@ -39,6 +40,7 @@ public:
 
     void set_compaction_stats(int64_t ms_base_compaction_cnt, int64_t 
ms_cumulative_compaction_cnt,
                               int64_t ms_cumulative_point);
+    void set_tablet_state(int64_t tablet_state);
 
     Status handle() const;
 
@@ -53,6 +55,7 @@ private:
     int64_t _ms_base_compaction_cnt {-1};
     int64_t _ms_cumulative_compaction_cnt {-1};
     int64_t _ms_cumulative_point {-1};
+    std::optional<int64_t> _ms_tablet_state;
     std::shared_ptr<MemTrackerLimiter> _mem_tracker;
 };
 
diff --git a/cloud/src/meta-service/meta_service.cpp 
b/cloud/src/meta-service/meta_service.cpp
index bc8af94496a..cc74384decf 100644
--- a/cloud/src/meta-service/meta_service.cpp
+++ b/cloud/src/meta-service/meta_service.cpp
@@ -2310,6 +2310,7 @@ void 
MetaServiceImpl::get_delete_bitmap_update_lock(google::protobuf::RpcControl
     }
 
     for (const auto& tablet_idx : request->tablet_indexes()) {
+        // 1. get compaction cnts
         TabletStatsPB tablet_stat;
         std::string stats_key =
                 stats_tablet_key({instance_id, tablet_idx.table_id(), 
tablet_idx.index_id(),
@@ -2343,16 +2344,43 @@ void 
MetaServiceImpl::get_delete_bitmap_update_lock(google::protobuf::RpcControl
         response->add_base_compaction_cnts(tablet_stat.base_compaction_cnt());
         
response->add_cumulative_compaction_cnts(tablet_stat.cumulative_compaction_cnt());
         response->add_cumulative_points(tablet_stat.cumulative_point());
+
+        // 2. get tablet states
+        std::string tablet_meta_key =
+                meta_tablet_key({instance_id, tablet_idx.table_id(), 
tablet_idx.index_id(),
+                                 tablet_idx.partition_id(), 
tablet_idx.tablet_id()});
+        std::string tablet_meta_val;
+        err = txn->get(tablet_meta_key, &tablet_meta_val);
+        if (err != TxnErrorCode::TXN_OK) {
+            ss << "failed to get tablet meta"
+               << (err == TxnErrorCode::TXN_KEY_NOT_FOUND ? " (not found)" : 
"")
+               << " instance_id=" << instance_id << " tablet_id=" << 
tablet_idx.tablet_id()
+               << " key=" << hex(tablet_meta_key) << " err=" << err;
+            msg = ss.str();
+            code = err == TxnErrorCode::TXN_KEY_NOT_FOUND ? 
MetaServiceCode::TABLET_NOT_FOUND
+                                                          : 
cast_as<ErrCategory::READ>(err);
+            return;
+        }
+        doris::TabletMetaCloudPB tablet_meta;
+        if (!tablet_meta.ParseFromString(tablet_meta_val)) {
+            code = MetaServiceCode::PROTOBUF_PARSE_ERR;
+            msg = "malformed tablet meta";
+            return;
+        }
+        response->add_tablet_states(
+                
static_cast<std::underlying_type_t<TabletStatePB>>(tablet_meta.tablet_state()));
     }
 
     read_stats_sw.pause();
-    LOG(INFO) << fmt::format("tablet_idxes.size()={}, read tablet compaction 
cnts cost={} ms",
-                             request->tablet_indexes().size(), 
read_stats_sw.elapsed_us() / 1000);
+    LOG(INFO) << fmt::format(
+            "tablet_idxes.size()={}, read tablet compaction cnts and tablet 
states cost={} ms",
+            request->tablet_indexes().size(), read_stats_sw.elapsed_us() / 
1000);
 
     DeleteBitmapUpdateLockPB lock_info_tmp;
     if (!check_delete_bitmap_lock(code, msg, ss, txn, table_id, 
request->lock_id(),
                                   request->initiator(), lock_key, 
lock_info_tmp)) {
-        LOG(WARNING) << "failed to check delete bitmap lock after get tablet 
stats, table_id="
+        LOG(WARNING) << "failed to check delete bitmap lock after get tablet 
stats and tablet "
+                        "states, table_id="
                      << table_id << " request lock_id=" << request->lock_id()
                      << " request initiator=" << request->initiator() << " 
code=" << code
                      << " msg=" << msg;
diff --git a/cloud/test/meta_service_test.cpp b/cloud/test/meta_service_test.cpp
index 9eed77271f2..10a5b3c6f18 100644
--- a/cloud/test/meta_service_test.cpp
+++ b/cloud/test/meta_service_test.cpp
@@ -277,7 +277,7 @@ static void insert_rowset(MetaServiceProxy* meta_service, 
int64_t db_id, const s
     commit_txn(meta_service, db_id, txn_id, label);
 }
 
-static void add_tablet_stats(MetaServiceProxy* meta_service, std::string 
instance_id,
+static void add_tablet_metas(MetaServiceProxy* meta_service, std::string 
instance_id,
                              int64_t table_id, int64_t index_id,
                              const std::vector<std::array<int64_t, 2>>& 
tablet_idxes) {
     std::unique_ptr<Transaction> txn;
@@ -293,6 +293,17 @@ static void add_tablet_stats(MetaServiceProxy* 
meta_service, std::string instanc
         stats.set_cumulative_compaction_cnt(20);
         stats.set_cumulative_point(30);
         txn->put(stats_key, stats.SerializeAsString());
+
+        doris::TabletMetaCloudPB tablet_pb;
+        tablet_pb.set_table_id(table_id);
+        tablet_pb.set_index_id(index_id);
+        tablet_pb.set_partition_id(partition_id);
+        tablet_pb.set_tablet_id(tablet_id);
+        tablet_pb.set_tablet_state(doris::TabletStatePB::PB_RUNNING);
+        auto tablet_meta_key =
+                meta_tablet_key({instance_id, table_id, index_id, 
partition_id, tablet_id});
+        auto tablet_meta_val = tablet_pb.SerializeAsString();
+        txn->put(tablet_meta_key, tablet_meta_val);
     }
     ASSERT_EQ(txn->commit(), TxnErrorCode::TXN_OK);
 }
@@ -4659,7 +4670,7 @@ TEST(MetaServiceTest, 
GetDeleteBitmapUpdateLockTabletStatsNormal) {
     // [(partition_id, tablet_id)]
     std::vector<std::array<int64_t, 2>> tablet_idxes {{70001, 12345}, {80001, 
3456}, {90001, 6789}};
 
-    add_tablet_stats(meta_service.get(), instance_id, table_id, index_id, 
tablet_idxes);
+    add_tablet_metas(meta_service.get(), instance_id, table_id, index_id, 
tablet_idxes);
 
     GetDeleteBitmapUpdateLockResponse res;
     get_delete_bitmap_update_lock(meta_service.get(), res, db_id, table_id, 
index_id, tablet_idxes,
@@ -4713,7 +4724,7 @@ TEST(MetaServiceTest, 
GetDeleteBitmapUpdateLockTabletStatsLockExpired) {
         std::vector<std::array<int64_t, 2>> tablet_idxes {
                 {70001, 12345}, {80001, 3456}, {90001, 6789}};
 
-        add_tablet_stats(meta_service.get(), instance_id, table_id, index_id, 
tablet_idxes);
+        add_tablet_metas(meta_service.get(), instance_id, table_id, index_id, 
tablet_idxes);
 
         GetDeleteBitmapUpdateLockResponse res;
         get_delete_bitmap_update_lock(meta_service.get(), res, db_id, 
table_id, index_id,
@@ -4754,7 +4765,7 @@ TEST(MetaServiceTest, 
GetDeleteBitmapUpdateLockTabletStatsLockExpired) {
         std::vector<std::array<int64_t, 2>> tablet_idxes {
                 {70001, 12345}, {80001, 3456}, {90001, 6789}};
 
-        add_tablet_stats(meta_service.get(), instance_id, table_id, index_id, 
tablet_idxes);
+        add_tablet_metas(meta_service.get(), instance_id, table_id, index_id, 
tablet_idxes);
 
         GetDeleteBitmapUpdateLockResponse res;
         get_delete_bitmap_update_lock(meta_service.get(), res, db_id, 
table_id, index_id,
@@ -4796,7 +4807,7 @@ TEST(MetaServiceTest, 
GetDeleteBitmapUpdateLockTabletStatsError) {
         std::vector<std::array<int64_t, 2>> tablet_idxes {
                 {70001, 12345}, {80001, 3456}, {90001, 6789}};
 
-        add_tablet_stats(meta_service.get(), instance_id, table_id, index_id, 
tablet_idxes);
+        add_tablet_metas(meta_service.get(), instance_id, table_id, index_id, 
tablet_idxes);
 
         GetDeleteBitmapUpdateLockResponse res;
         get_delete_bitmap_update_lock(meta_service.get(), res, db_id, 
table_id, index_id,
@@ -4841,7 +4852,7 @@ TEST(MetaServiceTest, 
GetDeleteBitmapUpdateLockTabletStatsError) {
             tablet_idxes.push_back({partition_id, tablet_id});
         }
 
-        add_tablet_stats(meta_service.get(), instance_id, table_id, index_id, 
tablet_idxes);
+        add_tablet_metas(meta_service.get(), instance_id, table_id, index_id, 
tablet_idxes);
 
         GetDeleteBitmapUpdateLockResponse res;
         get_delete_bitmap_update_lock(meta_service.get(), res, db_id, 
table_id, index_id,
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/CloudGlobalTransactionMgr.java
 
b/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/CloudGlobalTransactionMgr.java
index bee81365628..188173b32a9 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/CloudGlobalTransactionMgr.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/CloudGlobalTransactionMgr.java
@@ -780,17 +780,26 @@ public class CloudGlobalTransactionMgr implements 
GlobalTransactionMgrIface {
                 if (!lockContext.getBaseCompactionCnts().isEmpty()
                         && !lockContext.getCumulativeCompactionCnts().isEmpty()
                         && !lockContext.getCumulativePoints().isEmpty()) {
+                    boolean hasTabletStats = 
!lockContext.getTabletStates().isEmpty();
+
                     List<Long> reqBaseCompactionCnts = Lists.newArrayList();
                     List<Long> reqCumulativeCompactionCnts = 
Lists.newArrayList();
                     List<Long> reqCumulativePoints = Lists.newArrayList();
+                    List<Long> reqTabletStates = Lists.newArrayList();
                     for (long tabletId : tabletList) {
                         
reqBaseCompactionCnts.add(lockContext.getBaseCompactionCnts().get(tabletId));
                         
reqCumulativeCompactionCnts.add(lockContext.getCumulativeCompactionCnts().get(tabletId));
                         
reqCumulativePoints.add(lockContext.getCumulativePoints().get(tabletId));
+                        if (hasTabletStats) {
+                            
reqTabletStates.add(lockContext.getTabletStates().get(tabletId));
+                        }
                     }
                     partitionInfo.setBaseCompactionCnts(reqBaseCompactionCnts);
                     
partitionInfo.setCumulativeCompactionCnts(reqCumulativeCompactionCnts);
                     partitionInfo.setCumulativePoints(reqCumulativePoints);
+                    if (hasTabletStats) {
+                        partitionInfo.setTabletStates(reqTabletStates);
+                    }
                 }
                 partitionInfos.add(partitionInfo);
             }
@@ -917,19 +926,24 @@ public class CloudGlobalTransactionMgr implements 
GlobalTransactionMgrIface {
             List<Long> respBaseCompactionCnts = 
response.getBaseCompactionCntsList();
             List<Long> respCumulativeCompactionCnts = 
response.getCumulativeCompactionCntsList();
             List<Long> respCumulativePoints = 
response.getCumulativePointsList();
+            List<Long> respTabletStates = response.getTabletStatesList();
             int size1 = respBaseCompactionCnts.size();
             int size2 = respCumulativeCompactionCnts.size();
             int size3 = respCumulativePoints.size();
-            if (size1 != tabletList.size() || size2 != tabletList.size() || 
size3 != tabletList.size()) {
+            int size4 = respTabletStates.size();
+            if (size1 != tabletList.size() || size2 != tabletList.size() || 
size3 != tabletList.size()
+                    || (size4 > 0 && size4 != tabletList.size())) {
                 throw new UserException("The size of returned compaction cnts 
can't match the size of tabletList, "
                         + "tabletList.size()=" + tabletList.size() + ", 
respBaseCompactionCnts.size()=" + size1
-                        + ", respCumulativeCompactionCnts.size()=" + size2 + 
", respCumulativePoints.size()=" + size3);
+                        + ", respCumulativeCompactionCnts.size()=" + size2 + 
", respCumulativePoints.size()=" + size3
+                        + ", respTabletStates.size()=" + size4);
             }
             for (int i = 0; i < tabletList.size(); i++) {
                 long tabletId = tabletList.get(i);
                 lockContext.getBaseCompactionCnts().put(tabletId, 
respBaseCompactionCnts.get(i));
                 lockContext.getCumulativeCompactionCnts().put(tabletId, 
respCumulativeCompactionCnts.get(i));
                 lockContext.getCumulativePoints().put(tabletId, 
respCumulativePoints.get(i));
+                lockContext.getTabletStates().put(tabletId, 
respTabletStates.get(i));
             }
             totalRetryTime += retryTime;
         }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/DeleteBitmapUpdateLockContext.java
 
b/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/DeleteBitmapUpdateLockContext.java
index 02886f63427..120715d6276 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/DeleteBitmapUpdateLockContext.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/DeleteBitmapUpdateLockContext.java
@@ -30,6 +30,7 @@ public class DeleteBitmapUpdateLockContext {
     private Map<Long, Long> baseCompactionCnts;
     private Map<Long, Long> cumulativeCompactionCnts;
     private Map<Long, Long> cumulativePoints;
+    private Map<Long, Long> tabletStates;
     private Map<Long, Set<Long>> tableToPartitions;
     private Map<Long, Partition> partitions;
     private Map<Long, Map<Long, List<Long>>> backendToPartitionTablets;
@@ -40,6 +41,7 @@ public class DeleteBitmapUpdateLockContext {
         baseCompactionCnts = Maps.newHashMap();
         cumulativeCompactionCnts = Maps.newHashMap();
         cumulativePoints = Maps.newHashMap();
+        tabletStates = Maps.newHashMap();
         tableToPartitions = Maps.newHashMap();
         partitions = Maps.newHashMap();
         backendToPartitionTablets = Maps.newHashMap();
@@ -63,6 +65,10 @@ public class DeleteBitmapUpdateLockContext {
         return cumulativePoints;
     }
 
+    public Map<Long, Long> getTabletStates() {
+        return tabletStates;
+    }
+
     public Map<Long, Map<Long, List<Long>>> getBackendToPartitionTablets() {
         return backendToPartitionTablets;
     }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java 
b/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
index 06047e2cf16..e104ed288b3 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
@@ -44,6 +44,7 @@ import org.apache.doris.common.Config;
 import org.apache.doris.common.MetaNotFoundException;
 import org.apache.doris.common.Pair;
 import org.apache.doris.common.util.Daemon;
+import org.apache.doris.common.util.DebugPointUtil;
 import org.apache.doris.common.util.NetUtils;
 import org.apache.doris.common.util.TimeUtils;
 import org.apache.doris.cooldown.CooldownConf;
@@ -593,7 +594,22 @@ public class ReportHandler extends Daemon {
         LOG.info("finished to handle tablet report from backend[{}] cost: {} 
ms", backendId, (end - start));
     }
 
+    private static void debugBlock() {
+        if (DebugPointUtil.isEnable("ReportHandler.block")) {
+            LOG.info("debug point: block at ReportHandler.block");
+            while (DebugPointUtil.isEnable("ReportHandler.block")) {
+                try {
+                    Thread.sleep(1000);
+                } catch (InterruptedException e) {
+                    LOG.info("error ", e);
+                }
+            }
+            LOG.info("debug point: leave ReportHandler.block");
+        }
+    }
+
     private static void taskReport(long backendId, Map<TTaskType, Set<Long>> 
runningTasks) {
+        debugBlock();
         if (LOG.isDebugEnabled()) {
             LOG.debug("begin to handle task report from backend {}", 
backendId);
         }
diff --git a/gensrc/proto/cloud.proto b/gensrc/proto/cloud.proto
index dd85f6ec2a2..ff0279990ee 100644
--- a/gensrc/proto/cloud.proto
+++ b/gensrc/proto/cloud.proto
@@ -1470,6 +1470,7 @@ message GetDeleteBitmapUpdateLockResponse {
     repeated int64 base_compaction_cnts = 2;
     repeated int64 cumulative_compaction_cnts = 3;
     repeated int64 cumulative_points = 4;
+    repeated int64 tablet_states = 5;
 }
 
 message RemoveDeleteBitmapUpdateLockRequest {
diff --git a/gensrc/thrift/AgentService.thrift 
b/gensrc/thrift/AgentService.thrift
index abffd176ef8..aa1ee2f5b9f 100644
--- a/gensrc/thrift/AgentService.thrift
+++ b/gensrc/thrift/AgentService.thrift
@@ -440,6 +440,8 @@ struct TCalcDeleteBitmapPartitionInfo {
     4: optional list<i64> base_compaction_cnts
     5: optional list<i64> cumulative_compaction_cnts
     6: optional list<i64> cumulative_points
+    7: optional list<i64> sub_txn_ids
+    8: optional list<i64> tablet_states
 }
 
 struct TCalcDeleteBitmapRequest {
diff --git 
a/regression-test/data/fault_injection_p0/cloud/test_tablet_state_change_in_publish_phase.out
 
b/regression-test/data/fault_injection_p0/cloud/test_tablet_state_change_in_publish_phase.out
new file mode 100644
index 00000000000..0ece86d0fb4
Binary files /dev/null and 
b/regression-test/data/fault_injection_p0/cloud/test_tablet_state_change_in_publish_phase.out
 differ
diff --git 
a/regression-test/suites/fault_injection_p0/cloud/test_tablet_state_change_in_publish_phase.groovy
 
b/regression-test/suites/fault_injection_p0/cloud/test_tablet_state_change_in_publish_phase.groovy
new file mode 100644
index 00000000000..6b7102ed243
--- /dev/null
+++ 
b/regression-test/suites/fault_injection_p0/cloud/test_tablet_state_change_in_publish_phase.groovy
@@ -0,0 +1,161 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import org.apache.doris.regression.suite.ClusterOptions
+
+suite("test_tablet_state_change_in_publish_phase", "docker") {
+    def options = new ClusterOptions()
+    options.setFeNum(1)
+    options.setBeNum(2)
+    options.cloudMode = true
+    options.feConfigs += [
+        'cloud_cluster_check_interval_second=1',
+        'sys_log_verbose_modules=org',
+        'heartbeat_interval_second=1'
+    ]
+    options.enableDebugPoints()
+
+    docker(options) {
+        GetDebugPoint().clearDebugPointsForAllFEs()
+        GetDebugPoint().clearDebugPointsForAllBEs()
+
+        def table1 = "test_tablet_state_change_in_publish_phase"
+        sql "DROP TABLE IF EXISTS ${table1} FORCE;"
+        sql """ CREATE TABLE IF NOT EXISTS ${table1} (
+                `k1` int NOT NULL,
+                `c1` int,
+                `c2` int
+                )UNIQUE KEY(k1)
+            DISTRIBUTED BY HASH(k1) BUCKETS 1
+            PROPERTIES (
+                "enable_unique_key_merge_on_write" = "true",
+                "disable_auto_compaction" = "true",
+                "replication_num" = "1"); """
+
+        sql "insert into ${table1} values(1,1,1);"
+        sql "insert into ${table1} values(2,2,2);"
+        sql "insert into ${table1} values(3,3,3);"
+        sql "sync;"
+        qt_sql "select * from ${table1} order by k1;"
+
+        def beNodes = sql_return_maparray("show backends;")
+        def tabletStat = sql_return_maparray("show tablets from 
${table1};").get(0)
+        def tabletBackendId = tabletStat.BackendId
+        def tabletId = tabletStat.TabletId
+        def be1
+        for (def be : beNodes) {
+            if (be.BackendId == tabletBackendId) {
+                be1 = be
+            }
+        }
+        logger.info("tablet ${tabletId} on backend ${be1.Host} with 
backendId=${be1.BackendId}");
+        logger.info("backends: ${cluster.getBackends()}")
+        int beIndex = 1
+        for (def backend : cluster.getBackends()) {
+            if (backend.host == be1.Host) {
+                beIndex = backend.index
+                break
+            }
+        }
+        assert cluster.getBeByIndex(beIndex).backendId as String == 
tabletBackendId
+
+        try {
+            
GetDebugPoint().enableDebugPointForAllBEs("CloudSchemaChangeJob::_convert_historical_rowsets.block")
+            sql "alter table ${table1} modify column c1 varchar(100);"
+            Thread.sleep(1000)
+
+            cluster.stopBackends(beIndex)
+
+            Thread.sleep(1000)
+
+            // let tablet be on another BE
+            sql "insert into ${table1} values(10,88,88);"
+            qt_sql "select * from ${table1} order by k1;"
+            assert sql_return_maparray("show tablets from 
${table1};").get(0).BackendId as String != tabletBackendId
+
+            // block FE's task report handler to avoid alter task re-sended to 
BE before we enable debug points for SC
+            GetDebugPoint().enableDebugPointForAllFEs("ReportHandler.block")
+            cluster.startBackends(beIndex)
+            
GetDebugPoint().enableDebugPointForAllBEs("CloudSchemaChangeJob::_convert_historical_rowsets.block")
+            
GetDebugPoint().enableDebugPointForAllBEs("CloudSchemaChangeJob.process_alter_tablet.sleep")
+            GetDebugPoint().disableDebugPointForAllFEs("ReportHandler.block")
+
+            def newThreadInDocker = { Closure actionSupplier ->
+                def connInfo = context.threadLocalConn.get()
+                return Thread.start {
+                    connect(connInfo.username, connInfo.password, 
connInfo.conn.getMetaData().getURL(), actionSupplier)
+                }
+            }
+
+            // let load 1 block before publish
+            
GetDebugPoint().enableDebugPointForAllFEs("CloudGlobalTransactionMgr.getDeleteBitmapUpdateLock.enable_spin_wait")
+            
GetDebugPoint().enableDebugPointForAllFEs("CloudGlobalTransactionMgr.getDeleteBitmapUpdateLock.block")
+            def t1 = newThreadInDocker {
+                // load 1 will not see any historical data when flush
+                // and will skip to calculate delete bitmaps in later phase 
becase the tablet's state is NOT_READY
+                sql "insert into ${table1} values(1,88,88);"
+            }
+            Thread.sleep(800)
+
+            // let sc finish converting historical data
+            
GetDebugPoint().disableDebugPointForAllBEs("CloudSchemaChangeJob::_convert_historical_rowsets.block")
+            Thread.sleep(1000)
+
+            // let load 1 publish
+            
GetDebugPoint().disableDebugPointForAllFEs("CloudGlobalTransactionMgr.getDeleteBitmapUpdateLock.enable_spin_wait")
+            
GetDebugPoint().disableDebugPointForAllFEs("CloudGlobalTransactionMgr.getDeleteBitmapUpdateLock.block")
+            t1.join()
+
+            // load 2
+            sql "insert into ${table1} values(1,77,77);"
+
+
+            // let load 3 block before publish
+            
GetDebugPoint().enableDebugPointForAllFEs("CloudGlobalTransactionMgr.getDeleteBitmapUpdateLock.enable_spin_wait")
+            
GetDebugPoint().enableDebugPointForAllFEs("CloudGlobalTransactionMgr.getDeleteBitmapUpdateLock.block")
+            def t2 = newThreadInDocker {
+                sql "insert into ${table1} values(1,99,99);"
+            }
+            Thread.sleep(1000)
+
+            // let sc finish
+            
GetDebugPoint().disableDebugPointForAllBEs("CloudSchemaChangeJob.process_alter_tablet.sleep")
+
+            dockerAwaitUntil(30) {
+                def res = sql_return_maparray """ SHOW ALTER TABLE COLUMN 
WHERE TableName='${table1}' ORDER BY createtime DESC LIMIT 1 """
+                logger.info("alter status: ${res}")
+                res[0].State as String == "FINISHED"
+            }
+            // tablet state has changed to NORMAL in MS
+
+            // let load 3 publish
+            
GetDebugPoint().disableDebugPointForAllFEs("CloudGlobalTransactionMgr.getDeleteBitmapUpdateLock.enable_spin_wait")
+            
GetDebugPoint().disableDebugPointForAllFEs("CloudGlobalTransactionMgr.getDeleteBitmapUpdateLock.block")
+            t2.join()
+
+            qt_dup_key_count "select k1,count() as cnt from ${table1} group by 
k1 having cnt>1;"
+            qt_sql "select * from ${table1} order by k1;"
+
+        } catch(Exception e) {
+            logger.info(e.getMessage())
+            throw e
+        } finally {
+            GetDebugPoint().clearDebugPointsForAllFEs()
+            GetDebugPoint().clearDebugPointsForAllBEs()
+        }
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to