This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 9ede6bcb384c3d5be1fdab1087206b9a97222f84
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Tue Mar 17 09:05:30 2026 +0800

    branch-4.0: [enhance](load) exclude version-gap replicas from success 
counting in quorum success #60953 (#61359)
    
    Cherry-picked from #60953
    
    Co-authored-by: hui lai <[email protected]>
---
 be/src/exec/tablet_info.cpp                        |  8 +++++
 be/src/exec/tablet_info.h                          |  2 ++
 be/src/vec/sink/writer/vtablet_writer.cpp          | 13 +++++++-
 be/src/vec/sink/writer/vtablet_writer.h            |  4 +++
 be/src/vec/sink/writer/vtablet_writer_v2.cpp       | 38 ++++++++++++++++------
 be/src/vec/sink/writer/vtablet_writer_v2.h         |  3 ++
 .../java/org/apache/doris/catalog/OlapTable.java   | 22 +++++++++++++
 .../org/apache/doris/planner/OlapTableSink.java    |  8 +++++
 gensrc/thrift/Descriptors.thrift                   |  3 ++
 9 files changed, 90 insertions(+), 11 deletions(-)

diff --git a/be/src/exec/tablet_info.cpp b/be/src/exec/tablet_info.cpp
index 85b5b871382..7f61e1ac5ec 100644
--- a/be/src/exec/tablet_info.cpp
+++ b/be/src/exec/tablet_info.cpp
@@ -747,6 +747,14 @@ Status 
VOlapTablePartitionParam::generate_partition_from(const TOlapTablePartiti
     if (t_part.__isset.load_required_replica_num) {
         part_result->load_required_replica_num = 
t_part.load_required_replica_num;
     }
+    if (t_part.__isset.tablet_version_gap_backends) {
+        for (const auto& [tablet_id, backend_ids] : 
t_part.tablet_version_gap_backends) {
+            auto& gap_set = 
part_result->tablet_version_gap_backends[tablet_id];
+            for (auto backend_id : backend_ids) {
+                gap_set.insert(backend_id);
+            }
+        }
+    }
     return Status::OK();
 }
 
diff --git a/be/src/exec/tablet_info.h b/be/src/exec/tablet_info.h
index 1d13abec44a..91885c3ab5e 100644
--- a/be/src/exec/tablet_info.h
+++ b/be/src/exec/tablet_info.h
@@ -167,6 +167,8 @@ struct VOlapTablePartition {
     int64_t load_tablet_idx = -1;
     int total_replica_num = 0;
     int load_required_replica_num = 0;
+    // tablet_id -> set of backend_ids that have version gaps
+    std::unordered_map<int64_t, std::unordered_set<int64_t>> 
tablet_version_gap_backends;
 
     VOlapTablePartition(vectorized::Block* partition_block)
             // the default value of partition bound is -1.
diff --git a/be/src/vec/sink/writer/vtablet_writer.cpp 
b/be/src/vec/sink/writer/vtablet_writer.cpp
index 270b710634c..c8cb3496d2c 100644
--- a/be/src/vec/sink/writer/vtablet_writer.cpp
+++ b/be/src/vec/sink/writer/vtablet_writer.cpp
@@ -425,7 +425,13 @@ bool IndexChannel::_quorum_success(const 
std::unordered_set<int64_t>& unfinished
             continue;
         }
         for (const auto& tablet_id : _tablets_by_channel[node_id]) {
-            finished_tablets_replica[tablet_id]++;
+            // Only count non-gap backends for quorum success.
+            // Gap backends' success doesn't count toward majority write.
+            auto gap_it = 
_parent->_tablet_version_gap_backends.find(tablet_id);
+            if (gap_it == _parent->_tablet_version_gap_backends.end() ||
+                gap_it->second.find(node_id) == gap_it->second.end()) {
+                finished_tablets_replica[tablet_id]++;
+            }
         }
     }
 
@@ -1717,6 +1723,11 @@ void VTabletWriter::_build_tablet_replica_info(const 
int64_t tablet_id,
                                                  : 
partition->load_required_replica_num;
         _tablet_replica_info.emplace(
                 tablet_id, std::make_pair(total_replicas_num, 
load_required_replicas_num));
+        // Copy version gap backends info for this tablet
+        if (auto it = partition->tablet_version_gap_backends.find(tablet_id);
+            it != partition->tablet_version_gap_backends.end()) {
+            _tablet_version_gap_backends[tablet_id] = it->second;
+        }
     } else {
         _tablet_replica_info.emplace(tablet_id,
                                      std::make_pair(_num_replicas, 
(_num_replicas + 1) / 2));
diff --git a/be/src/vec/sink/writer/vtablet_writer.h 
b/be/src/vec/sink/writer/vtablet_writer.h
index 039dd10a2c7..6249febf8ad 100644
--- a/be/src/vec/sink/writer/vtablet_writer.h
+++ b/be/src/vec/sink/writer/vtablet_writer.h
@@ -775,5 +775,9 @@ private:
 
     // tablet_id -> <total replicas num, load required replicas num>
     std::unordered_map<int64_t, std::pair<int, int>> _tablet_replica_info;
+
+    // tablet_id -> set of backend_ids that have version gaps
+    // these backends' success should not be counted for majority write
+    std::unordered_map<int64_t, std::unordered_set<int64_t>> 
_tablet_version_gap_backends;
 };
 } // namespace doris::vectorized
diff --git a/be/src/vec/sink/writer/vtablet_writer_v2.cpp 
b/be/src/vec/sink/writer/vtablet_writer_v2.cpp
index 2017099a4ef..794cd99a30e 100644
--- a/be/src/vec/sink/writer/vtablet_writer_v2.cpp
+++ b/be/src/vec/sink/writer/vtablet_writer_v2.cpp
@@ -371,6 +371,11 @@ void VTabletWriterV2::_build_tablet_replica_info(const 
int64_t tablet_id,
                                                  : 
partition->load_required_replica_num;
         _tablet_replica_info[tablet_id] =
                 std::make_pair(total_replicas_num, load_required_replicas_num);
+        // Copy version gap backends info for this tablet
+        if (auto it = partition->tablet_version_gap_backends.find(tablet_id);
+            it != partition->tablet_version_gap_backends.end()) {
+            _tablet_version_gap_backends[tablet_id] = it->second;
+        }
     } else {
         _tablet_replica_info[tablet_id] = std::make_pair(_num_replicas, 
(_num_replicas + 1) / 2);
     }
@@ -876,7 +881,13 @@ bool VTabletWriterV2::_quorum_success(
             continue;
         }
         for (const auto& tablet_id : _tablets_by_node[dst_id]) {
-            finished_tablets_replica[tablet_id]++;
+            // Only count non-gap backends for quorum success.
+            // Gap backends' success doesn't count toward majority write.
+            auto gap_it = _tablet_version_gap_backends.find(tablet_id);
+            if (gap_it == _tablet_version_gap_backends.end() ||
+                gap_it->second.find(dst_id) == gap_it->second.end()) {
+                finished_tablets_replica[tablet_id]++;
+            }
         }
     }
 
@@ -1015,13 +1026,15 @@ void VTabletWriterV2::_calc_tablets_to_commit() {
 
 Status VTabletWriterV2::_create_commit_info(std::vector<TTabletCommitInfo>& 
tablet_commit_infos,
                                             std::shared_ptr<LoadStreamMap> 
load_stream_map) {
-    std::unordered_map<int64_t, int> failed_tablets;
+    // Track per-tablet non-gap success count and failure reasons
+    std::unordered_map<int64_t, int> success_tablets_replica;
+    std::unordered_set<int64_t> failed_tablets;
     std::unordered_map<int64_t, Status> failed_reason;
     load_stream_map->for_each([&](int64_t dst_id, LoadStreamStubs& streams) {
         size_t num_success_tablets = 0;
         size_t num_failed_tablets = 0;
         for (auto [tablet_id, reason] : streams.failed_tablets()) {
-            failed_tablets[tablet_id]++;
+            failed_tablets.insert(tablet_id);
             failed_reason[tablet_id] = reason;
             num_failed_tablets++;
         }
@@ -1030,20 +1043,25 @@ Status 
VTabletWriterV2::_create_commit_info(std::vector<TTabletCommitInfo>& tabl
             commit_info.tabletId = tablet_id;
             commit_info.backendId = dst_id;
             tablet_commit_infos.emplace_back(std::move(commit_info));
+            // Only count non-gap backends toward success
+            auto gap_it = _tablet_version_gap_backends.find(tablet_id);
+            if (gap_it == _tablet_version_gap_backends.end() ||
+                gap_it->second.find(dst_id) == gap_it->second.end()) {
+                success_tablets_replica[tablet_id]++;
+            }
             num_success_tablets++;
         }
         LOG(INFO) << "streams to dst_id: " << dst_id << ", success tablets: " 
<< num_success_tablets
                   << ", failed tablets: " << num_failed_tablets;
     });
 
-    for (auto [tablet_id, replicas] : failed_tablets) {
-        auto [total_replicas_num, load_required_replicas_num] = 
_tablet_replica_info[tablet_id];
-        int max_failed_replicas = total_replicas_num == 0
-                                          ? (_num_replicas - 1) / 2
-                                          : total_replicas_num - 
load_required_replicas_num;
-        if (replicas > max_failed_replicas) {
+    for (auto tablet_id : failed_tablets) {
+        int succ_count = success_tablets_replica[tablet_id];
+        int required = _load_required_replicas_num(tablet_id);
+        if (succ_count < required) {
             LOG(INFO) << "tablet " << tablet_id
-                      << " failed on majority backends: " << 
failed_reason[tablet_id];
+                      << " failed on majority backends (success=" << succ_count
+                      << ", required=" << required << "): " << 
failed_reason[tablet_id];
             return Status::InternalError("tablet {} failed on majority 
backends: {}", tablet_id,
                                          failed_reason[tablet_id]);
         }
diff --git a/be/src/vec/sink/writer/vtablet_writer_v2.h 
b/be/src/vec/sink/writer/vtablet_writer_v2.h
index e92084ebcd5..550501ad92e 100644
--- a/be/src/vec/sink/writer/vtablet_writer_v2.h
+++ b/be/src/vec/sink/writer/vtablet_writer_v2.h
@@ -258,6 +258,9 @@ private:
     // tablet_id -> <total replicas num, load required replicas num>
     std::unordered_map<int64_t, std::pair<int, int>> _tablet_replica_info;
 
+    // tablet_id -> set of backend_ids that have version gaps
+    std::unordered_map<int64_t, std::unordered_set<int64_t>> 
_tablet_version_gap_backends;
+
     std::atomic<int64_t> _load_back_pressure_version_block_ms {0};
 };
 
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java
index 3c9da127b40..9301b908905 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java
@@ -2525,6 +2525,28 @@ public class OlapTable extends Table implements 
MTMVRelatedTableIf, GsonPostProc
         return 
partitionInfo.getReplicaAllocation(partitionId).getTotalReplicaNum();
     }
 
+    public Map<Long, List<Long>> getPartitionVersionGapBackends(long 
partitionId) {
+        Map<Long, List<Long>> result = new HashMap<>();
+        Partition partition = getPartition(partitionId);
+        if (partition == null) {
+            return result;
+        }
+        for (MaterializedIndex index : 
partition.getMaterializedIndices(IndexExtState.ALL)) {
+            for (Tablet tablet : index.getTablets()) {
+                List<Long> gapBackends = new ArrayList<>();
+                for (Replica replica : tablet.getReplicas()) {
+                    if (replica.getLastFailedVersion() >= 0) {
+                        
gapBackends.add(replica.getBackendIdWithoutException());
+                    }
+                }
+                if (!gapBackends.isEmpty()) {
+                    result.put(tablet.getId(), gapBackends);
+                }
+            }
+        }
+        return result;
+    }
+
     public int getLoadRequiredReplicaNum(long partitionId) {
         int totalReplicaNum = getPartitionTotalReplicasNum(partitionId);
         int minLoadReplicaNum = getMinLoadReplicaNum();
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapTableSink.java 
b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapTableSink.java
index 72e244d3f1f..8520249dd97 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapTableSink.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapTableSink.java
@@ -221,6 +221,10 @@ public class OlapTableSink extends DataSink {
         for (TOlapTablePartition partition : 
tOlapTablePartitionParam.getPartitions()) {
             
partition.setTotalReplicaNum(dstTable.getPartitionTotalReplicasNum(partition.getId()));
             
partition.setLoadRequiredReplicaNum(dstTable.getLoadRequiredReplicaNum(partition.getId()));
+            Map<Long, List<Long>> gapBackends = 
dstTable.getPartitionVersionGapBackends(partition.getId());
+            if (!gapBackends.isEmpty()) {
+                partition.setTabletVersionGapBackends(gapBackends);
+            }
         }
         tOlapTableLocationParams = createLocation(tSink.getDbId(), dstTable);
 
@@ -275,6 +279,10 @@ public class OlapTableSink extends DataSink {
         for (TOlapTablePartition partition : 
tOlapTablePartitionParam.getPartitions()) {
             
partition.setTotalReplicaNum(dstTable.getPartitionTotalReplicasNum(partition.getId()));
             
partition.setLoadRequiredReplicaNum(dstTable.getLoadRequiredReplicaNum(partition.getId()));
+            Map<Long, List<Long>> gapBackends = 
dstTable.getPartitionVersionGapBackends(partition.getId());
+            if (!gapBackends.isEmpty()) {
+                partition.setTabletVersionGapBackends(gapBackends);
+            }
         }
         tOlapTableLocationParams = createLocation(tSink.getDbId(), dstTable);
 
diff --git a/gensrc/thrift/Descriptors.thrift b/gensrc/thrift/Descriptors.thrift
index 3be28d58089..51f4fcaf998 100644
--- a/gensrc/thrift/Descriptors.thrift
+++ b/gensrc/thrift/Descriptors.thrift
@@ -274,6 +274,9 @@ struct TOlapTablePartition {
     11: optional i64 load_tablet_idx
     12: optional i32 total_replica_num
     13: optional i32 load_required_replica_num
+    // tablet_id -> list of backend_ids that have version gaps 
(lastFailedVersion >= 0)
+    // used by BE to exclude these backends from success counting in majority 
write
+    14: optional map<i64, list<i64>> tablet_version_gap_backends
 }
 
 struct TOlapTablePartitionParam {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to