This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 156f7b7699 [improvement](transaction) make commit txn fail hint more 
understandable (#23227)
156f7b7699 is described below

commit 156f7b76999a09585b2cb906540fdb90cbc8b67f
Author: yujun <yu.jun.re...@gmail.com>
AuthorDate: Wed Aug 23 21:50:24 2023 +0800

    [improvement](transaction) make commit txn fail hint more understandable 
(#23227)
---
 .../org/apache/doris/planner/OlapTableSink.java    |  3 +-
 .../doris/transaction/DatabaseTransactionMgr.java  | 61 ++++++++++++++++++++--
 .../transaction/TabletQuorumFailedException.java   | 23 +-------
 3 files changed, 62 insertions(+), 25 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapTableSink.java 
b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapTableSink.java
index 8f15f4ea7a..2012475083 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapTableSink.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapTableSink.java
@@ -410,7 +410,8 @@ public class OlapTableSink extends DataSink {
                     Multimap<Long, Long> bePathsMap = 
tablet.getNormalReplicaBackendPathMap();
                     if (bePathsMap.keySet().size() < quorum) {
                         throw new 
UserException(InternalErrorCode.REPLICA_FEW_ERR,
-                                "tablet " + tablet.getId() + " has few 
replicas: " + bePathsMap.keySet().size()
+                                "tablet " + tablet.getId() + " alive replica 
num " + bePathsMap.keySet().size()
+                                        + " < quorum replica num " + quorum
                                         + ", alive backends: [" + 
StringUtils.join(bePathsMap.keySet(), ",") + "]");
                     }
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java
 
b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java
index 293b49368a..bd125d1dac 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java
@@ -81,6 +81,7 @@ import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
+import java.util.function.Function;
 import java.util.stream.Collectors;
 
 /**
@@ -434,6 +435,7 @@ public class DatabaseTransactionMgr {
                                    Set<Long> errorReplicaIds, Map<Long, 
Set<Long>> tableToPartition,
                                    Set<Long> totalInvolvedBackends) throws 
UserException {
 
+        long transactionId = transactionState.getTransactionId();
         Database db = env.getInternalCatalog().getDbOrMetaException(dbId);
 
         // update transaction state extra if exists
@@ -490,6 +492,33 @@ public class DatabaseTransactionMgr {
             }
             
tabletToBackends.get(tabletId).add(tabletCommitInfos.get(i).getBackendId());
         }
+        List<String> tabletSuccReplicas = Lists.newArrayList();
+        List<String> tabletWriteFailedReplicas = Lists.newArrayList();
+        List<String> tabletVersionFailedReplicas = Lists.newArrayList();
+        Function<Replica, String> getReplicaInfo = replica -> {
+            StringBuilder strBuffer = new StringBuilder("[replicaId=");
+            strBuffer.append(replica.getId());
+            strBuffer.append(", backendId=");
+            strBuffer.append(replica.getBackendId());
+            strBuffer.append(", backendAlive=");
+            
strBuffer.append(Env.getCurrentSystemInfo().checkBackendAlive(replica.getBackendId()));
+            strBuffer.append(", version=");
+            strBuffer.append(replica.getVersion());
+            if (replica.getLastFailedVersion() >= 0) {
+                strBuffer.append(", lastFailedVersion=");
+                strBuffer.append(replica.getLastFailedVersion());
+                strBuffer.append(", lastSuccessVersion=");
+                strBuffer.append(replica.getLastSuccessVersion());
+                strBuffer.append(", lastFailedTimestamp=");
+                strBuffer.append(replica.getLastFailedTimestamp());
+            }
+            strBuffer.append(", state=");
+            strBuffer.append(replica.getState().name());
+            strBuffer.append("]");
+
+            return strBuffer.toString();
+        };
+
         for (long tableId : tableToPartition.keySet()) {
             OlapTable table = (OlapTable) db.getTableOrMetaException(tableId);
             for (Partition partition : table.getAllPartitions()) {
@@ -533,6 +562,9 @@ public class DatabaseTransactionMgr {
                         
.getReplicaAllocation(partition.getId()).getTotalReplicaNum() / 2 + 1;
                 for (MaterializedIndex index : allIndices) {
                     for (Tablet tablet : index.getTablets()) {
+                        tabletSuccReplicas.clear();
+                        tabletWriteFailedReplicas.clear();
+                        tabletVersionFailedReplicas.clear();
                         int successReplicaNum = 0;
                         long tabletId = tablet.getId();
                         Set<Long> tabletBackends = tablet.getBackendIds();
@@ -558,11 +590,14 @@ public class DatabaseTransactionMgr {
                                 // for example, a replica is in clone state
                                 if (replica.getLastFailedVersion() < 0) {
                                     ++successReplicaNum;
+                                    
tabletSuccReplicas.add(getReplicaInfo.apply(replica));
                                 } else {
                                     errorReplicaInfo += " replica [" + 
replica.getId() + "], lastFailedVersion ["
                                                         + 
replica.getLastFailedVersion() + "]";
+                                    
tabletVersionFailedReplicas.add(getReplicaInfo.apply(replica));
                                 }
                             } else {
+                                
tabletWriteFailedReplicas.add(getReplicaInfo.apply(replica));
                                 errorBackendIdsForTablet.add(tabletBackend);
                                 errorReplicaIds.add(replica.getId());
                                 // not remove rollup task here, because the 
commit maybe failed
@@ -580,9 +615,29 @@ public class DatabaseTransactionMgr {
                                     transactionState.getTransactionId(), 
tablet.getId(), successReplicaNum,
                                     quorumReplicaNum, 
Joiner.on(",").join(errorBackendIdsForTablet),
                                     errorReplicaInfo, commitBackends);
-                            throw new 
TabletQuorumFailedException(transactionState.getTransactionId(), tablet.getId(),
-                                    successReplicaNum, quorumReplicaNum,
-                                    errorBackendIdsForTablet);
+
+                            String replicasDetailMsg = "";
+                            if (!tabletSuccReplicas.isEmpty()) {
+                                replicasDetailMsg += String.format("%s 
replicas final succ: { %s }; ",
+                                        tabletSuccReplicas.size(), 
Joiner.on(", ").join(tabletSuccReplicas));
+                            }
+                            if (!tabletWriteFailedReplicas.isEmpty()) {
+                                replicasDetailMsg += String.format("%s 
replicas write data failed: { %s }; ",
+                                        tabletWriteFailedReplicas.size(),
+                                        Joiner.on(", 
").join(tabletWriteFailedReplicas));
+                            }
+                            if (!tabletVersionFailedReplicas.isEmpty()) {
+                                replicasDetailMsg += String.format("%s 
replicas write data succ but miss previous "
+                                                + "version: { %s }.",
+                                        tabletVersionFailedReplicas.size(),
+                                        Joiner.on(", 
").join(tabletVersionFailedReplicas));
+                            }
+
+                            throw new 
TabletQuorumFailedException(transactionId, String.format(
+                                        "Failed to commit txn %s, cause tablet 
%s succ replica num %s < quorum "
+                                                + " replica num %s. table %s, 
partition %s, this tablet detail: %s",
+                                        transactionId, tablet.getId(), 
successReplicaNum, quorumReplicaNum, tableId,
+                                        partition.getId(), replicasDetailMsg));
                         }
                     }
                 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/transaction/TabletQuorumFailedException.java
 
b/fe/fe-core/src/main/java/org/apache/doris/transaction/TabletQuorumFailedException.java
index 741babff4b..aef45cdcfd 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/transaction/TabletQuorumFailedException.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/transaction/TabletQuorumFailedException.java
@@ -17,27 +17,8 @@
 
 package org.apache.doris.transaction;
 
-import com.google.common.base.Joiner;
-import com.google.common.collect.Sets;
-
-import java.util.Set;
-
 public class TabletQuorumFailedException extends TransactionException {
-
-    private static final String TABLET_QUORUM_FAILED_MSG = "Failed to commit 
txn %s. "
-            + "Tablet [%s] success replica num %s is less than quorum "
-            + "replica num %s while error backends %s";
-
-    private long tabletId;
-    private Set<Long> errorBackendIdsForTablet = Sets.newHashSet();
-
-    public TabletQuorumFailedException(long transactionId, long tabletId,
-                                       int successReplicaNum, int 
quorumReplicaNum,
-                                       Set<Long> errorBackendIdsForTablet) {
-        super(String.format(TABLET_QUORUM_FAILED_MSG, transactionId, tabletId,
-                successReplicaNum, quorumReplicaNum,
-                Joiner.on(",").join(errorBackendIdsForTablet)), transactionId);
-        this.tabletId = tabletId;
-        this.errorBackendIdsForTablet = errorBackendIdsForTablet;
+    public TabletQuorumFailedException(long transactionId, String message) {
+        super(message, transactionId);
     }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to