This is an automated email from the ASF dual-hosted git repository. dataroaring pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 156f7b7699 [improvement](transaction) make commit txn fail hint more understandable (#23227) 156f7b7699 is described below commit 156f7b76999a09585b2cb906540fdb90cbc8b67f Author: yujun <yu.jun.re...@gmail.com> AuthorDate: Wed Aug 23 21:50:24 2023 +0800 [improvement](transaction) make commit txn fail hint more understandable (#23227) --- .../org/apache/doris/planner/OlapTableSink.java | 3 +- .../doris/transaction/DatabaseTransactionMgr.java | 61 ++++++++++++++++++++-- .../transaction/TabletQuorumFailedException.java | 23 +------- 3 files changed, 62 insertions(+), 25 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapTableSink.java b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapTableSink.java index 8f15f4ea7a..2012475083 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapTableSink.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapTableSink.java @@ -410,7 +410,8 @@ public class OlapTableSink extends DataSink { Multimap<Long, Long> bePathsMap = tablet.getNormalReplicaBackendPathMap(); if (bePathsMap.keySet().size() < quorum) { throw new UserException(InternalErrorCode.REPLICA_FEW_ERR, - "tablet " + tablet.getId() + " has few replicas: " + bePathsMap.keySet().size() + "tablet " + tablet.getId() + " alive replica num " + bePathsMap.keySet().size() + + " < quorum replica num " + quorum + ", alive backends: [" + StringUtils.join(bePathsMap.keySet(), ",") + "]"); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java index 293b49368a..bd125d1dac 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java @@ -81,6 +81,7 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.function.Function; import java.util.stream.Collectors; /** @@ -434,6 +435,7 @@ public class DatabaseTransactionMgr { Set<Long> errorReplicaIds, Map<Long, Set<Long>> tableToPartition, Set<Long> totalInvolvedBackends) throws UserException { + long transactionId = transactionState.getTransactionId(); Database db = env.getInternalCatalog().getDbOrMetaException(dbId); // update transaction state extra if exists @@ -490,6 +492,33 @@ public class DatabaseTransactionMgr { } tabletToBackends.get(tabletId).add(tabletCommitInfos.get(i).getBackendId()); } + List<String> tabletSuccReplicas = Lists.newArrayList(); + List<String> tabletWriteFailedReplicas = Lists.newArrayList(); + List<String> tabletVersionFailedReplicas = Lists.newArrayList(); + Function<Replica, String> getReplicaInfo = replica -> { + StringBuilder strBuffer = new StringBuilder("[replicaId="); + strBuffer.append(replica.getId()); + strBuffer.append(", backendId="); + strBuffer.append(replica.getBackendId()); + strBuffer.append(", backendAlive="); + strBuffer.append(Env.getCurrentSystemInfo().checkBackendAlive(replica.getBackendId())); + strBuffer.append(", version="); + strBuffer.append(replica.getVersion()); + if (replica.getLastFailedVersion() >= 0) { + strBuffer.append(", lastFailedVersion="); + strBuffer.append(replica.getLastFailedVersion()); + strBuffer.append(", lastSuccessVersion="); + strBuffer.append(replica.getLastSuccessVersion()); + strBuffer.append(", lastFailedTimestamp="); + strBuffer.append(replica.getLastFailedTimestamp()); + } + strBuffer.append(", state="); + strBuffer.append(replica.getState().name()); + strBuffer.append("]"); + + return strBuffer.toString(); + }; + for (long tableId : tableToPartition.keySet()) { OlapTable table = (OlapTable) db.getTableOrMetaException(tableId); for (Partition partition : table.getAllPartitions()) { @@ -533,6 +562,9 @@ public class DatabaseTransactionMgr { .getReplicaAllocation(partition.getId()).getTotalReplicaNum() / 2 + 1; for (MaterializedIndex index : allIndices) { for (Tablet tablet : index.getTablets()) { + tabletSuccReplicas.clear(); + tabletWriteFailedReplicas.clear(); + tabletVersionFailedReplicas.clear(); int successReplicaNum = 0; long tabletId = tablet.getId(); Set<Long> tabletBackends = tablet.getBackendIds(); @@ -558,11 +590,14 @@ public class DatabaseTransactionMgr { // for example, a replica is in clone state if (replica.getLastFailedVersion() < 0) { ++successReplicaNum; + tabletSuccReplicas.add(getReplicaInfo.apply(replica)); } else { errorReplicaInfo += " replica [" + replica.getId() + "], lastFailedVersion [" + replica.getLastFailedVersion() + "]"; + tabletVersionFailedReplicas.add(getReplicaInfo.apply(replica)); } } else { + tabletWriteFailedReplicas.add(getReplicaInfo.apply(replica)); errorBackendIdsForTablet.add(tabletBackend); errorReplicaIds.add(replica.getId()); // not remove rollup task here, because the commit maybe failed @@ -580,9 +615,29 @@ public class DatabaseTransactionMgr { transactionState.getTransactionId(), tablet.getId(), successReplicaNum, quorumReplicaNum, Joiner.on(",").join(errorBackendIdsForTablet), errorReplicaInfo, commitBackends); - throw new TabletQuorumFailedException(transactionState.getTransactionId(), tablet.getId(), - successReplicaNum, quorumReplicaNum, - errorBackendIdsForTablet); + + String replicasDetailMsg = ""; + if (!tabletSuccReplicas.isEmpty()) { + replicasDetailMsg += String.format("%s replicas final succ: { %s }; ", + tabletSuccReplicas.size(), Joiner.on(", ").join(tabletSuccReplicas)); + } + if (!tabletWriteFailedReplicas.isEmpty()) { + replicasDetailMsg += String.format("%s replicas write data failed: { %s }; ", + tabletWriteFailedReplicas.size(), + Joiner.on(", ").join(tabletWriteFailedReplicas)); + } + if (!tabletVersionFailedReplicas.isEmpty()) { + replicasDetailMsg += String.format("%s replicas write data succ but miss previous " + + "version: { %s }.", + tabletVersionFailedReplicas.size(), + Joiner.on(", ").join(tabletVersionFailedReplicas)); + } + + throw new TabletQuorumFailedException(transactionId, String.format( + "Failed to commit txn %s, cause tablet %s succ replica num %s < quorum " + + " replica num %s. table %s, partition %s, this tablet detail: %s", + transactionId, tablet.getId(), successReplicaNum, quorumReplicaNum, tableId, + partition.getId(), replicasDetailMsg)); } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/transaction/TabletQuorumFailedException.java b/fe/fe-core/src/main/java/org/apache/doris/transaction/TabletQuorumFailedException.java index 741babff4b..aef45cdcfd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/transaction/TabletQuorumFailedException.java +++ b/fe/fe-core/src/main/java/org/apache/doris/transaction/TabletQuorumFailedException.java @@ -17,27 +17,8 @@ package org.apache.doris.transaction; -import com.google.common.base.Joiner; -import com.google.common.collect.Sets; - -import java.util.Set; - public class TabletQuorumFailedException extends TransactionException { - - private static final String TABLET_QUORUM_FAILED_MSG = "Failed to commit txn %s. " - + "Tablet [%s] success replica num %s is less than quorum " - + "replica num %s while error backends %s"; - - private long tabletId; - private Set<Long> errorBackendIdsForTablet = Sets.newHashSet(); - - public TabletQuorumFailedException(long transactionId, long tabletId, - int successReplicaNum, int quorumReplicaNum, - Set<Long> errorBackendIdsForTablet) { - super(String.format(TABLET_QUORUM_FAILED_MSG, transactionId, tabletId, - successReplicaNum, quorumReplicaNum, - Joiner.on(",").join(errorBackendIdsForTablet)), transactionId); - this.tabletId = tabletId; - this.errorBackendIdsForTablet = errorBackendIdsForTablet; + public TabletQuorumFailedException(long transactionId, String message) { + super(message, transactionId); } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org