This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new 6d6473efaed branch-2.1: [fix](tabletScheduler) Fix addTablet dead lock 
in tabletScheduler #45298 (#45769)
6d6473efaed is described below

commit 6d6473efaed7b75237f0abc5931dae33e542d4f9
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Tue Dec 24 21:44:30 2024 +0800

    branch-2.1: [fix](tabletScheduler) Fix addTablet dead lock in 
tabletScheduler #45298 (#45769)
    
    Cherry-picked from #45298
    
    Co-authored-by: deardeng <deng...@selectdb.com>
---
 be/src/olap/task/engine_clone_task.cpp             | 10 +++++
 .../org/apache/doris/clone/TabletSchedCtx.java     | 31 ++-----------
 .../org/apache/doris/clone/TabletScheduler.java    |  9 ++--
 .../org/apache/doris/clone/TabletHealthTest.java   | 52 ++++++++++++++++++++++
 .../apache/doris/utframe/MockedBackendFactory.java |  5 +++
 5 files changed, 74 insertions(+), 33 deletions(-)

diff --git a/be/src/olap/task/engine_clone_task.cpp 
b/be/src/olap/task/engine_clone_task.cpp
index a88945ed675..259b1f7108f 100644
--- a/be/src/olap/task/engine_clone_task.cpp
+++ b/be/src/olap/task/engine_clone_task.cpp
@@ -168,6 +168,16 @@ Status EngineCloneTask::_do_clone() {
         auto duration = std::chrono::milliseconds(dp->param("duration", 10 * 
1000));
         std::this_thread::sleep_for(duration);
     });
+
+    DBUG_EXECUTE_IF("EngineCloneTask.failed_clone", {
+        LOG_WARNING("EngineCloneTask.failed_clone")
+                .tag("tablet_id", _clone_req.tablet_id)
+                .tag("replica_id", _clone_req.replica_id)
+                .tag("version", _clone_req.version);
+        return Status::InternalError(
+                "in debug point, EngineCloneTask.failed_clone tablet={}, 
replica={}, version={}",
+                _clone_req.tablet_id, _clone_req.replica_id, 
_clone_req.version);
+    });
     Status status = Status::OK();
     string src_file_path;
     TBackend src_host;
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java 
b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java
index 5e29adbb6da..d8f7bd3fb09 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java
@@ -62,6 +62,7 @@ import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.TimeUnit;
 
 /*
  * TabletSchedCtx contains all information which is created during tablet 
scheduler processing.
@@ -69,28 +70,6 @@ import java.util.Set;
 public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
     private static final Logger LOG = 
LogManager.getLogger(TabletSchedCtx.class);
 
-    /*
-     * SCHED_FAILED_COUNTER_THRESHOLD:
-     *    threshold of times a tablet failed to be scheduled
-     *
-     * MIN_ADJUST_PRIORITY_INTERVAL_MS:
-     *    min interval time of adjusting a tablet's priority
-     *
-     * MAX_NOT_BEING_SCHEDULED_INTERVAL_MS:
-     *    max gap time of a tablet NOT being scheduled.
-     *
-     * These 3 params is for adjusting priority.
-     * If a tablet being scheduled failed for more than 
SCHED_FAILED_COUNTER_THRESHOLD times, its priority
-     * will be downgraded. And the interval between adjustment is larger than 
MIN_ADJUST_PRIORITY_INTERVAL_MS,
-     * to avoid being downgraded too soon.
-     * And if a tablet is not being scheduled longer than 
MAX_NOT_BEING_SCHEDULED_INTERVAL_MS, its priority
-     * will be upgraded, to avoid starvation.
-     *
-     */
-    private static final int SCHED_FAILED_COUNTER_THRESHOLD = 5;
-    private static final long MIN_ADJUST_PRIORITY_INTERVAL_MS = 5 * 60 * 
1000L; // 5 min
-    private static final long MAX_NOT_BEING_SCHEDULED_INTERVAL_MS = 30 * 60 * 
1000L; // 30 min
-
     /*
      * A clone task timeout is between Config.min_clone_task_timeout_sec and 
Config.max_clone_task_timeout_sec,
      * estimated by tablet size / MIN_CLONE_SPEED_MB_PER_SECOND.
@@ -450,10 +429,6 @@ public class TabletSchedCtx implements 
Comparable<TabletSchedCtx> {
         schedFailedCode = code;
     }
 
-    public CloneTask getCloneTask() {
-        return cloneTask;
-    }
-
     public long getCopySize() {
         return copySize;
     }
@@ -932,12 +907,14 @@ public class TabletSchedCtx implements 
Comparable<TabletSchedCtx> {
         }
         if (cloneTask != null) {
             AgentTaskQueue.removeTask(cloneTask.getBackendId(), 
TTaskType.CLONE, cloneTask.getSignature());
+            cloneTask = null;
 
             // clear all CLONE replicas
             Database db = Env.getCurrentInternalCatalog().getDbNullable(dbId);
             if (db != null) {
                 Table table = db.getTableNullable(tblId);
-                if (table != null && table.writeLockIfExist()) {
+                // try get table write lock, if failed TabletScheduler will 
try next time
+                if (table != null && 
table.tryWriteLockIfExist(Table.TRY_LOCK_TIMEOUT_MS, TimeUnit.MILLISECONDS)) {
                     try {
                         List<Replica> cloneReplicas = Lists.newArrayList();
                         tablet.getReplicas().stream().filter(r -> r.getState() 
== ReplicaState.CLONE).forEach(r -> {
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java 
b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
index 724d51c1791..ba10e5edf86 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
@@ -104,9 +104,6 @@ import java.util.stream.Collectors;
 public class TabletScheduler extends MasterDaemon {
     private static final Logger LOG = 
LogManager.getLogger(TabletScheduler.class);
 
-    // handle at most BATCH_NUM tablets in one loop
-    private static final int MIN_BATCH_NUM = 50;
-
     // the minimum interval of updating cluster statistics and priority of 
tablet info
     private static final long STAT_UPDATE_INTERVAL_MS = 20 * 1000; // 20s
 
@@ -150,7 +147,7 @@ public class TabletScheduler extends MasterDaemon {
         ADDED, // success to add
         ALREADY_IN, // already added, skip
         LIMIT_EXCEED, // number of pending tablets exceed the limit
-        REPLACE_ADDED,  // succ to add, and envit a lowest task
+        REPLACE_ADDED,  // succ to add, and evict a lowest task
         DISABLED // scheduler has been disabled.
     }
 
@@ -285,7 +282,7 @@ public class TabletScheduler extends MasterDaemon {
             addResult = AddResult.REPLACE_ADDED;
             pendingTablets.pollLast();
             finalizeTabletCtx(lowestPriorityTablet, 
TabletSchedCtx.State.CANCELLED, Status.UNRECOVERABLE,
-                    "envit lower priority sched tablet because pending queue 
is full");
+                    "evict lower priority sched tablet because pending queue 
is full");
         }
 
         if (!contains || tablet.getType() == TabletSchedCtx.Type.REPAIR) {
@@ -1845,9 +1842,9 @@ public class TabletScheduler extends MasterDaemon {
                 tabletCtx.increaseFailedRunningCounter();
                 if (!tabletCtx.isExceedFailedRunningLimit()) {
                     stat.counterCloneTaskFailed.incrementAndGet();
+                    tabletCtx.setState(TabletSchedCtx.State.PENDING);
                     tabletCtx.releaseResource(this);
                     tabletCtx.resetFailedSchedCounter();
-                    tabletCtx.setState(TabletSchedCtx.State.PENDING);
                     addBackToPendingTablets(tabletCtx);
                     return false;
                 } else {
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/clone/TabletHealthTest.java 
b/fe/fe-core/src/test/java/org/apache/doris/clone/TabletHealthTest.java
index b22925e5d89..320bff45229 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/clone/TabletHealthTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/clone/TabletHealthTest.java
@@ -40,12 +40,14 @@ import org.apache.doris.utframe.TestWithFeService;
 import com.google.common.base.Joiner;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
+import com.google.common.collect.MinMaxPriorityQueue;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
 
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
 
 public class TabletHealthTest extends TestWithFeService {
@@ -78,6 +80,8 @@ public class TabletHealthTest extends TestWithFeService {
 
     @Override
     protected void runBeforeEach() throws Exception {
+        // set back to default value
+        Config.max_scheduling_tablets = 2000;
         for (Table table : db.getTables()) {
             dropTable(table.getName(), true);
         }
@@ -358,4 +362,52 @@ public class TabletHealthTest extends TestWithFeService {
 
         dropTable(table.getName(), true);
     }
+
+    @Test
+    public void testAddTabletNoDeadLock() throws Exception {
+        Config.max_scheduling_tablets = 1;
+        createTable("CREATE TABLE tbl3 (k INT) DISTRIBUTED BY HASH(k) BUCKETS 
2"
+                + " PROPERTIES ('replication_num' = '3')");
+        
DebugPointUtil.addDebugPoint("MockedBackendFactory.handleCloneTablet.failed");
+        OlapTable table = (OlapTable) db.getTableOrMetaException("tbl3");
+        Partition partition = table.getPartitions().iterator().next();
+        List<Tablet> tablets = 
partition.getMaterializedIndices(IndexExtState.ALL).iterator().next().getTablets();
+        Assertions.assertEquals(2, tablets.size());
+
+        partition.updateVisibleVersion(10L);
+        tablets.forEach(tablet -> tablet.getReplicas().forEach(replica -> 
replica.updateVersion(10)));
+
+        Tablet tabletA = tablets.get(0);
+        Tablet tabletB = tablets.get(1);
+        TabletScheduler scheduler = Env.getCurrentEnv().getTabletScheduler();
+        tabletA.getReplicas().get(0).adminUpdateVersionInfo(8L, null, null, 
0L);
+        checkTabletStatus(tabletA, TabletStatus.VERSION_INCOMPLETE, table, 
partition);
+        Env.getCurrentEnv().getTabletChecker().runAfterCatalogReady();
+        Env.getCurrentEnv().getTabletScheduler().runAfterCatalogReady();
+        Thread.sleep(1000);
+        MinMaxPriorityQueue<TabletSchedCtx> queue = 
scheduler.getPendingTabletQueue();
+        TabletSchedCtx tabletACtx = queue.peekFirst();
+        Assertions.assertNotNull(tabletACtx);
+        tabletACtx.setLastVisitedTime(System.currentTimeMillis() + 3600 * 
1000L);
+        tabletB.getReplicas().get(0).adminUpdateVersionInfo(8L, null, null, 
0L);
+        checkTabletStatus(tabletB, TabletStatus.VERSION_INCOMPLETE, table, 
partition);
+        Thread thread = new Thread(() -> {
+            try {
+                Env.getCurrentEnv().getTabletChecker().runAfterCatalogReady();
+                
Env.getCurrentEnv().getTabletScheduler().runAfterCatalogReady();
+            } catch (Exception e) {
+                e.printStackTrace();
+            }
+        });
+        thread.start();
+        Thread.sleep(1000);
+        Assertions.assertTrue(table.tryWriteLock(2, TimeUnit.SECONDS));
+        table.writeUnlock();
+        DebugPointUtil.clearDebugPoints();
+        doRepair();
+        Thread.sleep(1000);
+        doRepair();
+        checkTabletIsHealth(tabletA, table, partition);
+        checkTabletIsHealth(tabletB, table, partition);
+    }
 }
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/utframe/MockedBackendFactory.java 
b/fe/fe-core/src/test/java/org/apache/doris/utframe/MockedBackendFactory.java
index 93ce8b6766b..4e2a9fa056b 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/utframe/MockedBackendFactory.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/utframe/MockedBackendFactory.java
@@ -83,6 +83,7 @@ import io.grpc.stub.StreamObserver;
 import org.apache.thrift.TException;
 
 import java.io.IOException;
+import java.util.Collections;
 import java.util.List;
 import java.util.Random;
 import java.util.concurrent.BlockingQueue;
@@ -293,6 +294,10 @@ public class MockedBackendFactory {
                     tabletInfo.setPathHash(pathHash);
                     tabletInfo.setUsed(true);
                     tabletInfos.add(tabletInfo);
+                    if 
(DebugPointUtil.isEnable("MockedBackendFactory.handleCloneTablet.failed")) {
+                        finishTaskRequest.setTaskStatus(new 
TStatus(TStatusCode.CANCELLED));
+                        
finishTaskRequest.getTaskStatus().setErrorMsgs(Collections.singletonList("debug 
point set"));
+                    }
                     finishTaskRequest.setFinishTabletInfos(tabletInfos);
                 }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to