This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.1 by this push:
     new 6f6da3250b4 [fix](balance) Fix PartitionRebalancer generating invalid 
moves to BEs without required storage medium (#62206) (#63756)
6f6da3250b4 is described below

commit 6f6da3250b49f4893fb70ac70618db515c887579
Author: deardeng <[email protected]>
AuthorDate: Thu May 28 13:39:31 2026 +0800

    [fix](balance) Fix PartitionRebalancer generating invalid moves to BEs 
without required storage medium (#62206) (#63756)
    
    pick from https://github.com/apache/doris/pull/62206
    
    When tablet_rebalancer_type=Partition, adding a new BE with only HDD
    disks to a cluster where tables use SSD storage medium causes the
    PartitionRebalancer to generate invalid moves (SSD tablets -> HDD-only
    BE), resulting in infinite "paths has no available balance slot: []"
    scheduling failures.
    
    Root cause:
    1. In LoadStatisticForTag.init(), beByTotalReplicaCount for each medium
    includes ALL available BEs without checking hasMedium(). This causes the
    greedy algorithm to consider HDD-only BEs as valid destinations for SSD
    tablets.
    2. In LocalTabletInvertedIndex.buildPartitionInfoBySkew(), the countMap
    initialization uses all availableBeIds without medium filtering, so
    HDD-only BEs get counted with 0 replicas for SSD partitions, making them
    appear as the "least loaded" and preferred move target.
    
    Fix:
    1. Add hasMedium() filter in LoadStatisticForTag.init() when building
    beByTotalReplicaCount, so only BEs that actually have the required
    medium are considered for balancing.
    2. Add availableBeIdsByMedium parameter to buildPartitionInfoBySkew()
    and use it to initialize countMap with only medium-matching BEs,
    preventing BEs without the required medium from appearing in the skew
    calculation.
    
    ### What problem does this PR solve?
    
    Issue Number: close #xxx
    
    Related PR: #xxx
    
    Problem Summary:
    
    ### Release note
    
    None
    
    ### Check List (For Author)
    
    - Test <!-- At least one of them must be included. -->
        - [ ] Regression test
        - [ ] Unit Test
        - [ ] Manual test (add detailed scripts or steps below)
        - [ ] No need to test or manual test. Explain why:
    - [ ] This is a refactor/code format and no logic has been changed.
            - [ ] Previous test can cover this change.
            - [ ] No code files have been changed.
            - [ ] Other reason <!-- Add your reason?  -->
    
    - Behavior changed:
        - [ ] No.
        - [ ] Yes. <!-- Explain the behavior change -->
    
    - Does this need documentation?
        - [ ] No.
    - [ ] Yes. <!-- Add document PR link here. eg:
    https://github.com/apache/doris-website/pull/1214 -->
    
    ### Check List (For Reviewer who merge this PR)
    
    - [ ] Confirm the release note
    - [ ] Confirm test cases
    - [ ] Confirm document
    - [ ] Add branch pick label <!-- Add branch pick label that this PR
    should merge into -->
---
 docker/runtime/doris-compose/command.py            |  15 +-
 .../doris/catalog/LocalTabletInvertedIndex.java    |  13 +-
 .../apache/doris/catalog/TabletInvertedIndex.java  |   4 +-
 .../apache/doris/clone/LoadStatisticForTag.java    |  13 +-
 .../apache/doris/clone/PartitionRebalancer.java    |   2 +-
 .../java/org/apache/doris/clone/RebalanceTest.java |  81 ++++++++++
 .../org/apache/doris/clone/RebalancerTestUtil.java |  16 ++
 .../doris/regression/suite/SuiteCluster.groovy     |  31 ++--
 ...est_partition_rebalancer_medium_mismatch.groovy | 178 +++++++++++++++++++++
 9 files changed, 332 insertions(+), 21 deletions(-)

diff --git a/docker/runtime/doris-compose/command.py 
b/docker/runtime/doris-compose/command.py
index 6acbe479532..c7883454899 100644
--- a/docker/runtime/doris-compose/command.py
+++ b/docker/runtime/doris-compose/command.py
@@ -391,7 +391,7 @@ class UpCommand(Command):
         )
         group1.add_argument("--be-disks",
                             nargs="*",
-                            default=["HDD=1"],
+                            default=None,
                             type=str,
                             help="Specify each be disks, each group is 
\"disk_type=disk_num[,disk_capactity]\", "\
                                     "disk_type is HDD or SSD, disk_capactity 
is capactity limit in gb. default: HDD=1. "\
@@ -628,7 +628,8 @@ class UpCommand(Command):
                 args.NAME, args.IMAGE, args.cloud, args.root, args.fe_config,
                 args.be_config, args.ms_config, args.recycle_config,
                 args.remote_master_fe, args.local_network_ip, args.fe_follower,
-                args.be_disks, args.be_cluster, args.reg_be, args.extra_hosts,
+                args.be_disks if args.be_disks is not None else ["HDD=1"],
+                args.be_cluster, args.reg_be, args.extra_hosts,
                 args.coverage_dir, cloud_store_config, args.sql_mode_node_mgr,
                 args.be_metaservice_endpoint, args.be_cluster_id, args.tde_ak, 
args.tde_sk)
             LOG.info("Create new cluster {} succ, cluster path is {}".format(
@@ -669,9 +670,19 @@ class UpCommand(Command):
                 related_nodes.append(node)
                 add_ids.append(node.id)
 
+        # If --be-disks is explicitly provided for an existing cluster,
+        # temporarily override cluster.be_disks so newly added BEs use
+        # the specified disk config instead of the original cluster config.
+        saved_be_disks = cluster.be_disks
+        if args.be_disks is not None:
+            cluster.be_disks = args.be_disks
+
         for node_type, add_num, add_ids in add_type_nums:
             do_add_node(node_type, add_num, add_ids)
 
+        # Restore original be_disks to avoid side effects
+        cluster.be_disks = saved_be_disks
+
         if args.IMAGE:
             for node in related_nodes:
                 node.set_image(args.IMAGE)
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/LocalTabletInvertedIndex.java
 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/LocalTabletInvertedIndex.java
index 27d1b6ba675..44a14322243 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/LocalTabletInvertedIndex.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/LocalTabletInvertedIndex.java
@@ -925,7 +925,9 @@ public class LocalTabletInvertedIndex extends 
TabletInvertedIndex {
     // Only build from available bes, exclude colocate tables
     @Override
     public Map<TStorageMedium, TreeMultimap<Long, PartitionBalanceInfo>> 
buildPartitionInfoBySkew(
-            List<Long> availableBeIds, Map<Long, Pair<TabletMove, Long>> 
movesInProgress) {
+            List<Long> availableBeIds,
+            Map<TStorageMedium, List<Long>> availableBeIdsByMedium,
+            Map<Long, Pair<TabletMove, Long>> movesInProgress) {
         Set<Long> dbIds = Sets.newHashSet();
         Set<Long> tableIds = Sets.newHashSet();
         Set<Long> partitionIds = Sets.newHashSet();
@@ -986,11 +988,14 @@ public class LocalTabletInvertedIndex extends 
TabletInvertedIndex {
                     Map<Long, Long> countMap = partitionReplicasInfo.get(
                             tabletMeta.getPartitionId(), 
tabletMeta.getIndexId());
                     if (countMap == null) {
-                        // If one be doesn't have any replica of one 
partition, it should be counted too.
-                        countMap = 
availableBeIds.stream().collect(Collectors.toMap(i -> i, i -> 0L));
+                        // If one be doesn't have any replica of one partition,
+                        // it should be counted too.
+                        List<Long> availableBeIdsForMedium = 
availableBeIdsByMedium.getOrDefault(
+                                medium, Lists.newArrayList());
+                        countMap = 
availableBeIdsForMedium.stream().collect(Collectors.toMap(i -> i, i -> 0L));
                     }
 
-                    Long count = countMap.get(beId);
+                    Long count = countMap.getOrDefault(beId, 0L);
                     countMap.put(beId, count + 1L);
                     partitionReplicasInfo.put(tabletMeta.getPartitionId(), 
tabletMeta.getIndexId(), countMap);
                     partitionReplicasInfoMaps.put(medium, 
partitionReplicasInfo);
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/TabletInvertedIndex.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/TabletInvertedIndex.java
index 420c1087066..0e1e68ef9a7 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/TabletInvertedIndex.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/TabletInvertedIndex.java
@@ -199,7 +199,9 @@ public abstract class TabletInvertedIndex {
     }
 
     public Map<TStorageMedium, TreeMultimap<Long, PartitionBalanceInfo>> 
buildPartitionInfoBySkew(
-            List<Long> availableBeIds, Map<Long, Pair<TabletMove, Long>> 
movesInProgress) {
+            List<Long> availableBeIds,
+            Map<TStorageMedium, List<Long>> availableBeIdsByMedium,
+            Map<Long, Pair<TabletMove, Long>> movesInProgress) {
         throw new UnsupportedOperationException("buildPartitionInfoBySkew is 
not supported in TabletInvertedIndex");
     }
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/clone/LoadStatisticForTag.java 
b/fe/fe-core/src/main/java/org/apache/doris/clone/LoadStatisticForTag.java
index 60a0d147917..e731d0701e0 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/clone/LoadStatisticForTag.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/clone/LoadStatisticForTag.java
@@ -159,7 +159,8 @@ public class LoadStatisticForTag {
         // Only count the available be
         for (TStorageMedium medium : TStorageMedium.values()) {
             TreeMultimap<Long, Long> beByTotalReplicaCount = 
TreeMultimap.create();
-            
beLoadStatistics.stream().filter(BackendLoadStatistic::isAvailable).forEach(beStat
 ->
+            beLoadStatistics.stream().filter(BackendLoadStatistic::isAvailable)
+                    .filter(beStat -> beStat.hasMedium(medium)).forEach(beStat 
->
                     beByTotalReplicaCount.put(beStat.getReplicaNum(medium), 
beStat.getBeId()));
             beByTotalReplicaCountMaps.put(medium, beByTotalReplicaCount);
         }
@@ -173,9 +174,17 @@ public class LoadStatisticForTag {
                     .filter(BackendLoadStatistic::isAvailable)
                     .map(BackendLoadStatistic::getBeId)
                     .collect(Collectors.toList());
+            Map<TStorageMedium, List<Long>> availableBeIdsByMedium = 
Maps.newHashMap();
+            for (TStorageMedium medium : TStorageMedium.values()) {
+                availableBeIdsByMedium.put(medium, beLoadStatistics.stream()
+                        .filter(BackendLoadStatistic::isAvailable)
+                        .filter(be -> be.hasMedium(medium))
+                        .map(BackendLoadStatistic::getBeId)
+                        .collect(Collectors.toList()));
+            }
             Map<Long, Pair<TabletMove, Long>> movesInProgress = rebalancer == 
null ? Maps.newHashMap()
                     : ((PartitionRebalancer) rebalancer).getMovesInProgress();
-            skewMaps = invertedIndex.buildPartitionInfoBySkew(availableBeIds, 
movesInProgress);
+            skewMaps = invertedIndex.buildPartitionInfoBySkew(availableBeIds, 
availableBeIdsByMedium, movesInProgress);
         }
     }
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/clone/PartitionRebalancer.java 
b/fe/fe-core/src/main/java/org/apache/doris/clone/PartitionRebalancer.java
index 30a7a76b920..96db70cf76f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/clone/PartitionRebalancer.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/clone/PartitionRebalancer.java
@@ -305,7 +305,7 @@ public class PartitionRebalancer extends Rebalancer {
 
             List<RootPathLoadStatistic> paths = beStat.getPathStatistics();
             List<Long> availPath = paths.stream().filter(path -> 
path.getStorageMedium() == tabletCtx.getStorageMedium()
-                            && path.isFit(tabletCtx.getTabletSize(), false) == 
BalanceStatus.OK)
+                            && path.isFit(tabletCtx.getTabletSize(), 
false).ok())
                     
.map(RootPathLoadStatistic::getPathHash).collect(Collectors.toList());
             long pathHash = slot.takeAnAvailBalanceSlotFrom(availPath, 
tabletCtx.getTag(),
                     tabletCtx.getStorageMedium());
diff --git a/fe/fe-core/src/test/java/org/apache/doris/clone/RebalanceTest.java 
b/fe/fe-core/src/test/java/org/apache/doris/clone/RebalanceTest.java
index 4e91ec5be3b..87ece62b757 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/clone/RebalanceTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/clone/RebalanceTest.java
@@ -323,6 +323,87 @@ public class RebalanceTest {
         Assert.assertEquals(needCheckTablets.size(), succeeded.get());
     }
 
+    // Test for OPENSOURCE-192: PartitionRebalancer should not generate moves
+    // targeting a BE that lacks the required storage medium.
+    // Scenario: SSD tablets on BE 20001/20002, new BE 20003 has only HDD.
+    // Without the fix, the algorithm would pick BE 20003 (0 SSD replicas) as 
the
+    // "least loaded" destination for SSD tablets, causing infinite scheduling 
failures.
+    @Test
+    public void testPartitionRebalancerSkipBEWithoutMedium() {
+        Configurator.setLevel("org.apache.doris.clone.PartitionRebalancer", 
Level.DEBUG);
+
+        // Add backends: 20001, 20002 have SSD; 20003 has only HDD
+        systemInfoService.addBackend(
+                RebalancerTestUtil.createBackend(20001L, 2048, 0, 
TStorageMedium.SSD));
+        systemInfoService.addBackend(
+                RebalancerTestUtil.createBackend(20002L, 2048, 0, 
TStorageMedium.SSD));
+        systemInfoService.addBackend(
+                RebalancerTestUtil.createBackend(20003L, 2048, 0, 
TStorageMedium.HDD));
+
+        // Create a table with SSD partition
+        OlapTable ssdTable = new OlapTable(3, "ssd table", new ArrayList<>(),
+                KeysType.DUP_KEYS, new RangePartitionInfo(), new 
HashDistributionInfo());
+        db.registerTable(ssdTable);
+
+        MaterializedIndex ssdIndex = new MaterializedIndex(ssdTable.getId(), 
null);
+        long partId = 41;
+        Partition partition = new Partition(partId, "p0", ssdIndex, new 
HashDistributionInfo());
+        ssdTable.addPartition(partition);
+        ssdTable.getPartitionInfo().addPartition(partId, new 
DataProperty(TStorageMedium.SSD),
+                ReplicaAllocation.DEFAULT_ALLOCATION, false, true);
+        ssdTable.setIndexMeta(ssdIndex.getId(), "ssd index", 
Lists.newArrayList(new Column()),
+                0, 0, (short) 0, TStorageType.COLUMN, KeysType.DUP_KEYS);
+
+        // Create SSD tablets: 3 replicas on BE 20001, 1 on BE 20002
+        // This creates skew = 3 - 1 = 2 among SSD BEs (with fix),
+        // or skew = 3 - 0 = 3 counting HDD-only BEs (without fix)
+        RebalancerTestUtil.createTablet(invertedIndex, db, ssdTable, "p0", 
TStorageMedium.SSD,
+                80001, Lists.newArrayList(20001L));
+        RebalancerTestUtil.createTablet(invertedIndex, db, ssdTable, "p0", 
TStorageMedium.SSD,
+                80002, Lists.newArrayList(20001L));
+        RebalancerTestUtil.createTablet(invertedIndex, db, ssdTable, "p0", 
TStorageMedium.SSD,
+                80003, Lists.newArrayList(20001L));
+        RebalancerTestUtil.createTablet(invertedIndex, db, ssdTable, "p0", 
TStorageMedium.SSD,
+                80004, Lists.newArrayList(20002L));
+
+        // Regenerate statistics with partition rebalancer
+        Config.tablet_rebalancer_type = "partition";
+        LoadStatisticForTag loadStatistic = new LoadStatisticForTag(
+                Tag.DEFAULT_BACKEND_TAG, systemInfoService, invertedIndex, 
null);
+        loadStatistic.init();
+        Map<Tag, LoadStatisticForTag> ssdStatMap = Maps.newHashMap();
+        ssdStatMap.put(Tag.DEFAULT_BACKEND_TAG, loadStatistic);
+
+        PartitionRebalancer rebalancer = new 
PartitionRebalancer(Env.getCurrentSystemInfo(),
+                Env.getCurrentInvertedIndex(), null);
+        rebalancer.updateLoadStatistic(ssdStatMap);
+        rebalancer.selectAlternativeTablets();
+
+        // Verify: moves were generated (test is meaningful)
+        Map<Long, Pair<PartitionRebalancer.TabletMove, Long>> moves = 
rebalancer.getMovesInProgress();
+        Assert.assertFalse("Should generate moves for skewed SSD partition", 
moves.isEmpty());
+
+        // Verify: no move targets BE 20003 (HDD-only) or any of the HDD BEs 
from setUp (10001-10004)
+        for (Map.Entry<Long, Pair<PartitionRebalancer.TabletMove, Long>> entry 
: moves.entrySet()) {
+            PartitionRebalancer.TabletMove move = entry.getValue().first;
+            Assert.assertNotEquals("Move should not target HDD-only BE for SSD 
tablet",
+                    Long.valueOf(20003L), move.toBe);
+            Assert.assertFalse("Move should not target any BE without SSD",
+                    move.toBe == 10001L || move.toBe == 10002L
+                            || move.toBe == 10003L || move.toBe == 10004L);
+        }
+
+        // Verify: all moves go from BE 20001 (most loaded) to BE 20002 (least 
loaded with SSD)
+        for (Map.Entry<Long, Pair<PartitionRebalancer.TabletMove, Long>> entry 
: moves.entrySet()) {
+            PartitionRebalancer.TabletMove move = entry.getValue().first;
+            Assert.assertEquals("Source should be the most loaded SSD BE",
+                    Long.valueOf(20001L), move.fromBe);
+            Assert.assertEquals("Dest should be the least loaded SSD BE",
+                    Long.valueOf(20002L), move.toBe);
+        }
+        LOG.info("testPartitionRebalancerSkipBEWithoutMedium success");
+    }
+
     @Test
     public void testMoveInProgressMap() {
         Configurator.setLevel("org.apache.doris.clone.MovesInProgressCache", 
Level.DEBUG);
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/clone/RebalancerTestUtil.java 
b/fe/fe-core/src/test/java/org/apache/doris/clone/RebalancerTestUtil.java
index 6f1d14ebf59..0864a996ee8 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/clone/RebalancerTestUtil.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/clone/RebalancerTestUtil.java
@@ -51,6 +51,22 @@ public class RebalancerTestUtil {
         return createBackend(id, totalCap, Lists.newArrayList(usedCap), 1);
     }
 
+    // Add only one path with specified storage medium, PathHash:id
+    public static Backend createBackend(long id, long totalCap, long usedCap, 
TStorageMedium medium) {
+        Backend be = new Backend(id, "192.168.0." + id, 9051);
+        Map<String, DiskInfo> disks = Maps.newHashMap();
+        DiskInfo diskInfo = new DiskInfo("/path1");
+        diskInfo.setPathHash(id);
+        diskInfo.setTotalCapacityB(totalCap);
+        diskInfo.setDataUsedCapacityB(usedCap);
+        diskInfo.setAvailableCapacityB(totalCap - usedCap);
+        diskInfo.setStorageMedium(medium);
+        disks.put(diskInfo.getRootPath(), diskInfo);
+        be.setDisks(ImmutableMap.copyOf(disks));
+        be.setAlive(true);
+        return be;
+    }
+
     /**
      * size of usedCaps should equal to diskNum.
      */
diff --git 
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
 
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
index dd1c2b8f2fd..da4617ff5d3 100644
--- 
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
+++ 
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
@@ -524,10 +524,15 @@ class SuiteCluster {
     }
 
     List<Integer> addFrontend(int num, boolean followerMode=false) throws 
Exception {
-        def result = add(num, 0, null, followerMode)
+        def result = add(0, num, '', false, null)
         return result.first
     }
 
+    List<Integer> addBackend(int num, List<String> beDisks) throws Exception {
+        def result = add(0, num, '', false, beDisks)
+        return result.second
+    }
+
     List<Integer> addBackend(int num, String ClusterName='') throws Exception {
         def result = add(0, num, ClusterName)
         return result.second
@@ -535,29 +540,33 @@ class SuiteCluster {
 
     // ATTN: clusterName just used for cloud mode, 1 cluster has n bes
     // ATTN: followerMode just used for cloud mode
-    Tuple2<List<Integer>, List<Integer>> add(int feNum, int beNum, String 
clusterName, boolean followerMode=false) throws Exception {
+    // ATTN: beDisks just used for not cloud mode
+    Tuple2<List<Integer>, List<Integer>> add(int feNum, int beNum, String 
clusterName, boolean followerMode=false, List<String> beDisks=null) throws 
Exception {
         assert feNum > 0 || beNum > 0
 
-        def sb = new StringBuilder()
-        sb.append('up ' + name + ' ')
+        def cmd = ['up', name]
         if (feNum > 0) {
-            sb.append('--add-fe-num ' + feNum + ' ')
+            cmd += ['--add-fe-num', String.valueOf(feNum)]
             if (followerMode) {
-                sb.append('--fe-follower' + ' ')
+                cmd += ['--fe-follower']
             }
             if (sqlModeNodeMgr) {
-                sb.append('--sql-mode-node-mgr' + ' ')
+                cmd += ['--sql-mode-node-mgr']
             }
         }
         if (beNum > 0) {
-            sb.append('--add-be-num ' + beNum + ' ')
+            cmd += ['--add-be-num', String.valueOf(beNum)]
             if (clusterName != null && !clusterName.isEmpty()) {
-                sb.append(' --be-cluster ' + clusterName + ' ')
+                cmd += ['--be-cluster', clusterName]
             }
         }
-        sb.append('--wait-timeout 60')
+        if (beDisks != null && !beDisks.isEmpty()) {
+            cmd += ['--be-disks']
+            cmd += beDisks
+        }
+        cmd += ['--wait-timeout', '60']
 
-        def data = (Map<String, Map<String, Object>>) runCmd(sb.toString(), 
180)
+        def data = (Map<String, Map<String, Object>>) runCmdList(cmd, 180)
         def newFrontends = (List<Integer>) data.get('fe').get('add_list')
         def newBackends = (List<Integer>) data.get('be').get('add_list')
 
diff --git 
a/regression-test/suites/storage_medium_p0/test_partition_rebalancer_medium_mismatch.groovy
 
b/regression-test/suites/storage_medium_p0/test_partition_rebalancer_medium_mismatch.groovy
new file mode 100644
index 00000000000..5aceb148914
--- /dev/null
+++ 
b/regression-test/suites/storage_medium_p0/test_partition_rebalancer_medium_mismatch.groovy
@@ -0,0 +1,178 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import org.apache.doris.regression.suite.ClusterOptions
+
+/**
+ * Reproduce OPENSOURCE-192:
+ * When tablet_rebalancer_type=Partition, adding a new BE with only HDD disks
+ * to a cluster where tables are created with storage_medium=SSD causes
+ * the PartitionRebalancer to generate invalid moves (SSD tablets -> HDD-only 
BE),
+ * resulting in infinite "paths has no available balance slot: []" errors.
+ *
+ * Root cause: In LoadStatisticForTag.init(), the beByTotalReplicaCount map
+ * for each medium includes ALL available BEs without checking hasMedium().
+ * Similarly, TabletInvertedIndex.buildPartitionInfoBySkew() includes all
+ * availableBeIds in countMap without medium filtering. This causes the
+ * greedy algorithm to generate moves targeting BEs that lack the required
+ * storage medium.
+ *
+ * Setup:
+ *   - 3 initial BEs with SSD + HDD disks
+ *   - Table created with storage_medium = SSD (explicitly specified)
+ *   - Add 1 new BE with HDD only (via addBackend with custom beDisks)
+ *   - PartitionRebalancer generates invalid moves to the HDD-only BE
+ */
+suite('test_partition_rebalancer_medium_mismatch', 'docker') {
+    if (isCloudMode()) {
+        return
+    }
+
+    def options = new ClusterOptions()
+    options.feConfigs += [
+        'tablet_rebalancer_type=Partition',
+        'schedule_slot_num_per_hdd_path=8',
+        'balance_slot_num_per_path=2',
+        'disable_balance=false',
+        'disable_disk_balance=true',
+        'tablet_checker_interval_ms=2000',
+        'schedule_batch_size=1000',
+    ]
+    options.beConfigs += [
+        'report_disk_state_interval_seconds=2',
+        'report_tablet_interval_seconds=3',
+    ]
+    // Initial 3 BEs: each has 1 SSD + 1 HDD
+    options.beDisks = ['SSD=1', 'HDD=1']
+    options.beNum = 3
+
+    docker(options) {
+        // Step 1: Create table explicitly with SSD medium
+        def table = 'tbl_ssd_balance'
+        sql "DROP TABLE IF EXISTS ${table} FORCE"
+        sql """
+            CREATE TABLE ${table} (
+                k1 INT,
+                k2 VARCHAR(100),
+                v1 INT
+            )
+            DISTRIBUTED BY HASH(k1) BUCKETS 10
+            PROPERTIES (
+                'replication_num' = '1',
+                'storage_medium' = 'SSD'
+            )
+        """
+
+        // Verify partition medium is SSD
+        def partitions = sql_return_maparray "SHOW PARTITIONS FROM ${table}"
+        assertTrue(partitions.size() > 0)
+        partitions.each {
+            assertEquals('SSD', it.StorageMedium)
+        }
+        log.info("Table created with SSD medium, partitions: 
${partitions.size()}")
+
+        // Step 2: Insert data to distribute tablets across existing BEs
+        for (int i = 0; i < 100; i++) {
+            sql "INSERT INTO ${table} VALUES (${i}, 'value_${i}', ${i * 10})"
+        }
+
+        def count = sql "SELECT COUNT(*) FROM ${table}"
+        assertEquals(100, count[0][0] as int)
+
+        // Record tablet distribution before expansion
+        def tabletsBefore = sql_return_maparray "SHOW TABLETS FROM ${table}"
+        log.info("Tablets before expansion: ${tabletsBefore.size()}")
+        def beIdsBefore = tabletsBefore.collect { it.BackendId }.unique()
+        log.info("Tablets on BEs: ${beIdsBefore}")
+
+        // Let scheduler settle
+        sleep(10000)
+
+        // Step 3: Add a new BE with HDD only (different disk config from 
initial BEs)
+        log.info("Adding new BE with HDD-only disks...")
+        def newBeIndices = cluster.addBackend(1, ['HDD=1'])
+        log.info("New BE added with indices: ${newBeIndices}")
+
+        // Wait for new BE heartbeat and disk report
+        sleep(8000)
+
+        // Verify all backends
+        def backends = sql_return_maparray "SHOW BACKENDS"
+        log.info("Total backends after expansion: ${backends.size()}")
+        assertEquals(4, backends.size())
+
+        // Find the new BE
+        def newBeId = null
+        for (def be : backends) {
+            if (!(be.BackendId in beIdsBefore.collect { it as String })) {
+                newBeId = be.BackendId
+                break
+            }
+        }
+        assertNotNull(newBeId, "Should find new BE")
+        log.info("New BE id: ${newBeId}")
+
+        // Verify new BE has only HDD
+        def newBeDisks = sql_return_maparray "SHOW PROC '/backends/${newBeId}'"
+        log.info("New BE disks: ${newBeDisks}")
+        def hasSSD = newBeDisks.any { it.StorageMedium == 'SSD' }
+        def hasHDD = newBeDisks.any { it.StorageMedium == 'HDD' }
+        assertTrue(hasHDD, "New BE should have HDD disk")
+        assertFalse(hasSSD, "New BE should NOT have SSD disk")
+
+        // Step 4: Wait for PartitionRebalancer to attempt balance scheduling
+        // The bug: algorithm generates moves targeting the HDD-only BE for 
SSD tablets
+        log.info("Waiting for PartitionRebalancer to run (60s)...")
+        sleep(60000)
+
+        // Step 5: Check balance history for the bug signature
+        def schedHistory = sql_return_maparray "SHOW PROC 
'/cluster_balance/history_tablets'"
+        def failedWithEmptySlot = schedHistory.findAll {
+            it.ErrMsg != null && it.ErrMsg.contains('paths has no available 
balance slot: []')
+        }
+
+        log.info("Total history entries: ${schedHistory.size()}")
+        log.info("Entries with 'empty slot' error: 
${failedWithEmptySlot.size()}")
+
+        if (failedWithEmptySlot.size() > 0) {
+            log.warn("BUG REPRODUCED (OPENSOURCE-192)! " +
+                     "Found ${failedWithEmptySlot.size()} balance tasks " +
+                     "failed with 'paths has no available balance slot: []'")
+            failedWithEmptySlot.take(5).each { task ->
+                log.warn("  tablet=${task.TabletId}, dest=${task.DestBe}, 
err=${task.ErrMsg}")
+            }
+            // This assertion will fail when the bug is present, and pass 
after fix
+            fail("BUG: PartitionRebalancer generated invalid moves to HDD-only 
BE for SSD tablets")
+        } else {
+            log.info("No 'empty slot' failures. Bug not triggered or already 
fixed.")
+        }
+
+        // Step 6: Check that no tablets moved to the new BE
+        // (since it has no SSD, SSD tablets should NOT be relocated there)
+        def tabletsAfter = sql_return_maparray "SHOW TABLETS FROM ${table}"
+        def tabletsOnNewBe = tabletsAfter.findAll { it.BackendId == newBeId }
+        log.info("Tablets on new HDD-only BE: ${tabletsOnNewBe.size()}")
+        assertEquals(0, tabletsOnNewBe.size())
+
+        // Step 7: Verify data integrity
+        def countAfter = sql "SELECT COUNT(*) FROM ${table}"
+        assertEquals(100, countAfter[0][0] as int)
+
+        // Cleanup
+        sql "DROP TABLE IF EXISTS ${table} FORCE"
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to