This is an automated email from the ASF dual-hosted git repository.
dlmarion pushed a commit to branch 2.1
in repository https://gitbox.apache.org/repos/asf/accumulo.git
The following commit(s) were added to refs/heads/2.1 by this push:
new df053cc63e Manager balancer fixes (#5070)
df053cc63e is described below
commit df053cc63e6e998c1ca264d5078cbd2337759356
Author: Dave Marion <[email protected]>
AuthorDate: Tue Dec 3 12:50:26 2024 -0500
Manager balancer fixes (#5070)
Modified Manager balancer code such that the tservers for the
ROOT and METADATA DataLevels are recalculated on each loop
to account for any change in available tablet servers, and ignoring
any migrations that the balancer may emit for tablets outside of
the current DataLevel.
---
.../java/org/apache/accumulo/manager/Manager.java | 38 ++++++++++++++++++----
1 file changed, 31 insertions(+), 7 deletions(-)
diff --git
a/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java
b/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java
index 9aca12d7ab..44800d5833 100644
--- a/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java
+++ b/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java
@@ -1045,7 +1045,7 @@ public class Manager extends AbstractServer
}
// Create a view of the tserver status such that it only contains the
tables
// for this level in the tableMap.
- final SortedMap<TServerInstance,TabletServerStatus>
tserverStatusForLevel =
+ SortedMap<TServerInstance,TabletServerStatus> tserverStatusForLevel =
createTServerStatusView(dl, tserverStatus);
// Construct the Thrift variant of the map above for the BalancerParams
final SortedMap<TabletServerId,TServerStatus>
tserverStatusForBalancerLevel =
@@ -1057,17 +1057,36 @@ public class Manager extends AbstractServer
int attemptNum = 0;
do {
log.debug("Balancing for tables at level {}, times-in-loop: {}", dl,
++attemptNum);
- params = BalanceParamsImpl.fromThrift(tserverStatusForBalancerLevel,
- tserverStatusForLevel, partitionedMigrations.get(dl));
+
+ SortedMap<TabletServerId,TServerStatus> statusForBalancerLevel =
+ tserverStatusForBalancerLevel;
+ if (attemptNum > 1 && (dl == DataLevel.ROOT || dl ==
DataLevel.METADATA)) {
+ // If we are still migrating then perform a re-check on the tablet
+ // servers to make sure non of them have failed.
+ Set<TServerInstance> currentServers =
tserverSet.getCurrentServers();
+ tserverStatus = gatherTableInformation(currentServers);
+ // Create a view of the tserver status such that it only contains
the tables
+ // for this level in the tableMap.
+ tserverStatusForLevel = createTServerStatusView(dl, tserverStatus);
+ final SortedMap<TabletServerId,TServerStatus>
tserverStatusForBalancerLevel2 =
+ new TreeMap<>();
+ tserverStatusForLevel.forEach((tsi, status) ->
tserverStatusForBalancerLevel2
+ .put(new TabletServerIdImpl(tsi),
TServerStatusImpl.fromThrift(status)));
+ statusForBalancerLevel = tserverStatusForBalancerLevel2;
+ }
+
+ params = BalanceParamsImpl.fromThrift(statusForBalancerLevel,
tserverStatusForLevel,
+ partitionedMigrations.get(dl));
wait = Math.max(tabletBalancer.balance(params), wait);
- migrationsOutForLevel = params.migrationsOut().size();
- for (TabletMigration m :
checkMigrationSanity(tserverStatusForBalancerLevel.keySet(),
- params.migrationsOut())) {
+ migrationsOutForLevel = 0;
+ for (TabletMigration m :
checkMigrationSanity(statusForBalancerLevel.keySet(),
+ params.migrationsOut(), dl)) {
final KeyExtent ke = KeyExtent.fromTabletId(m.getTablet());
if (migrations.containsKey(ke)) {
log.warn("balancer requested migration more than once, skipping
{}", m);
continue;
}
+ migrationsOutForLevel++;
migrations.put(ke,
TabletServerIdImpl.toThrift(m.getNewTabletServer()));
log.debug("migration {}", m);
}
@@ -1091,11 +1110,16 @@ public class Manager extends AbstractServer
}
private List<TabletMigration> checkMigrationSanity(Set<TabletServerId>
current,
- List<TabletMigration> migrations) {
+ List<TabletMigration> migrations, DataLevel level) {
return migrations.stream().filter(m -> {
boolean includeMigration = false;
if (m.getTablet() == null) {
log.error("Balancer gave back a null tablet {}", m);
+ } else if (DataLevel.of(m.getTablet().getTable()) != level) {
+ log.trace(
+ "Balancer wants to move a tablet ({}) outside of the current
processing level ({}), "
+ + "ignoring and should be processed at the correct level
({})",
+ m.getTablet(), level, DataLevel.of(m.getTablet().getTable()));
} else if (m.getNewTabletServer() == null) {
log.error("Balancer did not set the destination {}", m);
} else if (m.getOldTabletServer() == null) {