Skip to content

Commit df053cc

Browse files
authored
Manager balancer fixes (apache#5070)
Modified Manager balancer code such that the tservers for the ROOT and METADATA DataLevels are recalculated on each loop to account for any change in available tablet servers, and ignoring any migrations that the balancer may emit for tablets outside of the current DataLevel.
1 parent 70e73bc commit df053cc

File tree

1 file changed

+31
-7
lines changed
  • server/manager/src/main/java/org/apache/accumulo/manager

1 file changed

+31
-7
lines changed

server/manager/src/main/java/org/apache/accumulo/manager/Manager.java

+31-7
Original file line numberDiff line numberDiff line change
@@ -1045,7 +1045,7 @@ private long balanceTablets() {
10451045
}
10461046
// Create a view of the tserver status such that it only contains the tables
10471047
// for this level in the tableMap.
1048-
final SortedMap<TServerInstance,TabletServerStatus> tserverStatusForLevel =
1048+
SortedMap<TServerInstance,TabletServerStatus> tserverStatusForLevel =
10491049
createTServerStatusView(dl, tserverStatus);
10501050
// Construct the Thrift variant of the map above for the BalancerParams
10511051
final SortedMap<TabletServerId,TServerStatus> tserverStatusForBalancerLevel =
@@ -1057,17 +1057,36 @@ private long balanceTablets() {
10571057
int attemptNum = 0;
10581058
do {
10591059
log.debug("Balancing for tables at level {}, times-in-loop: {}", dl, ++attemptNum);
1060-
params = BalanceParamsImpl.fromThrift(tserverStatusForBalancerLevel,
1061-
tserverStatusForLevel, partitionedMigrations.get(dl));
1060+
1061+
SortedMap<TabletServerId,TServerStatus> statusForBalancerLevel =
1062+
tserverStatusForBalancerLevel;
1063+
if (attemptNum > 1 && (dl == DataLevel.ROOT || dl == DataLevel.METADATA)) {
1064+
// If we are still migrating then perform a re-check on the tablet
1065+
// servers to make sure non of them have failed.
1066+
Set<TServerInstance> currentServers = tserverSet.getCurrentServers();
1067+
tserverStatus = gatherTableInformation(currentServers);
1068+
// Create a view of the tserver status such that it only contains the tables
1069+
// for this level in the tableMap.
1070+
tserverStatusForLevel = createTServerStatusView(dl, tserverStatus);
1071+
final SortedMap<TabletServerId,TServerStatus> tserverStatusForBalancerLevel2 =
1072+
new TreeMap<>();
1073+
tserverStatusForLevel.forEach((tsi, status) -> tserverStatusForBalancerLevel2
1074+
.put(new TabletServerIdImpl(tsi), TServerStatusImpl.fromThrift(status)));
1075+
statusForBalancerLevel = tserverStatusForBalancerLevel2;
1076+
}
1077+
1078+
params = BalanceParamsImpl.fromThrift(statusForBalancerLevel, tserverStatusForLevel,
1079+
partitionedMigrations.get(dl));
10621080
wait = Math.max(tabletBalancer.balance(params), wait);
1063-
migrationsOutForLevel = params.migrationsOut().size();
1064-
for (TabletMigration m : checkMigrationSanity(tserverStatusForBalancerLevel.keySet(),
1065-
params.migrationsOut())) {
1081+
migrationsOutForLevel = 0;
1082+
for (TabletMigration m : checkMigrationSanity(statusForBalancerLevel.keySet(),
1083+
params.migrationsOut(), dl)) {
10661084
final KeyExtent ke = KeyExtent.fromTabletId(m.getTablet());
10671085
if (migrations.containsKey(ke)) {
10681086
log.warn("balancer requested migration more than once, skipping {}", m);
10691087
continue;
10701088
}
1089+
migrationsOutForLevel++;
10711090
migrations.put(ke, TabletServerIdImpl.toThrift(m.getNewTabletServer()));
10721091
log.debug("migration {}", m);
10731092
}
@@ -1091,11 +1110,16 @@ private long balanceTablets() {
10911110
}
10921111

10931112
private List<TabletMigration> checkMigrationSanity(Set<TabletServerId> current,
1094-
List<TabletMigration> migrations) {
1113+
List<TabletMigration> migrations, DataLevel level) {
10951114
return migrations.stream().filter(m -> {
10961115
boolean includeMigration = false;
10971116
if (m.getTablet() == null) {
10981117
log.error("Balancer gave back a null tablet {}", m);
1118+
} else if (DataLevel.of(m.getTablet().getTable()) != level) {
1119+
log.trace(
1120+
"Balancer wants to move a tablet ({}) outside of the current processing level ({}), "
1121+
+ "ignoring and should be processed at the correct level ({})",
1122+
m.getTablet(), level, DataLevel.of(m.getTablet().getTable()));
10991123
} else if (m.getNewTabletServer() == null) {
11001124
log.error("Balancer did not set the destination {}", m);
11011125
} else if (m.getOldTabletServer() == null) {

0 commit comments

Comments
 (0)