Skip to content

Commit ce7705c

Browse files
dlmarionctubbsii
andauthored
Track state of upgrade to eliminate re-running upgrade code (#5357)
During an upgrade the UpgradeCoordinator will run Upgraders for each version between the current version of the stored data and the current version of the installed software. The UpgradeCoordinator runs all of the necessary Upgraders on ZooKeeper, then runs all of them on the root table, and then finally runs all of them on the metadata table. This means that when upgrading through multiple versions the version of the data stored in ZooKeeper could be multiple versions ahead of the root and metadata tables, until the Upgraders are run on those tables. When a failure occurs in the Upgrade the system is left in a partially upgraded state. The user needs to find and fix any issues before trying to start the system again. When the Manager starts the next time it will attempt to do the upgrade from the very beginning. An Upgrader implementation for one version needs to be coded to handle the upgrade process running again, so it needs to take into account the fact that the ZooKeeper data has already been modified and skip that step, for example. However, an Upgrader for one version can't account for the changes in a future version. This change creates a temporary object stored in ZooKeeper that is created when an upgrade starts and deleted when the upgrade finishes. It keeps track of the version of the ZooKeeper, root, and metadata as the Upgraders are run against those objects and the overall target version of the upgrade. TheUpgradeCoordinator has been modified to use this object so that it does not re-run Upgraders on those objects when they have already run successfully. This change also fixes EasyMock usage in AccumuloTest unit test for upgrades observed while adding the context to the UpgradeCoordinator constructor and moves the PreUpgradeValidation inside the UpgradeCoordinator (since they are related responsibilities). Closes #5347 Co-authored-by: Christopher Tubbs <ctubbsii@apache.org>
1 parent 1abcad4 commit ce7705c

File tree

9 files changed

+657
-29
lines changed

9 files changed

+657
-29
lines changed

core/src/main/java/org/apache/accumulo/core/Constants.java

+2
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,8 @@ public class Constants {
8484
public static final String ZHDFS_RESERVATIONS = "/hdfs_reservations";
8585
public static final String ZRECOVERY = "/recovery";
8686

87+
public static final String ZUPGRADE_PROGRESS = "/upgrade_progress";
88+
8789
/**
8890
* Base znode for storing secret keys that back delegation tokens
8991
*/

server/manager/src/main/java/org/apache/accumulo/manager/Manager.java

+9-6
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,6 @@
116116
import org.apache.accumulo.manager.recovery.RecoveryManager;
117117
import org.apache.accumulo.manager.state.TableCounts;
118118
import org.apache.accumulo.manager.tableOps.TraceRepo;
119-
import org.apache.accumulo.manager.upgrade.PreUpgradeValidation;
120119
import org.apache.accumulo.manager.upgrade.UpgradeCoordinator;
121120
import org.apache.accumulo.server.AbstractServer;
122121
import org.apache.accumulo.server.HighlyAvailableService;
@@ -305,21 +304,22 @@ synchronized void setManagerState(final ManagerState newState) {
305304
break;
306305
case HAVE_LOCK:
307306
if (isUpgrading()) {
308-
new PreUpgradeValidation().validate(getContext(), nextEvent);
309-
upgradeCoordinator.upgradeZookeeper(getContext(), nextEvent);
307+
upgradeCoordinator.preUpgradeValidation();
308+
upgradeCoordinator.startOrContinueUpgrade();
309+
upgradeCoordinator.upgradeZookeeper(nextEvent);
310310
}
311311
break;
312312
case NORMAL:
313313
if (isUpgrading()) {
314-
upgradeMetadataFuture = upgradeCoordinator.upgradeMetadata(getContext(), nextEvent);
314+
upgradeMetadataFuture = upgradeCoordinator.upgradeMetadata(nextEvent);
315315
}
316316
break;
317317
default:
318318
break;
319319
}
320320
}
321321

322-
private final UpgradeCoordinator upgradeCoordinator = new UpgradeCoordinator();
322+
private final UpgradeCoordinator upgradeCoordinator;
323323

324324
private Future<Void> upgradeMetadataFuture;
325325

@@ -424,6 +424,7 @@ public static void main(String[] args) throws Exception {
424424
protected Manager(ConfigOpts opts, String[] args) throws IOException {
425425
super("manager", opts, args);
426426
ServerContext context = super.getContext();
427+
upgradeCoordinator = new UpgradeCoordinator(context);
427428
balancerEnvironment = new BalancerEnvironmentImpl(context);
428429

429430
AccumuloConfiguration aconf = context.getConfiguration();
@@ -1277,6 +1278,9 @@ public void run() {
12771278
} catch (KeeperException | InterruptedException e) {
12781279
throw new IllegalStateException("Exception getting manager lock", e);
12791280
}
1281+
// Setting the Manager state to HAVE_LOCK has the side-effect of
1282+
// starting the upgrade process if necessary.
1283+
setManagerState(ManagerState.HAVE_LOCK);
12801284

12811285
MetricsInfo metricsInfo = getContext().getMetricsInfo();
12821286

@@ -1632,7 +1636,6 @@ private ServiceLockData getManagerLock(final ServiceLockPath zManagerLoc)
16321636
sleepUninterruptibly(TIME_TO_WAIT_BETWEEN_LOCK_CHECKS, MILLISECONDS);
16331637
}
16341638

1635-
setManagerState(ManagerState.HAVE_LOCK);
16361639
return sld;
16371640
}
16381641

server/manager/src/main/java/org/apache/accumulo/manager/upgrade/PreUpgradeValidation.java

+5-5
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525

2626
import org.apache.accumulo.core.zookeeper.ZooSession;
2727
import org.apache.accumulo.core.zookeeper.ZooSession.ZKUtil;
28-
import org.apache.accumulo.manager.EventCoordinator;
2928
import org.apache.accumulo.server.AccumuloDataVersion;
3029
import org.apache.accumulo.server.ServerContext;
3130
import org.apache.zookeeper.KeeperException;
@@ -48,10 +47,11 @@ public class PreUpgradeValidation {
4847

4948
private final static Logger log = LoggerFactory.getLogger(PreUpgradeValidation.class);
5049

51-
public void validate(final ServerContext context, final EventCoordinator eventCoordinator) {
52-
int cv = AccumuloDataVersion.getCurrentVersion(context);
53-
if (cv == AccumuloDataVersion.get()) {
54-
log.debug("already at current data version: {}, skipping validation", cv);
50+
public void validate(final ServerContext context) {
51+
int storedVersion = AccumuloDataVersion.getCurrentVersion(context);
52+
int currentVersion = AccumuloDataVersion.get();
53+
if (storedVersion == currentVersion) {
54+
log.debug("already at current data version: {}, skipping validation", currentVersion);
5555
return;
5656
}
5757
validateACLs(context);

server/manager/src/main/java/org/apache/accumulo/manager/upgrade/UpgradeCoordinator.java

+50-9
Original file line numberDiff line numberDiff line change
@@ -132,12 +132,30 @@ public boolean isParentLevelUpgraded(KeyExtent extent) {
132132
Collections.unmodifiableMap(new TreeMap<>(Map.of(ROOT_TABLET_META_CHANGES,
133133
new Upgrader10to11(), REMOVE_DEPRECATIONS_FOR_VERSION_3, new Upgrader11to12())));
134134

135+
private final ServerContext context;
136+
private final UpgradeProgressTracker progressTracker;
137+
private final PreUpgradeValidation preUpgradeValidator;
138+
135139
private volatile UpgradeStatus status;
136140

137-
public UpgradeCoordinator() {
141+
public UpgradeCoordinator(ServerContext context) {
142+
this.context = context;
143+
progressTracker = new UpgradeProgressTracker(context);
144+
preUpgradeValidator = new PreUpgradeValidation();
138145
status = UpgradeStatus.INITIAL;
139146
}
140147

148+
public void preUpgradeValidation() {
149+
preUpgradeValidator.validate(context);
150+
}
151+
152+
public void startOrContinueUpgrade() {
153+
// The following check will fail if an upgrade is in progress
154+
// but the target version is not the current version of the
155+
// software.
156+
progressTracker.startOrContinueUpgrade();
157+
}
158+
141159
private void setStatus(UpgradeStatus status, EventCoordinator eventCoordinator) {
142160
UpgradeStatus oldStatus = this.status;
143161
this.status = status;
@@ -155,8 +173,7 @@ private void handleFailure(Exception e) {
155173
System.exit(1);
156174
}
157175

158-
public synchronized void upgradeZookeeper(ServerContext context,
159-
EventCoordinator eventCoordinator) {
176+
public synchronized void upgradeZookeeper(EventCoordinator eventCoordinator) {
160177

161178
Preconditions.checkState(status == UpgradeStatus.INITIAL,
162179
"Not currently in a suitable state to do zookeeper upgrade %s", status);
@@ -171,15 +188,24 @@ public synchronized void upgradeZookeeper(ServerContext context,
171188
}
172189

173190
if (currentVersion < AccumuloDataVersion.get()) {
174-
abortIfFateTransactions(context);
191+
abortIfFateTransactions();
192+
193+
final UpgradeProgress progress = progressTracker.getProgress();
175194

176195
for (int v = currentVersion; v < AccumuloDataVersion.get(); v++) {
196+
if (progress.getZooKeeperVersion() >= currentVersion) {
197+
log.info(
198+
"ZooKeeper has already been upgraded to version {}, moving on to next upgrader",
199+
currentVersion);
200+
continue;
201+
}
177202
log.info("Upgrading Zookeeper - current version {} as step towards target version {}", v,
178203
AccumuloDataVersion.get());
179204
var upgrader = upgraders.get(v);
180205
Objects.requireNonNull(upgrader,
181206
"upgrade ZooKeeper: failed to find upgrader for version " + currentVersion);
182207
upgrader.upgradeZookeeper(context);
208+
progressTracker.updateZooKeeperVersion(v);
183209
}
184210
}
185211

@@ -190,8 +216,7 @@ public synchronized void upgradeZookeeper(ServerContext context,
190216

191217
}
192218

193-
public synchronized Future<Void> upgradeMetadata(ServerContext context,
194-
EventCoordinator eventCoordinator) {
219+
public synchronized Future<Void> upgradeMetadata(EventCoordinator eventCoordinator) {
195220
if (status == UpgradeStatus.COMPLETE) {
196221
return CompletableFuture.completedFuture(null);
197222
}
@@ -205,35 +230,51 @@ public synchronized Future<Void> upgradeMetadata(ServerContext context,
205230
.numMaxThreads(Integer.MAX_VALUE).withTimeOut(60L, SECONDS)
206231
.withQueue(new SynchronousQueue<>()).build().submit(() -> {
207232
try {
233+
UpgradeProgress progress = progressTracker.getProgress();
208234
for (int v = currentVersion; v < AccumuloDataVersion.get(); v++) {
235+
if (progress.getRootVersion() >= currentVersion) {
236+
log.info(
237+
"Root table has already been upgraded to version {}, moving on to next upgrader",
238+
currentVersion);
239+
continue;
240+
}
209241
log.info("Upgrading Root - current version {} as step towards target version {}", v,
210242
AccumuloDataVersion.get());
211243
var upgrader = upgraders.get(v);
212244
Objects.requireNonNull(upgrader,
213245
"upgrade root: failed to find root upgrader for version " + currentVersion);
214246
upgraders.get(v).upgradeRoot(context);
247+
progressTracker.updateRootVersion(v);
215248
}
216249
setStatus(UpgradeStatus.UPGRADED_ROOT, eventCoordinator);
217250

218251
for (int v = currentVersion; v < AccumuloDataVersion.get(); v++) {
252+
if (progress.getMetadataVersion() >= currentVersion) {
253+
log.info(
254+
"Metadata table has already been upgraded to version {}, moving on to next upgrader",
255+
currentVersion);
256+
continue;
257+
}
219258
log.info(
220259
"Upgrading Metadata - current version {} as step towards target version {}", v,
221260
AccumuloDataVersion.get());
222261
var upgrader = upgraders.get(v);
223262
Objects.requireNonNull(upgrader,
224263
"upgrade metadata: failed to find upgrader for version " + currentVersion);
225264
upgraders.get(v).upgradeMetadata(context);
265+
progressTracker.updateMetadataVersion(v);
226266
}
227267
setStatus(UpgradeStatus.UPGRADED_METADATA, eventCoordinator);
228268

229269
log.info("Validating configuration properties.");
230-
validateProperties(context);
270+
validateProperties();
231271

232272
log.info("Updating persistent data version.");
233273
updateAccumuloVersion(context.getServerDirs(), context.getVolumeManager(),
234274
currentVersion);
235275
log.info("Upgrade complete");
236276
setStatus(UpgradeStatus.COMPLETE, eventCoordinator);
277+
progressTracker.upgradeComplete();
237278
} catch (Exception e) {
238279
handleFailure(e);
239280
}
@@ -244,7 +285,7 @@ public synchronized Future<Void> upgradeMetadata(ServerContext context,
244285
}
245286
}
246287

247-
private void validateProperties(ServerContext context) {
288+
private void validateProperties() {
248289
ConfigCheckUtil.validate(context.getSiteConfiguration(), "site configuration");
249290
ConfigCheckUtil.validate(context.getConfiguration(), "system configuration");
250291
try {
@@ -311,7 +352,7 @@ public UpgradeStatus getStatus() {
311352
*/
312353
@SuppressFBWarnings(value = "DM_EXIT",
313354
justification = "Want to immediately stop all manager threads on upgrade error")
314-
private void abortIfFateTransactions(ServerContext context) {
355+
private void abortIfFateTransactions() {
315356
try {
316357
final ReadOnlyTStore<UpgradeCoordinator> fate =
317358
new ZooStore<>(context.getZooKeeperRoot() + Constants.ZFATE, context.getZooSession());
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* https://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.accumulo.manager.upgrade;
20+
21+
import static java.nio.charset.StandardCharsets.UTF_8;
22+
import static org.apache.accumulo.core.util.LazySingletons.GSON;
23+
24+
/**
25+
* Track upgrade progress for each component. The version stored is the most recent version for
26+
* which an upgrade has been completed.
27+
*/
28+
public class UpgradeProgress {
29+
30+
private int zooKeeperVersion;
31+
private int rootVersion;
32+
private int metadataVersion;
33+
private int upgradeTargetVersion;
34+
35+
public UpgradeProgress() {}
36+
37+
public UpgradeProgress(int currentVersion, int targetVersion) {
38+
zooKeeperVersion = currentVersion;
39+
rootVersion = currentVersion;
40+
metadataVersion = currentVersion;
41+
upgradeTargetVersion = targetVersion;
42+
}
43+
44+
public void setZooKeeperVersion(int version) {
45+
zooKeeperVersion = version;
46+
}
47+
48+
public int getZooKeeperVersion() {
49+
return zooKeeperVersion;
50+
}
51+
52+
public void setRootVersion(int version) {
53+
rootVersion = version;
54+
}
55+
56+
public int getRootVersion() {
57+
return rootVersion;
58+
}
59+
60+
public void setMetadataVersion(int version) {
61+
metadataVersion = version;
62+
}
63+
64+
public int getMetadataVersion() {
65+
return metadataVersion;
66+
}
67+
68+
public int getUpgradeTargetVersion() {
69+
return upgradeTargetVersion;
70+
}
71+
72+
public byte[] toJsonBytes() {
73+
return GSON.get().toJson(this).getBytes(UTF_8);
74+
}
75+
76+
public static UpgradeProgress fromJsonBytes(byte[] jsonData) {
77+
return GSON.get().fromJson(new String(jsonData, UTF_8), UpgradeProgress.class);
78+
}
79+
80+
}

0 commit comments

Comments
 (0)