Skip to content

Commit f8fb872

Browse files
Enable graceful shutdown of server processes (apache#5193)
Added RPC to the server processes to initiate a graceful shutdown. A new admin command has been created for a user to send this command to a server process. When the server process receives the command to shutdown gracefully it will attempt an orderly shutdown. Additional requests to shutdown gracefully will be ignored. Compactors will finish the major compaction that they are currently working on, then will exit. Scan Servers will return a busy signal to all clients for new scan requests, and will shutdown when all in-progress scans have closed. Tablet Servers will signal the Manager that they are shutting down, which should prevent assignment of tablets to that server, then they will unload all tablets and shut down. The Monitor, Manager, GarbageCollector, and CompactionCoordinator will shut down also. The last step in the shutdown process for all servers is to remove their lock in ZooKeeper. --------- Co-authored-by: Dom G. <domgarguilo@apache.org>
1 parent c3df72e commit f8fb872

File tree

40 files changed

+3145
-410
lines changed

40 files changed

+3145
-410
lines changed

core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLockSupport.java

+20-10
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,23 @@ public static class HAServiceLockWatcher implements AccumuloLockWatcher {
4141
private static final Logger LOG = LoggerFactory.getLogger(HAServiceLockWatcher.class);
4242

4343
private final String serviceName;
44+
private final Supplier<Boolean> shutdownComplete;
4445
private volatile boolean acquiredLock = false;
4546
private volatile boolean failedToAcquireLock = false;
4647

47-
public HAServiceLockWatcher(String serviceName) {
48+
public HAServiceLockWatcher(String serviceName, Supplier<Boolean> shutdownComplete) {
4849
this.serviceName = serviceName;
50+
this.shutdownComplete = shutdownComplete;
4951
}
5052

5153
@Override
5254
public void lostLock(LockLossReason reason) {
53-
Halt.halt(serviceName + " lock in zookeeper lost (reason = " + reason + "), exiting!", -1);
55+
if (shutdownComplete.get()) {
56+
LOG.warn("{} lost lock (reason = {}), not halting because shutdown is complete.",
57+
serviceName, reason);
58+
} else {
59+
Halt.halt(serviceName + " lock in zookeeper lost (reason = " + reason + "), exiting!", -1);
60+
}
5461
}
5562

5663
@Override
@@ -122,24 +129,27 @@ public static class ServiceLockWatcher implements LockWatcher {
122129
private static final Logger LOG = LoggerFactory.getLogger(ServiceLockWatcher.class);
123130

124131
private final String serviceName;
125-
private final Supplier<Boolean> shuttingDown;
132+
private final Supplier<Boolean> shutdownComplete;
126133
private final Consumer<String> lostLockAction;
127134

128-
public ServiceLockWatcher(String serviceName, Supplier<Boolean> shuttingDown,
135+
public ServiceLockWatcher(String serviceName, Supplier<Boolean> shutdownComplete,
129136
Consumer<String> lostLockAction) {
130137
this.serviceName = serviceName;
131-
this.shuttingDown = shuttingDown;
138+
this.shutdownComplete = shutdownComplete;
132139
this.lostLockAction = lostLockAction;
133140
}
134141

135142
@Override
136143
public void lostLock(final LockLossReason reason) {
137-
Halt.halt(1, () -> {
138-
if (!shuttingDown.get()) {
144+
if (shutdownComplete.get()) {
145+
LOG.warn("{} lost lock (reason = {}), not halting because shutdown is complete.",
146+
serviceName, reason);
147+
} else {
148+
Halt.halt(1, () -> {
139149
LOG.error("{} lost lock (reason = {}), exiting.", serviceName, reason);
140-
}
141-
lostLockAction.accept(serviceName);
142-
});
150+
lostLockAction.accept(serviceName);
151+
});
152+
}
143153
}
144154

145155
@Override
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* https://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.accumulo.core.rpc.clients;
20+
21+
import java.io.UncheckedIOException;
22+
import java.net.UnknownHostException;
23+
24+
import org.apache.accumulo.core.clientImpl.ClientContext;
25+
import org.apache.accumulo.core.process.thrift.ServerProcessService.Client;
26+
import org.apache.accumulo.core.rpc.ThriftUtil;
27+
import org.apache.accumulo.core.util.HostAndPort;
28+
import org.apache.thrift.transport.TTransportException;
29+
import org.slf4j.Logger;
30+
31+
public class ServerProcessServiceThriftClient extends ThriftClientTypes<Client> {
32+
33+
protected ServerProcessServiceThriftClient(String serviceName) {
34+
super(serviceName, new Client.Factory());
35+
}
36+
37+
public Client getServerProcessConnection(ClientContext context, Logger log, String hostname,
38+
int port) {
39+
HostAndPort serverProcess = HostAndPort.fromParts(hostname, port);
40+
try {
41+
// Manager requests can take a long time: don't ever time out
42+
return ThriftUtil.getClientNoTimeout(this, serverProcess, context);
43+
} catch (TTransportException tte) {
44+
Throwable cause = tte.getCause();
45+
if (cause instanceof UnknownHostException) {
46+
// do not expect to recover from this
47+
throw new UncheckedIOException((UnknownHostException) cause);
48+
}
49+
log.debug("Failed to connect to process at " + serverProcess + ", will retry... ", tte);
50+
return null;
51+
}
52+
53+
}
54+
}

core/src/main/java/org/apache/accumulo/core/rpc/clients/ThriftClientTypes.java

+3
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ public abstract class ThriftClientTypes<C extends TServiceClient> {
5858
public static final TabletScanClientServiceThriftClient TABLET_SCAN =
5959
new TabletScanClientServiceThriftClient("scan");
6060

61+
public static final ServerProcessServiceThriftClient SERVER_PROCESS =
62+
new ServerProcessServiceThriftClient("process");
63+
6164
/**
6265
* execute method with supplied client returning object of type R
6366
*

core/src/main/java/org/apache/accumulo/core/util/threads/ThreadPoolNames.java

+1
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ public enum ThreadPoolNames {
5959
TSERVER_COMPACTION_MINOR_POOL("accumulo.pool.tserver.compaction.minor"),
6060
TSERVER_MIGRATIONS_POOL("accumulo.pool.tserver.migrations"),
6161
TSERVER_MINOR_COMPACTOR_POOL("accumulo.pool.tserver.minor.compactor"),
62+
TSERVER_SHUTDOWN_UNLOAD_TABLET_POOL("accumulo.pool.tserver.shutdown.tablet.unload"),
6263
TSERVER_SUMMARY_FILE_RETRIEVER_POOL("accumulo.pool.tserver.summary.file.retriever.pool"),
6364
TSERVER_SUMMARY_PARTITION_POOL("accumulo.pool.tserver.summary.partition"),
6465
TSERVER_SUMMARY_REMOTE_POOL("accumulo.pool.tserver.summary.remote"),

core/src/main/scripts/generate-thrift.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
[[ -z $REQUIRED_THRIFT_VERSION ]] && REQUIRED_THRIFT_VERSION='0.17.0'
3333
[[ -z $INCLUDED_MODULES ]] && INCLUDED_MODULES=()
3434
[[ -z $BASE_OUTPUT_PACKAGE ]] && BASE_OUTPUT_PACKAGE='org.apache.accumulo.core'
35-
[[ -z $PACKAGES_TO_GENERATE ]] && PACKAGES_TO_GENERATE=(gc master manager tabletserver securityImpl clientImpl dataImpl replication trace compaction)
35+
[[ -z $PACKAGES_TO_GENERATE ]] && PACKAGES_TO_GENERATE=(process gc master manager tabletserver securityImpl clientImpl dataImpl replication trace compaction)
3636
[[ -z $BUILD_DIR ]] && BUILD_DIR='target'
3737
[[ -z $LANGUAGES_TO_GENERATE ]] && LANGUAGES_TO_GENERATE=(java)
3838
[[ -z $FINAL_DIR ]] && FINAL_DIR='src/main'

core/src/main/spotbugs/exclude-filter.xml

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
<Package name="org.apache.accumulo.core.gc.thrift" />
3333
<Package name="org.apache.accumulo.core.manager.thrift" />
3434
<Package name="org.apache.accumulo.core.master.thrift" />
35+
<Package name="org.apache.accumulo.core.process.thrift" />
3536
<Package name="org.apache.accumulo.core.replication.thrift" />
3637
<Package name="org.apache.accumulo.core.securityImpl.thrift" />
3738
<Package name="org.apache.accumulo.core.tabletserver.thrift" />

0 commit comments

Comments
 (0)