Skip to content

Commit 80b8fbc

Browse files
authored
Added calls to ZooZap when shutting down groups of servers (apache#5321)
Modified accumulo-cluster to wait for all spawned ssh commands to finish and then call ZooZap when shutting down all Compactor, ScanServer, and TabletServer processes for a resource group. Modified ZooZap so that the code is consistent for Compactors, ScanServers, and TabletServers such that for those server types it performs a recursive delete at the resource group in the ZK path. Closes apache#5178
1 parent eed112b commit 80b8fbc

File tree

2 files changed

+53
-34
lines changed

2 files changed

+53
-34
lines changed

assemble/bin/accumulo-cluster

+36-9
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,17 @@ function parse_config() {
360360

361361
}
362362

363+
function ssh_wait() {
364+
#shellcheck disable=SC2009
365+
count=$(ps -ef | grep ssh | grep -c "accumulo-service")
366+
while ((count > 0)); do
367+
echo "waiting on $count ssh commands to complete"
368+
sleep 2
369+
#shellcheck disable=SC2009
370+
count=$(ps -ef | grep ssh | grep -c "accumulo-service")
371+
done
372+
}
373+
363374
function execute_command() {
364375
control_cmd=$1
365376
host=$2
@@ -431,12 +442,9 @@ function control_services() {
431442
for group in $tserver_groups; do
432443
G="TSERVER_HOSTS_$group"
433444
for tserver in ${!G}; do
445+
debug "Stopping tservers on $addr via admin command"
434446
if echo "$tserver" | grep -q "$addr"; then
435-
if ! isDebug; then
436-
"$accumulo_cmd" admin stop "$addr"
437-
else
438-
debug "Stopping tservers on $addr via admin command"
439-
fi
447+
debugOrRun "$accumulo_cmd" admin stop "$addr"
440448
fi
441449
done
442450
done
@@ -471,6 +479,13 @@ function control_services() {
471479
fi
472480
fi
473481
done
482+
if [[ $ARG_LOCAL == 0 && ($operation == "stop" || $operation == "kill") ]]; then
483+
# If the prior commands were executed via ssh, then we need to wait for them
484+
# to complete before zapping the nodes in ZooKeeper
485+
ssh_wait
486+
echo "Cleaning tablet server entries from zookeeper for resource group $group"
487+
debugOrRun "$accumulo_cmd" org.apache.accumulo.server.util.ZooZap -verbose -tservers -group "$group"
488+
fi
474489
echo "done"
475490
done
476491
fi
@@ -520,6 +535,13 @@ function control_services() {
520535
execute_command "$operation" "$sserver" sserver "$group" "-o" "sserver.group=$group"
521536
fi
522537
done
538+
if [[ $ARG_LOCAL == 0 && ($operation == "stop" || $operation == "kill") ]]; then
539+
# If the prior commands were executed via ssh, then we need to wait for them
540+
# to complete before zapping the nodes in ZooKeeper
541+
ssh_wait
542+
echo "Cleaning scan server entries from zookeeper for resource group $group"
543+
debugOrRun "$accumulo_cmd" org.apache.accumulo.server.util.ZooZap -verbose -sservers -group "$group"
544+
fi
523545
done
524546
fi
525547

@@ -538,14 +560,19 @@ function control_services() {
538560
execute_command "$operation" "$compactor" compactor "$group" "-o" "compactor.group=$group"
539561
fi
540562
done
563+
if [[ $ARG_LOCAL == 0 && ($operation == "stop" || $operation == "kill") ]]; then
564+
# If the prior commands were executed via ssh, then we need to wait for them
565+
# to complete before zapping the nodes in ZooKeeper
566+
ssh_wait
567+
echo "Cleaning compactor entries from zookeeper for resource group $group"
568+
debugOrRun "$accumulo_cmd" org.apache.accumulo.server.util.ZooZap -verbose -compactors -group "$group"
569+
fi
541570
done
542571
fi
543572

544573
if [[ $ARG_LOCAL == 0 && $ARG_ALL == 1 && ($operation == "stop" || $operation == "kill") ]]; then
545-
if ! isDebug; then
546-
echo "Cleaning all server entries in ZooKeeper"
547-
"$accumulo_cmd" org.apache.accumulo.server.util.ZooZap -manager -tservers -compactors -sservers
548-
fi
574+
debug "Cleaning all server entries in ZooKeeper"
575+
debugOrRun "$accumulo_cmd" org.apache.accumulo.server.util.ZooZap -verbose -manager -tservers -compactors -sservers
549576
fi
550577

551578
}

server/base/src/main/java/org/apache/accumulo/server/util/ZooZap.java

+17-25
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
import org.apache.accumulo.core.conf.SiteConfiguration;
2828
import org.apache.accumulo.core.fate.zookeeper.ZooReaderWriter;
2929
import org.apache.accumulo.core.fate.zookeeper.ZooUtil.NodeMissingPolicy;
30-
import org.apache.accumulo.core.lock.ServiceLock;
3130
import org.apache.accumulo.core.lock.ServiceLockPaths.AddressSelector;
3231
import org.apache.accumulo.core.lock.ServiceLockPaths.ResourceGroupPredicate;
3332
import org.apache.accumulo.core.lock.ServiceLockPaths.ServiceLockPath;
@@ -103,7 +102,7 @@ public void zap(SiteConfiguration siteConf, String... args) {
103102
Opts opts = new Opts();
104103
opts.parseArgs(keyword(), args);
105104

106-
if (!opts.zapManager && !opts.zapTservers) {
105+
if (!opts.zapManager && !opts.zapTservers && !opts.zapCompactors && !opts.zapScanServers) {
107106
new JCommander(opts).usage();
108107
return;
109108
}
@@ -136,21 +135,12 @@ public void zap(SiteConfiguration siteConf, String... args) {
136135
try {
137136
Set<ServiceLockPath> tserverLockPaths =
138137
context.getServerPaths().getTabletServer(rgp, AddressSelector.all(), false);
139-
for (ServiceLockPath tserverPath : tserverLockPaths) {
140-
141-
message("Deleting " + tserverPath + " from zookeeper", opts);
142-
143-
if (opts.zapManager) {
144-
zrw.recursiveDelete(tserverPath.toString(), NodeMissingPolicy.SKIP);
145-
} else {
146-
if (!zrw.getChildren(tserverPath.toString()).isEmpty()) {
147-
try {
148-
ServiceLock.deleteLock(zrw, tserverPath);
149-
} catch (RuntimeException e) {
150-
message("Did not delete " + tserverPath, opts);
151-
}
152-
}
153-
}
138+
Set<String> tserverResourceGroupPaths = new HashSet<>();
139+
tserverLockPaths.forEach(p -> tserverResourceGroupPaths
140+
.add(p.toString().substring(0, p.toString().lastIndexOf('/'))));
141+
for (String group : tserverResourceGroupPaths) {
142+
message("Deleting tserver " + group + " from zookeeper", opts);
143+
zrw.recursiveDelete(group.toString(), NodeMissingPolicy.SKIP);
154144
}
155145
} catch (KeeperException | InterruptedException e) {
156146
log.error("{}", e.getMessage(), e);
@@ -165,7 +155,7 @@ public void zap(SiteConfiguration siteConf, String... args) {
165155
.add(p.toString().substring(0, p.toString().lastIndexOf('/'))));
166156
try {
167157
for (String group : compactorResourceGroupPaths) {
168-
message("Deleting " + group + " from zookeeper", opts);
158+
message("Deleting compactor " + group + " from zookeeper", opts);
169159
zrw.recursiveDelete(group, NodeMissingPolicy.SKIP);
170160
}
171161
} catch (KeeperException | InterruptedException e) {
@@ -175,14 +165,16 @@ public void zap(SiteConfiguration siteConf, String... args) {
175165
}
176166

177167
if (opts.zapScanServers) {
168+
Set<ServiceLockPath> sserverLockPaths =
169+
context.getServerPaths().getScanServer(rgp, AddressSelector.all(), false);
170+
Set<String> sserverResourceGroupPaths = new HashSet<>();
171+
sserverLockPaths.forEach(p -> sserverResourceGroupPaths
172+
.add(p.toString().substring(0, p.toString().lastIndexOf('/'))));
173+
178174
try {
179-
Set<ServiceLockPath> sserverLockPaths =
180-
context.getServerPaths().getScanServer(rgp, AddressSelector.all(), false);
181-
for (ServiceLockPath sserverPath : sserverLockPaths) {
182-
message("Deleting " + sserverPath + " from zookeeper", opts);
183-
if (!zrw.getChildren(sserverPath.toString()).isEmpty()) {
184-
ServiceLock.deleteLock(zrw, sserverPath);
185-
}
175+
for (String group : sserverResourceGroupPaths) {
176+
message("Deleting sserver " + group + " from zookeeper", opts);
177+
zrw.recursiveDelete(group, NodeMissingPolicy.SKIP);
186178
}
187179
} catch (KeeperException | InterruptedException e) {
188180
log.error("{}", e.getMessage(), e);

0 commit comments

Comments
 (0)