Skip to content

Commit 0d38d14

Browse files
authored
Fix AwarenessAttributeDecommissionIT.testConcurrentDecommissionAction (opensearch-project#14372)
The problem is that this test would decommission one of six nodes. The tear down logic of the test would attempt to assert on the health of the cluster by randomly selecting a node and requesting the cluster health. If this random check happened to select the node that was decommissioned, then the test would fail. The fix is to recommission the node at the end of the test. Also, the "recommission node and assert cluster health" logic was used in multiple places and could be refactored out to a helper method. Resolves opensearch-project#14290 Resolves opensearch-project#12197 Signed-off-by: Andrew Ross <andrross@amazon.com>
1 parent 7650e64 commit 0d38d14

File tree

1 file changed

+25
-49
lines changed

1 file changed

+25
-49
lines changed

server/src/internalClusterTest/java/org/opensearch/cluster/coordination/AwarenessAttributeDecommissionIT.java

+25-49
Original file line numberDiff line numberDiff line change
@@ -539,18 +539,7 @@ private void assertNodesRemovedAfterZoneDecommission(boolean originalClusterMana
539539
assertEquals(originalClusterManager, currentClusterManager);
540540
}
541541

542-
// Will wait for all events to complete
543-
client(activeNode).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get();
544-
545-
// Recommissioning the zone back to gracefully succeed the test once above tests succeeds
546-
DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(currentClusterManager).execute(
547-
DeleteDecommissionStateAction.INSTANCE,
548-
new DeleteDecommissionStateRequest()
549-
).get();
550-
assertTrue(deleteDecommissionStateResponse.isAcknowledged());
551-
552-
// will wait for cluster to stabilise with a timeout of 2 min as by then all nodes should have joined the cluster
553-
ensureStableCluster(15, TimeValue.timeValueMinutes(2));
542+
deleteDecommissionStateAndWaitForStableCluster(currentClusterManager, 15);
554543
}
555544

556545
public void testDecommissionFailedWhenDifferentAttributeAlreadyDecommissioned() throws Exception {
@@ -617,18 +606,7 @@ public void testDecommissionFailedWhenDifferentAttributeAlreadyDecommissioned()
617606
)
618607
);
619608

620-
// Will wait for all events to complete
621-
client(node_in_c).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get();
622-
623-
// Recommissioning the zone back to gracefully succeed the test once above tests succeeds
624-
DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(node_in_c).execute(
625-
DeleteDecommissionStateAction.INSTANCE,
626-
new DeleteDecommissionStateRequest()
627-
).get();
628-
assertTrue(deleteDecommissionStateResponse.isAcknowledged());
629-
630-
// will wait for cluster to stabilise with a timeout of 2 min as by then all nodes should have joined the cluster
631-
ensureStableCluster(6, TimeValue.timeValueMinutes(2));
609+
deleteDecommissionStateAndWaitForStableCluster(node_in_c, 6);
632610
}
633611

634612
public void testDecommissionStatusUpdatePublishedToAllNodes() throws ExecutionException, InterruptedException {
@@ -748,20 +726,7 @@ public void testDecommissionStatusUpdatePublishedToAllNodes() throws ExecutionEx
748726
);
749727
logger.info("--> Verified the decommissioned node has in_progress state.");
750728

751-
// Will wait for all events to complete
752-
client(activeNode).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get();
753-
logger.info("--> Got LANGUID event");
754-
// Recommissioning the zone back to gracefully succeed the test once above tests succeeds
755-
DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(activeNode).execute(
756-
DeleteDecommissionStateAction.INSTANCE,
757-
new DeleteDecommissionStateRequest()
758-
).get();
759-
assertTrue(deleteDecommissionStateResponse.isAcknowledged());
760-
logger.info("--> Deleting decommission done.");
761-
762-
// will wait for cluster to stabilise with a timeout of 2 min (findPeerInterval for decommissioned nodes)
763-
// as by then all nodes should have joined the cluster
764-
ensureStableCluster(6, TimeValue.timeValueSeconds(121));
729+
deleteDecommissionStateAndWaitForStableCluster(activeNode, 6);
765730
}
766731

767732
public void testDecommissionFailedWhenAttributeNotWeighedAway() throws Exception {
@@ -983,15 +948,7 @@ public void testDecommissionAcknowledgedIfWeightsNotSetForNonRoutingNode() throw
983948
assertEquals(clusterState.nodes().getDataNodes().size(), 3);
984949
assertEquals(clusterState.nodes().getClusterManagerNodes().size(), 2);
985950

986-
// Recommissioning the zone back to gracefully succeed the test once above tests succeeds
987-
DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(dataNodes.get(0)).execute(
988-
DeleteDecommissionStateAction.INSTANCE,
989-
new DeleteDecommissionStateRequest()
990-
).get();
991-
assertTrue(deleteDecommissionStateResponse.isAcknowledged());
992-
993-
// will wait for cluster to stabilise with a timeout of 2 min as by then all nodes should have joined the cluster
994-
ensureStableCluster(6, TimeValue.timeValueMinutes(2));
951+
deleteDecommissionStateAndWaitForStableCluster(dataNodes.get(0), 6);
995952
}
996953

997954
public void testConcurrentDecommissionAction() throws Exception {
@@ -1019,7 +976,7 @@ public void testConcurrentDecommissionAction() throws Exception {
1019976
.build()
1020977
);
1021978
logger.info("--> start 3 data nodes on zones 'a' & 'b' & 'c'");
1022-
internalCluster().startNodes(
979+
final String bZoneDataNode = internalCluster().startNodes(
1023980
Settings.builder()
1024981
.put(commonSettings)
1025982
.put("node.attr.zone", "a")
@@ -1035,7 +992,7 @@ public void testConcurrentDecommissionAction() throws Exception {
1035992
.put("node.attr.zone", "c")
1036993
.put(onlyRole(commonSettings, DiscoveryNodeRole.DATA_ROLE))
1037994
.build()
1038-
);
995+
).get(1);
1039996

1040997
ensureStableCluster(6);
1041998
ClusterHealthResponse health = client().admin()
@@ -1100,6 +1057,25 @@ public void testConcurrentDecommissionAction() throws Exception {
11001057
assertEquals(concurrentRuns, numRequestAcknowledged.get() + numRequestUnAcknowledged.get() + numRequestFailed.get());
11011058
assertEquals(concurrentRuns - 1, numRequestFailed.get());
11021059
assertEquals(1, numRequestAcknowledged.get() + numRequestUnAcknowledged.get());
1060+
1061+
deleteDecommissionStateAndWaitForStableCluster(bZoneDataNode, 6);
1062+
}
1063+
1064+
private void deleteDecommissionStateAndWaitForStableCluster(String activeNodeName, int expectedClusterSize) throws ExecutionException,
1065+
InterruptedException {
1066+
client(activeNodeName).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get();
1067+
1068+
// Recommissioning the zone back to gracefully succeed the test once above tests succeeds
1069+
DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(activeNodeName).execute(
1070+
DeleteDecommissionStateAction.INSTANCE,
1071+
new DeleteDecommissionStateRequest()
1072+
).get();
1073+
assertTrue(deleteDecommissionStateResponse.isAcknowledged());
1074+
logger.info("--> Deleting decommission done.");
1075+
1076+
// will wait for cluster to stabilise with a timeout of 2 min (findPeerInterval for decommissioned nodes)
1077+
// as by then all nodes should have joined the cluster
1078+
ensureStableCluster(expectedClusterSize, TimeValue.timeValueSeconds(121));
11031079
}
11041080

11051081
private static class WaitForFailedDecommissionState implements ClusterStateObserver.Listener {

0 commit comments

Comments
 (0)