Fix AwarenessAttributeDecommissionIT.testConcurrentDecommissionAction (opensearch-project#14372)

andrross · web-flow · commit 0d38d1498bc6 · 2024-06-15T12:17:38.000-04:00
The problem is that this test would decommission one of six nodes. The tear down logic of the test would attempt to assert on the health of the cluster by randomly selecting a node and requesting the cluster health. If this random check happened to select the node that was decommissioned, then the test would fail. The fix is to recommission the node at the end of the test. Also, the "recommission node and assert cluster health" logic was used in multiple places and could be refactored out to a helper method. Resolves opensearch-project#14290 Resolves opensearch-project#12197 Signed-off-by: Andrew Ross <andrross@amazon.com>
diff --git a/server/src/internalClusterTest/java/org/opensearch/cluster/coordination/AwarenessAttributeDecommissionIT.java b/server/src/internalClusterTest/java/org/opensearch/cluster/coordination/AwarenessAttributeDecommissionIT.java
@@ -539,18 +539,7 @@ private void assertNodesRemovedAfterZoneDecommission(boolean originalClusterMana
             assertEquals(originalClusterManager, currentClusterManager);
         }
 
-        // Will wait for all events to complete
-        client(activeNode).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get();
-
-        // Recommissioning the zone back to gracefully succeed the test once above tests succeeds
-        DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(currentClusterManager).execute(
-            DeleteDecommissionStateAction.INSTANCE,
-            new DeleteDecommissionStateRequest()
-        ).get();
-        assertTrue(deleteDecommissionStateResponse.isAcknowledged());
-
-        // will wait for cluster to stabilise with a timeout of 2 min as by then all nodes should have joined the cluster
-        ensureStableCluster(15, TimeValue.timeValueMinutes(2));
+        deleteDecommissionStateAndWaitForStableCluster(currentClusterManager, 15);
     }
 
     public void testDecommissionFailedWhenDifferentAttributeAlreadyDecommissioned() throws Exception {
@@ -617,18 +606,7 @@ public void testDecommissionFailedWhenDifferentAttributeAlreadyDecommissioned()
             )
         );
 
-        // Will wait for all events to complete
-        client(node_in_c).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get();
-
-        // Recommissioning the zone back to gracefully succeed the test once above tests succeeds
-        DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(node_in_c).execute(
-            DeleteDecommissionStateAction.INSTANCE,
-            new DeleteDecommissionStateRequest()
-        ).get();
-        assertTrue(deleteDecommissionStateResponse.isAcknowledged());
-
-        // will wait for cluster to stabilise with a timeout of 2 min as by then all nodes should have joined the cluster
-        ensureStableCluster(6, TimeValue.timeValueMinutes(2));
+        deleteDecommissionStateAndWaitForStableCluster(node_in_c, 6);
     }
 
     public void testDecommissionStatusUpdatePublishedToAllNodes() throws ExecutionException, InterruptedException {
@@ -748,20 +726,7 @@ public void testDecommissionStatusUpdatePublishedToAllNodes() throws ExecutionEx
         );
         logger.info("--> Verified the decommissioned node has in_progress state.");
 
-        // Will wait for all events to complete
-        client(activeNode).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get();
-        logger.info("--> Got LANGUID event");
-        // Recommissioning the zone back to gracefully succeed the test once above tests succeeds
-        DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(activeNode).execute(
-            DeleteDecommissionStateAction.INSTANCE,
-            new DeleteDecommissionStateRequest()
-        ).get();
-        assertTrue(deleteDecommissionStateResponse.isAcknowledged());
-        logger.info("--> Deleting decommission done.");
-
-        // will wait for cluster to stabilise with a timeout of 2 min (findPeerInterval for decommissioned nodes)
-        // as by then all nodes should have joined the cluster
-        ensureStableCluster(6, TimeValue.timeValueSeconds(121));
+        deleteDecommissionStateAndWaitForStableCluster(activeNode, 6);
     }
 
     public void testDecommissionFailedWhenAttributeNotWeighedAway() throws Exception {
@@ -983,15 +948,7 @@ public void testDecommissionAcknowledgedIfWeightsNotSetForNonRoutingNode() throw
         assertEquals(clusterState.nodes().getDataNodes().size(), 3);
         assertEquals(clusterState.nodes().getClusterManagerNodes().size(), 2);
 
-        // Recommissioning the zone back to gracefully succeed the test once above tests succeeds
-        DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(dataNodes.get(0)).execute(
-            DeleteDecommissionStateAction.INSTANCE,
-            new DeleteDecommissionStateRequest()
-        ).get();
-        assertTrue(deleteDecommissionStateResponse.isAcknowledged());
-
-        // will wait for cluster to stabilise with a timeout of 2 min as by then all nodes should have joined the cluster
-        ensureStableCluster(6, TimeValue.timeValueMinutes(2));
+        deleteDecommissionStateAndWaitForStableCluster(dataNodes.get(0), 6);
     }
 
     public void testConcurrentDecommissionAction() throws Exception {
@@ -1019,7 +976,7 @@ public void testConcurrentDecommissionAction() throws Exception {
                 .build()
         );
         logger.info("--> start 3 data nodes on zones 'a' & 'b' & 'c'");
-        internalCluster().startNodes(
+        final String bZoneDataNode = internalCluster().startNodes(
             Settings.builder()
                 .put(commonSettings)
                 .put("node.attr.zone", "a")
@@ -1035,7 +992,7 @@ public void testConcurrentDecommissionAction() throws Exception {
                 .put("node.attr.zone", "c")
                 .put(onlyRole(commonSettings, DiscoveryNodeRole.DATA_ROLE))
                 .build()
-        );
+        ).get(1);
 
         ensureStableCluster(6);
         ClusterHealthResponse health = client().admin()
@@ -1100,6 +1057,25 @@ public void testConcurrentDecommissionAction() throws Exception {
         assertEquals(concurrentRuns, numRequestAcknowledged.get() + numRequestUnAcknowledged.get() + numRequestFailed.get());
         assertEquals(concurrentRuns - 1, numRequestFailed.get());
         assertEquals(1, numRequestAcknowledged.get() + numRequestUnAcknowledged.get());
+
+        deleteDecommissionStateAndWaitForStableCluster(bZoneDataNode, 6);
+    }
+
+    private void deleteDecommissionStateAndWaitForStableCluster(String activeNodeName, int expectedClusterSize) throws ExecutionException,
+        InterruptedException {
+        client(activeNodeName).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get();
+
+        // Recommissioning the zone back to gracefully succeed the test once above tests succeeds
+        DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(activeNodeName).execute(
+            DeleteDecommissionStateAction.INSTANCE,
+            new DeleteDecommissionStateRequest()
+        ).get();
+        assertTrue(deleteDecommissionStateResponse.isAcknowledged());
+        logger.info("--> Deleting decommission done.");
+
+        // will wait for cluster to stabilise with a timeout of 2 min (findPeerInterval for decommissioned nodes)
+        // as by then all nodes should have joined the cluster
+        ensureStableCluster(expectedClusterSize, TimeValue.timeValueSeconds(121));
     }
 
     private static class WaitForFailedDecommissionState implements ClusterStateObserver.Listener {