|
57 | 57 | import org.opensearch.cluster.routing.ShardRouting;
|
58 | 58 | import org.opensearch.cluster.routing.ShardRoutingState;
|
59 | 59 | import org.opensearch.cluster.routing.UnassignedInfo;
|
| 60 | +import org.opensearch.cluster.routing.allocation.AllocateUnassignedDecision; |
60 | 61 | import org.opensearch.cluster.routing.allocation.AllocationDecision;
|
61 | 62 | import org.opensearch.cluster.routing.allocation.ExistingShardsAllocator;
|
62 | 63 | import org.opensearch.cluster.service.ClusterService;
|
@@ -797,11 +798,26 @@ public void testBatchModeEnabledWithoutTimeout() throws Exception {
|
797 | 798 | );
|
798 | 799 | assertTrue(ExistingShardsAllocator.EXISTING_SHARDS_ALLOCATOR_BATCH_MODE.get(internalCluster().clusterService().getSettings()));
|
799 | 800 | assertEquals(1, gatewayAllocator.getNumberOfStartedShardBatches());
|
800 |
| - assertEquals(1, gatewayAllocator.getNumberOfStoreShardBatches()); |
| 801 | + // Replica shard would be marked ineligible since there are no data nodes. |
| 802 | + // It would then be removed from any batch and batches would get deleted, so we would have 0 replica batches |
| 803 | + assertEquals(0, gatewayAllocator.getNumberOfStoreShardBatches()); |
801 | 804 |
|
802 |
| - // Now start both data nodes and ensure batch mode is working |
803 |
| - logger.info("--> restarting the stopped nodes"); |
| 805 | + // Now start one data node |
| 806 | + logger.info("--> restarting the first stopped node"); |
804 | 807 | internalCluster().startDataOnlyNode(Settings.builder().put("node.name", dataOnlyNodes.get(0)).put(node0DataPathSettings).build());
|
| 808 | + ensureStableCluster(2); |
| 809 | + ensureYellow("test"); |
| 810 | + assertEquals(0, gatewayAllocator.getNumberOfStartedShardBatches()); |
| 811 | + assertEquals(0, gatewayAllocator.getNumberOfStoreShardBatches()); |
| 812 | + assertEquals(0, gatewayAllocator.getNumberOfInFlightFetches()); |
| 813 | + |
| 814 | + // calling reroute and asserting on reroute response |
| 815 | + logger.info("--> calling reroute while cluster is yellow"); |
| 816 | + clusterRerouteResponse = client().admin().cluster().prepareReroute().setRetryFailed(true).get(); |
| 817 | + assertTrue(clusterRerouteResponse.isAcknowledged()); |
| 818 | + |
| 819 | + // Now start last data node and ensure batch mode is working and cluster goes green |
| 820 | + logger.info("--> restarting the second stopped node"); |
805 | 821 | internalCluster().startDataOnlyNode(Settings.builder().put("node.name", dataOnlyNodes.get(1)).put(node1DataPathSettings).build());
|
806 | 822 | ensureStableCluster(3);
|
807 | 823 | ensureGreen("test");
|
@@ -842,11 +858,26 @@ public void testBatchModeEnabledWithSufficientTimeoutAndClusterGreen() throws Ex
|
842 | 858 | );
|
843 | 859 | assertTrue(ExistingShardsAllocator.EXISTING_SHARDS_ALLOCATOR_BATCH_MODE.get(internalCluster().clusterService().getSettings()));
|
844 | 860 | assertEquals(1, gatewayAllocator.getNumberOfStartedShardBatches());
|
845 |
| - assertEquals(1, gatewayAllocator.getNumberOfStoreShardBatches()); |
| 861 | + // Replica shard would be marked ineligible since there are no data nodes. |
| 862 | + // It would then be removed from any batch and batches would get deleted, so we would have 0 replica batches |
| 863 | + assertEquals(0, gatewayAllocator.getNumberOfStoreShardBatches()); |
846 | 864 |
|
847 |
| - // Now start both data nodes and ensure batch mode is working |
848 |
| - logger.info("--> restarting the stopped nodes"); |
| 865 | + // Now start one data nodes and ensure batch mode is working |
| 866 | + logger.info("--> restarting the first stopped node"); |
849 | 867 | internalCluster().startDataOnlyNode(Settings.builder().put("node.name", dataOnlyNodes.get(0)).put(node0DataPathSettings).build());
|
| 868 | + ensureStableCluster(2); |
| 869 | + ensureYellow("test"); |
| 870 | + assertEquals(0, gatewayAllocator.getNumberOfStartedShardBatches()); |
| 871 | + assertEquals(0, gatewayAllocator.getNumberOfStoreShardBatches()); |
| 872 | + assertEquals(0, gatewayAllocator.getNumberOfInFlightFetches()); |
| 873 | + |
| 874 | + // calling reroute and asserting on reroute response |
| 875 | + logger.info("--> calling reroute while cluster is yellow"); |
| 876 | + clusterRerouteResponse = client().admin().cluster().prepareReroute().setRetryFailed(true).get(); |
| 877 | + assertTrue(clusterRerouteResponse.isAcknowledged()); |
| 878 | + |
| 879 | + // Now start last data node and ensure batch mode is working and cluster goes green |
| 880 | + logger.info("--> restarting the second stopped node"); |
850 | 881 | internalCluster().startDataOnlyNode(Settings.builder().put("node.name", dataOnlyNodes.get(1)).put(node1DataPathSettings).build());
|
851 | 882 | ensureStableCluster(3);
|
852 | 883 | ensureGreen("test");
|
@@ -907,7 +938,9 @@ public void testBatchModeEnabledWithInSufficientTimeoutButClusterGreen() throws
|
907 | 938 |
|
908 | 939 | assertTrue(ExistingShardsAllocator.EXISTING_SHARDS_ALLOCATOR_BATCH_MODE.get(internalCluster().clusterService().getSettings()));
|
909 | 940 | assertEquals(10, gatewayAllocator.getNumberOfStartedShardBatches());
|
910 |
| - assertEquals(10, gatewayAllocator.getNumberOfStoreShardBatches()); |
| 941 | + // All replica shards would be marked ineligible since there are no data nodes. |
| 942 | + // They would then be removed from any batch and batches would get deleted, so we would have 0 replica batches |
| 943 | + assertEquals(0, gatewayAllocator.getNumberOfStoreShardBatches()); |
911 | 944 | health = client(internalCluster().getClusterManagerName()).admin().cluster().health(Requests.clusterHealthRequest()).actionGet();
|
912 | 945 | assertFalse(health.isTimedOut());
|
913 | 946 | assertEquals(RED, health.getStatus());
|
@@ -1051,6 +1084,18 @@ public void testMultipleReplicaShardAssignmentWithDelayedAllocationAndDifferentN
|
1051 | 1084 | ensureGreen("test");
|
1052 | 1085 | }
|
1053 | 1086 |
|
| 1087 | + public void testAllocationExplainReturnsNoWhenExtraReplicaShardInNonBatchMode() throws Exception { |
| 1088 | + // Non batch mode - This test is to validate that we don't return AWAITING_INFO in allocation explain API when the deciders are |
| 1089 | + // returning NO |
| 1090 | + this.allocationExplainReturnsNoWhenExtraReplicaShard(false); |
| 1091 | + } |
| 1092 | + |
| 1093 | + public void testAllocationExplainReturnsNoWhenExtraReplicaShardInBatchMode() throws Exception { |
| 1094 | + // Batch mode - This test is to validate that we don't return AWAITING_INFO in allocation explain API when the deciders are |
| 1095 | + // returning NO |
| 1096 | + this.allocationExplainReturnsNoWhenExtraReplicaShard(true); |
| 1097 | + } |
| 1098 | + |
1054 | 1099 | public void testNBatchesCreationAndAssignment() throws Exception {
|
1055 | 1100 | // we will reduce batch size to 5 to make sure we have enough batches to test assignment
|
1056 | 1101 | // Total number of primary shards = 50 (50 indices*1)
|
@@ -1104,7 +1149,9 @@ public void testNBatchesCreationAndAssignment() throws Exception {
|
1104 | 1149 | );
|
1105 | 1150 | assertTrue(ExistingShardsAllocator.EXISTING_SHARDS_ALLOCATOR_BATCH_MODE.get(internalCluster().clusterService().getSettings()));
|
1106 | 1151 | assertEquals(10, gatewayAllocator.getNumberOfStartedShardBatches());
|
1107 |
| - assertEquals(10, gatewayAllocator.getNumberOfStoreShardBatches()); |
| 1152 | + // All replica shards would be marked ineligible since there are no data nodes. |
| 1153 | + // They would then be removed from any batch and batches would get deleted, so we would have 0 replica batches |
| 1154 | + assertEquals(0, gatewayAllocator.getNumberOfStoreShardBatches()); |
1108 | 1155 | health = client(internalCluster().getClusterManagerName()).admin().cluster().health(Requests.clusterHealthRequest()).actionGet();
|
1109 | 1156 | assertFalse(health.isTimedOut());
|
1110 | 1157 | assertEquals(RED, health.getStatus());
|
@@ -1193,7 +1240,9 @@ public void testCulpritShardInBatch() throws Exception {
|
1193 | 1240 | );
|
1194 | 1241 | assertTrue(ExistingShardsAllocator.EXISTING_SHARDS_ALLOCATOR_BATCH_MODE.get(internalCluster().clusterService().getSettings()));
|
1195 | 1242 | assertEquals(1, gatewayAllocator.getNumberOfStartedShardBatches());
|
1196 |
| - assertEquals(1, gatewayAllocator.getNumberOfStoreShardBatches()); |
| 1243 | + // Replica shard would be marked ineligible since there are no data nodes. |
| 1244 | + // It would then be removed from any batch and batches would get deleted, so we would have 0 replica batches |
| 1245 | + assertEquals(0, gatewayAllocator.getNumberOfStoreShardBatches()); |
1197 | 1246 | assertTrue(clusterRerouteResponse.isAcknowledged());
|
1198 | 1247 | health = client(internalCluster().getClusterManagerName()).admin().cluster().health(Requests.clusterHealthRequest()).actionGet();
|
1199 | 1248 | assertFalse(health.isTimedOut());
|
@@ -1511,4 +1560,97 @@ private List<String> findNodesWithShard(final boolean primary) {
|
1511 | 1560 | Collections.shuffle(requiredStartedShards, random());
|
1512 | 1561 | return requiredStartedShards.stream().map(shard -> state.nodes().get(shard.currentNodeId()).getName()).collect(Collectors.toList());
|
1513 | 1562 | }
|
| 1563 | + |
| 1564 | + private void allocationExplainReturnsNoWhenExtraReplicaShard(boolean batchModeEnabled) throws Exception { |
| 1565 | + internalCluster().startClusterManagerOnlyNodes( |
| 1566 | + 1, |
| 1567 | + Settings.builder().put(ExistingShardsAllocator.EXISTING_SHARDS_ALLOCATOR_BATCH_MODE.getKey(), batchModeEnabled).build() |
| 1568 | + ); |
| 1569 | + internalCluster().startDataOnlyNodes(5); |
| 1570 | + createIndex( |
| 1571 | + "test", |
| 1572 | + Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 4).build() |
| 1573 | + ); |
| 1574 | + ensureGreen("test"); |
| 1575 | + ensureStableCluster(6); |
| 1576 | + |
| 1577 | + // Stop one of the nodes to make the cluster yellow |
| 1578 | + // We cannot directly create an index with replica = data node count because then the whole flow will get skipped due to |
| 1579 | + // INDEX_CREATED |
| 1580 | + List<String> nodesWithReplicaShards = findNodesWithShard(false); |
| 1581 | + Settings replicaNodeDataPathSettings = internalCluster().dataPathSettings(nodesWithReplicaShards.get(0)); |
| 1582 | + internalCluster().stopRandomNode(InternalTestCluster.nameFilter(nodesWithReplicaShards.get(0))); |
| 1583 | + |
| 1584 | + ensureStableCluster(5); |
| 1585 | + ensureYellow("test"); |
| 1586 | + |
| 1587 | + logger.info("--> calling allocation explain API"); |
| 1588 | + // shard should have decision NO because there is no valid node for the extra replica to go to |
| 1589 | + AllocateUnassignedDecision aud = client().admin() |
| 1590 | + .cluster() |
| 1591 | + .prepareAllocationExplain() |
| 1592 | + .setIndex("test") |
| 1593 | + .setShard(0) |
| 1594 | + .setPrimary(false) |
| 1595 | + .get() |
| 1596 | + .getExplanation() |
| 1597 | + .getShardAllocationDecision() |
| 1598 | + .getAllocateDecision(); |
| 1599 | + |
| 1600 | + assertEquals(AllocationDecision.NO, aud.getAllocationDecision()); |
| 1601 | + assertEquals("cannot allocate because allocation is not permitted to any of the nodes", aud.getExplanation()); |
| 1602 | + |
| 1603 | + // Now creating a new index with too many replicas and trying again |
| 1604 | + createIndex( |
| 1605 | + "test2", |
| 1606 | + Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 5).build() |
| 1607 | + ); |
| 1608 | + |
| 1609 | + ensureYellowAndNoInitializingShards("test2"); |
| 1610 | + |
| 1611 | + logger.info("--> calling allocation explain API again"); |
| 1612 | + // shard should have decision NO because there are 6 replicas and 4 data nodes |
| 1613 | + aud = client().admin() |
| 1614 | + .cluster() |
| 1615 | + .prepareAllocationExplain() |
| 1616 | + .setIndex("test2") |
| 1617 | + .setShard(0) |
| 1618 | + .setPrimary(false) |
| 1619 | + .get() |
| 1620 | + .getExplanation() |
| 1621 | + .getShardAllocationDecision() |
| 1622 | + .getAllocateDecision(); |
| 1623 | + |
| 1624 | + assertEquals(AllocationDecision.NO, aud.getAllocationDecision()); |
| 1625 | + assertEquals("cannot allocate because allocation is not permitted to any of the nodes", aud.getExplanation()); |
| 1626 | + |
| 1627 | + logger.info("--> restarting the stopped node"); |
| 1628 | + internalCluster().startDataOnlyNode( |
| 1629 | + Settings.builder().put("node.name", nodesWithReplicaShards.get(0)).put(replicaNodeDataPathSettings).build() |
| 1630 | + ); |
| 1631 | + |
| 1632 | + ensureStableCluster(6); |
| 1633 | + ensureGreen("test"); |
| 1634 | + |
| 1635 | + logger.info("--> calling allocation explain API 3rd time"); |
| 1636 | + // shard should still have decision NO because there are 6 replicas and 5 data nodes |
| 1637 | + aud = client().admin() |
| 1638 | + .cluster() |
| 1639 | + .prepareAllocationExplain() |
| 1640 | + .setIndex("test2") |
| 1641 | + .setShard(0) |
| 1642 | + .setPrimary(false) |
| 1643 | + .get() |
| 1644 | + .getExplanation() |
| 1645 | + .getShardAllocationDecision() |
| 1646 | + .getAllocateDecision(); |
| 1647 | + |
| 1648 | + assertEquals(AllocationDecision.NO, aud.getAllocationDecision()); |
| 1649 | + assertEquals("cannot allocate because allocation is not permitted to any of the nodes", aud.getExplanation()); |
| 1650 | + |
| 1651 | + internalCluster().startDataOnlyNodes(1); |
| 1652 | + |
| 1653 | + ensureStableCluster(7); |
| 1654 | + ensureGreen("test2"); |
| 1655 | + } |
1514 | 1656 | }
|
0 commit comments