Skip to content

Commit 5c19809

Browse files
authored
Add setting to ignore throttling nodes for allocation of unassigned remote primaries (opensearch-project#14991)
Signed-off-by: Gaurav Bafna <gbbafna@amazon.com>
1 parent eb306d2 commit 5c19809

File tree

8 files changed

+233
-20
lines changed

8 files changed

+233
-20
lines changed

CHANGELOG.md

+2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
77
### Added
88
- Fix for hasInitiatedFetching to fix allocation explain and manual reroute APIs (([#14972](https://github.com/opensearch-project/OpenSearch/pull/14972))
99
- [Workload Management] Add queryGroupId to Task ([14708](https://github.com/opensearch-project/OpenSearch/pull/14708))
10+
- Add setting to ignore throttling nodes for allocation of unassigned primaries in remote restore ([#14991](https://github.com/opensearch-project/OpenSearch/pull/14991))
1011
- Add basic aggregation support for derived fields ([#14618](https://github.com/opensearch-project/OpenSearch/pull/14618))
1112

1213
### Dependencies
@@ -23,6 +24,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
2324
### Removed
2425

2526
### Fixed
27+
- Fix constraint bug which allows more primary shards than average primary shards per index ([#14908](https://github.com/opensearch-project/OpenSearch/pull/14908))
2628
- Fix missing value of FieldSort for unsigned_long ([#14963](https://github.com/opensearch-project/OpenSearch/pull/14963))
2729

2830
### Security

server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java

+20-3
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,13 @@ public class BalancedShardsAllocator implements ShardsAllocator {
154154
Property.NodeScope
155155
);
156156

157+
public static final Setting<Boolean> IGNORE_THROTTLE_FOR_REMOTE_RESTORE = Setting.boolSetting(
158+
"cluster.routing.allocation.remote_primary.ignore_throttle",
159+
true,
160+
Property.Dynamic,
161+
Property.NodeScope
162+
);
163+
157164
public static final Setting<Float> PRIMARY_SHARD_REBALANCE_BUFFER = Setting.floatSetting(
158165
"cluster.routing.allocation.rebalance.primary.buffer",
159166
0.10f,
@@ -173,6 +180,8 @@ public class BalancedShardsAllocator implements ShardsAllocator {
173180
private volatile WeightFunction weightFunction;
174181
private volatile float threshold;
175182

183+
private volatile boolean ignoreThrottleInRestore;
184+
176185
public BalancedShardsAllocator(Settings settings) {
177186
this(settings, new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS));
178187
}
@@ -182,6 +191,7 @@ public BalancedShardsAllocator(Settings settings, ClusterSettings clusterSetting
182191
setShardBalanceFactor(SHARD_BALANCE_FACTOR_SETTING.get(settings));
183192
setIndexBalanceFactor(INDEX_BALANCE_FACTOR_SETTING.get(settings));
184193
setPreferPrimaryShardRebalanceBuffer(PRIMARY_SHARD_REBALANCE_BUFFER.get(settings));
194+
setIgnoreThrottleInRestore(IGNORE_THROTTLE_FOR_REMOTE_RESTORE.get(settings));
185195
updateWeightFunction();
186196
setThreshold(THRESHOLD_SETTING.get(settings));
187197
setPreferPrimaryShardBalance(PREFER_PRIMARY_SHARD_BALANCE.get(settings));
@@ -195,6 +205,7 @@ public BalancedShardsAllocator(Settings settings, ClusterSettings clusterSetting
195205
clusterSettings.addSettingsUpdateConsumer(PRIMARY_SHARD_REBALANCE_BUFFER, this::updatePreferPrimaryShardBalanceBuffer);
196206
clusterSettings.addSettingsUpdateConsumer(PREFER_PRIMARY_SHARD_REBALANCE, this::setPreferPrimaryShardRebalance);
197207
clusterSettings.addSettingsUpdateConsumer(THRESHOLD_SETTING, this::setThreshold);
208+
clusterSettings.addSettingsUpdateConsumer(IGNORE_THROTTLE_FOR_REMOTE_RESTORE, this::setIgnoreThrottleInRestore);
198209
}
199210

200211
/**
@@ -205,6 +216,10 @@ private void setMovePrimaryFirst(boolean movePrimaryFirst) {
205216
setShardMovementStrategy(this.shardMovementStrategy);
206217
}
207218

219+
private void setIgnoreThrottleInRestore(boolean ignoreThrottleInRestore) {
220+
this.ignoreThrottleInRestore = ignoreThrottleInRestore;
221+
}
222+
208223
/**
209224
* Sets the correct Shard movement strategy to use.
210225
* If users are still using deprecated setting `move_primary_first`, we want behavior to remain unchanged.
@@ -282,7 +297,8 @@ public void allocate(RoutingAllocation allocation) {
282297
weightFunction,
283298
threshold,
284299
preferPrimaryShardBalance,
285-
preferPrimaryShardRebalance
300+
preferPrimaryShardRebalance,
301+
ignoreThrottleInRestore
286302
);
287303
localShardsBalancer.allocateUnassigned();
288304
localShardsBalancer.moveShards();
@@ -304,7 +320,8 @@ public ShardAllocationDecision decideShardAllocation(final ShardRouting shard, f
304320
weightFunction,
305321
threshold,
306322
preferPrimaryShardBalance,
307-
preferPrimaryShardRebalance
323+
preferPrimaryShardRebalance,
324+
ignoreThrottleInRestore
308325
);
309326
AllocateUnassignedDecision allocateUnassignedDecision = AllocateUnassignedDecision.NOT_TAKEN;
310327
MoveDecision moveDecision = MoveDecision.NOT_TAKEN;
@@ -558,7 +575,7 @@ public Balancer(
558575
float threshold,
559576
boolean preferPrimaryBalance
560577
) {
561-
super(logger, allocation, shardMovementStrategy, weight, threshold, preferPrimaryBalance, false);
578+
super(logger, allocation, shardMovementStrategy, weight, threshold, preferPrimaryBalance, false, false);
562579
}
563580
}
564581

server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/LocalShardsBalancer.java

+15-2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import org.apache.lucene.util.IntroSorter;
1414
import org.opensearch.cluster.metadata.IndexMetadata;
1515
import org.opensearch.cluster.metadata.Metadata;
16+
import org.opensearch.cluster.routing.RecoverySource;
1617
import org.opensearch.cluster.routing.RoutingNode;
1718
import org.opensearch.cluster.routing.RoutingNodes;
1819
import org.opensearch.cluster.routing.RoutingPool;
@@ -60,6 +61,8 @@ public class LocalShardsBalancer extends ShardsBalancer {
6061

6162
private final boolean preferPrimaryBalance;
6263
private final boolean preferPrimaryRebalance;
64+
65+
private final boolean ignoreThrottleInRestore;
6366
private final BalancedShardsAllocator.WeightFunction weight;
6467

6568
private final float threshold;
@@ -77,7 +80,8 @@ public LocalShardsBalancer(
7780
BalancedShardsAllocator.WeightFunction weight,
7881
float threshold,
7982
boolean preferPrimaryBalance,
80-
boolean preferPrimaryRebalance
83+
boolean preferPrimaryRebalance,
84+
boolean ignoreThrottleInRestore
8185
) {
8286
this.logger = logger;
8387
this.allocation = allocation;
@@ -94,6 +98,7 @@ public LocalShardsBalancer(
9498
this.preferPrimaryBalance = preferPrimaryBalance;
9599
this.preferPrimaryRebalance = preferPrimaryRebalance;
96100
this.shardMovementStrategy = shardMovementStrategy;
101+
this.ignoreThrottleInRestore = ignoreThrottleInRestore;
97102
}
98103

99104
/**
@@ -918,7 +923,15 @@ AllocateUnassignedDecision decideAllocateUnassigned(final ShardRouting shard) {
918923
nodeExplanationMap.put(node.getNodeId(), new NodeAllocationResult(node.getRoutingNode().node(), currentDecision, 0));
919924
nodeWeights.add(Tuple.tuple(node.getNodeId(), currentWeight));
920925
}
921-
if (currentDecision.type() == Decision.Type.YES || currentDecision.type() == Decision.Type.THROTTLE) {
926+
927+
// For REMOTE_STORE recoveries, THROTTLE is as good as NO as we want faster recoveries
928+
// The side effect of this are increased relocations post these allocations.
929+
boolean considerThrottleAsNo = ignoreThrottleInRestore
930+
&& shard.recoverySource().getType() == RecoverySource.Type.REMOTE_STORE
931+
&& shard.primary();
932+
933+
if (currentDecision.type() == Decision.Type.YES
934+
|| (currentDecision.type() == Decision.Type.THROTTLE && considerThrottleAsNo == false)) {
922935
final boolean updateMinNode;
923936
if (currentWeight == minWeight) {
924937
/* we have an equal weight tie breaking:

server/src/main/java/org/opensearch/common/settings/ClusterSettings.java

+1
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ public void apply(Settings value, Settings current, Settings previous) {
268268
BalancedShardsAllocator.SHARD_MOVE_PRIMARY_FIRST_SETTING,
269269
BalancedShardsAllocator.SHARD_MOVEMENT_STRATEGY_SETTING,
270270
BalancedShardsAllocator.THRESHOLD_SETTING,
271+
BalancedShardsAllocator.IGNORE_THROTTLE_FOR_REMOTE_RESTORE,
271272
BreakerSettings.CIRCUIT_BREAKER_LIMIT_SETTING,
272273
BreakerSettings.CIRCUIT_BREAKER_OVERHEAD_SETTING,
273274
BreakerSettings.CIRCUIT_BREAKER_TYPE,

server/src/test/java/org/opensearch/cluster/routing/allocation/BalancedSingleShardTests.java

-15
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
package org.opensearch.cluster.routing.allocation;
3434

3535
import org.opensearch.action.support.replication.ClusterStateCreationUtils;
36-
import org.opensearch.cluster.ClusterInfo;
3736
import org.opensearch.cluster.ClusterState;
3837
import org.opensearch.cluster.OpenSearchAllocationTestCase;
3938
import org.opensearch.cluster.node.DiscoveryNode;
@@ -50,7 +49,6 @@
5049
import org.opensearch.cluster.routing.allocation.decider.Decision.Type;
5150
import org.opensearch.common.collect.Tuple;
5251
import org.opensearch.common.settings.Settings;
53-
import org.opensearch.snapshots.SnapshotShardSizeInfo;
5452

5553
import java.util.Arrays;
5654
import java.util.Collections;
@@ -398,19 +396,6 @@ public Decision canRebalance(ShardRouting shardRouting, RoutingAllocation alloca
398396
return Tuple.tuple(clusterState, rebalanceDecision);
399397
}
400398

401-
private RoutingAllocation newRoutingAllocation(AllocationDeciders deciders, ClusterState state) {
402-
RoutingAllocation allocation = new RoutingAllocation(
403-
deciders,
404-
new RoutingNodes(state, false),
405-
state,
406-
ClusterInfo.EMPTY,
407-
SnapshotShardSizeInfo.EMPTY,
408-
System.nanoTime()
409-
);
410-
allocation.debugDecision(true);
411-
return allocation;
412-
}
413-
414399
private void assertAssignedNodeRemainsSame(
415400
BalancedShardsAllocator allocator,
416401
RoutingAllocation routingAllocation,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*
4+
* The OpenSearch Contributors require contributions made to
5+
* this file be licensed under the Apache-2.0 license or a
6+
* compatible open source license.
7+
*/
8+
9+
package org.opensearch.cluster.routing.allocation;
10+
11+
import org.opensearch.Version;
12+
import org.opensearch.action.support.replication.ClusterStateCreationUtils;
13+
import org.opensearch.cluster.ClusterState;
14+
import org.opensearch.cluster.OpenSearchAllocationTestCase;
15+
import org.opensearch.cluster.metadata.IndexMetadata;
16+
import org.opensearch.cluster.metadata.Metadata;
17+
import org.opensearch.cluster.node.DiscoveryNode;
18+
import org.opensearch.cluster.node.DiscoveryNodes;
19+
import org.opensearch.cluster.routing.AllocationId;
20+
import org.opensearch.cluster.routing.IndexRoutingTable;
21+
import org.opensearch.cluster.routing.IndexShardRoutingTable;
22+
import org.opensearch.cluster.routing.RoutingNode;
23+
import org.opensearch.cluster.routing.RoutingTable;
24+
import org.opensearch.cluster.routing.ShardRouting;
25+
import org.opensearch.cluster.routing.ShardRoutingState;
26+
import org.opensearch.cluster.routing.TestShardRouting;
27+
import org.opensearch.cluster.routing.UnassignedInfo;
28+
import org.opensearch.cluster.routing.allocation.allocator.BalancedShardsAllocator;
29+
import org.opensearch.cluster.routing.allocation.decider.AllocationDecider;
30+
import org.opensearch.cluster.routing.allocation.decider.AllocationDeciders;
31+
import org.opensearch.cluster.routing.allocation.decider.Decision;
32+
import org.opensearch.common.settings.Settings;
33+
import org.opensearch.core.index.shard.ShardId;
34+
35+
import java.util.Arrays;
36+
import java.util.HashSet;
37+
import java.util.List;
38+
import java.util.Set;
39+
import java.util.stream.Collectors;
40+
41+
import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_CREATION_DATE;
42+
import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS;
43+
import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS;
44+
import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_VERSION_CREATED;
45+
import static org.opensearch.cluster.routing.allocation.allocator.BalancedShardsAllocator.IGNORE_THROTTLE_FOR_REMOTE_RESTORE;
46+
47+
public class DecideAllocateUnassignedTests extends OpenSearchAllocationTestCase {
48+
public void testAllocateUnassignedRemoteRestore_IgnoreThrottle() {
49+
final String[] indices = { "idx1" };
50+
// Create a cluster state with 1 indices, each with 1 started primary shard, and only
51+
// one node initially so that all primary shards get allocated to the same node.
52+
//
53+
// When we add 1 more 1 index with 1 started primary shard and 1 more node , if the new node throttles the recovery
54+
// shard should get assigned on the older node if IgnoreThrottle is set to true
55+
ClusterState clusterState = ClusterStateCreationUtils.state(1, indices, 1);
56+
clusterState = addNodesToClusterState(clusterState, 1);
57+
clusterState = addRestoringIndexToClusterState(clusterState, "idx2");
58+
List<AllocationDecider> allocationDeciders = getAllocationDecidersThrottleOnNode1();
59+
RoutingAllocation routingAllocation = newRoutingAllocation(new AllocationDeciders(allocationDeciders), clusterState);
60+
// allocate and get the node that is now relocating
61+
Settings build = Settings.builder().put(IGNORE_THROTTLE_FOR_REMOTE_RESTORE.getKey(), true).build();
62+
BalancedShardsAllocator allocator = new BalancedShardsAllocator(build);
63+
allocator.allocate(routingAllocation);
64+
assertEquals(routingAllocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).currentNodeId(), "node_0");
65+
assertEquals(routingAllocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).getIndexName(), "idx2");
66+
assertFalse(routingAllocation.routingNodes().hasUnassignedPrimaries());
67+
}
68+
69+
public void testAllocateUnassignedRemoteRestore() {
70+
final String[] indices = { "idx1" };
71+
// Create a cluster state with 1 indices, each with 1 started primary shard, and only
72+
// one node initially so that all primary shards get allocated to the same node.
73+
//
74+
// When we add 1 more 1 index with 1 started primary shard and 1 more node , if the new node throttles the recovery
75+
// shard should remain unassigned if IgnoreThrottle is set to false
76+
ClusterState clusterState = ClusterStateCreationUtils.state(1, indices, 1);
77+
clusterState = addNodesToClusterState(clusterState, 1);
78+
clusterState = addRestoringIndexToClusterState(clusterState, "idx2");
79+
List<AllocationDecider> allocationDeciders = getAllocationDecidersThrottleOnNode1();
80+
RoutingAllocation routingAllocation = newRoutingAllocation(new AllocationDeciders(allocationDeciders), clusterState);
81+
// allocate and get the node that is now relocating
82+
Settings build = Settings.builder().put(IGNORE_THROTTLE_FOR_REMOTE_RESTORE.getKey(), false).build();
83+
BalancedShardsAllocator allocator = new BalancedShardsAllocator(build);
84+
allocator.allocate(routingAllocation);
85+
assertEquals(routingAllocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), 0);
86+
assertTrue(routingAllocation.routingNodes().hasUnassignedPrimaries());
87+
}
88+
89+
private static List<AllocationDecider> getAllocationDecidersThrottleOnNode1() {
90+
// Allocation Deciders to throttle on `node_1`
91+
final Set<String> throttleNodes = new HashSet<>();
92+
throttleNodes.add("node_1");
93+
AllocationDecider allocationDecider = new AllocationDecider() {
94+
@Override
95+
public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
96+
if (throttleNodes.contains(node.nodeId())) {
97+
return Decision.THROTTLE;
98+
}
99+
return Decision.YES;
100+
}
101+
};
102+
List<AllocationDecider> allocationDeciders = Arrays.asList(allocationDecider);
103+
return allocationDeciders;
104+
}
105+
106+
private ClusterState addNodesToClusterState(ClusterState clusterState, int nodeId) {
107+
DiscoveryNodes.Builder nodesBuilder = DiscoveryNodes.builder(clusterState.nodes());
108+
DiscoveryNode discoveryNode = newNode("node_" + nodeId);
109+
nodesBuilder.add(discoveryNode);
110+
return ClusterState.builder(clusterState).nodes(nodesBuilder).build();
111+
}
112+
113+
private ClusterState addRestoringIndexToClusterState(ClusterState clusterState, String index) {
114+
final int primaryTerm = 1 + randomInt(200);
115+
final ShardId shardId = new ShardId(index, "_na_", 0);
116+
117+
IndexMetadata indexMetadata = IndexMetadata.builder(index)
118+
.settings(
119+
Settings.builder()
120+
.put(SETTING_VERSION_CREATED, Version.CURRENT)
121+
.put(SETTING_NUMBER_OF_SHARDS, 1)
122+
.put(SETTING_NUMBER_OF_REPLICAS, 0)
123+
.put(SETTING_CREATION_DATE, System.currentTimeMillis())
124+
)
125+
.primaryTerm(0, primaryTerm)
126+
.build();
127+
128+
IndexShardRoutingTable.Builder indexShardRoutingBuilder = new IndexShardRoutingTable.Builder(shardId);
129+
UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.EXISTING_INDEX_RESTORED, null);
130+
indexShardRoutingBuilder.addShard(
131+
TestShardRouting.newShardRoutingRemoteRestore(index, shardId, null, null, true, ShardRoutingState.UNASSIGNED, unassignedInfo)
132+
);
133+
final IndexShardRoutingTable indexShardRoutingTable = indexShardRoutingBuilder.build();
134+
135+
IndexMetadata.Builder indexMetadataBuilder = new IndexMetadata.Builder(indexMetadata);
136+
indexMetadataBuilder.putInSyncAllocationIds(
137+
0,
138+
indexShardRoutingTable.activeShards()
139+
.stream()
140+
.map(ShardRouting::allocationId)
141+
.map(AllocationId::getId)
142+
.collect(Collectors.toSet())
143+
);
144+
ClusterState.Builder state = ClusterState.builder(clusterState);
145+
state.metadata(Metadata.builder(clusterState.metadata()).put(indexMetadataBuilder.build(), false).generateClusterUuidIfNeeded());
146+
state.routingTable(
147+
RoutingTable.builder(clusterState.routingTable())
148+
.add(IndexRoutingTable.builder(indexMetadata.getIndex()).addIndexShard(indexShardRoutingTable))
149+
.build()
150+
);
151+
return state.build();
152+
}
153+
154+
}

test/framework/src/main/java/org/opensearch/cluster/OpenSearchAllocationTestCase.java

+15
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import org.opensearch.cluster.node.DiscoveryNodeRole;
3838
import org.opensearch.cluster.routing.RecoverySource;
3939
import org.opensearch.cluster.routing.RoutingNode;
40+
import org.opensearch.cluster.routing.RoutingNodes;
4041
import org.opensearch.cluster.routing.ShardRouting;
4142
import org.opensearch.cluster.routing.UnassignedInfo;
4243
import org.opensearch.cluster.routing.allocation.AllocationService;
@@ -287,6 +288,19 @@ public static ClusterState startShardsAndReroute(
287288
return allocationService.reroute(allocationService.applyStartedShards(clusterState, initializingShards), "reroute after starting");
288289
}
289290

291+
protected RoutingAllocation newRoutingAllocation(AllocationDeciders deciders, ClusterState state) {
292+
RoutingAllocation allocation = new RoutingAllocation(
293+
deciders,
294+
new RoutingNodes(state, false),
295+
state,
296+
ClusterInfo.EMPTY,
297+
SnapshotShardSizeInfo.EMPTY,
298+
System.nanoTime()
299+
);
300+
allocation.debugDecision(true);
301+
return allocation;
302+
}
303+
290304
public static class TestAllocateDecision extends AllocationDecider {
291305

292306
private final Decision decision;
@@ -465,5 +479,6 @@ public void allocateUnassigned(
465479
unassignedAllocationHandler.removeAndIgnore(UnassignedInfo.AllocationStatus.DELAYED_ALLOCATION, allocation.changes());
466480
}
467481
}
482+
468483
}
469484
}

0 commit comments

Comments
 (0)