Skip to content

Commit 11babe7

Browse files
authored
Decouple throttling limits for new and old indices. (opensearch-project#778)
* Decouple throttling limits for new and old indices. Signed-off-by: Itiyama <itiyamas@amazon.com> * Precommit fixes. Signed-off-by: Itiyama <itiyamas@amazon.com> * Review comments. Signed-off-by: Itiyama <itiyamas@amazon.com> * Review comments and test fix. Signed-off-by: Itiyama <itiyamas@amazon.com> * Checkstyle fixes. Signed-off-by: Itiyama <itiyamas@amazon.com> * Review comments. Signed-off-by: Itiyama <itiyamas@amazon.com>
1 parent 3cd4e7f commit 11babe7

17 files changed

+548
-50
lines changed

server/src/main/java/org/opensearch/cluster/routing/RoutingNodes.java

+54-12
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,8 @@ public class RoutingNodes implements Iterable<RoutingNode> {
9797

9898
private final Map<String, ObjectIntHashMap<String>> nodesPerAttributeNames = new HashMap<>();
9999
private final Map<String, Recoveries> recoveriesPerNode = new HashMap<>();
100-
private final Map<String, Recoveries> primaryRecoveriesPerNode = new HashMap<>();
100+
private final Map<String, Recoveries> initialReplicaRecoveries = new HashMap<>();
101+
private final Map<String, Recoveries> initialPrimaryRecoveries = new HashMap<>();
101102

102103
public RoutingNodes(ClusterState clusterState) {
103104
this(clusterState, true);
@@ -187,46 +188,69 @@ private void updateRecoveryCounts(final ShardRouting routing, final boolean incr
187188
// Primary shard routing, excluding the relocating primaries.
188189
if(routing.primary() && (primary == null || primary == routing)) {
189190
assert routing.relocatingNodeId() == null: "Routing must be a non relocating primary";
190-
Recoveries.getOrAdd(primaryRecoveriesPerNode, routing.currentNodeId()).addIncoming(howMany);
191+
Recoveries.getOrAdd(initialPrimaryRecoveries, routing.currentNodeId()).addIncoming(howMany);
191192
return;
192193
}
193194

194-
Recoveries.getOrAdd(recoveriesPerNode, routing.currentNodeId()).addIncoming(howMany);
195+
Recoveries.getOrAdd(getRecoveries(routing), routing.currentNodeId()).addIncoming(howMany);
195196

196197
if (routing.recoverySource().getType() == RecoverySource.Type.PEER) {
197198
// add/remove corresponding outgoing recovery on node with primary shard
198199
if (primary == null) {
199200
throw new IllegalStateException("shard is peer recovering but primary is unassigned");
200201
}
201-
Recoveries.getOrAdd(recoveriesPerNode, primary.currentNodeId()).addOutgoing(howMany);
202+
203+
Recoveries.getOrAdd(getRecoveries(routing), primary.currentNodeId()).addOutgoing(howMany);
202204

203205
if (increment == false && routing.primary() && routing.relocatingNodeId() != null) {
204206
// primary is done relocating, move non-primary recoveries from old primary to new primary
205-
int numRecoveringReplicas = 0;
206207
for (ShardRouting assigned : assignedShards(routing.shardId())) {
207208
if (assigned.primary() == false && assigned.initializing() &&
208209
assigned.recoverySource().getType() == RecoverySource.Type.PEER) {
209-
numRecoveringReplicas++;
210+
Map<String, Recoveries> recoveriesToUpdate = getRecoveries(assigned);
211+
Recoveries.getOrAdd(recoveriesToUpdate, routing.relocatingNodeId()).addOutgoing(-1);
212+
Recoveries.getOrAdd(recoveriesToUpdate, routing.currentNodeId()).addOutgoing(1);
210213
}
211214
}
212-
recoveriesPerNode.get(routing.relocatingNodeId()).addOutgoing(-numRecoveringReplicas);
213-
recoveriesPerNode.get(routing.currentNodeId()).addOutgoing(numRecoveringReplicas);
215+
214216
}
215217
}
216218
}
217219

220+
private Map<String, Recoveries> getRecoveries(ShardRouting routing) {
221+
if(routing.unassignedReasonIndexCreated() && !routing.primary()) {
222+
return initialReplicaRecoveries;
223+
} else {
224+
return recoveriesPerNode;
225+
}
226+
}
227+
218228
public int getIncomingRecoveries(String nodeId) {
219229
return recoveriesPerNode.getOrDefault(nodeId, Recoveries.EMPTY).getIncoming();
220230
}
221231

222232
public int getInitialPrimariesIncomingRecoveries(String nodeId) {
223-
return primaryRecoveriesPerNode.getOrDefault(nodeId, Recoveries.EMPTY).getIncoming();
233+
return initialPrimaryRecoveries.getOrDefault(nodeId, Recoveries.EMPTY).getIncoming();
224234
}
225235

226236
public int getOutgoingRecoveries(String nodeId) {
227237
return recoveriesPerNode.getOrDefault(nodeId, Recoveries.EMPTY).getOutgoing();
228238
}
229239

240+
/**
241+
* Recoveries started on node as a result of new index creation.
242+
*/
243+
public int getInitialIncomingRecoveries(String nodeId) {
244+
return initialReplicaRecoveries.getOrDefault(nodeId, Recoveries.EMPTY).getIncoming();
245+
}
246+
247+
/**
248+
* Recoveries started from node as a result of new index creation.
249+
*/
250+
public int getInitialOutgoingRecoveries(String nodeId) {
251+
return initialReplicaRecoveries.getOrDefault(nodeId, Recoveries.EMPTY).getOutgoing();
252+
}
253+
230254
@Nullable
231255
private ShardRouting findAssignedPrimaryIfPeerRecovery(ShardRouting routing) {
232256
ShardRouting primary = null;
@@ -1106,10 +1130,10 @@ public static boolean assertShardStats(RoutingNodes routingNodes) {
11061130
}
11071131
}
11081132

1109-
assertRecoveriesPerNode(routingNodes, routingNodes.recoveriesPerNode, true,
1110-
x -> !isNonRelocatingPrimary(x));
1111-
assertRecoveriesPerNode(routingNodes, routingNodes.primaryRecoveriesPerNode, false,
1133+
assertRecoveriesPerNode(routingNodes, routingNodes.initialPrimaryRecoveries, false,
11121134
x -> isNonRelocatingPrimary(x));
1135+
assertRecoveriesPerNode(routingNodes, Recoveries.unionRecoveries(routingNodes.recoveriesPerNode,
1136+
routingNodes.initialReplicaRecoveries), true, x -> !isNonRelocatingPrimary(x));
11131137

11141138
assert unassignedPrimaryCount == routingNodes.unassignedShards.getNumPrimaries() :
11151139
"Unassigned primaries is [" + unassignedPrimaryCount + "] but RoutingNodes returned unassigned primaries [" +
@@ -1237,5 +1261,23 @@ public static Recoveries getOrAdd(Map<String, Recoveries> map, String key) {
12371261
}
12381262
return recoveries;
12391263
}
1264+
1265+
// used only for tests
1266+
static Map<String, Recoveries> unionRecoveries(Map<String, Recoveries> first, Map<String, Recoveries> second) {
1267+
Map<String, Recoveries> recoveries = new HashMap<>();
1268+
addRecoveries(recoveries, first);
1269+
addRecoveries(recoveries, second);
1270+
return recoveries;
1271+
}
1272+
1273+
private static void addRecoveries(Map<String, Recoveries> existingRecoveries,
1274+
Map<String, Recoveries> newRecoveries) {
1275+
for (String node : newRecoveries.keySet()) {
1276+
Recoveries r2 = newRecoveries.get(node);
1277+
Recoveries r1 = Recoveries.getOrAdd(existingRecoveries, node);
1278+
r1.addIncoming(r2.incoming);
1279+
r1.addOutgoing(r2.outgoing);
1280+
}
1281+
}
12401282
}
12411283
}

server/src/main/java/org/opensearch/cluster/routing/ShardRouting.java

+7
Original file line numberDiff line numberDiff line change
@@ -683,4 +683,11 @@ public long getExpectedShardSize() {
683683
public RecoverySource recoverySource() {
684684
return recoverySource;
685685
}
686+
687+
public boolean unassignedReasonIndexCreated() {
688+
if (unassignedInfo != null) {
689+
return unassignedInfo.getReason() == UnassignedInfo.Reason.INDEX_CREATED;
690+
}
691+
return false;
692+
}
686693
}

server/src/main/java/org/opensearch/cluster/routing/allocation/decider/ThrottlingAllocationDecider.java

+87-33
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@
4444
import org.opensearch.common.settings.Setting.Property;
4545
import org.opensearch.common.settings.Settings;
4646

47+
import java.util.Locale;
48+
import java.util.function.BiFunction;
49+
4750
import static org.opensearch.cluster.routing.allocation.decider.Decision.THROTTLE;
4851
import static org.opensearch.cluster.routing.allocation.decider.Decision.YES;
4952

@@ -71,7 +74,7 @@ public class ThrottlingAllocationDecider extends AllocationDecider {
7174
private static final Logger logger = LogManager.getLogger(ThrottlingAllocationDecider.class);
7275

7376
public static final int DEFAULT_CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES = 2;
74-
public static final int DEFAULT_CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES = 4;
77+
public static final int DEFAULT_CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_RECOVERIES = 4;
7578
public static final String NAME = "throttling";
7679
public static final Setting<Integer> CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING =
7780
new Setting<>("cluster.routing.allocation.node_concurrent_recoveries",
@@ -80,7 +83,11 @@ public class ThrottlingAllocationDecider extends AllocationDecider {
8083
Property.Dynamic, Property.NodeScope);
8184
public static final Setting<Integer> CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING =
8285
Setting.intSetting("cluster.routing.allocation.node_initial_primaries_recoveries",
83-
DEFAULT_CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES, 0,
86+
DEFAULT_CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_RECOVERIES, 0,
87+
Property.Dynamic, Property.NodeScope);
88+
public static final Setting<Integer> CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING =
89+
Setting.intSetting("cluster.routing.allocation.node_initial_replicas_recoveries",
90+
DEFAULT_CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_RECOVERIES, 0,
8491
Property.Dynamic, Property.NodeScope);
8592
public static final Setting<Integer> CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING = new Setting<>(
8693
"cluster.routing.allocation.node_concurrent_incoming_recoveries",
@@ -99,9 +106,11 @@ public class ThrottlingAllocationDecider extends AllocationDecider {
99106
private volatile int primariesInitialRecoveries;
100107
private volatile int concurrentIncomingRecoveries;
101108
private volatile int concurrentOutgoingRecoveries;
109+
private volatile int replicasInitialRecoveries;
102110

103111
public ThrottlingAllocationDecider(Settings settings, ClusterSettings clusterSettings) {
104-
this.primariesInitialRecoveries = CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.get(settings);
112+
primariesInitialRecoveries = CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.get(settings);
113+
replicasInitialRecoveries = CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.get(settings);
105114
concurrentIncomingRecoveries = CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING.get(settings);
106115
concurrentOutgoingRecoveries = CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_OUTGOING_RECOVERIES_SETTING.get(settings);
107116

@@ -111,10 +120,12 @@ public ThrottlingAllocationDecider(Settings settings, ClusterSettings clusterSet
111120
this::setConcurrentIncomingRecoverries);
112121
clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_OUTGOING_RECOVERIES_SETTING,
113122
this::setConcurrentOutgoingRecoverries);
123+
clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING,
124+
this::setReplicasInitialRecoveries);
114125

115126
logger.debug("using node_concurrent_outgoing_recoveries [{}], node_concurrent_incoming_recoveries [{}], " +
116-
"node_initial_primaries_recoveries [{}]",
117-
concurrentOutgoingRecoveries, concurrentIncomingRecoveries, primariesInitialRecoveries);
127+
"node_initial_primaries_recoveries [{}], node_initial_replicas_recoveries [{}]",
128+
concurrentOutgoingRecoveries, concurrentIncomingRecoveries, primariesInitialRecoveries, replicasInitialRecoveries);
118129
}
119130

120131
private void setConcurrentIncomingRecoverries(int concurrentIncomingRecoveries) {
@@ -128,6 +139,10 @@ private void setPrimariesInitialRecoveries(int primariesInitialRecoveries) {
128139
this.primariesInitialRecoveries = primariesInitialRecoveries;
129140
}
130141

142+
private void setReplicasInitialRecoveries(int replicasInitialRecoveries) {
143+
this.replicasInitialRecoveries = replicasInitialRecoveries;
144+
}
145+
131146
@Override
132147
public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
133148
if (shardRouting.primary() && shardRouting.unassigned()) {
@@ -150,36 +165,75 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing
150165
// Peer recovery
151166
assert initializingShard(shardRouting, node.nodeId()).recoverySource().getType() == RecoverySource.Type.PEER;
152167

153-
// Allocating a shard to this node will increase the incoming recoveries
154-
int currentInRecoveries = allocation.routingNodes().getIncomingRecoveries(node.nodeId());
155-
if (currentInRecoveries >= concurrentIncomingRecoveries) {
168+
if(shardRouting.unassignedReasonIndexCreated()) {
169+
return allocateInitialShardCopies(shardRouting, node, allocation);
170+
} else {
171+
return allocateNonInitialShardCopies(shardRouting, node, allocation);
172+
}
173+
}
174+
}
175+
176+
private Decision allocateInitialShardCopies(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
177+
int currentInRecoveries = allocation.routingNodes().getInitialIncomingRecoveries(node.nodeId());
178+
assert shardRouting.unassignedReasonIndexCreated() && !shardRouting.primary();
179+
180+
return allocateShardCopies(shardRouting, allocation, currentInRecoveries, replicasInitialRecoveries,
181+
(x,y) -> getInitialPrimaryNodeOutgoingRecoveries(x,y), replicasInitialRecoveries,
182+
String.format(Locale.ROOT, "[%s=%d]", CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(),
183+
replicasInitialRecoveries),
184+
String.format(Locale.ROOT, "[%s=%d]", CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(),
185+
replicasInitialRecoveries));
186+
}
187+
188+
private Decision allocateNonInitialShardCopies(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
189+
190+
assert !shardRouting.unassignedReasonIndexCreated();
191+
int currentInRecoveries = allocation.routingNodes().getIncomingRecoveries(node.nodeId());
192+
193+
return allocateShardCopies(shardRouting, allocation, currentInRecoveries, concurrentIncomingRecoveries,
194+
(x,y) -> getPrimaryNodeOutgoingRecoveries(x,y), concurrentOutgoingRecoveries,
195+
String.format(Locale.ROOT, "[%s=%d] (can also be set via [%s])",
196+
CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING.getKey(),
197+
concurrentIncomingRecoveries, CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey()),
198+
String.format(Locale.ROOT, "[%s=%d] (can also be set via [%s])",
199+
CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_OUTGOING_RECOVERIES_SETTING.getKey(),
200+
concurrentOutgoingRecoveries, CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey()));
201+
}
202+
203+
private Integer getPrimaryNodeOutgoingRecoveries(ShardRouting shardRouting, RoutingAllocation allocation) {
204+
ShardRouting primaryShard = allocation.routingNodes().activePrimary(shardRouting.shardId());
205+
return allocation.routingNodes().getOutgoingRecoveries(primaryShard.currentNodeId());
206+
}
207+
208+
private Integer getInitialPrimaryNodeOutgoingRecoveries(ShardRouting shardRouting, RoutingAllocation allocation) {
209+
ShardRouting primaryShard = allocation.routingNodes().activePrimary(shardRouting.shardId());
210+
return allocation.routingNodes().getInitialOutgoingRecoveries(primaryShard.currentNodeId());
211+
}
212+
213+
private Decision allocateShardCopies(ShardRouting shardRouting, RoutingAllocation allocation, int currentInRecoveries,
214+
int inRecoveriesLimit, BiFunction<ShardRouting, RoutingAllocation,
215+
Integer> primaryNodeOutRecoveriesFunc, int outRecoveriesLimit,
216+
String incomingRecoveriesSettingMsg, String outGoingRecoveriesSettingMsg) {
217+
// Allocating a shard to this node will increase the incoming recoveries
218+
if (currentInRecoveries >= inRecoveriesLimit) {
219+
return allocation.decision(THROTTLE, NAME,
220+
"reached the limit of incoming shard recoveries [%d], cluster setting %s",
221+
currentInRecoveries, incomingRecoveriesSettingMsg);
222+
} else {
223+
// search for corresponding recovery source (= primary shard) and check number of outgoing recoveries on that node
224+
ShardRouting primaryShard = allocation.routingNodes().activePrimary(shardRouting.shardId());
225+
if (primaryShard == null) {
226+
return allocation.decision(Decision.NO, NAME, "primary shard for this replica is not yet active");
227+
}
228+
int primaryNodeOutRecoveries = primaryNodeOutRecoveriesFunc.apply(shardRouting, allocation);
229+
if (primaryNodeOutRecoveries >= outRecoveriesLimit) {
156230
return allocation.decision(THROTTLE, NAME,
157-
"reached the limit of incoming shard recoveries [%d], cluster setting [%s=%d] (can also be set via [%s])",
158-
currentInRecoveries, CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING.getKey(),
159-
concurrentIncomingRecoveries,
160-
CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey());
231+
"reached the limit of outgoing shard recoveries [%d] on the node [%s] which holds the primary, " +
232+
"cluster setting %s", primaryNodeOutRecoveries, primaryShard.currentNodeId(),
233+
outGoingRecoveriesSettingMsg);
161234
} else {
162-
// search for corresponding recovery source (= primary shard) and check number of outgoing recoveries on that node
163-
ShardRouting primaryShard = allocation.routingNodes().activePrimary(shardRouting.shardId());
164-
if (primaryShard == null) {
165-
return allocation.decision(Decision.NO, NAME, "primary shard for this replica is not yet active");
166-
}
167-
int primaryNodeOutRecoveries = allocation.routingNodes().getOutgoingRecoveries(primaryShard.currentNodeId());
168-
if (primaryNodeOutRecoveries >= concurrentOutgoingRecoveries) {
169-
return allocation.decision(THROTTLE, NAME,
170-
"reached the limit of outgoing shard recoveries [%d] on the node [%s] which holds the primary, " +
171-
"cluster setting [%s=%d] (can also be set via [%s])",
172-
primaryNodeOutRecoveries, primaryShard.currentNodeId(),
173-
CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_OUTGOING_RECOVERIES_SETTING.getKey(),
174-
concurrentOutgoingRecoveries,
175-
CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey());
176-
} else {
177-
return allocation.decision(YES, NAME, "below shard recovery limit of outgoing: [%d < %d] incoming: [%d < %d]",
178-
primaryNodeOutRecoveries,
179-
concurrentOutgoingRecoveries,
180-
currentInRecoveries,
181-
concurrentIncomingRecoveries);
182-
}
235+
return allocation.decision(YES, NAME, "below shard recovery limit of outgoing: [%d < %d] incoming: [%d < %d]",
236+
primaryNodeOutRecoveries, outRecoveriesLimit, currentInRecoveries, inRecoveriesLimit);
183237
}
184238
}
185239
}

server/src/main/java/org/opensearch/common/settings/ClusterSettings.java

+1
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ public void apply(Settings value, Settings current, Settings previous) {
256256
RecoverySettings.INDICES_RECOVERY_MAX_CONCURRENT_FILE_CHUNKS_SETTING,
257257
RecoverySettings.INDICES_RECOVERY_MAX_CONCURRENT_OPERATIONS_SETTING,
258258
ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING,
259+
ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING,
259260
ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING,
260261
ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_OUTGOING_RECOVERIES_SETTING,
261262
ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING,

0 commit comments

Comments
 (0)