Skip to content

Commit b67cdf4

Browse files
authored
Added support for search replica to return segrep stats (opensearch-project#16678)
* Added implementation for the stats calculation for search and regular replica in shards Signed-off-by: Vinay Krishna Pudyodu <vinkrish.neo@gmail.com> * Updated changelog Signed-off-by: Vinay Krishna Pudyodu <vinkrish.neo@gmail.com> * Added unit tests for TransportSegmentReplicationStatsAction Signed-off-by: Vinay Krishna Pudyodu <vinkrish.neo@gmail.com> * fixed java style after running precommit locally Signed-off-by: Vinay Krishna Pudyodu <vinkrish.neo@gmail.com> * refined the test cases Signed-off-by: Vinay Krishna Pudyodu <vinkrish.neo@gmail.com> * fixed style issues Signed-off-by: Vinay Krishna Pudyodu <vinkrish.neo@gmail.com> * Made changes in the bytes to download calculation based on comments Signed-off-by: Vinay Krishna Pudyodu <vinkrish.neo@gmail.com> * added addReplicaStats method to SegmentReplicationPerGroupStats Signed-off-by: Vinay Krishna Pudyodu <vinkrish.neo@gmail.com> * fixed style issues Signed-off-by: Vinay Krishna Pudyodu <vinkrish.neo@gmail.com> * Fixed issue with immutable set Signed-off-by: Vinay Krishna Pudyodu <vinkrish.neo@gmail.com> * Fixed PR comments and moved the integration tests to separate module Signed-off-by: Vinay Krishna Pudyodu <vinkrish.neo@gmail.com> * Fixed failing integ tests Signed-off-by: Vinay Krishna Pudyodu <vinkrish.neo@gmail.com> * Fixed failing integ test Signed-off-by: Vinay Krishna Pudyodu <vinkrish.neo@gmail.com> * fixed some comments for PR Signed-off-by: Vinay Krishna Pudyodu <vinkrish.neo@gmail.com> * fixed failing tests Signed-off-by: Vinay Krishna Pudyodu <vinkrish.neo@gmail.com> --------- Signed-off-by: Vinay Krishna Pudyodu <vinkrish.neo@gmail.com>
1 parent 2b402ec commit b67cdf4

File tree

4 files changed

+744
-18
lines changed

4 files changed

+744
-18
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
2222
- Add new configuration setting `synonym_analyzer`, to the `synonym` and `synonym_graph` filters, enabling the specification of a custom analyzer for reading the synonym file ([#16488](https://github.com/opensearch-project/OpenSearch/pull/16488)).
2323
- Add stats for remote publication failure and move download failure stats to remote methods([#16682](https://github.com/opensearch-project/OpenSearch/pull/16682/))
2424
- Added a precaution to handle extreme date values during sorting to prevent `arithmetic_exception: long overflow` ([#16812](https://github.com/opensearch-project/OpenSearch/pull/16812)).
25+
- Add search replica stats to segment replication stats API ([#16678](https://github.com/opensearch-project/OpenSearch/pull/16678))
2526

2627
### Dependencies
2728
- Bump `com.google.cloud:google-cloud-core-http` from 2.23.0 to 2.47.0 ([#16504](https://github.com/opensearch-project/OpenSearch/pull/16504))

server/src/internalClusterTest/java/org/opensearch/indices/replication/SearchReplicaReplicationIT.java

+49
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,20 @@
88

99
package org.opensearch.indices.replication;
1010

11+
import org.opensearch.action.admin.indices.replication.SegmentReplicationStatsResponse;
1112
import org.opensearch.cluster.metadata.IndexMetadata;
1213
import org.opensearch.common.settings.Settings;
1314
import org.opensearch.common.util.FeatureFlags;
15+
import org.opensearch.index.SegmentReplicationPerGroupStats;
16+
import org.opensearch.index.SegmentReplicationShardStats;
17+
import org.opensearch.indices.replication.common.ReplicationType;
1418
import org.opensearch.test.OpenSearchIntegTestCase;
1519
import org.junit.After;
1620
import org.junit.Before;
1721

1822
import java.nio.file.Path;
23+
import java.util.List;
24+
import java.util.Set;
1925

2026
@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, numDataNodes = 0)
2127
public class SearchReplicaReplicationIT extends SegmentReplicationBaseIT {
@@ -82,4 +88,47 @@ public void testReplication() throws Exception {
8288
waitForSearchableDocs(docCount, primary, replica);
8389
}
8490

91+
public void testSegmentReplicationStatsResponseWithSearchReplica() throws Exception {
92+
internalCluster().startClusterManagerOnlyNode();
93+
final List<String> nodes = internalCluster().startDataOnlyNodes(2);
94+
createIndex(
95+
INDEX_NAME,
96+
Settings.builder()
97+
.put("number_of_shards", 1)
98+
.put("number_of_replicas", 0)
99+
.put("number_of_search_only_replicas", 1)
100+
.put(IndexMetadata.SETTING_REPLICATION_TYPE, ReplicationType.SEGMENT)
101+
.build()
102+
);
103+
ensureGreen(INDEX_NAME);
104+
105+
final int docCount = 5;
106+
for (int i = 0; i < docCount; i++) {
107+
client().prepareIndex(INDEX_NAME).setId(Integer.toString(i)).setSource("field", "value" + i).execute().get();
108+
}
109+
refresh(INDEX_NAME);
110+
waitForSearchableDocs(docCount, nodes);
111+
112+
SegmentReplicationStatsResponse segmentReplicationStatsResponse = dataNodeClient().admin()
113+
.indices()
114+
.prepareSegmentReplicationStats(INDEX_NAME)
115+
.setDetailed(true)
116+
.execute()
117+
.actionGet();
118+
119+
// Verify the number of indices
120+
assertEquals(1, segmentReplicationStatsResponse.getReplicationStats().size());
121+
// Verify total shards
122+
assertEquals(2, segmentReplicationStatsResponse.getTotalShards());
123+
// Verify the number of primary shards
124+
assertEquals(1, segmentReplicationStatsResponse.getReplicationStats().get(INDEX_NAME).size());
125+
126+
SegmentReplicationPerGroupStats perGroupStats = segmentReplicationStatsResponse.getReplicationStats().get(INDEX_NAME).get(0);
127+
Set<SegmentReplicationShardStats> replicaStats = perGroupStats.getReplicaStats();
128+
// Verify the number of replica stats
129+
assertEquals(1, replicaStats.size());
130+
for (SegmentReplicationShardStats replicaStat : replicaStats) {
131+
assertNotNull(replicaStat.getCurrentReplicationState());
132+
}
133+
}
85134
}

server/src/main/java/org/opensearch/action/admin/indices/replication/TransportSegmentReplicationStatsAction.java

+99-18
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
import org.opensearch.core.action.support.DefaultShardOperationFailedException;
2222
import org.opensearch.core.common.io.stream.StreamInput;
2323
import org.opensearch.core.index.shard.ShardId;
24-
import org.opensearch.index.IndexService;
2524
import org.opensearch.index.SegmentReplicationPerGroupStats;
2625
import org.opensearch.index.SegmentReplicationPressureService;
2726
import org.opensearch.index.SegmentReplicationShardStats;
@@ -38,7 +37,9 @@
3837
import java.util.HashMap;
3938
import java.util.List;
4039
import java.util.Map;
40+
import java.util.Set;
4141
import java.util.stream.Collectors;
42+
import java.util.stream.Stream;
4243

4344
/**
4445
* Transport action for shard segment replication operation. This transport action does not actually
@@ -96,11 +97,11 @@ protected SegmentReplicationStatsResponse newResponse(
9697
) {
9798
String[] shards = request.shards();
9899
final List<Integer> shardsToFetch = Arrays.stream(shards).map(Integer::valueOf).collect(Collectors.toList());
99-
100100
// organize replica responses by allocationId.
101101
final Map<String, SegmentReplicationState> replicaStats = new HashMap<>();
102102
// map of index name to list of replication group stats.
103103
final Map<String, List<SegmentReplicationPerGroupStats>> primaryStats = new HashMap<>();
104+
104105
for (SegmentReplicationShardStatsResponse response : responses) {
105106
if (response != null) {
106107
if (response.getReplicaStats() != null) {
@@ -109,6 +110,7 @@ protected SegmentReplicationStatsResponse newResponse(
109110
replicaStats.putIfAbsent(shardRouting.allocationId().getId(), response.getReplicaStats());
110111
}
111112
}
113+
112114
if (response.getPrimaryStats() != null) {
113115
final ShardId shardId = response.getPrimaryStats().getShardId();
114116
if (shardsToFetch.isEmpty() || shardsToFetch.contains(shardId.getId())) {
@@ -126,15 +128,20 @@ protected SegmentReplicationStatsResponse newResponse(
126128
}
127129
}
128130
}
129-
// combine the replica stats to the shard stat entry in each group.
130-
for (Map.Entry<String, List<SegmentReplicationPerGroupStats>> entry : primaryStats.entrySet()) {
131-
for (SegmentReplicationPerGroupStats group : entry.getValue()) {
132-
for (SegmentReplicationShardStats replicaStat : group.getReplicaStats()) {
133-
replicaStat.setCurrentReplicationState(replicaStats.getOrDefault(replicaStat.getAllocationId(), null));
134-
}
135-
}
136-
}
137-
return new SegmentReplicationStatsResponse(totalShards, successfulShards, failedShards, primaryStats, shardFailures);
131+
132+
Map<String, List<SegmentReplicationPerGroupStats>> replicationStats = primaryStats.entrySet()
133+
.stream()
134+
.collect(
135+
Collectors.toMap(
136+
Map.Entry::getKey,
137+
entry -> entry.getValue()
138+
.stream()
139+
.map(groupStats -> updateGroupStats(groupStats, replicaStats))
140+
.collect(Collectors.toList())
141+
)
142+
);
143+
144+
return new SegmentReplicationStatsResponse(totalShards, successfulShards, failedShards, replicationStats, shardFailures);
138145
}
139146

140147
@Override
@@ -144,9 +151,8 @@ protected SegmentReplicationStatsRequest readRequestFrom(StreamInput in) throws
144151

145152
@Override
146153
protected SegmentReplicationShardStatsResponse shardOperation(SegmentReplicationStatsRequest request, ShardRouting shardRouting) {
147-
IndexService indexService = indicesService.indexServiceSafe(shardRouting.shardId().getIndex());
148-
IndexShard indexShard = indexService.getShard(shardRouting.shardId().id());
149154
ShardId shardId = shardRouting.shardId();
155+
IndexShard indexShard = indicesService.indexServiceSafe(shardId.getIndex()).getShard(shardId.id());
150156

151157
if (indexShard.indexSettings().isSegRepEnabledOrRemoteNode() == false) {
152158
return null;
@@ -156,11 +162,7 @@ protected SegmentReplicationShardStatsResponse shardOperation(SegmentReplication
156162
return new SegmentReplicationShardStatsResponse(pressureService.getStatsForShard(indexShard));
157163
}
158164

159-
// return information about only on-going segment replication events.
160-
if (request.activeOnly()) {
161-
return new SegmentReplicationShardStatsResponse(targetService.getOngoingEventSegmentReplicationState(shardId));
162-
}
163-
return new SegmentReplicationShardStatsResponse(targetService.getSegmentReplicationState(shardId));
165+
return new SegmentReplicationShardStatsResponse(getSegmentReplicationState(shardId, request.activeOnly()));
164166
}
165167

166168
@Override
@@ -181,4 +183,83 @@ protected ClusterBlockException checkRequestBlock(
181183
) {
182184
return state.blocks().indicesBlockedException(ClusterBlockLevel.METADATA_READ, concreteIndices);
183185
}
186+
187+
private SegmentReplicationPerGroupStats updateGroupStats(
188+
SegmentReplicationPerGroupStats groupStats,
189+
Map<String, SegmentReplicationState> replicaStats
190+
) {
191+
// Update the SegmentReplicationState for each of the replicas
192+
Set<SegmentReplicationShardStats> updatedReplicaStats = groupStats.getReplicaStats()
193+
.stream()
194+
.peek(replicaStat -> replicaStat.setCurrentReplicationState(replicaStats.getOrDefault(replicaStat.getAllocationId(), null)))
195+
.collect(Collectors.toSet());
196+
197+
// Compute search replica stats
198+
Set<SegmentReplicationShardStats> searchReplicaStats = computeSearchReplicaStats(groupStats.getShardId(), replicaStats);
199+
200+
// Combine ReplicaStats and SearchReplicaStats
201+
Set<SegmentReplicationShardStats> combinedStats = Stream.concat(updatedReplicaStats.stream(), searchReplicaStats.stream())
202+
.collect(Collectors.toSet());
203+
204+
return new SegmentReplicationPerGroupStats(groupStats.getShardId(), combinedStats, groupStats.getRejectedRequestCount());
205+
}
206+
207+
private Set<SegmentReplicationShardStats> computeSearchReplicaStats(
208+
ShardId shardId,
209+
Map<String, SegmentReplicationState> replicaStats
210+
) {
211+
return replicaStats.values()
212+
.stream()
213+
.filter(segmentReplicationState -> segmentReplicationState.getShardRouting().shardId().equals(shardId))
214+
.filter(segmentReplicationState -> segmentReplicationState.getShardRouting().isSearchOnly())
215+
.map(segmentReplicationState -> {
216+
ShardRouting shardRouting = segmentReplicationState.getShardRouting();
217+
SegmentReplicationShardStats segmentReplicationStats = computeSegmentReplicationShardStats(shardRouting);
218+
segmentReplicationStats.setCurrentReplicationState(segmentReplicationState);
219+
return segmentReplicationStats;
220+
})
221+
.collect(Collectors.toSet());
222+
}
223+
224+
SegmentReplicationShardStats computeSegmentReplicationShardStats(ShardRouting shardRouting) {
225+
ShardId shardId = shardRouting.shardId();
226+
SegmentReplicationState completedSegmentReplicationState = targetService.getlatestCompletedEventSegmentReplicationState(shardId);
227+
SegmentReplicationState ongoingSegmentReplicationState = targetService.getOngoingEventSegmentReplicationState(shardId);
228+
229+
return new SegmentReplicationShardStats(
230+
shardRouting.allocationId().getId(),
231+
0,
232+
calculateBytesRemainingToReplicate(ongoingSegmentReplicationState),
233+
0,
234+
getCurrentReplicationLag(ongoingSegmentReplicationState),
235+
getLastCompletedReplicationLag(completedSegmentReplicationState)
236+
);
237+
}
238+
239+
private SegmentReplicationState getSegmentReplicationState(ShardId shardId, boolean isActiveOnly) {
240+
if (isActiveOnly) {
241+
return targetService.getOngoingEventSegmentReplicationState(shardId);
242+
} else {
243+
return targetService.getSegmentReplicationState(shardId);
244+
}
245+
}
246+
247+
private long calculateBytesRemainingToReplicate(SegmentReplicationState ongoingSegmentReplicationState) {
248+
if (ongoingSegmentReplicationState == null) {
249+
return 0;
250+
}
251+
return ongoingSegmentReplicationState.getIndex()
252+
.fileDetails()
253+
.stream()
254+
.mapToLong(index -> index.length() - index.recovered())
255+
.sum();
256+
}
257+
258+
private long getCurrentReplicationLag(SegmentReplicationState ongoingSegmentReplicationState) {
259+
return ongoingSegmentReplicationState != null ? ongoingSegmentReplicationState.getTimer().time() : 0;
260+
}
261+
262+
private long getLastCompletedReplicationLag(SegmentReplicationState completedSegmentReplicationState) {
263+
return completedSegmentReplicationState != null ? completedSegmentReplicationState.getTimer().time() : 0;
264+
}
184265
}

0 commit comments

Comments
 (0)