Skip to content

Commit c0f63b5

Browse files
author
Jay Deng
committed
Add metrics for repository interactions to RemoteIndexBuildStrategy
1 parent 5873add commit c0f63b5

File tree

4 files changed

+252
-15
lines changed

4 files changed

+252
-15
lines changed

src/main/java/org/opensearch/knn/index/codec/nativeindex/remote/RemoteIndexBuildStrategy.java

+86-8
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import org.opensearch.knn.index.KNNSettings;
1515
import org.opensearch.knn.index.codec.nativeindex.NativeIndexBuildStrategy;
1616
import org.opensearch.knn.index.codec.nativeindex.model.BuildIndexParams;
17+
import org.opensearch.knn.index.vectorvalues.KNNVectorValues;
1718
import org.opensearch.repositories.RepositoriesService;
1819
import org.opensearch.repositories.Repository;
1920
import org.opensearch.repositories.RepositoryMissingException;
@@ -25,6 +26,19 @@
2526
import static org.opensearch.knn.index.KNNSettings.KNN_INDEX_REMOTE_VECTOR_BUILD_SETTING;
2627
import static org.opensearch.knn.index.KNNSettings.KNN_INDEX_REMOTE_VECTOR_BUILD_THRESHOLD_SETTING;
2728
import static org.opensearch.knn.index.KNNSettings.KNN_REMOTE_VECTOR_REPO_SETTING;
29+
import static org.opensearch.knn.index.codec.util.KNNCodecUtil.initializeVectorValues;
30+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.BUILD_REQUEST_FAILURE_COUNT;
31+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.BUILD_REQUEST_SUCCESS_COUNT;
32+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.READ_FAILURE_COUNT;
33+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.READ_SUCCESS_COUNT;
34+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.READ_TIME;
35+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.REMOTE_INDEX_BUILD_CURRENT_OPERATIONS;
36+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.REMOTE_INDEX_BUILD_CURRENT_SIZE;
37+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.REMOTE_INDEX_BUILD_TIME;
38+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.WAITING_TIME;
39+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.WRITE_FAILURE_COUNT;
40+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.WRITE_SUCCESS_COUNT;
41+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.WRITE_TIME;
2842

2943
/**
3044
* This class orchestrates building vector indices. It handles uploading data to a repository, submitting a remote
@@ -108,11 +122,17 @@ public static boolean shouldBuildIndexRemotely(IndexSettings indexSettings, long
108122
*/
109123
@Override
110124
public void buildAndWriteIndex(BuildIndexParams indexInfo) throws IOException {
125+
StopWatch remoteBuildTimeStopwatch = new StopWatch();
126+
KNNVectorValues<?> knnVectorValues = indexInfo.getKnnVectorValuesSupplier().get();
127+
initializeVectorValues(knnVectorValues);
128+
startRemoteIndexBuildStats((long) indexInfo.getTotalLiveDocs() * knnVectorValues.bytesPerVector(), remoteBuildTimeStopwatch);
111129
StopWatch stopWatch;
112130
long time_in_millis;
131+
VectorRepositoryAccessor vectorRepositoryAccessor = new DefaultVectorRepositoryAccessor(getRepository(), indexSettings);
132+
133+
// 1. Write required data to repository
134+
stopWatch = new StopWatch().start();
113135
try {
114-
VectorRepositoryAccessor vectorRepositoryAccessor = new DefaultVectorRepositoryAccessor(getRepository(), indexSettings);
115-
stopWatch = new StopWatch().start();
116136
// We create a new time based UUID per file in order to avoid conflicts across shards. It is also very difficult to get the
117137
// shard id in this context.
118138
String blobName = UUIDs.base64UUID() + "_" + indexInfo.getFieldName() + "_" + indexInfo.getSegmentWriteState().segmentInfo.name;
@@ -123,27 +143,61 @@ public void buildAndWriteIndex(BuildIndexParams indexInfo) throws IOException {
123143
indexInfo.getKnnVectorValuesSupplier()
124144
);
125145
time_in_millis = stopWatch.stop().totalTime().millis();
146+
WRITE_SUCCESS_COUNT.increment();
147+
WRITE_TIME.incrementBy(time_in_millis);
126148
log.debug("Repository write took {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName());
149+
} catch (Exception e) {
150+
time_in_millis = stopWatch.stop().totalTime().millis();
151+
WRITE_FAILURE_COUNT.increment();
152+
log.error("Repository write failed after {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName(), e);
153+
handleFailure(indexInfo, knnVectorValues.bytesPerVector(), remoteBuildTimeStopwatch);
154+
return;
155+
}
127156

128-
stopWatch = new StopWatch().start();
157+
// 2. Triggers index build
158+
stopWatch = new StopWatch().start();
159+
try {
129160
submitVectorBuild();
130161
time_in_millis = stopWatch.stop().totalTime().millis();
162+
BUILD_REQUEST_SUCCESS_COUNT.increment();
131163
log.debug("Submit vector build took {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName());
164+
} catch (Exception e) {
165+
BUILD_REQUEST_FAILURE_COUNT.increment();
166+
log.error("Submit vector failed after {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName(), e);
167+
handleFailure(indexInfo, knnVectorValues.bytesPerVector(), remoteBuildTimeStopwatch);
168+
return;
169+
}
132170

133-
stopWatch = new StopWatch().start();
171+
// 3. Awaits on vector build to complete
172+
stopWatch = new StopWatch().start();
173+
try {
134174
awaitVectorBuild();
135175
time_in_millis = stopWatch.stop().totalTime().millis();
176+
WAITING_TIME.incrementBy(time_in_millis);
136177
log.debug("Await vector build took {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName());
178+
} catch (Exception e) {
179+
log.debug("Await vector build failed after {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName());
180+
handleFailure(indexInfo, knnVectorValues.bytesPerVector(), remoteBuildTimeStopwatch);
181+
return;
182+
}
137183

138-
stopWatch = new StopWatch().start();
184+
// 4. Downloads index file and writes to indexOutput
185+
stopWatch = new StopWatch().start();
186+
try {
139187
vectorRepositoryAccessor.readFromRepository();
140188
time_in_millis = stopWatch.stop().totalTime().millis();
189+
READ_SUCCESS_COUNT.increment();
190+
READ_TIME.incrementBy(time_in_millis);
141191
log.debug("Repository read took {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName());
142192
} catch (Exception e) {
143-
// TODO: This needs more robust failure handling
144-
log.warn("Failed to build index remotely", e);
145-
fallbackStrategy.buildAndWriteIndex(indexInfo);
193+
time_in_millis = stopWatch.stop().totalTime().millis();
194+
READ_FAILURE_COUNT.increment();
195+
log.error("Repository read failed after {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName(), e);
196+
handleFailure(indexInfo, knnVectorValues.bytesPerVector(), remoteBuildTimeStopwatch);
197+
return;
146198
}
199+
200+
endRemoteIndexBuildStats((long) indexInfo.getTotalLiveDocs() * knnVectorValues.bytesPerVector(), stopWatch);
147201
}
148202

149203
/**
@@ -178,4 +232,28 @@ private void submitVectorBuild() {
178232
private void awaitVectorBuild() {
179233
throw new NotImplementedException();
180234
}
235+
236+
private void startRemoteIndexBuildStats(long size, StopWatch stopWatch) {
237+
stopWatch.start();
238+
REMOTE_INDEX_BUILD_CURRENT_OPERATIONS.increment();
239+
REMOTE_INDEX_BUILD_CURRENT_SIZE.incrementBy(size);
240+
}
241+
242+
private void endRemoteIndexBuildStats(long size, StopWatch stopWatch) {
243+
long time_in_millis = stopWatch.stop().totalTime().millis();
244+
REMOTE_INDEX_BUILD_CURRENT_OPERATIONS.decrement();
245+
REMOTE_INDEX_BUILD_CURRENT_SIZE.incrementBy(size);
246+
REMOTE_INDEX_BUILD_TIME.incrementBy(time_in_millis);
247+
}
248+
249+
/**
250+
* Helper method to collect remote index build metrics on failure and invoke fallback strategy
251+
* @param indexParams
252+
* @param bytesPerVector
253+
* @throws IOException
254+
*/
255+
private void handleFailure(BuildIndexParams indexParams, long bytesPerVector, StopWatch stopWatch) throws IOException {
256+
endRemoteIndexBuildStats(indexParams.getTotalLiveDocs() * bytesPerVector, stopWatch);
257+
fallbackStrategy.buildAndWriteIndex(indexParams);
258+
}
181259
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.knn.plugin.stats;
7+
8+
import lombok.Getter;
9+
10+
import java.util.concurrent.atomic.LongAdder;
11+
12+
public enum KNNRemoteIndexBuildValue {
13+
14+
// Repository Accumulating Stats
15+
WRITE_SUCCESS_COUNT("write_success_count"),
16+
WRITE_FAILURE_COUNT("write_failure_count"),
17+
WRITE_TIME("write_time_in_millis"),
18+
READ_SUCCESS_COUNT("read_success_count"),
19+
READ_FAILURE_COUNT("read_failure_count"),
20+
READ_TIME("read_time_in_millis"),
21+
22+
// Remote Index Build PIT Stats
23+
REMOTE_INDEX_BUILD_CURRENT_OPERATIONS("remote_index_build_current_operations"),
24+
REMOTE_INDEX_BUILD_CURRENT_SIZE("remote_index_build_current_size"),
25+
REMOTE_INDEX_BUILD_TIME("remote_index_build_time_in_millis"),
26+
27+
// Client Stats
28+
BUILD_REQUEST_SUCCESS_COUNT("build_request_success_count"),
29+
BUILD_REQUEST_FAILURE_COUNT("build_request_failure_count"),
30+
STATUS_REQUEST_SUCCESS_COUNT("status_request_success_count"),
31+
STATUS_REQUEST_FAILURE_COUNT("status_request_failure_count"),
32+
INDEX_BUILD_SUCCESS_COUNT("index_build_success_count"),
33+
INDEX_BUILD_FAILURE_COUNT("index_build_failure_count"),
34+
WAITING_TIME("waiting_time_in_ms");
35+
36+
@Getter
37+
private final String name;
38+
private final LongAdder value;
39+
40+
/**
41+
* Constructor
42+
*
43+
* @param name name of the graph value
44+
*/
45+
KNNRemoteIndexBuildValue(String name) {
46+
this.name = name;
47+
this.value = new LongAdder();
48+
}
49+
50+
/**
51+
* Get the graph value
52+
*
53+
* @return value
54+
*/
55+
public Long getValue() {
56+
return value.longValue();
57+
}
58+
59+
/**
60+
* Increment the graph value
61+
*/
62+
public void increment() {
63+
value.increment();
64+
}
65+
66+
/**
67+
* Decrement the graph value
68+
*/
69+
public void decrement() {
70+
value.decrement();
71+
}
72+
73+
/**
74+
* Increment the graph value by a specified amount
75+
*
76+
* @param delta The amount to increment
77+
*/
78+
public void incrementBy(long delta) {
79+
value.add(delta);
80+
}
81+
82+
/**
83+
* Decrement the graph value by a specified amount
84+
*
85+
* @param delta The amount to decrement
86+
*/
87+
public void decrementBy(long delta) {
88+
value.add(delta * -1);
89+
}
90+
}

src/main/java/org/opensearch/knn/plugin/stats/KNNStats.java

+72-7
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@
88
import com.google.common.cache.CacheStats;
99
import com.google.common.collect.ImmutableMap;
1010
import org.opensearch.knn.common.KNNConstants;
11-
import org.opensearch.knn.index.memory.NativeMemoryCacheManager;
11+
import org.opensearch.knn.common.featureflags.KNNFeatureFlags;
1212
import org.opensearch.knn.index.engine.KNNEngine;
13+
import org.opensearch.knn.index.memory.NativeMemoryCacheManager;
1314
import org.opensearch.knn.indices.ModelCache;
1415
import org.opensearch.knn.indices.ModelDao;
1516
import org.opensearch.knn.plugin.stats.suppliers.EventOccurredWithinThresholdSupplier;
@@ -80,12 +81,15 @@ private Map<String, KNNStat<?>> getClusterOrNodeStats(Boolean getClusterStats) {
8081

8182
private Map<String, KNNStat<?>> buildStatsMap() {
8283
ImmutableMap.Builder<String, KNNStat<?>> builder = ImmutableMap.<String, KNNStat<?>>builder();
83-
addQueryStats(builder);
84-
addNativeMemoryStats(builder);
85-
addEngineStats(builder);
86-
addScriptStats(builder);
87-
addModelStats(builder);
88-
addGraphStats(builder);
84+
// addQueryStats(builder);
85+
// addNativeMemoryStats(builder);
86+
// addEngineStats(builder);
87+
// addScriptStats(builder);
88+
// addModelStats(builder);
89+
// addGraphStats(builder);
90+
// if (KNNFeatureFlags.isKNNRemoteVectorBuildEnabled()) {
91+
addRemoteIndexBuildStats(builder);
92+
// }
8993
return builder.build();
9094
}
9195

@@ -218,4 +222,65 @@ private Map<String, Map<String, Object>> createGraphStatsMap() {
218222
graphStatsMap.put(StatNames.REFRESH.getName(), refreshMap);
219223
return graphStatsMap;
220224
}
225+
226+
private void addRemoteIndexBuildStats(ImmutableMap.Builder<String, KNNStat<?>> builder) {
227+
builder.put(StatNames.REMOTE_VECTOR_INDEX_BUILD_STATS.getName(), new KNNStat<>(false, this::createRemoteIndexStatsMap));
228+
}
229+
230+
private Map<String, Map<String, Object>> createRemoteIndexStatsMap() {
231+
Map<String, Object> clientStatsMap = new HashMap<>();
232+
clientStatsMap.put(
233+
KNNRemoteIndexBuildValue.BUILD_REQUEST_SUCCESS_COUNT.getName(),
234+
KNNRemoteIndexBuildValue.BUILD_REQUEST_SUCCESS_COUNT.getValue()
235+
);
236+
clientStatsMap.put(
237+
KNNRemoteIndexBuildValue.BUILD_REQUEST_FAILURE_COUNT.getName(),
238+
KNNRemoteIndexBuildValue.BUILD_REQUEST_FAILURE_COUNT.getValue()
239+
);
240+
clientStatsMap.put(
241+
KNNRemoteIndexBuildValue.STATUS_REQUEST_SUCCESS_COUNT.getName(),
242+
KNNRemoteIndexBuildValue.STATUS_REQUEST_SUCCESS_COUNT.getValue()
243+
);
244+
clientStatsMap.put(
245+
KNNRemoteIndexBuildValue.STATUS_REQUEST_FAILURE_COUNT.getName(),
246+
KNNRemoteIndexBuildValue.STATUS_REQUEST_FAILURE_COUNT.getValue()
247+
);
248+
clientStatsMap.put(
249+
KNNRemoteIndexBuildValue.INDEX_BUILD_SUCCESS_COUNT.getName(),
250+
KNNRemoteIndexBuildValue.INDEX_BUILD_SUCCESS_COUNT.getValue()
251+
);
252+
clientStatsMap.put(
253+
KNNRemoteIndexBuildValue.INDEX_BUILD_FAILURE_COUNT.getName(),
254+
KNNRemoteIndexBuildValue.INDEX_BUILD_FAILURE_COUNT.getValue()
255+
);
256+
clientStatsMap.put(KNNRemoteIndexBuildValue.WAITING_TIME.getName(), KNNRemoteIndexBuildValue.WAITING_TIME.getValue());
257+
258+
Map<String, Object> repoStatsMap = new HashMap<>();
259+
repoStatsMap.put(KNNRemoteIndexBuildValue.WRITE_SUCCESS_COUNT.getName(), KNNRemoteIndexBuildValue.WRITE_SUCCESS_COUNT.getValue());
260+
repoStatsMap.put(KNNRemoteIndexBuildValue.WRITE_FAILURE_COUNT.getName(), KNNRemoteIndexBuildValue.WRITE_FAILURE_COUNT.getValue());
261+
repoStatsMap.put(KNNRemoteIndexBuildValue.WRITE_TIME.getName(), KNNRemoteIndexBuildValue.WRITE_TIME.getValue());
262+
repoStatsMap.put(KNNRemoteIndexBuildValue.READ_SUCCESS_COUNT.getName(), KNNRemoteIndexBuildValue.READ_SUCCESS_COUNT.getValue());
263+
repoStatsMap.put(KNNRemoteIndexBuildValue.READ_FAILURE_COUNT.getName(), KNNRemoteIndexBuildValue.READ_FAILURE_COUNT.getValue());
264+
repoStatsMap.put(KNNRemoteIndexBuildValue.READ_TIME.getName(), KNNRemoteIndexBuildValue.READ_TIME.getValue());
265+
266+
Map<String, Object> buildStatsMap = new HashMap<>();
267+
buildStatsMap.put(
268+
KNNRemoteIndexBuildValue.REMOTE_INDEX_BUILD_CURRENT_OPERATIONS.getName(),
269+
KNNRemoteIndexBuildValue.REMOTE_INDEX_BUILD_CURRENT_OPERATIONS.getValue()
270+
);
271+
buildStatsMap.put(
272+
KNNRemoteIndexBuildValue.REMOTE_INDEX_BUILD_CURRENT_SIZE.getName(),
273+
KNNRemoteIndexBuildValue.REMOTE_INDEX_BUILD_CURRENT_SIZE.getValue()
274+
);
275+
buildStatsMap.put(
276+
KNNRemoteIndexBuildValue.REMOTE_INDEX_BUILD_TIME.getName(),
277+
KNNRemoteIndexBuildValue.REMOTE_INDEX_BUILD_TIME.getValue()
278+
);
279+
280+
Map<String, Map<String, Object>> remoteIndexBuildStatsMap = new HashMap<>();
281+
remoteIndexBuildStatsMap.put(StatNames.BUILD_STATS.getName(), buildStatsMap);
282+
remoteIndexBuildStatsMap.put(StatNames.CLIENT_STATS.getName(), clientStatsMap);
283+
remoteIndexBuildStatsMap.put(StatNames.REPOSITORY_STATS.getName(), repoStatsMap);
284+
return remoteIndexBuildStatsMap;
285+
}
221286
}

src/main/java/org/opensearch/knn/plugin/stats/StatNames.java

+4
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ public enum StatNames {
4545
GRAPH_STATS("graph_stats"),
4646
REFRESH("refresh"),
4747
MERGE("merge"),
48+
REMOTE_VECTOR_INDEX_BUILD_STATS("remote_vector_index_build_stats"),
49+
CLIENT_STATS("client_stats"),
50+
REPOSITORY_STATS("repository_stats"),
51+
BUILD_STATS("build_stats"),
4852
MIN_SCORE_QUERY_REQUESTS(KNNCounter.MIN_SCORE_QUERY_REQUESTS.getName()),
4953
MIN_SCORE_QUERY_WITH_FILTER_REQUESTS(KNNCounter.MIN_SCORE_QUERY_WITH_FILTER_REQUESTS.getName()),
5054
MAX_DISTANCE_QUERY_REQUESTS(KNNCounter.MAX_DISTANCE_QUERY_REQUESTS.getName()),

0 commit comments

Comments
 (0)