Skip to content

Commit 10674b2

Browse files
jed326Jay Deng
authored and
Jay Deng
committed
Add metrics for repository interactions to RemoteIndexBuildStrategy
Signed-off-by: Jay Deng <jayd0104@gmail.com>
1 parent 5873add commit 10674b2

File tree

6 files changed

+265
-10
lines changed

6 files changed

+265
-10
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
88
### Features
99
* [Remote Vector Index Build] Introduce Remote Native Index Build feature flag, settings, and initial skeleton [#2525](https://github.com/opensearch-project/k-NN/pull/2525)
1010
* [Remote Vector Index Build] Implement vector data upload and vector data size threshold setting [#2550](https://github.com/opensearch-project/k-NN/pull/2550)
11+
* [Remote Vector Index Build] Add metrics for repository interactions to RemoteIndexBuildStrategy [#2566](https://github.com/opensearch-project/k-NN/pull/2566)
1112
### Enhancements
1213
* Introduce node level circuit breakers for k-NN [#2509](https://github.com/opensearch-project/k-NN/pull/2509)
1314
### Bug Fixes

src/main/java/org/opensearch/knn/index/codec/nativeindex/remote/RemoteIndexBuildStrategy.java

+89-8
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import org.opensearch.knn.index.KNNSettings;
1515
import org.opensearch.knn.index.codec.nativeindex.NativeIndexBuildStrategy;
1616
import org.opensearch.knn.index.codec.nativeindex.model.BuildIndexParams;
17+
import org.opensearch.knn.index.vectorvalues.KNNVectorValues;
1718
import org.opensearch.repositories.RepositoriesService;
1819
import org.opensearch.repositories.Repository;
1920
import org.opensearch.repositories.RepositoryMissingException;
@@ -25,6 +26,19 @@
2526
import static org.opensearch.knn.index.KNNSettings.KNN_INDEX_REMOTE_VECTOR_BUILD_SETTING;
2627
import static org.opensearch.knn.index.KNNSettings.KNN_INDEX_REMOTE_VECTOR_BUILD_THRESHOLD_SETTING;
2728
import static org.opensearch.knn.index.KNNSettings.KNN_REMOTE_VECTOR_REPO_SETTING;
29+
import static org.opensearch.knn.index.codec.util.KNNCodecUtil.initializeVectorValues;
30+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.BUILD_REQUEST_FAILURE_COUNT;
31+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.BUILD_REQUEST_SUCCESS_COUNT;
32+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.READ_FAILURE_COUNT;
33+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.READ_SUCCESS_COUNT;
34+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.READ_TIME;
35+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.REMOTE_INDEX_BUILD_CURRENT_OPERATIONS;
36+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.REMOTE_INDEX_BUILD_CURRENT_SIZE;
37+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.REMOTE_INDEX_BUILD_TIME;
38+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.WAITING_TIME;
39+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.WRITE_FAILURE_COUNT;
40+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.WRITE_SUCCESS_COUNT;
41+
import static org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue.WRITE_TIME;
2842

2943
/**
3044
* This class orchestrates building vector indices. It handles uploading data to a repository, submitting a remote
@@ -110,9 +124,17 @@ public static boolean shouldBuildIndexRemotely(IndexSettings indexSettings, long
110124
public void buildAndWriteIndex(BuildIndexParams indexInfo) throws IOException {
111125
StopWatch stopWatch;
112126
long time_in_millis;
127+
final VectorRepositoryAccessor vectorRepositoryAccessor;
128+
129+
StopWatch remoteBuildTimeStopwatch = new StopWatch();
130+
KNNVectorValues<?> knnVectorValues = indexInfo.getKnnVectorValuesSupplier().get();
131+
initializeVectorValues(knnVectorValues);
132+
startRemoteIndexBuildStats((long) indexInfo.getTotalLiveDocs() * knnVectorValues.bytesPerVector(), remoteBuildTimeStopwatch);
133+
134+
// 1. Write required data to repository
135+
stopWatch = new StopWatch().start();
113136
try {
114-
VectorRepositoryAccessor vectorRepositoryAccessor = new DefaultVectorRepositoryAccessor(getRepository(), indexSettings);
115-
stopWatch = new StopWatch().start();
137+
vectorRepositoryAccessor = new DefaultVectorRepositoryAccessor(getRepository(), indexSettings);
116138
// We create a new time based UUID per file in order to avoid conflicts across shards. It is also very difficult to get the
117139
// shard id in this context.
118140
String blobName = UUIDs.base64UUID() + "_" + indexInfo.getFieldName() + "_" + indexInfo.getSegmentWriteState().segmentInfo.name;
@@ -123,27 +145,62 @@ public void buildAndWriteIndex(BuildIndexParams indexInfo) throws IOException {
123145
indexInfo.getKnnVectorValuesSupplier()
124146
);
125147
time_in_millis = stopWatch.stop().totalTime().millis();
148+
WRITE_SUCCESS_COUNT.increment();
149+
WRITE_TIME.incrementBy(time_in_millis);
126150
log.debug("Repository write took {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName());
151+
} catch (Exception e) {
152+
time_in_millis = stopWatch.stop().totalTime().millis();
153+
WRITE_FAILURE_COUNT.increment();
154+
log.error("Repository write failed after {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName(), e);
155+
handleFailure(indexInfo, knnVectorValues.bytesPerVector(), remoteBuildTimeStopwatch);
156+
return;
157+
}
127158

128-
stopWatch = new StopWatch().start();
159+
// 2. Triggers index build
160+
stopWatch = new StopWatch().start();
161+
try {
129162
submitVectorBuild();
130163
time_in_millis = stopWatch.stop().totalTime().millis();
164+
BUILD_REQUEST_SUCCESS_COUNT.increment();
131165
log.debug("Submit vector build took {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName());
166+
} catch (Exception e) {
167+
BUILD_REQUEST_FAILURE_COUNT.increment();
168+
log.error("Submit vector failed after {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName(), e);
169+
handleFailure(indexInfo, knnVectorValues.bytesPerVector(), remoteBuildTimeStopwatch);
170+
return;
171+
}
132172

133-
stopWatch = new StopWatch().start();
173+
// 3. Awaits on vector build to complete
174+
stopWatch = new StopWatch().start();
175+
try {
134176
awaitVectorBuild();
135177
time_in_millis = stopWatch.stop().totalTime().millis();
178+
WAITING_TIME.incrementBy(time_in_millis);
136179
log.debug("Await vector build took {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName());
180+
} catch (Exception e) {
181+
log.debug("Await vector build failed after {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName());
182+
handleFailure(indexInfo, knnVectorValues.bytesPerVector(), remoteBuildTimeStopwatch);
183+
return;
184+
}
137185

138-
stopWatch = new StopWatch().start();
186+
// 4. Downloads index file and writes to indexOutput
187+
stopWatch = new StopWatch().start();
188+
try {
189+
assert vectorRepositoryAccessor != null;
139190
vectorRepositoryAccessor.readFromRepository();
140191
time_in_millis = stopWatch.stop().totalTime().millis();
192+
READ_SUCCESS_COUNT.increment();
193+
READ_TIME.incrementBy(time_in_millis);
141194
log.debug("Repository read took {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName());
142195
} catch (Exception e) {
143-
// TODO: This needs more robust failure handling
144-
log.warn("Failed to build index remotely", e);
145-
fallbackStrategy.buildAndWriteIndex(indexInfo);
196+
time_in_millis = stopWatch.stop().totalTime().millis();
197+
READ_FAILURE_COUNT.increment();
198+
log.error("Repository read failed after {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName(), e);
199+
handleFailure(indexInfo, knnVectorValues.bytesPerVector(), remoteBuildTimeStopwatch);
200+
return;
146201
}
202+
203+
endRemoteIndexBuildStats((long) indexInfo.getTotalLiveDocs() * knnVectorValues.bytesPerVector(), stopWatch);
147204
}
148205

149206
/**
@@ -178,4 +235,28 @@ private void submitVectorBuild() {
178235
private void awaitVectorBuild() {
179236
throw new NotImplementedException();
180237
}
238+
239+
private void startRemoteIndexBuildStats(long size, StopWatch stopWatch) {
240+
stopWatch.start();
241+
REMOTE_INDEX_BUILD_CURRENT_OPERATIONS.increment();
242+
REMOTE_INDEX_BUILD_CURRENT_SIZE.incrementBy(size);
243+
}
244+
245+
private void endRemoteIndexBuildStats(long size, StopWatch stopWatch) {
246+
long time_in_millis = stopWatch.stop().totalTime().millis();
247+
REMOTE_INDEX_BUILD_CURRENT_OPERATIONS.decrement();
248+
REMOTE_INDEX_BUILD_CURRENT_SIZE.decrementBy(size);
249+
REMOTE_INDEX_BUILD_TIME.incrementBy(time_in_millis);
250+
}
251+
252+
/**
253+
* Helper method to collect remote index build metrics on failure and invoke fallback strategy
254+
* @param indexParams
255+
* @param bytesPerVector
256+
* @throws IOException
257+
*/
258+
private void handleFailure(BuildIndexParams indexParams, long bytesPerVector, StopWatch stopWatch) throws IOException {
259+
endRemoteIndexBuildStats(indexParams.getTotalLiveDocs() * bytesPerVector, stopWatch);
260+
fallbackStrategy.buildAndWriteIndex(indexParams);
261+
}
181262
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.knn.plugin.stats;
7+
8+
import lombok.Getter;
9+
10+
import java.util.concurrent.atomic.LongAdder;
11+
12+
public enum KNNRemoteIndexBuildValue {
13+
14+
// Repository Accumulating Stats
15+
WRITE_SUCCESS_COUNT("write_success_count"),
16+
WRITE_FAILURE_COUNT("write_failure_count"),
17+
WRITE_TIME("successful_write_time_in_millis"),
18+
READ_SUCCESS_COUNT("read_success_count"),
19+
READ_FAILURE_COUNT("read_failure_count"),
20+
READ_TIME("successful_read_time_in_millis"),
21+
22+
// Remote Index Build Stats
23+
REMOTE_INDEX_BUILD_CURRENT_OPERATIONS("remote_index_build_current_operations"),
24+
REMOTE_INDEX_BUILD_CURRENT_SIZE("remote_index_build_current_size"),
25+
REMOTE_INDEX_BUILD_TIME("remote_index_build_time_in_millis"),
26+
27+
// Client Stats
28+
BUILD_REQUEST_SUCCESS_COUNT("build_request_success_count"),
29+
BUILD_REQUEST_FAILURE_COUNT("build_request_failure_count"),
30+
STATUS_REQUEST_SUCCESS_COUNT("status_request_success_count"),
31+
STATUS_REQUEST_FAILURE_COUNT("status_request_failure_count"),
32+
INDEX_BUILD_SUCCESS_COUNT("index_build_success_count"),
33+
INDEX_BUILD_FAILURE_COUNT("index_build_failure_count"),
34+
WAITING_TIME("waiting_time_in_ms");
35+
36+
@Getter
37+
private final String name;
38+
private final LongAdder value;
39+
40+
/**
41+
* Constructor
42+
*
43+
* @param name name of the graph value
44+
*/
45+
KNNRemoteIndexBuildValue(String name) {
46+
this.name = name;
47+
this.value = new LongAdder();
48+
}
49+
50+
/**
51+
* Get the graph value
52+
*
53+
* @return value
54+
*/
55+
public Long getValue() {
56+
return value.longValue();
57+
}
58+
59+
/**
60+
* Increment the graph value
61+
*/
62+
public void increment() {
63+
value.increment();
64+
}
65+
66+
/**
67+
* Decrement the graph value
68+
*/
69+
public void decrement() {
70+
value.decrement();
71+
}
72+
73+
/**
74+
* Increment the graph value by a specified amount
75+
*
76+
* @param delta The amount to increment
77+
*/
78+
public void incrementBy(long delta) {
79+
value.add(delta);
80+
}
81+
82+
/**
83+
* Decrement the graph value by a specified amount
84+
*
85+
* @param delta The amount to decrement
86+
*/
87+
public void decrementBy(long delta) {
88+
value.add(delta * -1);
89+
}
90+
}

src/main/java/org/opensearch/knn/plugin/stats/KNNStats.java

+73-2
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@
88
import com.google.common.cache.CacheStats;
99
import com.google.common.collect.ImmutableMap;
1010
import org.opensearch.knn.common.KNNConstants;
11-
import org.opensearch.knn.index.memory.NativeMemoryCacheManager;
11+
import org.opensearch.knn.common.featureflags.KNNFeatureFlags;
1212
import org.opensearch.knn.index.engine.KNNEngine;
13+
import org.opensearch.knn.index.memory.NativeMemoryCacheManager;
1314
import org.opensearch.knn.indices.ModelCache;
1415
import org.opensearch.knn.indices.ModelDao;
1516
import org.opensearch.knn.plugin.stats.suppliers.EventOccurredWithinThresholdSupplier;
@@ -24,6 +25,7 @@
2425
import java.time.temporal.ChronoUnit;
2526
import java.util.HashMap;
2627
import java.util.Map;
28+
import java.util.Objects;
2729
import java.util.function.Supplier;
2830

2931
/**
@@ -71,8 +73,15 @@ private Map<String, KNNStat<?>> getClusterOrNodeStats(Boolean getClusterStats) {
7173
Map<String, KNNStat<?>> statsMap = new HashMap<>();
7274

7375
for (Map.Entry<String, KNNStat<?>> entry : knnStats.entrySet()) {
76+
// knnStats is initialized at node bootup, so we need to do feature flag enforcement when retrieving the stats instead
7477
if (entry.getValue().isClusterLevel() == getClusterStats) {
75-
statsMap.put(entry.getKey(), entry.getValue());
78+
if (Objects.equals(entry.getKey(), StatNames.REMOTE_VECTOR_INDEX_BUILD_STATS.getName())) {
79+
if (KNNFeatureFlags.isKNNRemoteVectorBuildEnabled()) {
80+
statsMap.put(entry.getKey(), entry.getValue());
81+
}
82+
} else {
83+
statsMap.put(entry.getKey(), entry.getValue());
84+
}
7685
}
7786
}
7887
return statsMap;
@@ -86,6 +95,7 @@ private Map<String, KNNStat<?>> buildStatsMap() {
8695
addScriptStats(builder);
8796
addModelStats(builder);
8897
addGraphStats(builder);
98+
addRemoteIndexBuildStats(builder);
8999
return builder.build();
90100
}
91101

@@ -218,4 +228,65 @@ private Map<String, Map<String, Object>> createGraphStatsMap() {
218228
graphStatsMap.put(StatNames.REFRESH.getName(), refreshMap);
219229
return graphStatsMap;
220230
}
231+
232+
private void addRemoteIndexBuildStats(ImmutableMap.Builder<String, KNNStat<?>> builder) {
233+
builder.put(StatNames.REMOTE_VECTOR_INDEX_BUILD_STATS.getName(), new KNNStat<>(false, this::createRemoteIndexStatsMap));
234+
}
235+
236+
private Map<String, Map<String, Object>> createRemoteIndexStatsMap() {
237+
Map<String, Object> clientStatsMap = new HashMap<>();
238+
clientStatsMap.put(
239+
KNNRemoteIndexBuildValue.BUILD_REQUEST_SUCCESS_COUNT.getName(),
240+
KNNRemoteIndexBuildValue.BUILD_REQUEST_SUCCESS_COUNT.getValue()
241+
);
242+
clientStatsMap.put(
243+
KNNRemoteIndexBuildValue.BUILD_REQUEST_FAILURE_COUNT.getName(),
244+
KNNRemoteIndexBuildValue.BUILD_REQUEST_FAILURE_COUNT.getValue()
245+
);
246+
clientStatsMap.put(
247+
KNNRemoteIndexBuildValue.STATUS_REQUEST_SUCCESS_COUNT.getName(),
248+
KNNRemoteIndexBuildValue.STATUS_REQUEST_SUCCESS_COUNT.getValue()
249+
);
250+
clientStatsMap.put(
251+
KNNRemoteIndexBuildValue.STATUS_REQUEST_FAILURE_COUNT.getName(),
252+
KNNRemoteIndexBuildValue.STATUS_REQUEST_FAILURE_COUNT.getValue()
253+
);
254+
clientStatsMap.put(
255+
KNNRemoteIndexBuildValue.INDEX_BUILD_SUCCESS_COUNT.getName(),
256+
KNNRemoteIndexBuildValue.INDEX_BUILD_SUCCESS_COUNT.getValue()
257+
);
258+
clientStatsMap.put(
259+
KNNRemoteIndexBuildValue.INDEX_BUILD_FAILURE_COUNT.getName(),
260+
KNNRemoteIndexBuildValue.INDEX_BUILD_FAILURE_COUNT.getValue()
261+
);
262+
clientStatsMap.put(KNNRemoteIndexBuildValue.WAITING_TIME.getName(), KNNRemoteIndexBuildValue.WAITING_TIME.getValue());
263+
264+
Map<String, Object> repoStatsMap = new HashMap<>();
265+
repoStatsMap.put(KNNRemoteIndexBuildValue.WRITE_SUCCESS_COUNT.getName(), KNNRemoteIndexBuildValue.WRITE_SUCCESS_COUNT.getValue());
266+
repoStatsMap.put(KNNRemoteIndexBuildValue.WRITE_FAILURE_COUNT.getName(), KNNRemoteIndexBuildValue.WRITE_FAILURE_COUNT.getValue());
267+
repoStatsMap.put(KNNRemoteIndexBuildValue.WRITE_TIME.getName(), KNNRemoteIndexBuildValue.WRITE_TIME.getValue());
268+
repoStatsMap.put(KNNRemoteIndexBuildValue.READ_SUCCESS_COUNT.getName(), KNNRemoteIndexBuildValue.READ_SUCCESS_COUNT.getValue());
269+
repoStatsMap.put(KNNRemoteIndexBuildValue.READ_FAILURE_COUNT.getName(), KNNRemoteIndexBuildValue.READ_FAILURE_COUNT.getValue());
270+
repoStatsMap.put(KNNRemoteIndexBuildValue.READ_TIME.getName(), KNNRemoteIndexBuildValue.READ_TIME.getValue());
271+
272+
Map<String, Object> buildStatsMap = new HashMap<>();
273+
buildStatsMap.put(
274+
KNNRemoteIndexBuildValue.REMOTE_INDEX_BUILD_CURRENT_OPERATIONS.getName(),
275+
KNNRemoteIndexBuildValue.REMOTE_INDEX_BUILD_CURRENT_OPERATIONS.getValue()
276+
);
277+
buildStatsMap.put(
278+
KNNRemoteIndexBuildValue.REMOTE_INDEX_BUILD_CURRENT_SIZE.getName(),
279+
KNNRemoteIndexBuildValue.REMOTE_INDEX_BUILD_CURRENT_SIZE.getValue()
280+
);
281+
buildStatsMap.put(
282+
KNNRemoteIndexBuildValue.REMOTE_INDEX_BUILD_TIME.getName(),
283+
KNNRemoteIndexBuildValue.REMOTE_INDEX_BUILD_TIME.getValue()
284+
);
285+
286+
Map<String, Map<String, Object>> remoteIndexBuildStatsMap = new HashMap<>();
287+
remoteIndexBuildStatsMap.put(StatNames.BUILD_STATS.getName(), buildStatsMap);
288+
remoteIndexBuildStatsMap.put(StatNames.CLIENT_STATS.getName(), clientStatsMap);
289+
remoteIndexBuildStatsMap.put(StatNames.REPOSITORY_STATS.getName(), repoStatsMap);
290+
return remoteIndexBuildStatsMap;
291+
}
221292
}

src/main/java/org/opensearch/knn/plugin/stats/StatNames.java

+4
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ public enum StatNames {
4545
GRAPH_STATS("graph_stats"),
4646
REFRESH("refresh"),
4747
MERGE("merge"),
48+
REMOTE_VECTOR_INDEX_BUILD_STATS("remote_vector_index_build_stats"),
49+
CLIENT_STATS("client_stats"),
50+
REPOSITORY_STATS("repository_stats"),
51+
BUILD_STATS("build_stats"),
4852
MIN_SCORE_QUERY_REQUESTS(KNNCounter.MIN_SCORE_QUERY_REQUESTS.getName()),
4953
MIN_SCORE_QUERY_WITH_FILTER_REQUESTS(KNNCounter.MIN_SCORE_QUERY_WITH_FILTER_REQUESTS.getName()),
5054
MAX_DISTANCE_QUERY_REQUESTS(KNNCounter.MAX_DISTANCE_QUERY_REQUESTS.getName()),

src/test/java/org/opensearch/knn/index/codec/nativeindex/remote/RemoteIndexBuildStrategyTests.java

+8
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import org.opensearch.core.index.Index;
1212
import org.opensearch.index.IndexSettings;
1313
import org.opensearch.knn.index.KNNSettings;
14+
import org.opensearch.knn.plugin.stats.KNNRemoteIndexBuildValue;
1415
import org.opensearch.repositories.RepositoriesService;
1516
import org.opensearch.repositories.RepositoryMissingException;
1617

@@ -41,6 +42,13 @@ public void testRemoteIndexBuildStrategyFallback() throws IOException {
4142
);
4243
objectUnderTest.buildAndWriteIndex(buildIndexParams);
4344
assertTrue(fallback.get());
45+
assertEquals(0L, (long) KNNRemoteIndexBuildValue.WRITE_SUCCESS_COUNT.getValue());
46+
assertEquals(1L, (long) KNNRemoteIndexBuildValue.WRITE_FAILURE_COUNT.getValue()); // Repository is first accessed during write
47+
assertEquals(0L, (long) KNNRemoteIndexBuildValue.WRITE_TIME.getValue());
48+
assertEquals(0L, (long) KNNRemoteIndexBuildValue.READ_SUCCESS_COUNT.getValue());
49+
assertEquals(0L, (long) KNNRemoteIndexBuildValue.READ_FAILURE_COUNT.getValue());
50+
assertEquals(0L, (long) KNNRemoteIndexBuildValue.READ_TIME.getValue());
51+
assertTrue(KNNRemoteIndexBuildValue.REMOTE_INDEX_BUILD_TIME.getValue() > 0L);
4452
}
4553

4654
public void testShouldBuildIndexRemotely() {

0 commit comments

Comments
 (0)