Skip to content

Commit 25981af

Browse files
jed326Jay Deng
authored and
Jay Deng
committed
Introduce RemoteIndexBuilder skeleton
Signed-off-by: Jay Deng <jayd0104@gmail.com>
1 parent 349a715 commit 25981af

21 files changed

+2460
-71
lines changed

src/main/java/org/opensearch/knn/common/featureflags/KNNFeatureFlags.java

+16-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ public class KNNFeatureFlags {
2626

2727
// Feature flags
2828
private static final String KNN_FORCE_EVICT_CACHE_ENABLED = "knn.feature.cache.force_evict.enabled";
29+
private static final String KNN_REMOTE_VECTOR_BUILD = "knn.feature.remote_index_build.enabled";
2930

3031
@VisibleForTesting
3132
public static final Setting<Boolean> KNN_FORCE_EVICT_CACHE_ENABLED_SETTING = Setting.boolSetting(
@@ -35,8 +36,15 @@ public class KNNFeatureFlags {
3536
Dynamic
3637
);
3738

39+
public static final Setting<Boolean> KNN_REMOTE_VECTOR_BUILD_SETTING = Setting.boolSetting(
40+
KNN_REMOTE_VECTOR_BUILD,
41+
false,
42+
NodeScope,
43+
Dynamic
44+
);
45+
3846
public static List<Setting<?>> getFeatureFlags() {
39-
return ImmutableList.of(KNN_FORCE_EVICT_CACHE_ENABLED_SETTING);
47+
return ImmutableList.of(KNN_FORCE_EVICT_CACHE_ENABLED_SETTING, KNN_REMOTE_VECTOR_BUILD_SETTING);
4048
}
4149

4250
/**
@@ -46,4 +54,11 @@ public static List<Setting<?>> getFeatureFlags() {
4654
public static boolean isForceEvictCacheEnabled() {
4755
return Booleans.parseBoolean(KNNSettings.state().getSettingValue(KNN_FORCE_EVICT_CACHE_ENABLED).toString(), false);
4856
}
57+
58+
/**
59+
* @return true if remote vector index build feature flag is enabled
60+
*/
61+
public static boolean isKNNRemoteVectorBuildEnabled() {
62+
return Booleans.parseBooleanStrict(KNNSettings.state().getSettingValue(KNN_REMOTE_VECTOR_BUILD).toString(), false);
63+
}
4964
}

src/main/java/org/opensearch/knn/index/KNNSettings.java

+23-2
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,9 @@
4141

4242
import static java.util.stream.Collectors.toUnmodifiableMap;
4343
import static org.opensearch.common.settings.Setting.Property.Dynamic;
44+
import static org.opensearch.common.settings.Setting.Property.Final;
4445
import static org.opensearch.common.settings.Setting.Property.IndexScope;
4546
import static org.opensearch.common.settings.Setting.Property.NodeScope;
46-
import static org.opensearch.common.settings.Setting.Property.Final;
4747
import static org.opensearch.common.settings.Setting.Property.UnmodifiableOnRestore;
4848
import static org.opensearch.common.unit.MemorySizeValue.parseBytesSizeValueOrHeapRatio;
4949
import static org.opensearch.core.common.unit.ByteSizeValue.parseBytesSizeValue;
@@ -94,6 +94,8 @@ public class KNNSettings {
9494
public static final String KNN_FAISS_AVX512_SPR_DISABLED = "knn.faiss.avx512_spr.disabled";
9595
public static final String KNN_DISK_VECTOR_SHARD_LEVEL_RESCORING_DISABLED = "index.knn.disk.vector.shard_level_rescoring_disabled";
9696
public static final String KNN_DERIVED_SOURCE_ENABLED = "index.knn.derived_source.enabled";
97+
public static final String KNN_INDEX_REMOTE_VECTOR_BUILD = "index.knn.remote_index_build.enabled";
98+
public static final String KNN_REMOTE_VECTOR_REPO = "knn.remote_index_build.vector_repo";
9799

98100
/**
99101
* Default setting values
@@ -371,6 +373,15 @@ public class KNNSettings {
371373
NodeScope
372374
);
373375

376+
public static final Setting<Boolean> KNN_INDEX_REMOTE_VECTOR_BUILD_SETTING = Setting.boolSetting(
377+
KNN_INDEX_REMOTE_VECTOR_BUILD,
378+
false,
379+
Dynamic,
380+
IndexScope
381+
);
382+
383+
public static final Setting<String> KNN_REMOTE_VECTOR_REPO_SETTING = Setting.simpleString(KNN_REMOTE_VECTOR_REPO, Dynamic, NodeScope);
384+
374385
/**
375386
* Dynamic settings
376387
*/
@@ -525,6 +536,14 @@ private Setting<?> getSetting(String key) {
525536
return KNN_DERIVED_SOURCE_ENABLED_SETTING;
526537
}
527538

539+
if (KNN_INDEX_REMOTE_VECTOR_BUILD.equals(key)) {
540+
return KNN_INDEX_REMOTE_VECTOR_BUILD_SETTING;
541+
}
542+
543+
if (KNN_REMOTE_VECTOR_REPO.equals(key)) {
544+
return KNN_REMOTE_VECTOR_REPO_SETTING;
545+
}
546+
528547
throw new IllegalArgumentException("Cannot find setting by key [" + key + "]");
529548
}
530549

@@ -550,7 +569,9 @@ public List<Setting<?>> getSettings() {
550569
QUANTIZATION_STATE_CACHE_SIZE_LIMIT_SETTING,
551570
QUANTIZATION_STATE_CACHE_EXPIRY_TIME_MINUTES_SETTING,
552571
KNN_DISK_VECTOR_SHARD_LEVEL_RESCORING_DISABLED_SETTING,
553-
KNN_DERIVED_SOURCE_ENABLED_SETTING
572+
KNN_DERIVED_SOURCE_ENABLED_SETTING,
573+
KNN_INDEX_REMOTE_VECTOR_BUILD_SETTING,
574+
KNN_REMOTE_VECTOR_REPO_SETTING
554575
);
555576
return Stream.concat(settings.stream(), Stream.concat(getFeatureFlags().stream(), dynamicCacheSettings.values().stream()))
556577
.collect(Collectors.toList());

src/main/java/org/opensearch/knn/index/codec/BasePerFieldKnnVectorsFormat.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ public KnnVectorsFormat getKnnVectorsFormatForField(final String field) {
135135
return nativeEngineVectorsFormat();
136136
}
137137

138-
private NativeEngines990KnnVectorsFormat nativeEngineVectorsFormat() {
138+
protected KnnVectorsFormat nativeEngineVectorsFormat() {
139139
// mapperService is already checked for null or valid instance type at caller, hence we don't need
140140
// addition isPresent check here.
141141
int approximateThreshold = getApproximateThresholdValue();
@@ -145,7 +145,7 @@ private NativeEngines990KnnVectorsFormat nativeEngineVectorsFormat() {
145145
);
146146
}
147147

148-
private int getApproximateThresholdValue() {
148+
protected int getApproximateThresholdValue() {
149149
// This is private method and mapperService is already checked for null or valid instance type before this call
150150
// at caller, hence we don't need additional isPresent check here.
151151
final IndexSettings indexSettings = mapperService.get().getIndexSettings();
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.knn.index.codec.KNN10010Codec;
7+
8+
import org.apache.lucene.codecs.KnnVectorsFormat;
9+
import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil;
10+
import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat;
11+
import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
12+
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
13+
import org.opensearch.common.Nullable;
14+
import org.opensearch.common.collect.Tuple;
15+
import org.opensearch.index.mapper.MapperService;
16+
import org.opensearch.knn.index.KNNSettings;
17+
import org.opensearch.knn.index.SpaceType;
18+
import org.opensearch.knn.index.codec.BasePerFieldKnnVectorsFormat;
19+
import org.opensearch.knn.index.codec.KNN9120Codec.KNN9120HnswBinaryVectorsFormat;
20+
import org.opensearch.knn.index.engine.KNNEngine;
21+
import org.opensearch.knn.index.remote.RemoteIndexBuilder;
22+
23+
import java.util.Optional;
24+
import java.util.concurrent.ExecutorService;
25+
import java.util.concurrent.Executors;
26+
27+
/**
28+
* Class provides per field format implementation for Lucene Knn vector type
29+
*/
30+
public class KNN10010PerFieldKnnVectorsFormat extends BasePerFieldKnnVectorsFormat {
31+
private static final Tuple<Integer, ExecutorService> DEFAULT_MERGE_THREAD_COUNT_AND_EXECUTOR_SERVICE = Tuple.tuple(1, null);
32+
@Nullable
33+
private RemoteIndexBuilder remoteIndexBuilder;
34+
35+
public KNN10010PerFieldKnnVectorsFormat(final Optional<MapperService> mapperService, final RemoteIndexBuilder remoteIndexBuilder) {
36+
super(
37+
mapperService,
38+
Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN,
39+
Lucene99HnswVectorsFormat.DEFAULT_BEAM_WIDTH,
40+
Lucene99HnswVectorsFormat::new,
41+
knnVectorsFormatParams -> {
42+
final Tuple<Integer, ExecutorService> mergeThreadCountAndExecutorService = getMergeThreadCountAndExecutorService();
43+
// There is an assumption here that hamming space will only be used for binary vectors. This will need to be fixed if that
44+
// changes in the future.
45+
if (knnVectorsFormatParams.getSpaceType() == SpaceType.HAMMING) {
46+
return new KNN9120HnswBinaryVectorsFormat(
47+
knnVectorsFormatParams.getMaxConnections(),
48+
knnVectorsFormatParams.getBeamWidth(),
49+
// number of merge threads
50+
mergeThreadCountAndExecutorService.v1(),
51+
// executor service
52+
mergeThreadCountAndExecutorService.v2()
53+
);
54+
} else {
55+
return new Lucene99HnswVectorsFormat(
56+
knnVectorsFormatParams.getMaxConnections(),
57+
knnVectorsFormatParams.getBeamWidth(),
58+
// number of merge threads
59+
mergeThreadCountAndExecutorService.v1(),
60+
// executor service
61+
mergeThreadCountAndExecutorService.v2()
62+
);
63+
}
64+
},
65+
knnScalarQuantizedVectorsFormatParams -> {
66+
final Tuple<Integer, ExecutorService> mergeThreadCountAndExecutorService = getMergeThreadCountAndExecutorService();
67+
return new Lucene99HnswScalarQuantizedVectorsFormat(
68+
knnScalarQuantizedVectorsFormatParams.getMaxConnections(),
69+
knnScalarQuantizedVectorsFormatParams.getBeamWidth(),
70+
// Number of merge threads
71+
mergeThreadCountAndExecutorService.v1(),
72+
knnScalarQuantizedVectorsFormatParams.getBits(),
73+
knnScalarQuantizedVectorsFormatParams.isCompressFlag(),
74+
knnScalarQuantizedVectorsFormatParams.getConfidenceInterval(),
75+
// Executor service
76+
mergeThreadCountAndExecutorService.v2()
77+
);
78+
}
79+
);
80+
this.remoteIndexBuilder = remoteIndexBuilder;
81+
}
82+
83+
public KNN10010PerFieldKnnVectorsFormat(final Optional<MapperService> mapperService) {
84+
this(mapperService, null);
85+
}
86+
87+
/**
88+
* This method returns the maximum dimension allowed from KNNEngine for Lucene codec
89+
*
90+
* @param fieldName Name of the field, ignored
91+
* @return Maximum constant dimension set by KNNEngine
92+
*/
93+
@Override
94+
public int getMaxDimensions(String fieldName) {
95+
return KNNEngine.getMaxDimensionByEngine(KNNEngine.LUCENE);
96+
}
97+
98+
private static Tuple<Integer, ExecutorService> getMergeThreadCountAndExecutorService() {
99+
// To ensure that only once we are fetching the settings per segment, we are fetching the num threads once while
100+
// creating the executors
101+
int mergeThreadCount = KNNSettings.getIndexThreadQty();
102+
// We need to return null whenever the merge threads are <=1, as lucene assumes that if number of threads are 1
103+
// then we should be giving a null value of the executor
104+
if (mergeThreadCount <= 1) {
105+
return DEFAULT_MERGE_THREAD_COUNT_AND_EXECUTOR_SERVICE;
106+
} else {
107+
return Tuple.tuple(mergeThreadCount, Executors.newFixedThreadPool(mergeThreadCount));
108+
}
109+
}
110+
111+
@Override
112+
protected KnnVectorsFormat nativeEngineVectorsFormat() {
113+
int approximateThreshold = getApproximateThresholdValue();
114+
return new NativeEngines10010KnnVectorsFormat(
115+
new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.getLucene99FlatVectorsScorer()),
116+
approximateThreshold,
117+
remoteIndexBuilder
118+
);
119+
}
120+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*
4+
* The OpenSearch Contributors require contributions made to
5+
* this file be licensed under the Apache-2.0 license or a
6+
* compatible open source license.
7+
*
8+
* Modifications Copyright OpenSearch Contributors. See
9+
* GitHub history for details.
10+
*/
11+
12+
package org.opensearch.knn.index.codec.KNN10010Codec;
13+
14+
import org.apache.lucene.codecs.KnnVectorsFormat;
15+
import org.apache.lucene.codecs.KnnVectorsReader;
16+
import org.apache.lucene.codecs.KnnVectorsWriter;
17+
import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer;
18+
import org.apache.lucene.codecs.hnsw.FlatVectorsFormat;
19+
import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat;
20+
import org.apache.lucene.index.SegmentReadState;
21+
import org.apache.lucene.index.SegmentWriteState;
22+
import org.opensearch.common.Nullable;
23+
import org.opensearch.knn.index.KNNSettings;
24+
import org.opensearch.knn.index.codec.KNN990Codec.NativeEngines990KnnVectorsReader;
25+
import org.opensearch.knn.index.engine.KNNEngine;
26+
import org.opensearch.knn.index.remote.RemoteIndexBuilder;
27+
28+
import java.io.IOException;
29+
30+
/**
31+
* This is a Vector format that will be used for Native engines like Faiss and Nmslib for reading and writing vector
32+
* related data structures.
33+
*/
34+
public class NativeEngines10010KnnVectorsFormat extends KnnVectorsFormat {
35+
/** The format for storing, reading, merging vectors on disk */
36+
private static FlatVectorsFormat flatVectorsFormat;
37+
private static final String FORMAT_NAME = "NativeEngines10010KnnVectorsFormat";
38+
private static int approximateThreshold;
39+
@Nullable
40+
private final RemoteIndexBuilder remoteIndexBuilder;
41+
42+
// For Testing Only
43+
public NativeEngines10010KnnVectorsFormat() {
44+
this(new Lucene99FlatVectorsFormat(new DefaultFlatVectorScorer()));
45+
}
46+
47+
// For Testing Only
48+
public NativeEngines10010KnnVectorsFormat(int approximateThreshold) {
49+
this(new Lucene99FlatVectorsFormat(new DefaultFlatVectorScorer()), approximateThreshold, null);
50+
}
51+
52+
// For Testing Only
53+
public NativeEngines10010KnnVectorsFormat(final FlatVectorsFormat flatVectorsFormat) {
54+
this(flatVectorsFormat, KNNSettings.INDEX_KNN_ADVANCED_APPROXIMATE_THRESHOLD_DEFAULT_VALUE, null);
55+
}
56+
57+
public NativeEngines10010KnnVectorsFormat(
58+
final FlatVectorsFormat flatVectorsFormat,
59+
int approximateThreshold,
60+
RemoteIndexBuilder remoteIndexBuilder
61+
) {
62+
super(FORMAT_NAME);
63+
NativeEngines10010KnnVectorsFormat.flatVectorsFormat = flatVectorsFormat;
64+
NativeEngines10010KnnVectorsFormat.approximateThreshold = approximateThreshold;
65+
this.remoteIndexBuilder = remoteIndexBuilder;
66+
}
67+
68+
/**
69+
* Returns a {@link org.apache.lucene.codecs.KnnVectorsWriter} to write the vectors to the index.
70+
*
71+
* @param state {@link org.apache.lucene.index.SegmentWriteState}
72+
*/
73+
@Override
74+
public KnnVectorsWriter fieldsWriter(final SegmentWriteState state) throws IOException {
75+
return new NativeEngines10010KnnVectorsWriter(
76+
state,
77+
flatVectorsFormat.fieldsWriter(state),
78+
approximateThreshold,
79+
remoteIndexBuilder
80+
);
81+
}
82+
83+
/**
84+
* Returns a {@link org.apache.lucene.codecs.KnnVectorsReader} to read the vectors from the index.
85+
*
86+
* @param state {@link org.apache.lucene.index.SegmentReadState}
87+
*/
88+
@Override
89+
public KnnVectorsReader fieldsReader(final SegmentReadState state) throws IOException {
90+
return new NativeEngines990KnnVectorsReader(state, flatVectorsFormat.fieldsReader(state));
91+
}
92+
93+
/**
94+
* @param s
95+
* @return
96+
*/
97+
@Override
98+
public int getMaxDimensions(String s) {
99+
return KNNEngine.getMaxDimensionByEngine(KNNEngine.LUCENE);
100+
}
101+
102+
@Override
103+
public String toString() {
104+
return "NativeEngines99KnnVectorsFormat(name="
105+
+ this.getClass().getSimpleName()
106+
+ ", flatVectorsFormat="
107+
+ flatVectorsFormat
108+
+ ", approximateThreshold="
109+
+ approximateThreshold
110+
+ ")";
111+
}
112+
}

0 commit comments

Comments
 (0)