Skip to content

Commit 2532d77

Browse files
authored
Support Vector Search Nested benchmark (#584)
Signed-off-by: Finn Roblin <finnrobl@amazon.com>
1 parent f4ab3ab commit 2532d77

File tree

5 files changed

+491
-29
lines changed

5 files changed

+491
-29
lines changed

osbenchmark/utils/dataset.py

+3
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ class Context(Enum):
2626
NEIGHBORS = 3
2727
MAX_DISTANCE_NEIGHBORS = 4
2828
MIN_SCORE_NEIGHBORS = 5
29+
PARENTS = 6
2930

3031

3132
class DataSet(ABC):
@@ -143,6 +144,8 @@ def parse_context(context: Context) -> str:
143144
if context == Context.QUERY:
144145
return "test"
145146

147+
if context == Context.PARENTS:
148+
return "parents" # used in nested benchmarks to get the parent document id associated with each vector.
146149
if context == Context.MAX_DISTANCE_NEIGHBORS:
147150
return "max_distance_neighbors"
148151

osbenchmark/workload/params.py

+164-28
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
import time
3434
from abc import ABC, abstractmethod
3535
from enum import Enum
36-
from typing import List, Dict, Any
36+
from typing import List, Dict, Any, Optional, Tuple
3737

3838
import numpy as np
3939

@@ -880,10 +880,12 @@ class VectorDataSetPartitionParamSource(ParamSource):
880880
offset: Offset into the data set to start at. Relevant when there are
881881
multiple partitions
882882
"""
883+
NESTED_FIELD_SEPARATOR = "."
883884

884885
def __init__(self, workload, params, context: Context, **kwargs):
885886
super().__init__(workload, params, **kwargs)
886887
self.field_name: str = parse_string_parameter("field", params)
888+
self.is_nested = self.NESTED_FIELD_SEPARATOR in self.field_name # in base class because used for both bulk ingest and queries.
887889
self.context = context
888890
self.data_set_format = parse_string_parameter("data_set_format", params)
889891
self.data_set_path = parse_string_parameter("data_set_path", params, "")
@@ -979,6 +981,18 @@ def partition(self, partition_index, total_partitions):
979981
partition_x.current = partition_x.offset
980982
return partition_x
981983

984+
def get_split_fields(self) -> Tuple[str, str]:
985+
fields_as_array = self.field_name.split(self.NESTED_FIELD_SEPARATOR)
986+
987+
# TODO: Add support to multiple levels of nesting if a future benchmark requires it.
988+
989+
if len(fields_as_array) != 2:
990+
raise ValueError(
991+
f"Field name {self.field_name} is not a nested field name. Currently we support only 1 level of nesting."
992+
)
993+
return fields_as_array[0], fields_as_array[1]
994+
995+
982996
@abstractmethod
983997
def params(self):
984998
"""
@@ -1219,12 +1233,24 @@ def _build_vector_search_query_body(self, vector, efficient_filter=None) -> dict
12191233
query.update({
12201234
"filter": efficient_filter,
12211235
})
1222-
return {
1236+
1237+
knn_search_query = {
12231238
"knn": {
12241239
self.field_name: query,
12251240
},
12261241
}
12271242

1243+
if self.is_nested:
1244+
outer_field_name, _ = self.get_split_fields()
1245+
return {
1246+
"nested": {
1247+
"path": outer_field_name,
1248+
"query": knn_search_query
1249+
}
1250+
}
1251+
1252+
return knn_search_query
1253+
12281254

12291255
class BulkVectorsFromDataSetParamSource(VectorDataSetPartitionParamSource):
12301256
""" Create bulk index requests from a data set of vectors.
@@ -1241,13 +1267,74 @@ class BulkVectorsFromDataSetParamSource(VectorDataSetPartitionParamSource):
12411267
def __init__(self, workload, params, **kwargs):
12421268
super().__init__(workload, params, Context.INDEX, **kwargs)
12431269
self.bulk_size: int = parse_int_parameter("bulk_size", params)
1244-
self.retries: int = parse_int_parameter("retries", params,
1245-
self.DEFAULT_RETRIES)
1270+
self.retries: int = parse_int_parameter("retries", params, self.DEFAULT_RETRIES)
12461271
self.index_name: str = parse_string_parameter("index", params)
12471272
self.id_field_name: str = parse_string_parameter(
1248-
self.PARAMS_NAME_ID_FIELD_NAME, params, self.DEFAULT_ID_FIELD_NAME)
1273+
self.PARAMS_NAME_ID_FIELD_NAME, params, self.DEFAULT_ID_FIELD_NAME
1274+
)
12491275

1250-
def bulk_transform(self, partition: np.ndarray, action) -> List[Dict[str, Any]]:
1276+
self.action_buffer = None
1277+
self.num_nested_vectors = 10
1278+
1279+
self.parent_data_set_path = parse_string_parameter(
1280+
"parents_data_set_path", params, self.data_set_path
1281+
)
1282+
1283+
self.parent_data_set_format = self.data_set_format
1284+
1285+
self.parent_data_set_corpus = self.data_set_corpus
1286+
1287+
self.logger = logging.getLogger(__name__)
1288+
1289+
def partition(self, partition_index, total_partitions):
1290+
partition = super().partition(partition_index, total_partitions)
1291+
if self.parent_data_set_corpus and not self.parent_data_set_path:
1292+
parent_data_set_path = self._get_corpora_file_paths(
1293+
self.parent_data_set_corpus, self.parent_data_set_format
1294+
)
1295+
self._validate_data_set_corpus(parent_data_set_path)
1296+
self.parent_data_set_path = parent_data_set_path[0]
1297+
if not self.parent_data_set_path:
1298+
self.parent_data_set_path = self.data_set_path
1299+
# add neighbor instance to partition
1300+
if self.is_nested:
1301+
partition.parent_data_set = get_data_set(
1302+
self.parent_data_set_format, self.parent_data_set_path, Context.PARENTS
1303+
)
1304+
partition.parent_data_set.seek(partition.offset)
1305+
1306+
return partition
1307+
1308+
def bulk_transform_non_nested(self, partition: np.ndarray, action) -> List[Dict[str, Any]]:
1309+
"""
1310+
Create bulk ingest actions for data with a non-nested field.
1311+
"""
1312+
actions = []
1313+
1314+
_ = [
1315+
actions.extend([action(self.id_field_name, i + self.current), None])
1316+
for i in range(len(partition))
1317+
]
1318+
bulk_contents = []
1319+
1320+
add_id_field_to_body = self.id_field_name != self.DEFAULT_ID_FIELD_NAME
1321+
for vec, identifier in zip(
1322+
partition.tolist(), range(self.current, self.current + len(partition))
1323+
):
1324+
row = {self.field_name: vec}
1325+
if add_id_field_to_body:
1326+
row.update({self.id_field_name: identifier})
1327+
bulk_contents.append(row)
1328+
1329+
actions[1::2] = bulk_contents
1330+
1331+
self.logger.info("Actions: %s", actions)
1332+
return actions
1333+
1334+
1335+
def bulk_transform(
1336+
self, partition: np.ndarray, action, parents_ids: Optional[np.ndarray]
1337+
) -> List[Dict[str, Any]]:
12511338
"""Partitions and transforms a list of vectors into OpenSearch's bulk
12521339
injection format.
12531340
Args:
@@ -1257,19 +1344,63 @@ def bulk_transform(self, partition: np.ndarray, action) -> List[Dict[str, Any]]:
12571344
Returns:
12581345
An array of transformed vectors in bulk format.
12591346
"""
1347+
1348+
if not self.is_nested:
1349+
return self.bulk_transform_non_nested(partition, action)
1350+
12601351
actions = []
1261-
_ = [
1262-
actions.extend([action(self.id_field_name, i + self.current), None])
1263-
for i in range(len(partition))
1264-
]
1265-
bulk_contents = []
1352+
1353+
outer_field_name, inner_field_name = self.get_split_fields()
1354+
12661355
add_id_field_to_body = self.id_field_name != self.DEFAULT_ID_FIELD_NAME
1267-
for vec, identifier in zip(partition.tolist(), range(self.current, self.current + len(partition))):
1268-
row = {self.field_name: vec}
1356+
1357+
if self.action_buffer is None:
1358+
first_index_of_parent_ids = 0
1359+
self.action_buffer = {outer_field_name: []}
1360+
self.action_parent_id = parents_ids[first_index_of_parent_ids]
12691361
if add_id_field_to_body:
1270-
row.update({self.id_field_name: identifier})
1271-
bulk_contents.append(row)
1272-
actions[1::2] = bulk_contents
1362+
self.action_buffer.update({self.id_field_name: self.action_parent_id})
1363+
1364+
part_list = partition.tolist()
1365+
for i in range(len(partition)):
1366+
1367+
nested = {inner_field_name: part_list[i]}
1368+
1369+
current_parent_id = parents_ids[i]
1370+
1371+
if self.action_parent_id == current_parent_id:
1372+
self.action_buffer[outer_field_name].append(nested)
1373+
else:
1374+
# flush action buffer
1375+
actions.extend(
1376+
[
1377+
action(self.id_field_name, self.action_parent_id),
1378+
self.action_buffer,
1379+
]
1380+
)
1381+
1382+
self.current += len(self.action_buffer[outer_field_name])
1383+
1384+
self.action_buffer = {outer_field_name: []}
1385+
if add_id_field_to_body:
1386+
1387+
self.action_buffer.update({self.id_field_name: current_parent_id})
1388+
1389+
self.action_buffer[outer_field_name].append(nested)
1390+
1391+
self.action_parent_id = current_parent_id
1392+
1393+
max_position = self.offset + self.num_vectors
1394+
if (
1395+
self.current + len(self.action_buffer[outer_field_name]) + self.bulk_size
1396+
>= max_position
1397+
):
1398+
# final flush of remaining vectors in the last partition (for the last client)
1399+
self.current += len(self.action_buffer[outer_field_name])
1400+
actions.extend(
1401+
[action(self.id_field_name, self.action_parent_id), self.action_buffer]
1402+
)
1403+
12731404
return actions
12741405

12751406
def params(self):
@@ -1281,29 +1412,34 @@ def params(self):
12811412

12821413
def action(id_field_name, doc_id):
12831414
# support only index operation
1284-
bulk_action = 'index'
1285-
metadata = {
1286-
'_index': self.index_name
1287-
}
1415+
bulk_action = "index"
1416+
metadata = {"_index": self.index_name}
12881417
# Add id field to metadata only if it is _id
12891418
if id_field_name == self.DEFAULT_ID_FIELD_NAME:
12901419
metadata.update({id_field_name: doc_id})
12911420
return {bulk_action: metadata}
12921421

12931422
remaining_vectors_in_partition = self.num_vectors + self.offset - self.current
1294-
# update bulk size if number of vectors to read is less than actual bulk size
1423+
12951424
bulk_size = min(self.bulk_size, remaining_vectors_in_partition)
1425+
12961426
partition = self.data_set.read(bulk_size)
1297-
body = self.bulk_transform(partition, action)
1427+
1428+
if self.is_nested:
1429+
parent_ids = self.parent_data_set.read(bulk_size)
1430+
else:
1431+
parent_ids = None
1432+
1433+
body = self.bulk_transform(partition, action, parent_ids)
12981434
size = len(body) // 2
1299-
self.current += size
1435+
1436+
if not self.is_nested:
1437+
# in the nested case, we may have irregular number of vectors ingested,
1438+
# so we calculate self.current within bulk_transform method when self.is_nested.
1439+
self.current += size
13001440
self.percent_completed = self.current / self.total
13011441

1302-
return {
1303-
"body": body,
1304-
"retries": self.retries,
1305-
"size": size
1306-
}
1442+
return {"body": body, "retries": self.retries, "size": size}
13071443

13081444

13091445
def get_target(workload, params):

small-nested-works.hdf5

133 KB
Binary file not shown.

tests/utils/dataset_helper.py

+35
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,12 @@ def _build_data_set(self, context: DataSetBuildContext):
193193
# file with distance.
194194
context.vectors.tofile(f)
195195

196+
def create_parent_ids(num_vectors: int, group_size: int = 10) -> np.ndarray:
197+
num_ids = (num_vectors + group_size - 1) // group_size # Calculate total number of different IDs needed
198+
ids = np.arange(1, num_ids + 1) # Create an array of IDs starting from 1
199+
parent_ids = np.repeat(ids, group_size)[:num_vectors] # Repeat each ID 'group_size' times and trim to 'num_vectors'
200+
return parent_ids
201+
196202

197203
def create_random_2d_array(num_vectors: int, dimension: int) -> np.ndarray:
198204
rng = np.random.default_rng()
@@ -239,6 +245,35 @@ def create_data_set(
239245
return data_set_path
240246

241247

248+
def create_parent_data_set(
249+
num_vectors: int,
250+
dimension: int,
251+
extension: str,
252+
data_set_context: Context,
253+
data_set_dir,
254+
file_path: str = None
255+
) -> str:
256+
if file_path:
257+
data_set_path = file_path
258+
else:
259+
file_name_base = ''.join(random.choice(string.ascii_letters) for _ in
260+
range(DEFAULT_RANDOM_STRING_LENGTH))
261+
data_set_file_name = "{}.{}".format(file_name_base, extension)
262+
data_set_path = os.path.join(data_set_dir, data_set_file_name)
263+
context = DataSetBuildContext(
264+
data_set_context,
265+
create_parent_ids(num_vectors),
266+
data_set_path)
267+
268+
if extension == HDF5DataSet.FORMAT_NAME:
269+
HDF5Builder().add_data_set_build_context(context).build()
270+
else:
271+
BigANNVectorBuilder().add_data_set_build_context(context).build()
272+
273+
return data_set_path
274+
275+
276+
242277
def create_ground_truth(
243278
num_queries: int,
244279
k: int,

0 commit comments

Comments
 (0)