-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added create_vectors_dataset and upload_index tasks in core package
Signed-off-by: Rohan Chitale <rchital@amazon.com>
- Loading branch information
Showing
26 changed files
with
1,447 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
__pycache__/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# Copyright OpenSearch Contributors | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
# The OpenSearch Contributors require contributions made to | ||
# this file be licensed under the Apache-2.0 license or a | ||
# compatible open source license. | ||
|
||
""" | ||
Expose public exceptions & warnings | ||
""" | ||
|
||
|
||
class BlobError(Exception): | ||
"""Generic error raised when blob is downloaded to or uploaded from Object Store""" | ||
|
||
def __init__(self, message: str): | ||
super().__init__(message) | ||
|
||
|
||
class UnsupportedObjectStoreTypeError(ValueError): | ||
"""Error raised when creating an Object Store object""" | ||
|
||
pass | ||
|
||
|
||
class VectorsDatasetError(Exception): | ||
"""Generic error raised when converting a buffer into a Vector Dataset""" | ||
|
||
def __init__(self, message: str): | ||
super().__init__(message) | ||
|
||
|
||
class UnsupportedVectorsDataTypeError(ValueError): | ||
"""Error raised when creating a Vector Dataset because of unsupported data type""" | ||
|
||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# Copyright OpenSearch Contributors | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
# The OpenSearch Contributors require contributions made to | ||
# this file be licensed under the Apache-2.0 license or a | ||
# compatible open source license. |
151 changes: 151 additions & 0 deletions
151
remote_vector_index_builder/core/common/models/index_build_parameters.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
# Copyright OpenSearch Contributors | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
# The OpenSearch Contributors require contributions made to | ||
# this file be licensed under the Apache-2.0 license or a | ||
# compatible open source license. | ||
from enum import Enum | ||
from typing import Annotated | ||
|
||
from core.object_store.types import ObjectStoreType | ||
from pydantic import BaseModel, ConfigDict, Field | ||
|
||
# Type annotation for vector file paths that must end with .knnvec | ||
VectorPathRegex = Annotated[str, Field(pattern=".+\\.knnvec$")] | ||
|
||
|
||
class DataType(str, Enum): | ||
"""Supported data types for vector values. | ||
Attributes: | ||
FLOAT32: 32-bit floating point values | ||
FLOAT16: 16-bit floating point values | ||
BYTE: 8-bit integer values | ||
BINARY: Binary data format | ||
""" | ||
|
||
FLOAT32 = "fp32" | ||
FLOAT16 = "fp16" | ||
BYTE = "byte" | ||
BINARY = "binary" | ||
|
||
|
||
class SpaceType(str, Enum): | ||
"""Distance method used for measuring vector similarities. | ||
Attributes: | ||
L2: Euclidean distance | ||
COSINESIMIL: Cosine similarity | ||
L1: Manhattan distance | ||
LINF: Chebyshev distance | ||
INNERPRODUCT: Dot product similarity | ||
HAMMING: Hamming distance for binary vectors | ||
""" | ||
|
||
L2 = "l2" | ||
COSINESIMIL = "cosinesimil" | ||
L1 = "l1" | ||
LINF = "linf" | ||
INNERPRODUCT = "innerproduct" | ||
HAMMING = "hamming" | ||
|
||
|
||
class Algorithm(str, Enum): | ||
"""Supported algorithms for vector indexing. | ||
Attributes: | ||
HNSW: Hierarchical Navigable Small World graph | ||
""" | ||
|
||
HNSW = "hnsw" | ||
|
||
|
||
class Engine(str, Enum): | ||
"""Available vector search engines. | ||
Attributes: | ||
FAISS: Facebook AI Similarity Search | ||
""" | ||
|
||
FAISS = "faiss" | ||
|
||
|
||
class AlgorithmParameters(BaseModel): | ||
"""Configuration parameters for the HNSW algorithm. | ||
Attributes: | ||
ef_construction (int): Size of the dynamic candidate list for constructing | ||
the HNSW graph. Higher values lead to better quality but slower | ||
index construction. Defaults to 100. | ||
m (int): Number of bi-directional links created for every new element | ||
during construction. Higher values lead to better search speed but | ||
more memory consumption. Defaults to 16. | ||
Note: | ||
The class is configured to allow extra attributes using the ConfigDict class. | ||
""" | ||
|
||
ef_construction: int = 100 | ||
m: int = 16 | ||
model_config = ConfigDict(extra="allow") | ||
|
||
|
||
class IndexParameters(BaseModel): | ||
"""Configuration parameters for vector index construction. | ||
This class defines the core index configuration including the algorithm type, | ||
distance metric, and algorithm-specific parameters. | ||
Attributes: | ||
algorithm (Algorithm): The vector indexing algorithm to use. | ||
Defaults to HNSW (Hierarchical Navigable Small World). | ||
space_type (SpaceType): The distance metric to use for vector comparisons. | ||
Defaults to L2 (Euclidean distance). | ||
algorithm_parameters (AlgorithmParameters): Specific parameters for the chosen | ||
algorithm. Defaults to standard HNSW parameters (ef_construction=128, m=16). | ||
""" | ||
|
||
algorithm: Algorithm = Algorithm.HNSW | ||
space_type: SpaceType = SpaceType.L2 | ||
algorithm_parameters: AlgorithmParameters = Field( | ||
default_factory=AlgorithmParameters | ||
) | ||
|
||
|
||
class IndexBuildParameters(BaseModel): | ||
"""Parameters required for building a vector index. | ||
This class encapsulates all necessary parameters for constructing a vector index, | ||
including data source information, vector specifications, and index configuration. | ||
Attributes: | ||
repository_type (str): The type of repository where the vector data is stored. | ||
Defaults to s3 | ||
container_name (str): Name of the container (e.g., S3 bucket) containing the vector data. | ||
vector_path (VectorPathRegex): Path to the vector data file. Must end with .knnvec extension. | ||
doc_id_path (str): Path to the document IDs corresponding to the vectors. | ||
tenant_id (str): Optional identifier for multi-tenant scenarios. Defaults to empty string. | ||
dimension (int): The dimensionality of the vectors to be indexed. | ||
doc_count (int): Total number of documents/vectors to be indexed. | ||
data_type (DataType): The numerical format of the vector data. | ||
Defaults to FLOAT32. | ||
engine (Engine): The vector search engine to use for indexing. | ||
Defaults to FAISS. | ||
index_parameters (IndexParameters): Configuration for the index structure | ||
and algorithm. Defaults to standard HNSW configuration. | ||
Note: | ||
The class is configured to forbid extra attributes using the ConfigDict class, | ||
ensuring strict parameter validation. | ||
""" | ||
|
||
repository_type: ObjectStoreType = ObjectStoreType.S3 | ||
container_name: str | ||
vector_path: VectorPathRegex | ||
doc_id_path: str | ||
tenant_id: str = "" | ||
dimension: int = Field(gt=0) | ||
doc_count: int = Field(gt=0) | ||
data_type: DataType = DataType.FLOAT32 | ||
engine: Engine = Engine.FAISS | ||
index_parameters: IndexParameters = Field(default_factory=IndexParameters) | ||
model_config = ConfigDict(extra="forbid") |
115 changes: 115 additions & 0 deletions
115
remote_vector_index_builder/core/common/models/vectors_dataset.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
# Copyright OpenSearch Contributors | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
# The OpenSearch Contributors require contributions made to | ||
# this file be licensed under the Apache-2.0 license or a | ||
# compatible open source license. | ||
|
||
from dataclasses import dataclass | ||
from io import BytesIO | ||
|
||
import numpy as np | ||
from core.common.exceptions import UnsupportedVectorsDataTypeError, VectorsDatasetError | ||
from core.common.models.index_build_parameters import DataType | ||
|
||
|
||
@dataclass | ||
class VectorsDataset: | ||
"""A class for handling vector datasets and their associated document IDs. | ||
This class provides functionality to parse, validate, and store vector data along with | ||
their corresponding document IDs. It supports multiple data types including FLOAT32, | ||
FLOAT16, BYTE, and BINARY formats. | ||
Attributes: | ||
vectors (numpy.ndarray): The array of vectors, where each row represents a vector. | ||
doc_ids (numpy.ndarray): Array of document IDs corresponding to the vectors. | ||
""" | ||
|
||
vectors: np.ndarray | ||
doc_ids: np.ndarray | ||
|
||
def free_vectors_space(self): | ||
"""Free up memory by deleting the vectors and document IDs arrays.""" | ||
del self.vectors | ||
del self.doc_ids | ||
|
||
@staticmethod | ||
def get_numpy_dtype(dtype: DataType): | ||
"""Convert DataType enum to numpy dtype string. | ||
Args: | ||
dtype (DataType): The data type enum value to convert. | ||
Returns: | ||
str: The corresponding numpy dtype string. | ||
Raises: | ||
UnsupportedVectorsDataTypeError: If the provided data type is not supported. | ||
""" | ||
if dtype == DataType.FLOAT32: | ||
return "<f4" | ||
elif dtype == DataType.FLOAT16: | ||
return "<f2" | ||
elif dtype == DataType.BYTE: | ||
return "<i1" | ||
elif dtype == DataType.BINARY: | ||
return "<i1" | ||
else: | ||
raise UnsupportedVectorsDataTypeError(f"Unsupported data type: {dtype}") | ||
|
||
@staticmethod | ||
def check_dimensions(vectors, expected_length): | ||
"""Validate that the vector array has the expected length. | ||
Args: | ||
vectors: Array-like object to check. | ||
expected_length (int): The expected length of the vectors array. | ||
Raises: | ||
VectorsDatasetError: If the vectors length doesn't match the expected length. | ||
""" | ||
if len(vectors) != expected_length: | ||
raise VectorsDatasetError( | ||
f"Expected {expected_length} vectors, but got {len(vectors)}" | ||
) | ||
|
||
@staticmethod | ||
def parse( | ||
vectors: BytesIO, | ||
doc_ids: BytesIO, | ||
dimension: int, | ||
doc_count: int, | ||
vector_dtype: DataType, | ||
): | ||
"""Parse binary vector data and document IDs into numpy arrays. | ||
This method reads binary data for vectors and document IDs, validates their | ||
dimensions, and creates a new VectorsDataset instance. | ||
Args: | ||
vectors (BytesIO): Binary stream containing vector data. | ||
doc_ids (BytesIO): Binary stream containing document IDs. | ||
dimension (int): The dimensionality of each vector. | ||
doc_count (int): Expected number of vectors/documents. | ||
vector_dtype (DataType): The data type of the vector values. | ||
Returns: | ||
VectorsDataset: A new instance containing the parsed vectors and document IDs. | ||
Raises: | ||
VectorsDatasetError: If there are any errors during parsing or validation. | ||
""" | ||
try: | ||
np_doc_ids = np.frombuffer(doc_ids.read(), dtype="<i4") | ||
VectorsDataset.check_dimensions(np_doc_ids, doc_count) | ||
|
||
np_vectors = np.frombuffer( | ||
vectors.read(), dtype=VectorsDataset.get_numpy_dtype(vector_dtype) | ||
) | ||
VectorsDataset.check_dimensions(np_vectors, doc_count * dimension) | ||
np_vectors = np_vectors.reshape(doc_count, dimension) | ||
|
||
except (ValueError, TypeError, MemoryError, RuntimeError) as e: | ||
raise VectorsDatasetError(f"Error parsing vectors: {e}") from e | ||
return VectorsDataset(np_vectors, np_doc_ids) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# Copyright OpenSearch Contributors | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
# The OpenSearch Contributors require contributions made to | ||
# this file be licensed under the Apache-2.0 license or a | ||
# compatible open source license. |
Oops, something went wrong.