Skip to content

Commit

Permalink
Added create_vectors_dataset and upload_index tasks in core package (#13
Browse files Browse the repository at this point in the history
)

Signed-off-by: Rohan Chitale <rchital@amazon.com>
  • Loading branch information
rchitale7 authored Feb 28, 2025
1 parent 5ae3ed5 commit 01ac863
Show file tree
Hide file tree
Showing 26 changed files with 1,458 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__pycache__/
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,3 @@
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.


def test_placeholder():
assert 1 + 1 == 2
36 changes: 36 additions & 0 deletions remote_vector_index_builder/core/common/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright OpenSearch Contributors
# SPDX-License-Identifier: Apache-2.0
#
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.

"""
Expose public exceptions & warnings
"""


class BlobError(Exception):
"""Generic error raised when blob is downloaded to or uploaded from Object Store"""

def __init__(self, message: str):
super().__init__(message)


class UnsupportedObjectStoreTypeError(ValueError):
"""Error raised when creating an Object Store object"""

pass


class VectorsDatasetError(Exception):
"""Generic error raised when converting a buffer into a Vector Dataset"""

def __init__(self, message: str):
super().__init__(message)


class UnsupportedVectorsDataTypeError(ValueError):
"""Error raised when creating a Vector Dataset because of unsupported data type"""

pass
6 changes: 6 additions & 0 deletions remote_vector_index_builder/core/common/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright OpenSearch Contributors
# SPDX-License-Identifier: Apache-2.0
#
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
# Copyright OpenSearch Contributors
# SPDX-License-Identifier: Apache-2.0
#
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.
from enum import Enum
from typing import Annotated

from core.object_store.types import ObjectStoreType
from pydantic import BaseModel, ConfigDict, Field

# Type annotation for vector file paths that must end with .knnvec
VectorPathRegex = Annotated[str, Field(pattern=".+\\.knnvec$")]


class DataType(str, Enum):
"""Supported data types for vector values.
Attributes:
FLOAT32: 32-bit floating point values
FLOAT16: 16-bit floating point values
BYTE: 8-bit integer values
BINARY: Binary data format
"""

FLOAT32 = "fp32"
FLOAT16 = "fp16"
BYTE = "byte"
BINARY = "binary"


class SpaceType(str, Enum):
"""Distance method used for measuring vector similarities.
Attributes:
L2: Euclidean distance
COSINESIMIL: Cosine similarity
L1: Manhattan distance
LINF: Chebyshev distance
INNERPRODUCT: Dot product similarity
HAMMING: Hamming distance for binary vectors
"""

L2 = "l2"
COSINESIMIL = "cosinesimil"
L1 = "l1"
LINF = "linf"
INNERPRODUCT = "innerproduct"
HAMMING = "hamming"


class Algorithm(str, Enum):
"""Supported algorithms for vector indexing.
Attributes:
HNSW: Hierarchical Navigable Small World graph
"""

HNSW = "hnsw"


class Engine(str, Enum):
"""Available vector search engines.
Attributes:
FAISS: Facebook AI Similarity Search
"""

FAISS = "faiss"


class AlgorithmParameters(BaseModel):
"""Configuration parameters for the HNSW algorithm.
Attributes:
ef_construction (int): Size of the dynamic candidate list for constructing
the HNSW graph. Higher values lead to better quality but slower
index construction. Defaults to 100.
m (int): Number of bi-directional links created for every new element
during construction. Higher values lead to better search speed but
more memory consumption. Defaults to 16.
Note:
The class is configured to allow extra attributes using the ConfigDict class.
"""

ef_construction: int = 100
m: int = 16
model_config = ConfigDict(extra="allow")


class IndexParameters(BaseModel):
"""Configuration parameters for vector index construction.
This class defines the core index configuration including the algorithm type,
distance metric, and algorithm-specific parameters.
Attributes:
algorithm (Algorithm): The vector indexing algorithm to use.
Defaults to HNSW (Hierarchical Navigable Small World).
space_type (SpaceType): The distance metric to use for vector comparisons.
Defaults to L2 (Euclidean distance).
algorithm_parameters (AlgorithmParameters): Specific parameters for the chosen
algorithm. Defaults to standard HNSW parameters (ef_construction=128, m=16).
"""

algorithm: Algorithm = Algorithm.HNSW
space_type: SpaceType = SpaceType.L2
algorithm_parameters: AlgorithmParameters = Field(
default_factory=AlgorithmParameters
)


class IndexBuildParameters(BaseModel):
"""Parameters required for building a vector index.
This class encapsulates all necessary parameters for constructing a vector index,
including data source information, vector specifications, and index configuration.
Attributes:
repository_type (str): The type of repository where the vector data is stored.
Defaults to s3
container_name (str): Name of the container (e.g., S3 bucket) containing the vector data.
vector_path (VectorPathRegex): Path to the vector data file. Must end with .knnvec extension.
doc_id_path (str): Path to the document IDs corresponding to the vectors.
tenant_id (str): Optional identifier for multi-tenant scenarios. Defaults to empty string.
dimension (int): The dimensionality of the vectors to be indexed.
doc_count (int): Total number of documents/vectors to be indexed.
data_type (DataType): The numerical format of the vector data.
Defaults to FLOAT32.
engine (Engine): The vector search engine to use for indexing.
Defaults to FAISS.
index_parameters (IndexParameters): Configuration for the index structure
and algorithm. Defaults to standard HNSW configuration.
Note:
The class is configured to forbid extra attributes using the ConfigDict class,
ensuring strict parameter validation.
"""

repository_type: ObjectStoreType = ObjectStoreType.S3
container_name: str
vector_path: VectorPathRegex
doc_id_path: str
tenant_id: str = ""
dimension: int = Field(gt=0)
doc_count: int = Field(gt=0)
data_type: DataType = DataType.FLOAT32
engine: Engine = Engine.FAISS
index_parameters: IndexParameters = Field(default_factory=IndexParameters)
model_config = ConfigDict(extra="forbid")
119 changes: 119 additions & 0 deletions remote_vector_index_builder/core/common/models/vectors_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# Copyright OpenSearch Contributors
# SPDX-License-Identifier: Apache-2.0
#
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.

from dataclasses import dataclass
from io import BytesIO

import numpy as np
from core.common.exceptions import UnsupportedVectorsDataTypeError, VectorsDatasetError
from core.common.models.index_build_parameters import DataType


@dataclass
class VectorsDataset:
"""A class for handling vector datasets and their associated document IDs.
This class provides functionality to parse, validate, and store vector data along with
their corresponding document IDs. It supports multiple data types including FLOAT32,
FLOAT16, BYTE, and BINARY formats.
Attributes:
vectors (numpy.ndarray): The array of vectors, where each row represents a vector.
doc_ids (numpy.ndarray): Array of document IDs corresponding to the vectors.
"""

vectors: np.ndarray
doc_ids: np.ndarray

def free_vectors_space(self):
"""Free up memory by deleting the vectors and document IDs arrays."""
del self.vectors
del self.doc_ids

@staticmethod
def get_numpy_dtype(dtype: DataType):
"""Convert DataType enum to numpy dtype string.
Args:
dtype (DataType): The data type enum value to convert.
Returns:
str: The corresponding numpy dtype string.
Raises:
UnsupportedVectorsDataTypeError: If the provided data type is not supported.
"""
if dtype == DataType.FLOAT32:
return "<f4"
elif dtype == DataType.FLOAT16:
return "<f2"
elif dtype == DataType.BYTE:
return "<i1"
elif dtype == DataType.BINARY:
return "<i1"
else:
raise UnsupportedVectorsDataTypeError(f"Unsupported data type: {dtype}")

@staticmethod
def check_dimensions(vectors, expected_length):
"""Validate that the vector array has the expected length.
Args:
vectors: Array-like object to check.
expected_length (int): The expected length of the vectors array.
Raises:
VectorsDatasetError: If the vectors length doesn't match the expected length.
"""
if len(vectors) != expected_length:
raise VectorsDatasetError(
f"Expected {expected_length} vectors, but got {len(vectors)}"
)

@staticmethod
def parse(
vectors: BytesIO,
doc_ids: BytesIO,
dimension: int,
doc_count: int,
vector_dtype: DataType,
):
"""Parse binary vector data and document IDs into numpy arrays.
This method reads binary data for vectors and document IDs, validates their
dimensions, and creates a new VectorsDataset instance.
Args:
vectors (BytesIO): Binary stream containing vector data.
doc_ids (BytesIO): Binary stream containing document IDs.
dimension (int): The dimensionality of each vector.
doc_count (int): Expected number of vectors/documents.
vector_dtype (DataType): The data type of the vector values.
Returns:
VectorsDataset: A new instance containing the parsed vectors and document IDs.
Raises:
VectorsDatasetError: If there are any errors during parsing or validation.
"""
try:
# Create a view into the buffer, to prevent additional allocation of memory
vector_view = vectors.getbuffer()
np_vectors = np.frombuffer(
vector_view, dtype=VectorsDataset.get_numpy_dtype(vector_dtype)
)
VectorsDataset.check_dimensions(np_vectors, doc_count * dimension)
np_vectors = np_vectors.reshape(doc_count, dimension)

# Do the same for doc ids
doc_id_view = doc_ids.getbuffer()
np_doc_ids = np.frombuffer(doc_id_view, dtype="<i4")
VectorsDataset.check_dimensions(np_doc_ids, doc_count)

except (ValueError, TypeError, MemoryError, RuntimeError) as e:
raise VectorsDatasetError(f"Error parsing vectors: {e}") from e
return VectorsDataset(np_vectors, np_doc_ids)
11 changes: 11 additions & 0 deletions remote_vector_index_builder/core/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,14 @@
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.

# TODO: Call each task from tasks.py in sequence, main()
# Add this file as the entry point for the Dockerfile


def main():
pass


if __name__ == "__main__":
main()
6 changes: 6 additions & 0 deletions remote_vector_index_builder/core/object_store/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright OpenSearch Contributors
# SPDX-License-Identifier: Apache-2.0
#
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.
Loading

0 comments on commit 01ac863

Please sign in to comment.