diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c18dd8d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__/ diff --git a/test_remote_vector_index_builder/test_placeholder.py b/remote_vector_index_builder/core/common/__init__.py similarity index 82% rename from test_remote_vector_index_builder/test_placeholder.py rename to remote_vector_index_builder/core/common/__init__.py index 2a99f7f..fe22b86 100644 --- a/test_remote_vector_index_builder/test_placeholder.py +++ b/remote_vector_index_builder/core/common/__init__.py @@ -4,7 +4,3 @@ # The OpenSearch Contributors require contributions made to # this file be licensed under the Apache-2.0 license or a # compatible open source license. - - -def test_placeholder(): - assert 1 + 1 == 2 diff --git a/remote_vector_index_builder/core/common/exceptions.py b/remote_vector_index_builder/core/common/exceptions.py new file mode 100644 index 0000000..0e5cf5c --- /dev/null +++ b/remote_vector_index_builder/core/common/exceptions.py @@ -0,0 +1,36 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +""" +Expose public exceptions & warnings +""" + + +class BlobError(Exception): + """Generic error raised when blob is downloaded to or uploaded from Object Store""" + + def __init__(self, message: str): + super().__init__(message) + + +class UnsupportedObjectStoreTypeError(ValueError): + """Error raised when creating an Object Store object""" + + pass + + +class VectorsDatasetError(Exception): + """Generic error raised when converting a buffer into a Vector Dataset""" + + def __init__(self, message: str): + super().__init__(message) + + +class UnsupportedVectorsDataTypeError(ValueError): + """Error raised when creating a Vector Dataset because of unsupported data type""" + + pass diff --git a/remote_vector_index_builder/core/common/models/__init__.py b/remote_vector_index_builder/core/common/models/__init__.py new file mode 100644 index 0000000..fe22b86 --- /dev/null +++ b/remote_vector_index_builder/core/common/models/__init__.py @@ -0,0 +1,6 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. diff --git a/remote_vector_index_builder/core/common/models/index_build_parameters.py b/remote_vector_index_builder/core/common/models/index_build_parameters.py new file mode 100644 index 0000000..e09f433 --- /dev/null +++ b/remote_vector_index_builder/core/common/models/index_build_parameters.py @@ -0,0 +1,151 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +from enum import Enum +from typing import Annotated + +from core.object_store.types import ObjectStoreType +from pydantic import BaseModel, ConfigDict, Field + +# Type annotation for vector file paths that must end with .knnvec +VectorPathRegex = Annotated[str, Field(pattern=".+\\.knnvec$")] + + +class DataType(str, Enum): + """Supported data types for vector values. + + Attributes: + FLOAT32: 32-bit floating point values + FLOAT16: 16-bit floating point values + BYTE: 8-bit integer values + BINARY: Binary data format + """ + + FLOAT32 = "fp32" + FLOAT16 = "fp16" + BYTE = "byte" + BINARY = "binary" + + +class SpaceType(str, Enum): + """Distance method used for measuring vector similarities. + + Attributes: + L2: Euclidean distance + COSINESIMIL: Cosine similarity + L1: Manhattan distance + LINF: Chebyshev distance + INNERPRODUCT: Dot product similarity + HAMMING: Hamming distance for binary vectors + """ + + L2 = "l2" + COSINESIMIL = "cosinesimil" + L1 = "l1" + LINF = "linf" + INNERPRODUCT = "innerproduct" + HAMMING = "hamming" + + +class Algorithm(str, Enum): + """Supported algorithms for vector indexing. + + Attributes: + HNSW: Hierarchical Navigable Small World graph + """ + + HNSW = "hnsw" + + +class Engine(str, Enum): + """Available vector search engines. + + Attributes: + FAISS: Facebook AI Similarity Search + """ + + FAISS = "faiss" + + +class AlgorithmParameters(BaseModel): + """Configuration parameters for the HNSW algorithm. + + Attributes: + ef_construction (int): Size of the dynamic candidate list for constructing + the HNSW graph. Higher values lead to better quality but slower + index construction. Defaults to 100. + m (int): Number of bi-directional links created for every new element + during construction. Higher values lead to better search speed but + more memory consumption. Defaults to 16. + Note: + The class is configured to allow extra attributes using the ConfigDict class. + """ + + ef_construction: int = 100 + m: int = 16 + model_config = ConfigDict(extra="allow") + + +class IndexParameters(BaseModel): + """Configuration parameters for vector index construction. + + This class defines the core index configuration including the algorithm type, + distance metric, and algorithm-specific parameters. + + Attributes: + algorithm (Algorithm): The vector indexing algorithm to use. + Defaults to HNSW (Hierarchical Navigable Small World). + space_type (SpaceType): The distance metric to use for vector comparisons. + Defaults to L2 (Euclidean distance). + algorithm_parameters (AlgorithmParameters): Specific parameters for the chosen + algorithm. Defaults to standard HNSW parameters (ef_construction=128, m=16). + """ + + algorithm: Algorithm = Algorithm.HNSW + space_type: SpaceType = SpaceType.L2 + algorithm_parameters: AlgorithmParameters = Field( + default_factory=AlgorithmParameters + ) + + +class IndexBuildParameters(BaseModel): + """Parameters required for building a vector index. + + This class encapsulates all necessary parameters for constructing a vector index, + including data source information, vector specifications, and index configuration. + + Attributes: + repository_type (str): The type of repository where the vector data is stored. + Defaults to s3 + container_name (str): Name of the container (e.g., S3 bucket) containing the vector data. + vector_path (VectorPathRegex): Path to the vector data file. Must end with .knnvec extension. + doc_id_path (str): Path to the document IDs corresponding to the vectors. + tenant_id (str): Optional identifier for multi-tenant scenarios. Defaults to empty string. + dimension (int): The dimensionality of the vectors to be indexed. + doc_count (int): Total number of documents/vectors to be indexed. + data_type (DataType): The numerical format of the vector data. + Defaults to FLOAT32. + engine (Engine): The vector search engine to use for indexing. + Defaults to FAISS. + index_parameters (IndexParameters): Configuration for the index structure + and algorithm. Defaults to standard HNSW configuration. + + Note: + The class is configured to forbid extra attributes using the ConfigDict class, + ensuring strict parameter validation. + """ + + repository_type: ObjectStoreType = ObjectStoreType.S3 + container_name: str + vector_path: VectorPathRegex + doc_id_path: str + tenant_id: str = "" + dimension: int = Field(gt=0) + doc_count: int = Field(gt=0) + data_type: DataType = DataType.FLOAT32 + engine: Engine = Engine.FAISS + index_parameters: IndexParameters = Field(default_factory=IndexParameters) + model_config = ConfigDict(extra="forbid") diff --git a/remote_vector_index_builder/core/common/models/vectors_dataset.py b/remote_vector_index_builder/core/common/models/vectors_dataset.py new file mode 100644 index 0000000..cc73d42 --- /dev/null +++ b/remote_vector_index_builder/core/common/models/vectors_dataset.py @@ -0,0 +1,119 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +from dataclasses import dataclass +from io import BytesIO + +import numpy as np +from core.common.exceptions import UnsupportedVectorsDataTypeError, VectorsDatasetError +from core.common.models.index_build_parameters import DataType + + +@dataclass +class VectorsDataset: + """A class for handling vector datasets and their associated document IDs. + + This class provides functionality to parse, validate, and store vector data along with + their corresponding document IDs. It supports multiple data types including FLOAT32, + FLOAT16, BYTE, and BINARY formats. + + Attributes: + vectors (numpy.ndarray): The array of vectors, where each row represents a vector. + doc_ids (numpy.ndarray): Array of document IDs corresponding to the vectors. + """ + + vectors: np.ndarray + doc_ids: np.ndarray + + def free_vectors_space(self): + """Free up memory by deleting the vectors and document IDs arrays.""" + del self.vectors + del self.doc_ids + + @staticmethod + def get_numpy_dtype(dtype: DataType): + """Convert DataType enum to numpy dtype string. + + Args: + dtype (DataType): The data type enum value to convert. + + Returns: + str: The corresponding numpy dtype string. + + Raises: + UnsupportedVectorsDataTypeError: If the provided data type is not supported. + """ + if dtype == DataType.FLOAT32: + return " None: + """ + Downloads the blob from the remote_store_path, to a buffer in memory + + Args: + remote_store_path (str): The path/key to the remote object to be downloaded + bytes_buffer (BytesIO): A bytes buffer where the downloaded data will be stored + + Returns: + None + + Note: + - The bytes_buffer should be properly initialized before passing to this method + - Caller is also responsible for cleaning up the bytes buffer + - Implementations should handle any necessary authentication and error handling + """ + pass + + @abstractmethod + def write_blob(self, local_file_path: str, remote_store_path: str) -> None: + """ + Uploads the blob at local_file_path to the remote_store_path + + Args: + local_file_path (str): Path to the local file that needs to be uploaded + remote_store_path (str): The path/key where the file should be stored in remote storage + + Returns: + None + + Note: + - Implementations should handle any necessary authentication and error handling + - The local file must exist and be readable + - The remote path should be valid for the specific storage implementation + """ + pass diff --git a/remote_vector_index_builder/core/object_store/object_store_factory.py b/remote_vector_index_builder/core/object_store/object_store_factory.py new file mode 100644 index 0000000..69bb44b --- /dev/null +++ b/remote_vector_index_builder/core/object_store/object_store_factory.py @@ -0,0 +1,56 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +from typing import Any, Dict + +from core.common.exceptions import UnsupportedObjectStoreTypeError +from core.common.models.index_build_parameters import IndexBuildParameters +from core.object_store.object_store import ObjectStore +from core.object_store.s3.s3_object_store import S3ObjectStore +from core.object_store.types import ObjectStoreType + + +class ObjectStoreFactory: + """ + A factory class for creating object store instances. + + This class provides a static method to create appropriate object store instances + based on the repository type specified in the index build parameters. It serves + as a central point for object store instance creation and helps maintain loose + coupling between different object store implementations. + """ + + @staticmethod + def create_object_store( + index_build_params: IndexBuildParameters, object_store_config: Dict[str, Any] + ) -> ObjectStore: + """ + Creates and returns an appropriate object store instance based on the repository type. + + Args: + index_build_params (IndexBuildParameters): Parameters for index building, including + the repository type that determines which object store implementation to use. + object_store_config (Dict[str, Any]): Configuration dictionary containing settings + specific to the object store implementation. + + Returns: + ObjectStore: An instance of the appropriate object store implementation. + + Raises: + UnsupportedObjectStoreTypeError: If the specified repository type is not supported. + + Example: + params = IndexBuildParameters(repository_type=ObjectStoreType.S3) + config = {"region": "us-west-2"} + store = ObjectStoreFactory.create_object_store(params, config) + """ + if index_build_params.repository_type == ObjectStoreType.S3: + return S3ObjectStore(index_build_params, object_store_config) + else: + raise UnsupportedObjectStoreTypeError( + f"Unknown object store type: {index_build_params.repository_type}" + ) diff --git a/remote_vector_index_builder/core/object_store/s3/__init__.py b/remote_vector_index_builder/core/object_store/s3/__init__.py new file mode 100644 index 0000000..fe22b86 --- /dev/null +++ b/remote_vector_index_builder/core/object_store/s3/__init__.py @@ -0,0 +1,6 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. diff --git a/remote_vector_index_builder/core/object_store/s3/s3_object_store.py b/remote_vector_index_builder/core/object_store/s3/s3_object_store.py new file mode 100644 index 0000000..3e5835a --- /dev/null +++ b/remote_vector_index_builder/core/object_store/s3/s3_object_store.py @@ -0,0 +1,225 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +import logging +import os +import threading +from functools import cache +from io import BytesIO +from typing import Any, Dict + +import boto3 +from boto3.s3.transfer import TransferConfig +from botocore.config import Config +from botocore.exceptions import ClientError +from core.common.exceptions import BlobError +from core.common.models.index_build_parameters import IndexBuildParameters +from core.object_store.object_store import ObjectStore + +logger = logging.getLogger(__name__) + + +@cache +def get_boto3_client(region: str, retries: int) -> boto3.client: + """Create or retrieve a cached boto3 S3 client. + + Args: + region (str): AWS region name for the S3 client + retries (int): Maximum number of retry attempts for failed requests + + Returns: + boto3.client: Configured S3 client instance + """ + config = Config(retries={"max_attempts": retries}) + return boto3.client("s3", config=config, region_name=region) + + +class S3ObjectStore(ObjectStore): + """S3 implementation of the ObjectStore interface for managing vector data files. + + This class handles interactions with AWS S3, including file uploads and downloads, + with configurable retry logic and transfer settings for optimal performance. + + Attributes: + DEFAULT_TRANSFER_CONFIG (dict): Default configuration for S3 file transfers, + including chunk sizes, concurrency, and retry settings + + Args: + index_build_params (IndexBuildParameters): Parameters for the index building process + object_store_config (Dict[str, Any]): Configuration options for S3 interactions + """ + + DEFAULT_TRANSFER_CONFIG = { + "multipart_chunksize": 10 * 1024 * 1024, # 10MB + "max_concurrency": (os.cpu_count() or 2) + // 2, # os.cpu_count can None, according to mypy. If it is none, then default to 1 thread + "multipart_threshold": 10 * 1024 * 1024, # 10MB + "use_threads": True, + "max_bandwidth": None, + "io_chunksize": 256 * 1024, # 256KB + "num_download_attempts": 5, + "max_io_queue": 100, + "preferred_transfer_client": "auto", + } + + def __init__( + self, + index_build_params: IndexBuildParameters, + object_store_config: Dict[str, Any], + ): + """Initialize the S3ObjectStore with the given parameters and configuration. + + Args: + index_build_params (IndexBuildParameters): Contains bucket name and other + index building parameters + object_store_config (Dict[str, Any]): Configuration dictionary containing: + - retries (int): Maximum number of retry attempts (default: 3) + - region (str): AWS region name (default: 'us-west-2') + - transfer_config (Dict[str, Any]): s3 TransferConfig parameters + - debug: Turns on debug mode (default: False) + """ + self.bucket = index_build_params.container_name + self.max_retries = object_store_config.get("retries", 3) + self.region = object_store_config.get("region", "us-west-2") + + self.s3_client = get_boto3_client(region=self.region, retries=self.max_retries) + + transfer_config = object_store_config.get("transfer_config", {}) + # Create transfer config with validated parameters + self.transfer_config = self._create_transfer_config(transfer_config) + + self.debug = object_store_config.get("debug", False) + + # Debug mode provides progress tracking on downloads and uploads + if self.debug: + self._read_progress = 0 + self._read_progress_lock = threading.Lock() + self._write_progress = 0 + self._write_progress_lock = threading.Lock() + + def _create_transfer_config(self, custom_config: Dict[str, Any]) -> TransferConfig: + """ + Creates a TransferConfig with custom parameters while maintaining defaults + for unspecified values. + + Args: + custom_config: Dictionary of custom transfer configuration parameters + + Returns: + TransferConfig: Configured transfer configuration object + """ + # Start with default values + config_params = self.DEFAULT_TRANSFER_CONFIG.copy() + + # Update with custom values, only if they are valid parameters + for key, value in custom_config.items(): + if key in self.DEFAULT_TRANSFER_CONFIG: + config_params[key] = value + else: + logger.info( + f"Warning: Ignoring invalid transfer config parameter: {key}" + ) + + # Remove None values to let boto3 use its internal defaults + config_params = {k: v for k, v in config_params.items() if v is not None} + + return TransferConfig(**config_params) + + def read_blob(self, remote_store_path: str, bytes_buffer: BytesIO) -> None: + """ + Downloads a blob from S3 to the provided bytes buffer, with retry logic. + + Args: + remote_store_path (str): The S3 key (path) of the object to download + bytes_buffer (BytesIO): A bytes buffer to store the downloaded data + + Returns: + None + + Note: + - boto3 automatically handles retries for the exceptions given here: + - https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html + - Resets buffer position to 0 after successful download + - Uses configured TransferConfig for download parameters + - boto3 may perform the download in parallel multipart chunks, + based on the TransferConfig setting + + Raises: + BlobError: If download fails after all retry attempts or encounters non-retryable error + """ + + callback_func = None + + # Set up progress callback, if debug mode is on + if self.debug: + with self._read_progress_lock: + self._read_progress = 0 + + def callback(bytes_transferred): + with self._read_progress_lock: + self._read_progress += bytes_transferred + logger.info(f"Downloaded: {self._read_progress:,} bytes") + + callback_func = callback + + try: + self.s3_client.download_fileobj( + self.bucket, + remote_store_path, + bytes_buffer, + Config=self.transfer_config, + Callback=callback_func, + ) + return + except ClientError as e: + raise BlobError(f"Error downloading file: {e}") from e + + def write_blob(self, local_file_path: str, remote_store_path: str) -> None: + """ + Uploads a local file to S3, with retry logic. + + Args: + local_file_path (str): Path to the local file to be uploaded + remote_store_path (str): The S3 key (path) where the file will be stored + + Returns: + None + + Note: + - boto3 automatically handles retries for the exceptions given here: + - https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html + - Uses configured TransferConfig for upload parameters + - boto3 may perform the upload in parallel multipart chunks, based on the TransferConfig setting + + Raises: + BlobError: If upload fails after all retry attempts or encounters a non-retryable error + """ + + callback_func = None + if self.debug: + # Set up progress callback, if debug mode is on + with self._write_progress_lock: + self._write_progress = 0 + + def callback(bytes_amount): + with self._write_progress_lock: + self._write_progress += bytes_amount + logger.info(f"Uploaded: {self._write_progress:,} bytes") + + callback_func = callback + + try: + self.s3_client.upload_file( + local_file_path, + self.bucket, + remote_store_path, + Config=self.transfer_config, + Callback=callback_func, + ) + return + except ClientError as e: + raise BlobError(f"Error uploading file: {e}") from e diff --git a/remote_vector_index_builder/core/object_store/types.py b/remote_vector_index_builder/core/object_store/types.py new file mode 100644 index 0000000..90a633e --- /dev/null +++ b/remote_vector_index_builder/core/object_store/types.py @@ -0,0 +1,22 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +from enum import Enum + + +class ObjectStoreType(str, Enum): + """ + Enumeration of supported object store types. + + This enum inherits from both str and Enum to provide string-based + enumeration values, allowing for easy serialization and comparison. + + Attributes: + S3 (str): Represents Amazon S3 object storage service + """ + + S3 = "s3" diff --git a/remote_vector_index_builder/core/tasks.py b/remote_vector_index_builder/core/tasks.py new file mode 100644 index 0000000..7866baf --- /dev/null +++ b/remote_vector_index_builder/core/tasks.py @@ -0,0 +1,130 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +""" +core.tasks +~~~~~~~~~~~~~~~~~ + +This module contains the tasks necessary to build an index on GPUs. +These tasks must be run in the following sequence, for a given build request: +1. create_vectors_dataset +2. build_index +3. upload_index + +""" +import logging +from io import BytesIO +from typing import Any, Dict + +from core.common.models.index_build_parameters import IndexBuildParameters +from core.common.models.vectors_dataset import VectorsDataset +from core.object_store.object_store_factory import ObjectStoreFactory + +logger = logging.getLogger(__name__) + +# TODO: Create build_index task + + +def create_vectors_dataset( + index_build_params: IndexBuildParameters, + object_store_config: Dict[str, Any], + vector_bytes_buffer: BytesIO, + doc_id_bytes_buffer: BytesIO, +) -> VectorsDataset: + """ + Downloads vector and document ID data from object storage and creates a VectorsDataset. + + This function performs the first step in the index building process by: + 1. Creating an appropriate object store instance + 2. Downloading vector data from the specified vector_path, into the vector_bytes_buffer + 3. Downloading document IDs from the specified doc_id_path, into the doc_id_bytes_buffer + 4. Combining them into a VectorsDataset object + + Args: + index_build_params (IndexBuildParameters): Contains the configuration for the index build, + including: + - vector_path: Path to the vector data in object storage + - doc_id_path: Path to the document IDs in object storage + - repository_type: Type of object store to use + object_store_config (Dict[str, Any]): Configuration for the object store + containing connection details + vector_bytes_buffer: Buffer for storing vector binary data + doc_id_bytes_buffer: Buffer for storing doc id binary data + + Returns: + VectorsDataset: A dataset object containing: + - The downloaded vectors in the specified format + - Associated document IDs for each vector + + Note: + - Uses BytesIO buffers for memory-efficient data handling + - The caller is responsible for closing each buffer + - Before closing the buffers, caller must call free_vector_space on VectorDataset object, + to remove all references to the underlying data. + - Both vector and document ID files must exist in object storage + - The number of vectors must match the number of document IDs + - Memory usage scales with the size of the vector and document ID data + + Raises: + BlobError: If there are issues accessing or reading from object storage + VectorDatasetError: If there are issues parsing the vectors and/or doc IDs into a VectorDataset + UnsupportedVectorsDataTypeError: If the index_build_params.data_type is not supported + UnsupportedObjectStoreTypeError: If the index_build_params.repository_type is not supported + + """ + object_store = ObjectStoreFactory.create_object_store( + index_build_params, object_store_config + ) + + object_store.read_blob(index_build_params.vector_path, vector_bytes_buffer) + object_store.read_blob(index_build_params.doc_id_path, doc_id_bytes_buffer) + + return VectorsDataset.parse( + vector_bytes_buffer, + doc_id_bytes_buffer, + index_build_params.dimension, + index_build_params.doc_count, + index_build_params.data_type, + ) + + +def upload_index( + index_build_params: IndexBuildParameters, + object_store_config: Dict[str, Any], + index_local_path: str, +) -> None: + """ + Uploads a built index from a local path to the configured object store. + + Args: + index_build_params (IndexBuildParameters): Parameters for the index build process, + containing the vector path which is used to determine the upload destination + object_store_config (Dict[str, Any]): Configuration dictionary for the object store + containing connection details + index_local_path (str): Local filesystem path where the built index is stored + + Returns: + None + + Note: + - Creates an object store instance based on the provided configuration + - Uses the vector_path from index_build_params to determine the upload destination + - The index_local_path must exist and be readable + - The function assumes index_build_params has been validated by Pydantic + + Raises: + BlobError: If there are issues uploading to the object store + UnsupportedObjectStoreTypeError: If the index_build_params.repository_type is not supported + """ + object_store = ObjectStoreFactory.create_object_store( + index_build_params, object_store_config + ) + + # vector_path is unique for each index build request, so we can simply append the local path + index_remote_path = index_build_params.vector_path + index_local_path + + object_store.write_blob(index_local_path, index_remote_path) diff --git a/remote_vector_index_builder/requirements.txt b/remote_vector_index_builder/requirements.txt new file mode 100644 index 0000000..96c9f4f --- /dev/null +++ b/remote_vector_index_builder/requirements.txt @@ -0,0 +1,3 @@ +pydantic>=2.7.0,<3.0.0 +boto3>=1.36,<2.0.0 +numpy>=1.26,<2.0.0 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index b3066ad..eebf4d4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,3 +16,9 @@ show-source = True [black] max-line-length = 120 target-version = 'py38' + +[mypy] +ignore_missing_imports=True + +[tool:pytest] +pythonpath = remote_vector_index_builder \ No newline at end of file diff --git a/test_remote_vector_index_builder/test_core/__init__.py b/test_remote_vector_index_builder/test_core/__init__.py new file mode 100644 index 0000000..fe22b86 --- /dev/null +++ b/test_remote_vector_index_builder/test_core/__init__.py @@ -0,0 +1,6 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. diff --git a/test_remote_vector_index_builder/test_core/common/__init__.py b/test_remote_vector_index_builder/test_core/common/__init__.py new file mode 100644 index 0000000..fe22b86 --- /dev/null +++ b/test_remote_vector_index_builder/test_core/common/__init__.py @@ -0,0 +1,6 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. diff --git a/test_remote_vector_index_builder/test_core/common/models/__init__.py b/test_remote_vector_index_builder/test_core/common/models/__init__.py new file mode 100644 index 0000000..fe22b86 --- /dev/null +++ b/test_remote_vector_index_builder/test_core/common/models/__init__.py @@ -0,0 +1,6 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. diff --git a/test_remote_vector_index_builder/test_core/common/models/test_vectors_dataset.py b/test_remote_vector_index_builder/test_core/common/models/test_vectors_dataset.py new file mode 100644 index 0000000..06a8cbe --- /dev/null +++ b/test_remote_vector_index_builder/test_core/common/models/test_vectors_dataset.py @@ -0,0 +1,167 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +from io import BytesIO +from unittest.mock import patch + +import numpy as np +import pytest +from core.common.exceptions import UnsupportedVectorsDataTypeError, VectorsDatasetError +from core.common.models.index_build_parameters import DataType +from core.common.models.vectors_dataset import VectorsDataset + + +@pytest.fixture +def sample_vectors(): + # Create sample float32 vectors (2 vectors of dimension 3) + return np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype="