Added create_vectors_dataset and upload_index tasks in core package (#13

) Signed-off-by: Rohan Chitale <rchital@amazon.com>
opensearch-project · Feb 28, 2025 · 01ac863 · 01ac863
1 parent 5ae3ed5
commit 01ac863
Show file tree

Hide file tree

Showing 26 changed files with 1,458 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+__pycache__/
diff --git a/..._vector_index_builder/test_placeholder.py → ...tor_index_builder/core/common/__init__.py b/..._vector_index_builder/test_placeholder.py → ...tor_index_builder/core/common/__init__.py
@@ -4,7 +4,3 @@
 # The OpenSearch Contributors require contributions made to
 # this file be licensed under the Apache-2.0 license or a
 # compatible open source license.
-
-
-def test_placeholder():
-    assert 1 + 1 == 2
diff --git a/remote_vector_index_builder/core/common/exceptions.py b/remote_vector_index_builder/core/common/exceptions.py
@@ -0,0 +1,36 @@
+# Copyright OpenSearch Contributors
+# SPDX-License-Identifier: Apache-2.0
+#
+# The OpenSearch Contributors require contributions made to
+# this file be licensed under the Apache-2.0 license or a
+# compatible open source license.
+
+"""
+Expose public exceptions & warnings
+"""
+
+
+class BlobError(Exception):
+    """Generic error raised when blob is downloaded to or uploaded from Object Store"""
+
+    def __init__(self, message: str):
+        super().__init__(message)
+
+
+class UnsupportedObjectStoreTypeError(ValueError):
+    """Error raised when creating an Object Store object"""
+
+    pass
+
+
+class VectorsDatasetError(Exception):
+    """Generic error raised when converting a buffer into a Vector Dataset"""
+
+    def __init__(self, message: str):
+        super().__init__(message)
+
+
+class UnsupportedVectorsDataTypeError(ValueError):
+    """Error raised when creating a Vector Dataset because of unsupported data type"""
+
+    pass
diff --git a/remote_vector_index_builder/core/common/models/__init__.py b/remote_vector_index_builder/core/common/models/__init__.py
@@ -0,0 +1,6 @@
+# Copyright OpenSearch Contributors
+# SPDX-License-Identifier: Apache-2.0
+#
+# The OpenSearch Contributors require contributions made to
+# this file be licensed under the Apache-2.0 license or a
+# compatible open source license.
diff --git a/remote_vector_index_builder/core/common/models/index_build_parameters.py b/remote_vector_index_builder/core/common/models/index_build_parameters.py
@@ -0,0 +1,151 @@
+# Copyright OpenSearch Contributors
+# SPDX-License-Identifier: Apache-2.0
+#
+# The OpenSearch Contributors require contributions made to
+# this file be licensed under the Apache-2.0 license or a
+# compatible open source license.
+from enum import Enum
+from typing import Annotated
+
+from core.object_store.types import ObjectStoreType
+from pydantic import BaseModel, ConfigDict, Field
+
+# Type annotation for vector file paths that must end with .knnvec
+VectorPathRegex = Annotated[str, Field(pattern=".+\\.knnvec$")]
+
+
+class DataType(str, Enum):
+    """Supported data types for vector values.
+
+    Attributes:
+        FLOAT32: 32-bit floating point values
+        FLOAT16: 16-bit floating point values
+        BYTE: 8-bit integer values
+        BINARY: Binary data format
+    """
+
+    FLOAT32 = "fp32"
+    FLOAT16 = "fp16"
+    BYTE = "byte"
+    BINARY = "binary"
+
+
+class SpaceType(str, Enum):
+    """Distance method used for measuring vector similarities.
+
+    Attributes:
+        L2: Euclidean distance
+        COSINESIMIL: Cosine similarity
+        L1: Manhattan distance
+        LINF: Chebyshev distance
+        INNERPRODUCT: Dot product similarity
+        HAMMING: Hamming distance for binary vectors
+    """
+
+    L2 = "l2"
+    COSINESIMIL = "cosinesimil"
+    L1 = "l1"
+    LINF = "linf"
+    INNERPRODUCT = "innerproduct"
+    HAMMING = "hamming"
+
+
+class Algorithm(str, Enum):
+    """Supported algorithms for vector indexing.
+
+    Attributes:
+        HNSW: Hierarchical Navigable Small World graph
+    """
+
+    HNSW = "hnsw"
+
+
+class Engine(str, Enum):
+    """Available vector search engines.
+
+    Attributes:
+        FAISS: Facebook AI Similarity Search
+    """
+
+    FAISS = "faiss"
+
+
+class AlgorithmParameters(BaseModel):
+    """Configuration parameters for the HNSW algorithm.
+
+    Attributes:
+        ef_construction (int): Size of the dynamic candidate list for constructing
+            the HNSW graph. Higher values lead to better quality but slower
+            index construction. Defaults to 100.
+        m (int): Number of bi-directional links created for every new element
+            during construction. Higher values lead to better search speed but
+            more memory consumption. Defaults to 16.
+    Note:
+        The class is configured to allow extra attributes using the ConfigDict class.
+    """
+
+    ef_construction: int = 100
+    m: int = 16
+    model_config = ConfigDict(extra="allow")
+
+
+class IndexParameters(BaseModel):
+    """Configuration parameters for vector index construction.
+
+    This class defines the core index configuration including the algorithm type,
+    distance metric, and algorithm-specific parameters.
+
+    Attributes:
+        algorithm (Algorithm): The vector indexing algorithm to use.
+            Defaults to HNSW (Hierarchical Navigable Small World).
+        space_type (SpaceType): The distance metric to use for vector comparisons.
+            Defaults to L2 (Euclidean distance).
+        algorithm_parameters (AlgorithmParameters): Specific parameters for the chosen
+            algorithm. Defaults to standard HNSW parameters (ef_construction=128, m=16).
+    """
+
+    algorithm: Algorithm = Algorithm.HNSW
+    space_type: SpaceType = SpaceType.L2
+    algorithm_parameters: AlgorithmParameters = Field(
+        default_factory=AlgorithmParameters
+    )
+
+
+class IndexBuildParameters(BaseModel):
+    """Parameters required for building a vector index.
+
+    This class encapsulates all necessary parameters for constructing a vector index,
+    including data source information, vector specifications, and index configuration.
+
+    Attributes:
+        repository_type (str): The type of repository where the vector data is stored.
+            Defaults to s3
+        container_name (str): Name of the container (e.g., S3 bucket) containing the vector data.
+        vector_path (VectorPathRegex): Path to the vector data file. Must end with .knnvec extension.
+        doc_id_path (str): Path to the document IDs corresponding to the vectors.
+        tenant_id (str): Optional identifier for multi-tenant scenarios. Defaults to empty string.
+        dimension (int): The dimensionality of the vectors to be indexed.
+        doc_count (int): Total number of documents/vectors to be indexed.
+        data_type (DataType): The numerical format of the vector data.
+            Defaults to FLOAT32.
+        engine (Engine): The vector search engine to use for indexing.
+            Defaults to FAISS.
+        index_parameters (IndexParameters): Configuration for the index structure
+            and algorithm. Defaults to standard HNSW configuration.
+
+    Note:
+        The class is configured to forbid extra attributes using the ConfigDict class,
+        ensuring strict parameter validation.
+    """
+
+    repository_type: ObjectStoreType = ObjectStoreType.S3
+    container_name: str
+    vector_path: VectorPathRegex
+    doc_id_path: str
+    tenant_id: str = ""
+    dimension: int = Field(gt=0)
+    doc_count: int = Field(gt=0)
+    data_type: DataType = DataType.FLOAT32
+    engine: Engine = Engine.FAISS
+    index_parameters: IndexParameters = Field(default_factory=IndexParameters)
+    model_config = ConfigDict(extra="forbid")
diff --git a/remote_vector_index_builder/core/common/models/vectors_dataset.py b/remote_vector_index_builder/core/common/models/vectors_dataset.py
@@ -0,0 +1,119 @@
+# Copyright OpenSearch Contributors
+# SPDX-License-Identifier: Apache-2.0
+#
+# The OpenSearch Contributors require contributions made to
+# this file be licensed under the Apache-2.0 license or a
+# compatible open source license.
+
+from dataclasses import dataclass
+from io import BytesIO
+
+import numpy as np
+from core.common.exceptions import UnsupportedVectorsDataTypeError, VectorsDatasetError
+from core.common.models.index_build_parameters import DataType
+
+
+@dataclass
+class VectorsDataset:
+    """A class for handling vector datasets and their associated document IDs.
+
+    This class provides functionality to parse, validate, and store vector data along with
+    their corresponding document IDs. It supports multiple data types including FLOAT32,
+    FLOAT16, BYTE, and BINARY formats.
+
+    Attributes:
+        vectors (numpy.ndarray): The array of vectors, where each row represents a vector.
+        doc_ids (numpy.ndarray): Array of document IDs corresponding to the vectors.
+    """
+
+    vectors: np.ndarray
+    doc_ids: np.ndarray
+
+    def free_vectors_space(self):
+        """Free up memory by deleting the vectors and document IDs arrays."""
+        del self.vectors
+        del self.doc_ids
+
+    @staticmethod
+    def get_numpy_dtype(dtype: DataType):
+        """Convert DataType enum to numpy dtype string.
+
+        Args:
+            dtype (DataType): The data type enum value to convert.
+
+        Returns:
+            str: The corresponding numpy dtype string.
+
+        Raises:
+            UnsupportedVectorsDataTypeError: If the provided data type is not supported.
+        """
+        if dtype == DataType.FLOAT32:
+            return "<f4"
+        elif dtype == DataType.FLOAT16:
+            return "<f2"
+        elif dtype == DataType.BYTE:
+            return "<i1"
+        elif dtype == DataType.BINARY:
+            return "<i1"
+        else:
+            raise UnsupportedVectorsDataTypeError(f"Unsupported data type: {dtype}")
+
+    @staticmethod
+    def check_dimensions(vectors, expected_length):
+        """Validate that the vector array has the expected length.
+
+        Args:
+            vectors: Array-like object to check.
+            expected_length (int): The expected length of the vectors array.
+
+        Raises:
+            VectorsDatasetError: If the vectors length doesn't match the expected length.
+        """
+        if len(vectors) != expected_length:
+            raise VectorsDatasetError(
+                f"Expected {expected_length} vectors, but got {len(vectors)}"
+            )
+
+    @staticmethod
+    def parse(
+        vectors: BytesIO,
+        doc_ids: BytesIO,
+        dimension: int,
+        doc_count: int,
+        vector_dtype: DataType,
+    ):
+        """Parse binary vector data and document IDs into numpy arrays.
+
+        This method reads binary data for vectors and document IDs, validates their
+        dimensions, and creates a new VectorsDataset instance.
+
+        Args:
+            vectors (BytesIO): Binary stream containing vector data.
+            doc_ids (BytesIO): Binary stream containing document IDs.
+            dimension (int): The dimensionality of each vector.
+            doc_count (int): Expected number of vectors/documents.
+            vector_dtype (DataType): The data type of the vector values.
+
+        Returns:
+            VectorsDataset: A new instance containing the parsed vectors and document IDs.
+
+        Raises:
+            VectorsDatasetError: If there are any errors during parsing or validation.
+        """
+        try:
+            # Create a view into the buffer, to prevent additional allocation of memory
+            vector_view = vectors.getbuffer()
+            np_vectors = np.frombuffer(
+                vector_view, dtype=VectorsDataset.get_numpy_dtype(vector_dtype)
+            )
+            VectorsDataset.check_dimensions(np_vectors, doc_count * dimension)
+            np_vectors = np_vectors.reshape(doc_count, dimension)
+
+            # Do the same for doc ids
+            doc_id_view = doc_ids.getbuffer()
+            np_doc_ids = np.frombuffer(doc_id_view, dtype="<i4")
+            VectorsDataset.check_dimensions(np_doc_ids, doc_count)
+
+        except (ValueError, TypeError, MemoryError, RuntimeError) as e:
+            raise VectorsDatasetError(f"Error parsing vectors: {e}") from e
+        return VectorsDataset(np_vectors, np_doc_ids)
diff --git a/remote_vector_index_builder/core/main.py b/remote_vector_index_builder/core/main.py
@@ -4,3 +4,14 @@
 # The OpenSearch Contributors require contributions made to
 # this file be licensed under the Apache-2.0 license or a
 # compatible open source license.
+
+# TODO: Call each task from tasks.py in sequence, main()
+# Add this file as the entry point for the Dockerfile
+
+
+def main():
+    pass
+
+
+if __name__ == "__main__":
+    main()
diff --git a/remote_vector_index_builder/core/object_store/__init__.py b/remote_vector_index_builder/core/object_store/__init__.py
@@ -0,0 +1,6 @@
+# Copyright OpenSearch Contributors
+# SPDX-License-Identifier: Apache-2.0
+#
+# The OpenSearch Contributors require contributions made to
+# this file be licensed under the Apache-2.0 license or a
+# compatible open source license.