Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create Dataclasses and Builder for GPU Index Build Config #16

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions remote_vector_index_builder/core/common/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,24 @@
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.

from .index_build_parameters import SpaceType
from .index_builder.gpu_index_cagra_config import (
GPUIndexCagraConfig,
IVFPQSearchCagraConfig,
IVFPQBuildCagraConfig,
)
from .index_builder.index_hnsw_cagra_config import IndexHNSWCagraConfig
from .index_builder.gpu_index_build_config import GPUIndexBuildConfig

from .index_builder.graph_build_algo import GraphBuildAlgo

__all__ = [
"SpaceType",
"GPUIndexCagraConfig",
"IVFPQSearchCagraConfig",
"IVFPQBuildCagraConfig",
"IndexHNSWCagraConfig",
"GPUIndexBuildConfig",
"GraphBuildAlgo",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright OpenSearch Contributors
# SPDX-License-Identifier: Apache-2.0
#
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.

from .gpu_index_cagra_config import GPUIndexCagraConfig
from .ivf_pq_search_cagra_config import IVFPQSearchCagraConfig
from .ivf_pq_build_cagra_config import IVFPQBuildCagraConfig
from .index_hnsw_cagra_config import IndexHNSWCagraConfig
from .gpu_index_build_config import GPUIndexBuildConfig
from .graph_build_algo import GraphBuildAlgo

__all__ = [
"GPUIndexCagraConfig",
"IVFPQSearchCagraConfig",
"IVFPQBuildCagraConfig",
"IndexHNSWCagraConfig",
"GPUIndexBuildConfig",
"GraphBuildAlgo",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright OpenSearch Contributors
# SPDX-License-Identifier: Apache-2.0
#
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.

from dataclasses import dataclass, field

from ..index_build_parameters import SpaceType

from .gpu_index_cagra_config import GPUIndexCagraConfig
from .index_hnsw_cagra_config import IndexHNSWCagraConfig


@dataclass
class GPUIndexBuildConfig:
index_hnsw_cagra_config: IndexHNSWCagraConfig = field(
default_factory=IndexHNSWCagraConfig
)
gpu_index_cagra_config: GPUIndexCagraConfig = field(
default_factory=GPUIndexCagraConfig
)

# type of metric the gpuIndex is created with
metric: SpaceType = SpaceType.L2
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright OpenSearch Contributors
# SPDX-License-Identifier: Apache-2.0
#
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.

from dataclasses import dataclass, field

from .graph_build_algo import GraphBuildAlgo
from .ivf_pq_build_cagra_config import IVFPQBuildCagraConfig
from .ivf_pq_search_cagra_config import IVFPQSearchCagraConfig


@dataclass
class GPUIndexCagraConfig:
# Degree of input graph for pruning
intermediate_graph_degree: int = 64
# Degree of output graph
graph_degree: int = 32
# ANN Algorithm to build the knn graph
graph_build_algo: GraphBuildAlgo = GraphBuildAlgo.IVF_PQ

store_dataset: bool = False
# GPU Device on which the index is resident
device: int = 0

ivf_pq_build_config: IVFPQBuildCagraConfig = field(
default_factory=IVFPQBuildCagraConfig
)

ivf_pq_search_config: IVFPQSearchCagraConfig = field(
default_factory=IVFPQSearchCagraConfig
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright OpenSearch Contributors
# SPDX-License-Identifier: Apache-2.0
#
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.

from enum import Enum


class GraphBuildAlgo(Enum):
IVF_PQ = "IVF_PQ"
NN_DESCENT = "NN_DESCENT"
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Copyright OpenSearch Contributors
# SPDX-License-Identifier: Apache-2.0
#
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.

from dataclasses import dataclass


@dataclass
class IndexHNSWCagraConfig:
# expansion factor at search time
ef_search: int = 256

# expansion factor at construction time
ef_construction: int = 40

# When set to true, the index is immutable.
# This option is used to copy the knn graph from GpuIndexCagra
# to the base level of IndexHNSWCagra without adding upper levels.
# Doing so enables to search the HNSW index, but removes the
# ability to add vectors.
base_level_only: bool = True

# Set to true to delete internal storage:Index variable
# when destructor is called
own_fields: bool = True
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright OpenSearch Contributors
# SPDX-License-Identifier: Apache-2.0
#
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.

from dataclasses import dataclass


@dataclass
class IVFPQBuildCagraConfig:
# The number of inverted lists (clusters)
# Hint: the number of vectors per cluster (`n_rows/n_lists`) should be
# approximately 1,000 to 10,000.
n_lists: int = 1000

# The number of iterations searching for kmeans centers (index building).
kmeans_n_iters: int = 10
# The fraction of data to use during iterative kmeans building.
kmeans_trainset_fraction: float = 0.1

# The bit length of the vector element after compression by PQ.
# Possible values: [4, 5, 6, 7, 8].
# Hint: the smaller the 'pq_bits', the smaller the index size and the
# better the search performance, but the lower the recall.
pq_bits: int = 8

# The dimensionality of the vector after compression by PQ. When zero, an
# optimal value is selected using a heuristic.
# pq_bits` must be a multiple of 8.
# Hint: a smaller 'pq_dim' results in a smaller index size and better
# search performance, but lower recall. If 'pq_bits' is 8, 'pq_dim' can be
# set to any number, but multiple of 8 are desirable for good performance.
# If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8. For good
# performance, it is desirable that 'pq_dim' is a multiple of 32
# Ideally 'pq_dim' should be also a divisor of the dataset dim.
pq_dim: int = 16

# By default, the algorithm allocates more space than necessary for
# individual clusters
# This allows to amortize the cost of memory allocation and
# reduce the number of data copies during repeated calls to `extend`
# (extending the database).
#
# The alternative is the conservative allocation behavior; when enabled,
# the algorithm always allocates the minimum amount of memory required to
# store the given number of records. Set this flag to `true` if you prefer
# to use as little GPU memory for the database as possible.
conservative_memory_allocation: bool = True
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright OpenSearch Contributors
# SPDX-License-Identifier: Apache-2.0
#
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.

from dataclasses import dataclass


@dataclass
class IVFPQSearchCagraConfig:
# The number of clusters to search.
n_probes: int = 30
6 changes: 6 additions & 0 deletions remote_vector_index_builder/core/index_builder/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright OpenSearch Contributors
# SPDX-License-Identifier: Apache-2.0
#
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from typing import Any, Dict, Optional
from remote_vector_index_builder.core.common.models import (
IndexHNSWCagraConfig,
GPUIndexCagraConfig,
SpaceType,
IVFPQBuildCagraConfig,
IVFPQSearchCagraConfig,
GraphBuildAlgo,
GPUIndexBuildConfig,
)


class IndexConfigBuilder:
def __init__(self):
self._hnsw_config: Optional[IndexHNSWCagraConfig] = None
self._gpu_config: Optional[GPUIndexCagraConfig] = None
self._metric: SpaceType = SpaceType("l2") # default metric

def set_hnsw_config(self, params: Dict[str, Any]) -> "IndexConfigBuilder":
self._hnsw_config = (
IndexHNSWCagraConfig(**params) if params else IndexHNSWCagraConfig()
)
return self

def set_gpu_config(self, params: Dict[str, Any]) -> "IndexConfigBuilder":
if not params:
self._gpu_config = GPUIndexCagraConfig()
return self

ivf_pq_build_params = params.pop("ivf_pq_build_params", None)
ivf_pq_build_config = (
IVFPQBuildCagraConfig(**ivf_pq_build_params)
if ivf_pq_build_params
else IVFPQBuildCagraConfig()
)

ivf_pq_search_params = params.pop("ivf_pq_search_params", None)
ivf_pq_search_config = (
IVFPQSearchCagraConfig(**ivf_pq_search_params)
if ivf_pq_search_params
else IVFPQSearchCagraConfig()
)

graph_build_algo_param = params.pop("graph_build_algo", None)
graph_build_algo = (
GraphBuildAlgo(graph_build_algo_param)
if graph_build_algo_param
else GraphBuildAlgo.IVF_PQ
)

self._gpu_config = GPUIndexCagraConfig(
**params,
graph_build_algo=graph_build_algo,
ivf_pq_build_config=ivf_pq_build_config,
ivf_pq_search_config=ivf_pq_search_config
)
return self

def set_metric(self, metric: str) -> "IndexConfigBuilder":
self._metric = SpaceType(metric)
return self

def build(self) -> GPUIndexBuildConfig:
if not self._hnsw_config:
self._hnsw_config = IndexHNSWCagraConfig()
if not self._gpu_config:
self._gpu_config = GPUIndexCagraConfig()

return GPUIndexBuildConfig(
index_hnsw_cagra_config=self._hnsw_config,
gpu_index_cagra_config=self._gpu_config,
metric=self._metric,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from typing import Dict, Any
from remote_vector_index_builder.core.common.models.index_builder.gpu_index_build_config import (
GPUIndexBuildConfig,
)
from remote_vector_index_builder.core.index_builder.index_config_builder import (
IndexConfigBuilder,
)


class IndexConfigDirector:
"""Director class to construct index configurations using the builder"""

def __init__(self, builder: IndexConfigBuilder):
self._builder = builder

def construct_config(self, config_params: Dict[str, Any]) -> GPUIndexBuildConfig:
return (
self._builder.set_hnsw_config(config_params.get("hnsw_config", {}))
.set_gpu_config(config_params.get("gpu_config", {}))
.set_metric(config_params.get("metric", "l2"))
.build()
)