From a08d83f600e4f37584ed0965c18dd7a6ca0e88c6 Mon Sep 17 00:00:00 2001 From: alpayariyak Date: Tue, 2 Jul 2024 19:44:01 +0000 Subject: [PATCH 1/9] Allow any vLLM engine args as env vars, refactor --- src/config.py | 62 ---------------------------------------------- src/engine.py | 20 ++++++--------- src/engine_args.py | 58 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 74 deletions(-) delete mode 100644 src/config.py create mode 100644 src/engine_args.py diff --git a/src/config.py b/src/config.py deleted file mode 100644 index 67b9836..0000000 --- a/src/config.py +++ /dev/null @@ -1,62 +0,0 @@ -import os -import json -import logging -from dotenv import load_dotenv -from torch.cuda import device_count -from utils import get_int_bool_env - -class EngineConfig: - def __init__(self): - load_dotenv() - self.hf_home = os.getenv("HF_HOME") - # Check if /local_metadata.json exists - local_metadata = {} - if os.path.exists("/local_metadata.json"): - with open("/local_metadata.json", "r") as f: - local_metadata = json.load(f) - if local_metadata.get("model_name") is None: - raise ValueError("Model name is not found in /local_metadata.json, there was a problem when you baked the model in.") - logging.info("Using baked-in model") - os.environ["TRANSFORMERS_OFFLINE"] = "1" - os.environ["HF_HUB_OFFLINE"] = "1" - - self.model_name_or_path = local_metadata.get("model_name", os.getenv("MODEL_NAME")) - self.model_revision = local_metadata.get("revision", os.getenv("MODEL_REVISION")) - self.tokenizer_name_or_path = local_metadata.get("tokenizer_name", os.getenv("TOKENIZER_NAME")) or self.model_name_or_path - self.tokenizer_revision = local_metadata.get("tokenizer_revision", os.getenv("TOKENIZER_REVISION")) - self.quantization = local_metadata.get("quantization", os.getenv("QUANTIZATION")) - self.config = self._initialize_config() - def _initialize_config(self): - args = { - "model": self.model_name_or_path, - "revision": self.model_revision, - "download_dir": self.hf_home, - "quantization": self.quantization, - "load_format": os.getenv("LOAD_FORMAT", "auto"), - "dtype": os.getenv("DTYPE", "half" if self.quantization else "auto"), - "tokenizer": self.tokenizer_name_or_path, - "tokenizer_revision": self.tokenizer_revision, - "disable_log_stats": get_int_bool_env("DISABLE_LOG_STATS", True), - "disable_log_requests": get_int_bool_env("DISABLE_LOG_REQUESTS", True), - "trust_remote_code": get_int_bool_env("TRUST_REMOTE_CODE", False), - "gpu_memory_utilization": float(os.getenv("GPU_MEMORY_UTILIZATION", 0.95)), - "max_parallel_loading_workers": None if device_count() > 1 or not os.getenv("MAX_PARALLEL_LOADING_WORKERS") else int(os.getenv("MAX_PARALLEL_LOADING_WORKERS")), - "max_model_len": int(os.getenv("MAX_MODEL_LEN")) if os.getenv("MAX_MODEL_LEN") else None, - "tensor_parallel_size": device_count(), - "seed": int(os.getenv("SEED")) if os.getenv("SEED") else None, - "kv_cache_dtype": os.getenv("KV_CACHE_DTYPE"), - "block_size": int(os.getenv("BLOCK_SIZE")) if os.getenv("BLOCK_SIZE") else None, - "swap_space": int(os.getenv("SWAP_SPACE")) if os.getenv("SWAP_SPACE") else None, - "max_seq_len_to_capture": int(os.getenv("MAX_SEQ_LEN_TO_CAPTURE")) if os.getenv("MAX_SEQ_LEN_TO_CAPTURE") else None, - "disable_custom_all_reduce": get_int_bool_env("DISABLE_CUSTOM_ALL_REDUCE", False), - "enforce_eager": get_int_bool_env("ENFORCE_EAGER", False) - } - if args["kv_cache_dtype"] == "fp8_e5m2": - args["kv_cache_dtype"] = "fp8" - logging.warning("Using fp8_e5m2 is deprecated. Please use fp8 instead.") - if os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE"): - args["max_seq_len_to_capture"] = int(os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE")) - logging.warning("Using MAX_CONTEXT_LEN_TO_CAPTURE is deprecated. Please use MAX_SEQ_LEN_TO_CAPTURE instead.") - - - return {k: v for k, v in args.items() if v not in [None, ""]} diff --git a/src/engine.py b/src/engine.py index 1ac3f8d..e8fc606 100644 --- a/src/engine.py +++ b/src/engine.py @@ -7,7 +7,7 @@ from typing import AsyncGenerator import time -from vllm import AsyncLLMEngine, AsyncEngineArgs +from vllm import AsyncLLMEngine from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest, ErrorResponse @@ -15,13 +15,13 @@ from utils import DummyRequest, JobInput, BatchSize, create_error_response from constants import DEFAULT_MAX_CONCURRENCY, DEFAULT_BATCH_SIZE, DEFAULT_BATCH_SIZE_GROWTH_FACTOR, DEFAULT_MIN_BATCH_SIZE from tokenizer import TokenizerWrapper -from config import EngineConfig +from engine_args import get_engine_args class vLLMEngine: def __init__(self, engine = None): load_dotenv() # For local development - self.config = EngineConfig().config - self.tokenizer = TokenizerWrapper(self.config.get("tokenizer"), self.config.get("tokenizer_revision"), self.config.get("trust_remote_code")) + self.engine_args = get_engine_args() + self.tokenizer = TokenizerWrapper(self.tokenizer, self.engine_args.tokenizer_revision, self.engine_args.trust_remote_code) self.llm = self._initialize_llm() if engine is None else engine self.max_concurrency = int(os.getenv("MAX_CONCURRENCY", DEFAULT_MAX_CONCURRENCY)) self.default_batch_size = int(os.getenv("DEFAULT_BATCH_SIZE", DEFAULT_BATCH_SIZE)) @@ -102,7 +102,7 @@ async def _generate_vllm(self, llm_input, validated_sampling_params, batch_size, def _initialize_llm(self): try: start = time.time() - engine = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**self.config)) + engine = AsyncLLMEngine.from_engine_args(self.engine_args) end = time.time() logging.info(f"Initialized vLLM engine in {end - start:.2f}s") return engine @@ -111,15 +111,11 @@ def _initialize_llm(self): raise e -class OpenAIvLLMEngine: +class OpenAIvLLMEngine(vLLMEngine): def __init__(self, vllm_engine): - self.config = vllm_engine.config - self.llm = vllm_engine.llm - self.served_model_name = os.getenv("OPENAI_SERVED_MODEL_NAME_OVERRIDE") or self.config["model"] + super().__init__(vllm_engine) + self.served_model_name = os.getenv("OPENAI_SERVED_MODEL_NAME_OVERRIDE") or self.engine_args["model"] self.response_role = os.getenv("OPENAI_RESPONSE_ROLE") or "assistant" - self.tokenizer = vllm_engine.tokenizer - self.default_batch_size = vllm_engine.default_batch_size - self.batch_size_growth_factor, self.min_batch_size = vllm_engine.batch_size_growth_factor, vllm_engine.min_batch_size self._initialize_engines() self.raw_openai_output = bool(int(os.getenv("RAW_OPENAI_OUTPUT", 1))) diff --git a/src/engine_args.py b/src/engine_args.py new file mode 100644 index 0000000..ed4e6e3 --- /dev/null +++ b/src/engine_args.py @@ -0,0 +1,58 @@ +import os +import json +import logging +from torch.cuda import device_count +from vllm import AsyncEngineArgs + +env_to_args_map = { + "MODEL_NAME": "model", + "MODEL_REVISION": "revision", + "TOKENIZER_NAME": "tokenizer", + "TOKENIZER_REVISION": "tokenizer_revision", + "QUANTIZATION": "quantization" +} + +def get_local_args(): + if os.path.exists("/local_metadata.json"): + with open("/local_metadata.json", "r") as f: + local_metadata = json.load(f) + if local_metadata.get("model_name") is None: + raise ValueError("Model name is not found in /local_metadata.json, there was a problem when baking the model in.") + else: + local_args = {env_to_args_map[k.upper()]: v for k, v in local_metadata.items() if k in env_to_args_map} + os.environ["TRANSFORMERS_OFFLINE"] = "1" + os.environ["HF_HUB_OFFLINE"] = "1" + return local_args + +def get_engine_args(): + # Start with default args + args = { + "disable_log_stats": True, + "disable_log_requests": True, + "gpu_memory_utilization": float(os.getenv("GPU_MEMORY_UTILIZATION", 0.9)), + } + + # Get env args that match keys in AsyncEngineArgs + env_args = {k.lower(): v for k, v in dict(os.environ).items() if k.lower() in AsyncEngineArgs.__dataclass_fields__} + args.update(env_args) + + # Get local args if model is baked in and overwrite env args + local_args = get_local_args() + args.update(local_args) + + # Set tensor parallel size and max parallel loading workers if more than 1 GPU is available + num_gpus = device_count() + if num_gpus > 1: + args["tensor_parallel_size"] = num_gpus + args["max_parallel_loading_workers"] = None + if os.getenv("MAX_PARALLEL_LOADING_WORKERS"): + logging.warning("Overriding MAX_PARALLEL_LOADING_WORKERS with None because more than 1 GPU is available.") + + # Deprecated env args backwards compatibility + if args["kv_cache_dtype"] == "fp8_e5m2": + args["kv_cache_dtype"] = "fp8" + logging.warning("Using fp8_e5m2 is deprecated. Please use fp8 instead.") + if os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE"): + args["max_seq_len_to_capture"] = int(os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE")) + logging.warning("Using MAX_CONTEXT_LEN_TO_CAPTURE is deprecated. Please use MAX_SEQ_LEN_TO_CAPTURE instead.") + return AsyncEngineArgs(**args) From 5bd6f3a75e77457c3feef2e5b065e69685db7ad5 Mon Sep 17 00:00:00 2001 From: alpayariyak Date: Thu, 25 Jul 2024 19:40:32 +0000 Subject: [PATCH 2/9] 0.5.3, any vllm arg as env var, refactor and fixes, moving away from building separate image from vLLM fork --- Dockerfile | 20 ++-- docker-bake.hcl | 2 +- src/download_model.py | 113 ++++++++++++++++++---- src/engine.py | 29 ++++-- src/engine_args.py | 86 ++++++++++++----- src/tokenizer.py | 3 +- src/utils.py | 25 +++-- vllm-base-image/Dockerfile | 149 ------------------------------ vllm-base-image/README.md | 1 - vllm-base-image/vllm | 2 +- vllm-base-image/vllm-metadata.yml | 2 - 11 files changed, 209 insertions(+), 223 deletions(-) delete mode 100644 vllm-base-image/Dockerfile delete mode 100644 vllm-base-image/README.md delete mode 100644 vllm-base-image/vllm-metadata.yml diff --git a/Dockerfile b/Dockerfile index f48b832..426d771 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,20 @@ -ARG WORKER_CUDA_VERSION=11.8.0 -ARG BASE_IMAGE_VERSION=1.0.0 -FROM runpod/worker-vllm:base-${BASE_IMAGE_VERSION}-cuda${WORKER_CUDA_VERSION} AS vllm-base +FROM nvidia/cuda:12.1.0-base-ubuntu22.04 RUN apt-get update -y \ && apt-get install -y python3-pip +RUN ldconfig /usr/local/cuda-12.1/compat/ + # Install Python dependencies COPY builder/requirements.txt /requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install --upgrade pip && \ python3 -m pip install --upgrade -r /requirements.txt +# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer +RUN python3 -m pip install vllm==0.5.1 && \ + python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3 + # Setup for Option 2: Building the Image with the Model included ARG MODEL_NAME="" ARG TOKENIZER_NAME="" @@ -32,19 +36,15 @@ ENV MODEL_NAME=$MODEL_NAME \ ENV PYTHONPATH="/:/vllm-workspace" -COPY src/download_model.py /download_model.py + +COPY src /src RUN --mount=type=secret,id=HF_TOKEN,required=false \ if [ -f /run/secrets/HF_TOKEN ]; then \ export HF_TOKEN=$(cat /run/secrets/HF_TOKEN); \ fi && \ if [ -n "$MODEL_NAME" ]; then \ - python3 /download_model.py; \ + python3 /src/download_model.py; \ fi -# Add source files -COPY src /src -# Remove download_model.py -RUN rm /download_model.py - # Start the handler CMD ["python3", "/src/handler.py"] \ No newline at end of file diff --git a/docker-bake.hcl b/docker-bake.hcl index 1519a5f..2830d6f 100644 --- a/docker-bake.hcl +++ b/docker-bake.hcl @@ -7,7 +7,7 @@ variable "REPOSITORY" { } variable "BASE_IMAGE_VERSION" { - default = "1.0.0" + default = "1.1.0preview" } group "all" { diff --git a/src/download_model.py b/src/download_model.py index 918e183..107e1e5 100644 --- a/src/download_model.py +++ b/src/download_model.py @@ -1,27 +1,100 @@ import os -from huggingface_hub import snapshot_download import json +import logging +import glob +from shutil import rmtree +from huggingface_hub import snapshot_download +from utils import timer_decorator + +BASE_DIR = "/" +TOKENIZER_PATTERNS = [["*.json", "tokenizer*"]] +MODEL_PATTERNS = [["*.safetensors"], ["*.bin"], ["*.pt"]] + +def setup_env(): + if os.getenv("TESTING_DOWNLOAD") == "1": + BASE_DIR = "tmp" + os.makedirs(BASE_DIR, exist_ok=True) + os.environ.update({ + "HF_HOME": f"{BASE_DIR}/hf_cache", + "MODEL_NAME": "openchat/openchat-3.5-0106", + "HF_HUB_ENABLE_HF_TRANSFER": "1", + "TENSORIZE": "1", + "TENSORIZER_NUM_GPUS": "1", + "DTYPE": "auto" + }) + +@timer_decorator +def download(name, revision, type, cache_dir): + if type == "model": + pattern_sets = [model_pattern + TOKENIZER_PATTERNS[0] for model_pattern in MODEL_PATTERNS] + elif type == "tokenizer": + pattern_sets = TOKENIZER_PATTERNS + else: + raise ValueError(f"Invalid type: {type}") + try: + for pattern_set in pattern_sets: + path = snapshot_download(name, revision=revision, cache_dir=cache_dir, + allow_patterns=pattern_set) + for pattern in pattern_set: + if glob.glob(os.path.join(path, pattern)): + logging.info(f"Successfully downloaded {pattern} model files.") + return path + except ValueError: + raise ValueError(f"No patterns matching {pattern_sets} found for download.") + + +# @timer_decorator +# def tensorize_model(model_path): TODO: Add back once tensorizer is ready +# from vllm.engine.arg_utils import EngineArgs +# from vllm.model_executor.model_loader.tensorizer import TensorizerConfig, tensorize_vllm_model +# from torch.cuda import device_count + +# tensorizer_num_gpus = int(os.getenv("TENSORIZER_NUM_GPUS", "1")) +# if tensorizer_num_gpus > device_count(): +# raise ValueError(f"TENSORIZER_NUM_GPUS ({tensorizer_num_gpus}) exceeds available GPUs ({device_count()})") + +# dtype = os.getenv("DTYPE", "auto") +# serialized_dir = f"{BASE_DIR}/serialized_model" +# os.makedirs(serialized_dir, exist_ok=True) +# serialized_uri = f"{serialized_dir}/model{'-%03d' if tensorizer_num_gpus > 1 else ''}.tensors" + +# tensorize_vllm_model( +# EngineArgs(model=model_path, tensor_parallel_size=tensorizer_num_gpus, dtype=dtype), +# TensorizerConfig(tensorizer_uri=serialized_uri) +# ) +# logging.info("Successfully serialized model to %s", str(serialized_uri)) +# logging.info("Removing HF Model files after serialization") +# rmtree("/".join(model_path.split("/")[:-2])) +# return serialized_uri, tensorizer_num_gpus, dtype if __name__ == "__main__": - model_name = os.getenv("MODEL_NAME") - if not model_name: - raise ValueError("Must specify model name by adding --build-arg MODEL_NAME=") - revision = os.getenv("MODEL_REVISION") or None - snapshot_download(model_name, revision=revision, cache_dir=os.getenv("HF_HOME")) + setup_env() + cache_dir = os.getenv("HF_HOME") + model_name, model_revision = os.getenv("MODEL_NAME"), os.getenv("MODEL_REVISION") or None + tokenizer_name, tokenizer_revision = os.getenv("TOKENIZER_NAME") or model_name, os.getenv("TOKENIZER_REVISION") or model_revision + + model_path = download(model_name, model_revision, "model", cache_dir) + + metadata = { + "MODEL_NAME": model_path, + "MODEL_REVISION": os.getenv("MODEL_REVISION"), + "QUANTIZATION": os.getenv("QUANTIZATION"), + } - tokenizer_name = os.getenv("TOKENIZER_NAME") or None - tokenizer_revision = os.getenv("TOKENIZER_REVISION") or None - if tokenizer_name: - snapshot_download(tokenizer_name, revision=tokenizer_revision, cache_dir=os.getenv("HF_HOME")) + # if os.getenv("TENSORIZE") == "1": TODO: Add back once tensorizer is ready + # serialized_uri, tensorizer_num_gpus, dtype = tensorize_model(model_path) + # metadata.update({ + # "MODEL_NAME": serialized_uri, + # "TENSORIZER_URI": serialized_uri, + # "TENSOR_PARALLEL_SIZE": tensorizer_num_gpus, + # "DTYPE": dtype + # }) - # Create file with metadata of baked in model and/or tokenizer + tokenizer_path = download(tokenizer_name, tokenizer_revision, "tokenizer", cache_dir) + metadata.update({ + "TOKENIZER_NAME": tokenizer_path, + "TOKENIZER_REVISION": tokenizer_revision + }) - with open("/local_metadata.json", "w") as f: - json.dump({ - "model_name": model_name, - "revision": revision, - "tokenizer_name": tokenizer_name or model_name, - "tokenizer_revision": tokenizer_revision or revision, - "quantization": os.getenv("QUANTIZATION") - }, f) - + with open(f"{BASE_DIR}/local_model_args.json", "w") as f: + json.dump({k: v for k, v in metadata.items() if v not in (None, "")}, f) \ No newline at end of file diff --git a/src/engine.py b/src/engine.py index e8fc606..33b374b 100644 --- a/src/engine.py +++ b/src/engine.py @@ -1,9 +1,9 @@ import os import logging import json +import asyncio from dotenv import load_dotenv -from torch.cuda import device_count from typing import AsyncGenerator import time @@ -21,8 +21,11 @@ class vLLMEngine: def __init__(self, engine = None): load_dotenv() # For local development self.engine_args = get_engine_args() - self.tokenizer = TokenizerWrapper(self.tokenizer, self.engine_args.tokenizer_revision, self.engine_args.trust_remote_code) - self.llm = self._initialize_llm() if engine is None else engine + logging.info(f"Engine args: {self.engine_args}") + self.tokenizer = TokenizerWrapper(self.engine_args.tokenizer or self.engine_args.model, + self.engine_args.tokenizer_revision, + self.engine_args.trust_remote_code) + self.llm = self._initialize_llm() if engine is None else engine.llm self.max_concurrency = int(os.getenv("MAX_CONCURRENCY", DEFAULT_MAX_CONCURRENCY)) self.default_batch_size = int(os.getenv("DEFAULT_BATCH_SIZE", DEFAULT_BATCH_SIZE)) self.batch_size_growth_factor = int(os.getenv("BATCH_SIZE_GROWTH_FACTOR", DEFAULT_BATCH_SIZE_GROWTH_FACTOR)) @@ -114,17 +117,27 @@ def _initialize_llm(self): class OpenAIvLLMEngine(vLLMEngine): def __init__(self, vllm_engine): super().__init__(vllm_engine) - self.served_model_name = os.getenv("OPENAI_SERVED_MODEL_NAME_OVERRIDE") or self.engine_args["model"] + self.served_model_name = os.getenv("OPENAI_SERVED_MODEL_NAME_OVERRIDE") or self.engine_args.model self.response_role = os.getenv("OPENAI_RESPONSE_ROLE") or "assistant" - self._initialize_engines() + asyncio.run(self._initialize_engines()) self.raw_openai_output = bool(int(os.getenv("RAW_OPENAI_OUTPUT", 1))) - def _initialize_engines(self): + async def _initialize_engines(self): + self.model_config = await self.llm.get_model_config() + self.chat_engine = OpenAIServingChat( - self.llm, self.served_model_name, self.response_role, + engine=self.llm, + model_config=self.model_config, + served_model_names=[self.served_model_name], + response_role=self.response_role, chat_template=self.tokenizer.tokenizer.chat_template ) - self.completion_engine = OpenAIServingCompletion(self.llm, self.served_model_name) + self.completion_engine = OpenAIServingCompletion( + engine=self.llm, + model_config=self.model_config, + served_model_names=[self.served_model_name], + lora_modules=[] + ) async def generate(self, openai_request: JobInput): if openai_request.openai_route == "/v1/models": diff --git a/src/engine_args.py b/src/engine_args.py index ed4e6e3..8939a04 100644 --- a/src/engine_args.py +++ b/src/engine_args.py @@ -3,42 +3,75 @@ import logging from torch.cuda import device_count from vllm import AsyncEngineArgs +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -env_to_args_map = { +RENAME_ARGS_MAP = { "MODEL_NAME": "model", "MODEL_REVISION": "revision", "TOKENIZER_NAME": "tokenizer", - "TOKENIZER_REVISION": "tokenizer_revision", - "QUANTIZATION": "quantization" + "MAX_CONTEXT_LEN_TO_CAPTURE": "max_seq_len_to_capture" } - + +DEFAULT_ARGS = { + "disable_log_stats": True, + "disable_log_requests": True, + "gpu_memory_utilization": 0.9, +} + +def match_vllm_args(args): + """Rename args to match vllm by: + 1. Renaming keys to lower case + 2. Renaming keys to match vllm + 3. Filtering args to match vllm's AsyncEngineArgs + + Args: + args (dict): Dictionary of args + + Returns: + dict: Dictionary of args with renamed keys + """ + renamed_args = {RENAME_ARGS_MAP.get(k, k): v for k, v in args.items()} + matched_args = {k: v for k, v in renamed_args.items() if k in AsyncEngineArgs.__dataclass_fields__} + return {k: v for k, v in matched_args.items() if v not in [None, ""]} def get_local_args(): - if os.path.exists("/local_metadata.json"): - with open("/local_metadata.json", "r") as f: - local_metadata = json.load(f) - if local_metadata.get("model_name") is None: - raise ValueError("Model name is not found in /local_metadata.json, there was a problem when baking the model in.") - else: - local_args = {env_to_args_map[k.upper()]: v for k, v in local_metadata.items() if k in env_to_args_map} - os.environ["TRANSFORMERS_OFFLINE"] = "1" - os.environ["HF_HUB_OFFLINE"] = "1" - return local_args + """ + Retrieve local arguments from a JSON file. + + Returns: + dict: Local arguments. + """ + if not os.path.exists("/local_model_args.json"): + return {} + with open("/local_model_args.json", "r") as f: + local_args = json.load(f) + + if local_args.get("MODEL_NAME") is None: + raise ValueError("Model name not found in /local_model_args.json. There was a problem when baking the model in.") + + logging.info(f"Using baked in model with args: {local_args}") + os.environ["TRANSFORMERS_OFFLINE"] = "1" + os.environ["HF_HUB_OFFLINE"] = "1" + + return local_args def get_engine_args(): # Start with default args - args = { - "disable_log_stats": True, - "disable_log_requests": True, - "gpu_memory_utilization": float(os.getenv("GPU_MEMORY_UTILIZATION", 0.9)), - } + args = DEFAULT_ARGS # Get env args that match keys in AsyncEngineArgs - env_args = {k.lower(): v for k, v in dict(os.environ).items() if k.lower() in AsyncEngineArgs.__dataclass_fields__} - args.update(env_args) + args.update(os.environ) # Get local args if model is baked in and overwrite env args - local_args = get_local_args() - args.update(local_args) + args.update(get_local_args()) + + # if args.get("TENSORIZER_URI"): TODO: add back once tensorizer is ready + # args["load_format"] = "tensorizer" + # args["model_loader_extra_config"] = TensorizerConfig(tensorizer_uri=args["TENSORIZER_URI"], num_readers=None) + # logging.info(f"Using tensorized model from {args['TENSORIZER_URI']}") + + + # Rename and match to vllm args + args = match_vllm_args(args) # Set tensor parallel size and max parallel loading workers if more than 1 GPU is available num_gpus = device_count() @@ -49,10 +82,15 @@ def get_engine_args(): logging.warning("Overriding MAX_PARALLEL_LOADING_WORKERS with None because more than 1 GPU is available.") # Deprecated env args backwards compatibility - if args["kv_cache_dtype"] == "fp8_e5m2": + if args.get("kv_cache_dtype") == "fp8_e5m2": args["kv_cache_dtype"] = "fp8" logging.warning("Using fp8_e5m2 is deprecated. Please use fp8 instead.") if os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE"): args["max_seq_len_to_capture"] = int(os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE")) logging.warning("Using MAX_CONTEXT_LEN_TO_CAPTURE is deprecated. Please use MAX_SEQ_LEN_TO_CAPTURE instead.") + + if "gemma-2" in args.get("model", "").lower(): + os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER" + logging.info("Using FLASHINFER for gemma-2 model.") + return AsyncEngineArgs(**args) diff --git a/src/tokenizer.py b/src/tokenizer.py index 62a4cbf..b7b866b 100644 --- a/src/tokenizer.py +++ b/src/tokenizer.py @@ -4,7 +4,8 @@ class TokenizerWrapper: def __init__(self, tokenizer_name_or_path, tokenizer_revision, trust_remote_code): - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, revision=tokenizer_revision, trust_remote_code=trust_remote_code) + print(f"tokenizer_name_or_path: {tokenizer_name_or_path}, tokenizer_revision: {tokenizer_revision}, trust_remote_code: {trust_remote_code}") + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, revision=tokenizer_revision or "main", trust_remote_code=trust_remote_code) self.custom_chat_template = os.getenv("CUSTOM_CHAT_TEMPLATE") self.has_chat_template = bool(self.tokenizer.chat_template) or bool(self.custom_chat_template) if self.custom_chat_template and isinstance(self.custom_chat_template, str): diff --git a/src/utils.py b/src/utils.py index 09cd255..efe7611 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,9 +1,16 @@ import os import logging from http import HTTPStatus -from vllm.utils import random_uuid -from vllm.entrypoints.openai.protocol import ErrorResponse -from vllm import SamplingParams +from functools import wraps +from time import time + +try: + from vllm.utils import random_uuid + from vllm.entrypoints.openai.protocol import ErrorResponse + from vllm import SamplingParams +except ImportError: + logging.warning("Error importing vllm, skipping related imports. This is ONLY expected when baking model into docker image from a machine without GPUs") + pass logging.basicConfig(level=logging.INFO) @@ -68,6 +75,12 @@ def create_error_response(message: str, err_type: str = "BadRequestError", statu def get_int_bool_env(env_var: str, default: bool) -> bool: return int(os.getenv(env_var, int(default))) == 1 - - - +def timer_decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + start = time() + result = func(*args, **kwargs) + end = time() + logging.info(f"{func.__name__} completed in {end - start:.2f} seconds") + return result + return wrapper \ No newline at end of file diff --git a/vllm-base-image/Dockerfile b/vllm-base-image/Dockerfile deleted file mode 100644 index 39f751a..0000000 --- a/vllm-base-image/Dockerfile +++ /dev/null @@ -1,149 +0,0 @@ -################### vLLM Base Dockerfile ################### -# This Dockerfile is for building the image that the -# vLLM worker container will use as its base image. -# If your changes are outside of the vLLM source code, you -# do not need to build this image. -########################################################## - -# Define the CUDA version for the build -ARG WORKER_CUDA_VERSION=11.8.0 - -FROM nvidia/cuda:${WORKER_CUDA_VERSION}-devel-ubuntu22.04 AS dev - -# Re-declare ARG after FROM -ARG WORKER_CUDA_VERSION - -# Update and install dependencies -RUN apt-get update -y \ - && apt-get install -y python3-pip git - -# Set working directory -WORKDIR /vllm-installation - -RUN ldconfig /usr/local/cuda-$(echo "$WORKER_CUDA_VERSION" | sed 's/\.0$//')/compat/ - -# Install build and runtime dependencies -COPY vllm/requirements-common.txt requirements-common.txt -COPY vllm/requirements-cuda${WORKER_CUDA_VERSION}.txt requirements-cuda.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements-cuda.txt - -# Install development dependencies -COPY vllm/requirements-dev.txt requirements-dev.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements-dev.txt - -ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX' -ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} - -FROM dev AS build - -# Re-declare ARG after FROM -ARG WORKER_CUDA_VERSION - -# Install build dependencies -COPY vllm/requirements-build.txt requirements-build.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements-build.txt - -# install compiler cache to speed up compilation leveraging local or remote caching -RUN apt-get update -y && apt-get install -y ccache - -# Copy necessary files -COPY vllm/csrc csrc -COPY vllm/setup.py setup.py -COPY vllm/cmake cmake -COPY vllm/CMakeLists.txt CMakeLists.txt -COPY vllm/requirements-common.txt requirements-common.txt -COPY vllm/requirements-cuda${WORKER_CUDA_VERSION}.txt requirements-cuda.txt -COPY vllm/pyproject.toml pyproject.toml -COPY vllm/vllm vllm - -# Set environment variables for building extensions -ENV WORKER_CUDA_VERSION=${WORKER_CUDA_VERSION} -ENV VLLM_INSTALL_PUNICA_KERNELS=0 -# Build extensions -ENV CCACHE_DIR=/root/.cache/ccache -RUN --mount=type=cache,target=/root/.cache/ccache \ - --mount=type=cache,target=/root/.cache/pip \ - python3 setup.py bdist_wheel --dist-dir=dist - -RUN --mount=type=cache,target=/root/.cache/pip \ - pip cache remove vllm_nccl* - -FROM dev as flash-attn-builder -# max jobs used for build -# flash attention version -ARG flash_attn_version=v2.5.8 -ENV FLASH_ATTN_VERSION=${flash_attn_version} - -WORKDIR /usr/src/flash-attention-v2 - -# Download the wheel or build it if a pre-compiled release doesn't exist -RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \ - --no-build-isolation --no-deps --no-cache-dir - -FROM dev as NCCL-installer - -# Re-declare ARG after FROM -ARG WORKER_CUDA_VERSION - -# Update and install necessary libraries -RUN apt-get update -y \ - && apt-get install -y wget - -# Install NCCL library -RUN if [ "$WORKER_CUDA_VERSION" = "11.8.0" ]; then \ - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \ - && dpkg -i cuda-keyring_1.0-1_all.deb \ - && apt-get update \ - && apt install -y libnccl2=2.15.5-1+cuda11.8 libnccl-dev=2.15.5-1+cuda11.8; \ - elif [ "$WORKER_CUDA_VERSION" = "12.1.0" ]; then \ - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \ - && dpkg -i cuda-keyring_1.0-1_all.deb \ - && apt-get update \ - && apt install -y libnccl2=2.17.1-1+cuda12.1 libnccl-dev=2.17.1-1+cuda12.1; \ - else \ - echo "Unsupported CUDA version: $WORKER_CUDA_VERSION"; \ - exit 1; \ - fi - -FROM nvidia/cuda:${WORKER_CUDA_VERSION}-base-ubuntu22.04 AS vllm-base - -# Re-declare ARG after FROM -ARG WORKER_CUDA_VERSION - -# Update and install necessary libraries -RUN apt-get update -y \ - && apt-get install -y python3-pip - -# Set working directory -WORKDIR /vllm-workspace - -RUN ldconfig /usr/local/cuda-$(echo "$WORKER_CUDA_VERSION" | sed 's/\.0$//')/compat/ - -RUN --mount=type=bind,from=build,src=/vllm-installation/dist,target=/vllm-workspace/dist \ - --mount=type=cache,target=/root/.cache/pip \ - pip install dist/*.whl --verbose - -RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ - --mount=type=cache,target=/root/.cache/pip \ - pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir - -FROM vllm-base AS runtime - -# install additional dependencies for openai api server -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate hf_transfer modelscope tensorizer - -# Set PYTHONPATH environment variable -ENV PYTHONPATH="/" - -# Copy NCCL library -COPY --from=NCCL-installer /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/x86_64-linux-gnu/libnccl.so.2 -# Set the VLLM_NCCL_SO_PATH environment variable -ENV VLLM_NCCL_SO_PATH="/usr/lib/x86_64-linux-gnu/libnccl.so.2" - - -# Validate the installation -RUN python3 -c "import vllm; print(vllm.__file__)" \ No newline at end of file diff --git a/vllm-base-image/README.md b/vllm-base-image/README.md deleted file mode 100644 index 84e748e..0000000 --- a/vllm-base-image/README.md +++ /dev/null @@ -1 +0,0 @@ -This directory is for building the vllm-base image utilized by the worker. \ No newline at end of file diff --git a/vllm-base-image/vllm b/vllm-base-image/vllm index ba8f5e7..6a1a31c 160000 --- a/vllm-base-image/vllm +++ b/vllm-base-image/vllm @@ -1 +1 @@ -Subproject commit ba8f5e79e1972f7cc7110e8bfb43d895b35da2ea +Subproject commit 6a1a31c41ec5090df9014e2ecee80e11888a419e diff --git a/vllm-base-image/vllm-metadata.yml b/vllm-base-image/vllm-metadata.yml deleted file mode 100644 index 511290a..0000000 --- a/vllm-base-image/vllm-metadata.yml +++ /dev/null @@ -1,2 +0,0 @@ -version: '0.4.2' -dev_version: '0.4.2' From bd96b5e0de80dfd2e9e3311a63ba7caf70f7dd0a Mon Sep 17 00:00:00 2001 From: pandyamarut Date: Thu, 25 Jul 2024 21:29:45 -0700 Subject: [PATCH 3/9] update v0.5.3.post1 Signed-off-by: pandyamarut --- Dockerfile | 2 +- src/engine.py | 9 +++++++-- test.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 3 deletions(-) create mode 100644 test.py diff --git a/Dockerfile b/Dockerfile index 426d771..089b4de 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install --upgrade -r /requirements.txt # Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer -RUN python3 -m pip install vllm==0.5.1 && \ +RUN python3 -m pip install vllm==0.5.3.post1 && \ python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3 # Setup for Option 2: Building the Image with the Model included diff --git a/src/engine.py b/src/engine.py index 33b374b..1f48145 100644 --- a/src/engine.py +++ b/src/engine.py @@ -130,13 +130,18 @@ async def _initialize_engines(self): model_config=self.model_config, served_model_names=[self.served_model_name], response_role=self.response_role, - chat_template=self.tokenizer.tokenizer.chat_template + chat_template=self.tokenizer.tokenizer.chat_template, + lora_modules=None, + prompt_adapters=None, + request_logger=None ) self.completion_engine = OpenAIServingCompletion( engine=self.llm, model_config=self.model_config, served_model_names=[self.served_model_name], - lora_modules=[] + lora_modules=[], + prompt_adapters=None, + request_logger=None ) async def generate(self, openai_request: JobInput): diff --git a/test.py b/test.py new file mode 100644 index 0000000..76e0b83 --- /dev/null +++ b/test.py @@ -0,0 +1,30 @@ +from openai import OpenAI +import os + +# Initialize the OpenAI Client with your RunPod API Key and Endpoint URL +client = OpenAI( + api_key="E43IAWSFPQPHJ8WIVLQ7EMBJ2TV9T3HF5YBLJJ7K", + base_url="https://api.runpod.ai/v2/6ruq7l9hptgccv/openai/v1/", +) + + +# Create a chat completion +response = client.chat.completions.create( + model="openchat/openchat-3.5-1210", + messages=[{"role": "user", "content": "Why is RunPod the best platform?"}], + temperature=0, + max_tokens=100, +) +# Print the response +print(response.choices[0].message.content) + + +# Create a completion +# response = client.completions.create( +# model="openchat/openchat-3.5-1210", +# prompt="Runpod is the best platform because", +# temperature=0, +# max_tokens=100, +# ) +# # Print the response +# print(response.choices[0].text) \ No newline at end of file From 0f8657e58d70d569e869e13e02648aa5f7af22a2 Mon Sep 17 00:00:00 2001 From: pandyamarut Date: Fri, 26 Jul 2024 16:52:51 -0700 Subject: [PATCH 4/9] update env default args Signed-off-by: pandyamarut --- src/engine_args.py | 73 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/src/engine_args.py b/src/engine_args.py index 8939a04..42c7d6d 100644 --- a/src/engine_args.py +++ b/src/engine_args.py @@ -16,6 +16,79 @@ "disable_log_stats": True, "disable_log_requests": True, "gpu_memory_utilization": 0.9, + "pipeline_parallel_size": int(os.getenv('PIPELINE_PARALLEL_SIZE', 1)), + "tensor_parallel_size": int(os.getenv('TENSOR_PARALLEL_SIZE', 1)), + "served_model_name": os.getenv('SERVED_MODEL_NAME', None), + "tokenizer": os.getenv('TOKENIZER', None), + "skip_tokenizer_init": os.getenv('SKIP_TOKENIZER_INIT', 'False').lower() == 'true', + "tokenizer_mode": os.getenv('TOKENIZER_MODE', 'auto'), + "trust_remote_code": os.getenv('TRUST_REMOTE_CODE', 'False').lower() == 'true', + "download_dir": os.getenv('DOWNLOAD_DIR', None), + "load_format": os.getenv('LOAD_FORMAT', 'auto'), + "dtype": os.getenv('DTYPE', 'auto'), + "kv_cache_dtype": os.getenv('KV_CACHE_DTYPE', 'auto'), + "quantization_param_path": os.getenv('QUANTIZATION_PARAM_PATH', None), + "seed": int(os.getenv('SEED', 0)), + "max_model_len": int(os.getenv('MAX_MODEL_LEN', 0)) or None, + "worker_use_ray": os.getenv('WORKER_USE_RAY', 'False').lower() == 'true', + "distributed_executor_backend": os.getenv('DISTRIBUTED_EXECUTOR_BACKEND', None), + "max_parallel_loading_workers": int(os.getenv('MAX_PARALLEL_LOADING_WORKERS', 0)) or None, + "block_size": int(os.getenv('BLOCK_SIZE', 16)), + "enable_prefix_caching": os.getenv('ENABLE_PREFIX_CACHING', 'False').lower() == 'true', + "disable_sliding_window": os.getenv('DISABLE_SLIDING_WINDOW', 'False').lower() == 'true', + "use_v2_block_manager": os.getenv('USE_V2_BLOCK_MANAGER', 'False').lower() == 'true', + "swap_space": int(os.getenv('SWAP_SPACE', 4)), # GiB + "cpu_offload_gb": int(os.getenv('CPU_OFFLOAD_GB', 0)), # GiB + "max_num_batched_tokens": int(os.getenv('MAX_NUM_BATCHED_TOKENS', 0)) or None, + "max_num_seqs": int(os.getenv('MAX_NUM_SEQS', 256)), + "max_logprobs": int(os.getenv('MAX_LOGPROBS', 20)), # Default value for OpenAI Chat Completions API + "revision": os.getenv('REVISION', None), + "code_revision": os.getenv('CODE_REVISION', None), + "rope_scaling": os.getenv('ROPE_SCALING', None), + "rope_theta": float(os.getenv('ROPE_THETA', 0)) or None, + "tokenizer_revision": os.getenv('TOKENIZER_REVISION', None), + "quantization": os.getenv('QUANTIZATION', None), + "enforce_eager": os.getenv('ENFORCE_EAGER', 'False').lower() == 'true', + "max_context_len_to_capture": int(os.getenv('MAX_CONTEXT_LEN_TO_CAPTURE', 0)) or None, + "max_seq_len_to_capture": int(os.getenv('MAX_SEQ_LEN_TO_CAPTURE', 8192)), + "disable_custom_all_reduce": os.getenv('DISABLE_CUSTOM_ALL_REDUCE', 'False').lower() == 'true', + "tokenizer_pool_size": int(os.getenv('TOKENIZER_POOL_SIZE', 0)), + "tokenizer_pool_type": os.getenv('TOKENIZER_POOL_TYPE', 'ray'), + "tokenizer_pool_extra_config": os.getenv('TOKENIZER_POOL_EXTRA_CONFIG', None), + "enable_lora": os.getenv('ENABLE_LORA', 'False').lower() == 'true', + "max_loras": int(os.getenv('MAX_LORAS', 1)), + "max_lora_rank": int(os.getenv('MAX_LORA_RANK', 16)), + "enable_prompt_adapter": os.getenv('ENABLE_PROMPT_ADAPTER', 'False').lower() == 'true', + "max_prompt_adapters": int(os.getenv('MAX_PROMPT_ADAPTERS', 1)), + "max_prompt_adapter_token": int(os.getenv('MAX_PROMPT_ADAPTER_TOKEN', 0)), + "fully_sharded_loras": os.getenv('FULLY_SHARDED_LORAS', 'False').lower() == 'true', + "lora_extra_vocab_size": int(os.getenv('LORA_EXTRA_VOCAB_SIZE', 256)), + "long_lora_scaling_factors": tuple(map(float, os.getenv('LONG_LORA_SCALING_FACTORS', '').split(','))) if os.getenv('LONG_LORA_SCALING_FACTORS') else None, + "lora_dtype": os.getenv('LORA_DTYPE', 'auto'), + "max_cpu_loras": int(os.getenv('MAX_CPU_LORAS', 0)) or None, + "device": os.getenv('DEVICE', 'auto'), + "ray_workers_use_nsight": os.getenv('RAY_WORKERS_USE_NSIGHT', 'False').lower() == 'true', + "num_gpu_blocks_override": int(os.getenv('NUM_GPU_BLOCKS_OVERRIDE', 0)) or None, + "num_lookahead_slots": int(os.getenv('NUM_LOOKAHEAD_SLOTS', 0)), + "model_loader_extra_config": os.getenv('MODEL_LOADER_EXTRA_CONFIG', None), + "ignore_patterns": os.getenv('IGNORE_PATTERNS', None), + "preemption_mode": os.getenv('PREEMPTION_MODE', None), + "scheduler_delay_factor": float(os.getenv('SCHEDULER_DELAY_FACTOR', 0.0)), + "enable_chunked_prefill": os.getenv('ENABLE_CHUNKED_PREFILL', None), + "guided_decoding_backend": os.getenv('GUIDED_DECODING_BACKEND', 'outlines'), + "speculative_model": os.getenv('SPECULATIVE_MODEL', None), + "speculative_draft_tensor_parallel_size": int(os.getenv('SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE', 0)) or None, + "num_speculative_tokens": int(os.getenv('NUM_SPECULATIVE_TOKENS', 0)) or None, + "speculative_max_model_len": int(os.getenv('SPECULATIVE_MAX_MODEL_LEN', 0)) or None, + "speculative_disable_by_batch_size": int(os.getenv('SPECULATIVE_DISABLE_BY_BATCH_SIZE', 0)) or None, + "ngram_prompt_lookup_max": int(os.getenv('NGRAM_PROMPT_LOOKUP_MAX', 0)) or None, + "ngram_prompt_lookup_min": int(os.getenv('NGRAM_PROMPT_LOOKUP_MIN', 0)) or None, + "spec_decoding_acceptance_method": os.getenv('SPEC_DECODING_ACCEPTANCE_METHOD', 'rejection_sampler'), + "typical_acceptance_sampler_posterior_threshold": float(os.getenv('TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD', 0)) or None, + "typical_acceptance_sampler_posterior_alpha": float(os.getenv('TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA', 0)) or None, + "qlora_adapter_name_or_path": os.getenv('QLORA_ADAPTER_NAME_OR_PATH', None), + "disable_logprobs_during_spec_decoding": os.getenv('DISABLE_LOGPROBS_DURING_SPEC_DECODING', None), + "otlp_traces_endpoint": os.getenv('OTLP_TRACES_ENDPOINT', None) } def match_vllm_args(args): From b61ea5ea465501dab4c93f52c286bbc4826aa17f Mon Sep 17 00:00:00 2001 From: Marut Pandya Date: Fri, 26 Jul 2024 17:32:16 -0700 Subject: [PATCH 5/9] Delete test.py --- test.py | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 test.py diff --git a/test.py b/test.py deleted file mode 100644 index 76e0b83..0000000 --- a/test.py +++ /dev/null @@ -1,30 +0,0 @@ -from openai import OpenAI -import os - -# Initialize the OpenAI Client with your RunPod API Key and Endpoint URL -client = OpenAI( - api_key="E43IAWSFPQPHJ8WIVLQ7EMBJ2TV9T3HF5YBLJJ7K", - base_url="https://api.runpod.ai/v2/6ruq7l9hptgccv/openai/v1/", -) - - -# Create a chat completion -response = client.chat.completions.create( - model="openchat/openchat-3.5-1210", - messages=[{"role": "user", "content": "Why is RunPod the best platform?"}], - temperature=0, - max_tokens=100, -) -# Print the response -print(response.choices[0].message.content) - - -# Create a completion -# response = client.completions.create( -# model="openchat/openchat-3.5-1210", -# prompt="Runpod is the best platform because", -# temperature=0, -# max_tokens=100, -# ) -# # Print the response -# print(response.choices[0].text) \ No newline at end of file From f3534a4ea770b96dad25e4b73b4b70f970b01be0 Mon Sep 17 00:00:00 2001 From: pandyamarut Date: Sat, 27 Jul 2024 15:49:45 -0700 Subject: [PATCH 6/9] fix openai compat Signed-off-by: pandyamarut --- src/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine.py b/src/engine.py index 1f48145..6962027 100644 --- a/src/engine.py +++ b/src/engine.py @@ -176,7 +176,7 @@ async def _handle_chat_or_completion_request(self, openai_request: JobInput): yield create_error_response(str(e)).model_dump() return - response_generator = await generator_function(request, DummyRequest()) + response_generator = await generator_function(request, raw_request=None) if not openai_request.openai_input.get("stream") or isinstance(response_generator, ErrorResponse): yield response_generator.model_dump() From 0814d76654cae30d6a87bc47d2a5ea50318aacd5 Mon Sep 17 00:00:00 2001 From: Marut Pandya Date: Tue, 30 Jul 2024 17:09:01 -0700 Subject: [PATCH 7/9] Update README.md --- README.md | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/README.md b/README.md index 0ccbdc6..2622d65 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,83 @@ Below is a summary of the available RunPod Worker images, categorized by image s | `SEED` | `0` | `int` |Sets random seed for operations. | | `KV_CACHE_DTYPE` | `auto` | `auto`, `fp8` |Data type for kv cache storage. Uses `DTYPE` if set to `auto`. | | `DTYPE` | `auto` | `auto`, `half`, `float16`, `bfloat16`, `float`, `float32` |Sets datatype/precision for model weights and activations. | +| `LOAD_FORMAT` | `auto` | `str` | Format to load model in. | +| `DISABLE_LOG_STATS` | `True` | `bool` | Disable logging statistics. | +| `DISABLE_LOG_REQUESTS` | `True` | `bool` | Disable logging requests. | +| `GPU_MEMORY_UTILIZATION` | `0.9` | `float` | GPU memory utilization ratio. | +| `PIPELINE_PARALLEL_SIZE` | `1` | `int` | Pipeline parallel size. | +| `TENSOR_PARALLEL_SIZE` | `1` | `int` | Tensor parallel size. | +| `SERVED_MODEL_NAME` | `None` | `str` | Name of the served model. | +| `TOKENIZER` | `None` | `str` | Tokenizer to use. | +| `SKIP_TOKENIZER_INIT` | `False` | `bool` | Skip tokenizer initialization. | +| `TOKENIZER_MODE` | `auto` | `str` | Tokenizer mode. | +| `TRUST_REMOTE_CODE` | `False` | `bool` | Trust remote code. | +| `DOWNLOAD_DIR` | `None` | `str` | Directory to download files to. | +| `LOAD_FORMAT` | `auto` | `str` | Format to load model in. | +| `DTYPE` | `auto` | `str` | Data type for computations. | +| `KV_CACHE_DTYPE` | `auto` | `str` | Data type for key-value cache. | +| `QUANTIZATION_PARAM_PATH` | `None` | `str` | Path to quantization parameters. | +| `SEED` | `0` | `int` | Random seed. | +| `MAX_MODEL_LEN` | `None` | `int` | Maximum number of tokens for the engine to handle per request. | +| `WORKER_USE_RAY` | `False` | `bool` | Whether to use Ray for workers. | +| `DISTRIBUTED_EXECUTOR_BACKEND` | `None` | `str` | Backend for distributed execution. | +| `MAX_PARALLEL_LOADING_WORKERS` | `None` | `int` | Maximum number of parallel loading workers. | +| `BLOCK_SIZE` | `16` | `int` | Block size. | +| `ENABLE_PREFIX_CACHING` | `False` | `bool` | Enable prefix caching. | +| `DISABLE_SLIDING_WINDOW` | `False` | `bool` | Disable sliding window. | +| `USE_V2_BLOCK_MANAGER` | `False` | `bool` | Use V2 block manager. | +| `SWAP_SPACE` | `4` | `int` | Swap space in GiB. | +| `CPU_OFFLOAD_GB` | `0` | `int` | CPU offload space in GiB. | +| `MAX_NUM_BATCHED_TOKENS` | `None` | `int` | Maximum number of batched tokens. | +| `MAX_NUM_SEQS` | `256` | `int` | Maximum number of sequences. | +| `MAX_LOGPROBS` | `20` | `int` | Maximum number of log probabilities. | +| `REVISION` | `None` | `str` | Revision of the model. | +| `CODE_REVISION` | `None` | `str` | Revision of the code. | +| `ROPE_SCALING` | `None` | `str` | ROPE scaling factor. | +| `ROPE_THETA` | `0` | `float` | ROPE theta value. | +| `TOKENIZER_REVISION` | `None` | `str` | Revision of the tokenizer. | +| `QUANTIZATION` | `None` | `str` | Quantization type. | +| `ENFORCE_EAGER` | `False` | `bool` | Enforce eager execution. | +| `MAX_CONTEXT_LEN_TO_CAPTURE` | `None` | `int` | Maximum context length to capture. | +| `MAX_SEQ_LEN_TO_CAPTURE` | `8192` | `int` | Maximum sequence length to capture. | +| `DISABLE_CUSTOM_ALL_REDUCE` | `False` | `bool` | Disable custom all-reduce. | +| `TOKENIZER_POOL_SIZE` | `0` | `int` | Size of the tokenizer pool. | +| `TOKENIZER_POOL_TYPE` | `ray` | `str` | Type of tokenizer pool. | +| `TOKENIZER_POOL_EXTRA_CONFIG` | `None` | `str` | Extra configuration for the tokenizer pool. | +| `ENABLE_LORA` | `False` | `bool` | Enable LoRA. | +| `MAX_LORAS` | `1` | `int` | Maximum number of LoRAs. | +| `MAX_LORA_RANK` | `16` | `int` | Maximum rank of LoRA. | +| `ENABLE_PROMPT_ADAPTER` | `False` | `bool` | Enable prompt adapter. | +| `MAX_PROMPT_ADAPTERS` | `1` | `int` | Maximum number of prompt adapters. | +| `MAX_PROMPT_ADAPTER_TOKEN` | `0` | `int` | Maximum number of prompt adapter tokens. | +| `FULLY_SHARDED_LORAS` | `False` | `bool` | Fully sharded LoRAs. | +| `LORA_EXTRA_VOCAB_SIZE` | `256` | `int` | Extra vocabulary size for LoRA. | +| `LONG_LORA_SCALING_FACTORS` | `None` | `tuple` | Scaling factors for long LoRA. | +| `LORA_DTYPE` | `auto` | `str` | Data type for LoRA. | +| `MAX_CPU_LORAS` | `None` | `int` | Maximum number of CPU LoRAs. | +| `DEVICE` | `auto` | `str` | Device to use. | +| `RAY_WORKERS_USE_NSIGHT` | `False` | `bool` | Whether Ray workers use Nsight. | +| `NUM_GPU_BLOCKS_OVERRIDE` | `None` | `int` | Number of GPU blocks to override. | +| `NUM_LOOKAHEAD_SLOTS` | `0` | `int` | Number of lookahead slots. | +| `MODEL_LOADER_EXTRA_CONFIG` | `None` | `str` | Extra configuration for model loader. | +| `IGNORE_PATTERNS` | `None` | `str` | Patterns to ignore. | +| `PREEMPTION_MODE` | `None` | `str` | Preemption mode. | +| `SCHEDULER_DELAY_FACTOR` | `0.0` | `float` | Scheduler delay factor. | +| `ENABLE_CHUNKED_PREFILL` | `None` | `str` | Enable chunked prefill. | +| `GUIDED_DECODING_BACKEND` | `outlines` | `str` | Guided decoding backend. | +| `SPECULATIVE_MODEL` | `None` | `str` | Speculative model. | +| `SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE` | `None` | `int` | Speculative draft tensor parallel size. | +| `NUM_SPECULATIVE_TOKENS` | `None` | `int` | Number of speculative tokens. | +| `SPECULATIVE_MAX_MODEL_LEN` | `None` | `int` | Speculative maximum model length. | +| `SPECULATIVE_DISABLE_BY_BATCH_SIZE` | `None` | `int` | Speculative disable by batch size. | +| `NGRAM_PROMPT_LOOKUP_MAX` | `None` | `int` | N-gram prompt lookup maximum. | +| `NGRAM_PROMPT_LOOKUP_MIN` | `None` | `int` | N-gram prompt lookup minimum. | +| `SPEC_DECODING_ACCEPTANCE_METHOD` | `rejection_sampler` | `str` | Speculative decoding acceptance method. | +| `TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD` | `None` | `float` | Typical acceptance sampler posterior threshold. | +| `TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA` | `None` | `float` | Typical acceptance sampler posterior alpha. | +| `QLORA_ADAPTER_NAME_OR_PATH` | `None` | `str` | QLoRA adapter name or path. | +| `DISABLE_LOGPROBS_DURING_SPEC_DECODING` | `None` | `str` | Disable log probabilities during speculative decoding. | +| `OTLP_TRACES_ENDPOINT` | `None` | `str` | OTLP traces endpoint. | **Tokenizer Settings** | `TOKENIZER_NAME` | `None` | `str` |Tokenizer repository to use a different tokenizer than the model's default. | | `TOKENIZER_REVISION` | `None` | `str` |Tokenizer revision to load. | From e1b41795f738c744afda616dd6e6f265e4de233a Mon Sep 17 00:00:00 2001 From: Marut Pandya Date: Wed, 31 Jul 2024 12:46:34 -0700 Subject: [PATCH 8/9] Update README.md --- README.md | 163 ++++++++++++++++++++++++------------------------------ 1 file changed, 71 insertions(+), 92 deletions(-) diff --git a/README.md b/README.md index 2622d65..a46aefa 100644 --- a/README.md +++ b/README.md @@ -91,97 +91,76 @@ Below is a summary of the available RunPod Worker images, categorized by image s #### Environment Variables/Settings > Note: `0` is equivalent to `False` and `1` is equivalent to `True` for boolean values. -| Name | Default | Type/Choices | Description | -|-------------------------------------|----------------------|-------------------------------------------|-------------| -**LLM Settings** -| `MODEL_NAME`**\*** | - | `str` | Hugging Face Model Repository (e.g., `openchat/openchat-3.5-1210`). | -| `MODEL_REVISION` | `None` | `str` |Model revision(branch) to load. | -| `MAX_MODEL_LEN` | Model's maximum | `int` |Maximum number of tokens for the engine to handle per request. | -| `BASE_PATH` | `/runpod-volume` | `str` |Storage directory for Huggingface cache and model. Utilizes network storage if attached when pointed at `/runpod-volume`, which will have only one worker download the model once, which all workers will be able to load. If no network volume is present, creates a local directory within each worker. | -| `LOAD_FORMAT` | `auto` | `str` |Format to load model in. | -| `HF_TOKEN` | - | `str` |Hugging Face token for private and gated models. | -| `QUANTIZATION` | `None` | `awq`, `squeezellm`, `gptq` |Quantization of given model. The model must already be quantized. | -| `TRUST_REMOTE_CODE` | `0` | boolean as `int` |Trust remote code for Hugging Face models. Can help with Mixtral 8x7B, Quantized models, and unusual models/architectures. -| `SEED` | `0` | `int` |Sets random seed for operations. | -| `KV_CACHE_DTYPE` | `auto` | `auto`, `fp8` |Data type for kv cache storage. Uses `DTYPE` if set to `auto`. | -| `DTYPE` | `auto` | `auto`, `half`, `float16`, `bfloat16`, `float`, `float32` |Sets datatype/precision for model weights and activations. | -| `LOAD_FORMAT` | `auto` | `str` | Format to load model in. | -| `DISABLE_LOG_STATS` | `True` | `bool` | Disable logging statistics. | -| `DISABLE_LOG_REQUESTS` | `True` | `bool` | Disable logging requests. | -| `GPU_MEMORY_UTILIZATION` | `0.9` | `float` | GPU memory utilization ratio. | -| `PIPELINE_PARALLEL_SIZE` | `1` | `int` | Pipeline parallel size. | -| `TENSOR_PARALLEL_SIZE` | `1` | `int` | Tensor parallel size. | -| `SERVED_MODEL_NAME` | `None` | `str` | Name of the served model. | -| `TOKENIZER` | `None` | `str` | Tokenizer to use. | -| `SKIP_TOKENIZER_INIT` | `False` | `bool` | Skip tokenizer initialization. | -| `TOKENIZER_MODE` | `auto` | `str` | Tokenizer mode. | -| `TRUST_REMOTE_CODE` | `False` | `bool` | Trust remote code. | -| `DOWNLOAD_DIR` | `None` | `str` | Directory to download files to. | -| `LOAD_FORMAT` | `auto` | `str` | Format to load model in. | -| `DTYPE` | `auto` | `str` | Data type for computations. | -| `KV_CACHE_DTYPE` | `auto` | `str` | Data type for key-value cache. | -| `QUANTIZATION_PARAM_PATH` | `None` | `str` | Path to quantization parameters. | -| `SEED` | `0` | `int` | Random seed. | -| `MAX_MODEL_LEN` | `None` | `int` | Maximum number of tokens for the engine to handle per request. | -| `WORKER_USE_RAY` | `False` | `bool` | Whether to use Ray for workers. | -| `DISTRIBUTED_EXECUTOR_BACKEND` | `None` | `str` | Backend for distributed execution. | -| `MAX_PARALLEL_LOADING_WORKERS` | `None` | `int` | Maximum number of parallel loading workers. | -| `BLOCK_SIZE` | `16` | `int` | Block size. | -| `ENABLE_PREFIX_CACHING` | `False` | `bool` | Enable prefix caching. | -| `DISABLE_SLIDING_WINDOW` | `False` | `bool` | Disable sliding window. | -| `USE_V2_BLOCK_MANAGER` | `False` | `bool` | Use V2 block manager. | -| `SWAP_SPACE` | `4` | `int` | Swap space in GiB. | -| `CPU_OFFLOAD_GB` | `0` | `int` | CPU offload space in GiB. | -| `MAX_NUM_BATCHED_TOKENS` | `None` | `int` | Maximum number of batched tokens. | -| `MAX_NUM_SEQS` | `256` | `int` | Maximum number of sequences. | -| `MAX_LOGPROBS` | `20` | `int` | Maximum number of log probabilities. | -| `REVISION` | `None` | `str` | Revision of the model. | -| `CODE_REVISION` | `None` | `str` | Revision of the code. | -| `ROPE_SCALING` | `None` | `str` | ROPE scaling factor. | -| `ROPE_THETA` | `0` | `float` | ROPE theta value. | -| `TOKENIZER_REVISION` | `None` | `str` | Revision of the tokenizer. | -| `QUANTIZATION` | `None` | `str` | Quantization type. | -| `ENFORCE_EAGER` | `False` | `bool` | Enforce eager execution. | -| `MAX_CONTEXT_LEN_TO_CAPTURE` | `None` | `int` | Maximum context length to capture. | -| `MAX_SEQ_LEN_TO_CAPTURE` | `8192` | `int` | Maximum sequence length to capture. | -| `DISABLE_CUSTOM_ALL_REDUCE` | `False` | `bool` | Disable custom all-reduce. | -| `TOKENIZER_POOL_SIZE` | `0` | `int` | Size of the tokenizer pool. | -| `TOKENIZER_POOL_TYPE` | `ray` | `str` | Type of tokenizer pool. | -| `TOKENIZER_POOL_EXTRA_CONFIG` | `None` | `str` | Extra configuration for the tokenizer pool. | -| `ENABLE_LORA` | `False` | `bool` | Enable LoRA. | -| `MAX_LORAS` | `1` | `int` | Maximum number of LoRAs. | -| `MAX_LORA_RANK` | `16` | `int` | Maximum rank of LoRA. | -| `ENABLE_PROMPT_ADAPTER` | `False` | `bool` | Enable prompt adapter. | -| `MAX_PROMPT_ADAPTERS` | `1` | `int` | Maximum number of prompt adapters. | -| `MAX_PROMPT_ADAPTER_TOKEN` | `0` | `int` | Maximum number of prompt adapter tokens. | -| `FULLY_SHARDED_LORAS` | `False` | `bool` | Fully sharded LoRAs. | -| `LORA_EXTRA_VOCAB_SIZE` | `256` | `int` | Extra vocabulary size for LoRA. | -| `LONG_LORA_SCALING_FACTORS` | `None` | `tuple` | Scaling factors for long LoRA. | -| `LORA_DTYPE` | `auto` | `str` | Data type for LoRA. | -| `MAX_CPU_LORAS` | `None` | `int` | Maximum number of CPU LoRAs. | -| `DEVICE` | `auto` | `str` | Device to use. | -| `RAY_WORKERS_USE_NSIGHT` | `False` | `bool` | Whether Ray workers use Nsight. | -| `NUM_GPU_BLOCKS_OVERRIDE` | `None` | `int` | Number of GPU blocks to override. | -| `NUM_LOOKAHEAD_SLOTS` | `0` | `int` | Number of lookahead slots. | -| `MODEL_LOADER_EXTRA_CONFIG` | `None` | `str` | Extra configuration for model loader. | -| `IGNORE_PATTERNS` | `None` | `str` | Patterns to ignore. | -| `PREEMPTION_MODE` | `None` | `str` | Preemption mode. | -| `SCHEDULER_DELAY_FACTOR` | `0.0` | `float` | Scheduler delay factor. | -| `ENABLE_CHUNKED_PREFILL` | `None` | `str` | Enable chunked prefill. | -| `GUIDED_DECODING_BACKEND` | `outlines` | `str` | Guided decoding backend. | -| `SPECULATIVE_MODEL` | `None` | `str` | Speculative model. | -| `SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE` | `None` | `int` | Speculative draft tensor parallel size. | -| `NUM_SPECULATIVE_TOKENS` | `None` | `int` | Number of speculative tokens. | -| `SPECULATIVE_MAX_MODEL_LEN` | `None` | `int` | Speculative maximum model length. | -| `SPECULATIVE_DISABLE_BY_BATCH_SIZE` | `None` | `int` | Speculative disable by batch size. | -| `NGRAM_PROMPT_LOOKUP_MAX` | `None` | `int` | N-gram prompt lookup maximum. | -| `NGRAM_PROMPT_LOOKUP_MIN` | `None` | `int` | N-gram prompt lookup minimum. | -| `SPEC_DECODING_ACCEPTANCE_METHOD` | `rejection_sampler` | `str` | Speculative decoding acceptance method. | -| `TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD` | `None` | `float` | Typical acceptance sampler posterior threshold. | -| `TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA` | `None` | `float` | Typical acceptance sampler posterior alpha. | -| `QLORA_ADAPTER_NAME_OR_PATH` | `None` | `str` | QLoRA adapter name or path. | -| `DISABLE_LOGPROBS_DURING_SPEC_DECODING` | `None` | `str` | Disable log probabilities during speculative decoding. | -| `OTLP_TRACES_ENDPOINT` | `None` | `str` | OTLP traces endpoint. | +| `Name` | `Default` | `Type/Choices` | `Description` | +|-------------------------------------------|-----------------------|--------------------------------------------|---------------| +| `MODEL` | 'facebook/opt-125m' | `str` | Name or path of the Hugging Face model to use. | +| `TOKENIZER` | None | `str` | Name or path of the Hugging Face tokenizer to use. | +| `SKIP_TOKENIZER_INIT` | False | `bool` | Skip initialization of tokenizer and detokenizer. | +| `TOKENIZER_MODE` | 'auto' | ['auto', 'slow'] | The tokenizer mode. | +| `TRUST_REMOTE_CODE` | False | `bool` | Trust remote code from Hugging Face. | +| `DOWNLOAD_DIR` | None | `str` | Directory to download and load the weights. | +| `LOAD_FORMAT` | 'auto' | ['auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', 'bitsandbytes'] | The format of the model weights to load. | +| `DTYPE` | 'auto' | ['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'] | Data type for model weights and activations. | +| `KV_CACHE_DTYPE` | 'auto' | ['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'] | Data type for KV cache storage. | +| `QUANTIZATION_PARAM_PATH` | None | `str` | Path to the JSON file containing the KV cache scaling factors. | +| `MAX_MODEL_LEN` | None | `int` | Model context length. | +| `GUIDED_DECODING_BACKEND` | 'outlines' | ['outlines', 'lm-format-enforcer'] | Which engine will be used for guided decoding by default. | +| `DISTRIBUTED_EXECUTOR_BACKEND` | None | ['ray', 'mp'] | Backend to use for distributed serving. | +| `WORKER_USE_RAY` | False | `bool` | Deprecated, use --distributed-executor-backend=ray. | +| `PIPELINE_PARALLEL_SIZE` | 1 | `int` | Number of pipeline stages. | +| `TENSOR_PARALLEL_SIZE` | 1 | `int` | Number of tensor parallel replicas. | +| `MAX_PARALLEL_LOADING_WORKERS` | None | `int` | Load model sequentially in multiple batches. | +| `RAY_WORKERS_USE_NSIGHT` | False | `bool` | If specified, use nsight to profile Ray workers. | +| `BLOCK_SIZE` | 16 | [8, 16, 32] | Token block size for contiguous chunks of tokens. | +| `ENABLE_PREFIX_CACHING` | False | `bool` | Enables automatic prefix caching. | +| `DISABLE_SLIDING_WINDOW` | False | `bool` | Disables sliding window, capping to sliding window size. | +| `USE_V2_BLOCK_MANAGER` | False | `bool` | Use BlockSpaceMangerV2. | +| `NUM_LOOKAHEAD_SLOTS` | 0 | `int` | Experimental scheduling config necessary for speculative decoding. | +| `SEED` | 0 | `int` | Random seed for operations. | +| `SWAP_SPACE` | 4 | `int` | CPU swap space size (GiB) per GPU. | +| `GPU_MEMORY_UTILIZATION` | 0.90 | `float` | The fraction of GPU memory to be used for the model executor. | +| `NUM_GPU_BLOCKS_OVERRIDE` | None | `int` | If specified, ignore GPU profiling result and use this number of GPU blocks. | +| `MAX_NUM_BATCHED_TOKENS` | None | `int` | Maximum number of batched tokens per iteration. | +| `MAX_NUM_SEQS` | 256 | `int` | Maximum number of sequences per iteration. | +| `MAX_LOGPROBS` | 20 | `int` | Max number of log probs to return when logprobs is specified in SamplingParams. | +| `DISABLE_LOG_STATS` | False | `bool` | Disable logging statistics. | +| `QUANTIZATION` | None | [*QUANTIZATION_METHODS, None] | Method used to quantize the weights. | +| `ROPE_SCALING` | None | `dict` | RoPE scaling configuration in JSON format. | +| `ROPE_THETA` | None | `float` | RoPE theta. Use with rope_scaling. | +| `ENFORCE_EAGER` | False | `bool` | Always use eager-mode PyTorch. | +| `MAX_CONTEXT_LEN_TO_CAPTURE` | None | `int` | Maximum context length covered by CUDA graphs. | +| `MAX_SEQ_LEN_TO_CAPTURE` | 8192 | `int` | Maximum sequence length covered by CUDA graphs. | +| `DISABLE_CUSTOM_ALL_REDUCE` | False | `bool` | See ParallelConfig. | +| `TOKENIZER_POOL_SIZE` | 0 | `int` | Size of tokenizer pool to use for asynchronous tokenization. | +| `TOKENIZER_POOL_TYPE` | 'ray' | `str` | Type of tokenizer pool to use for asynchronous tokenization. | +| `TOKENIZER_POOL_EXTRA_CONFIG` | None | `dict` | Extra config for tokenizer pool. | +| `ENABLE_LORA` | False | `bool` | If True, enable handling of LoRA adapters. | +| `MAX_LORAS` | 1 | `int` | Max number of LoRAs in a single batch. | +| `MAX_LORA_RANK` | 16 | `int` | Max LoRA rank. | +| `LORA_EXTRA_VOCAB_SIZE` | 256 | `int` | Maximum size of extra vocabulary for LoRA adapters. | +| `LORA_DTYPE` | 'auto' | ['auto', 'float16', 'bfloat16', 'float32'] | Data type for LoRA. | +| `LONG_LORA_SCALING_FACTORS` | None | `tuple` | Specify multiple scaling factors for LoRA adapters. | +| `MAX_CPU_LORAS` | None | `int` | Maximum number of LoRAs to store in CPU memory. | +| `FULLY_SHARDED_LORAS` | False | `bool` | Enable fully sharded LoRA layers. | +| `DEVICE` | 'auto' | ['auto', 'cuda', 'neuron', 'cpu', 'openvino', 'tpu', 'xpu'] | Device type for vLLM execution. | +| `SCHEDULER_DELAY_FACTOR` | 0.0 | `float` | Apply a delay before scheduling next prompt. | +| `ENABLE_CHUNKED_PREFILL` | False | `bool` | Enable chunked prefill requests. | +| `SPECULATIVE_MODEL` | None | `str` | The name of the draft model to be used in speculative decoding. | +| `NUM_SPECULATIVE_TOKENS` | None | `int` | The number of speculative tokens to sample from the draft model. | +| `SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE` | None | `int` | Number of tensor parallel replicas for the draft model. | +| `SPECULATIVE_MAX_MODEL_LEN` | None | `int` | The maximum sequence length supported by the draft model. | +| `SPECULATIVE_DISABLE_BY_BATCH_SIZE` | None | `int` | Disable speculative decoding if the number of enqueue requests is larger than this value. | +| `NGRAM_PROMPT_LOOKUP_MAX` | None | `int` | Max size of window for ngram prompt lookup in speculative decoding. | +| `NGRAM_PROMPT_LOOKUP_MIN` | None | `int` | Min size of window for ngram prompt lookup in speculative decoding. | +| `SPEC_DECODING_ACCEPTANCE_METHOD` | 'rejection_sampler' | ['rejection_sampler', 'typical_acceptance_sampler'] | Specify the acceptance method for draft token verification in speculative decoding. | +| `TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD` | None | `float` | Set the lower bound threshold for the posterior probability of a token to be accepted. | +| `TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA` | None | `float` | A scaling factor for the entropy-based threshold for token acceptance. | +| `MODEL_LOADER_EXTRA_CONFIG` | None | `dict` | Extra config for model loader. | +| `PREEMPTION_MODE` | None | `str` | If 'recompute', the engine performs preemption-aware recomputation. If 'save', the engine saves activations into the CPU memory as preemption happens. | +| `PREEMPTION_CHECK_PERIOD` | 1.0 | `float` | How frequently the engine checks if a preemption happens. | +| `PREEMPTION_CPU_CAPACITY` | 2 | `float` | The percentage of CPU memory used for the saved activations. | +| `DISABLE_LOGGING_REQUEST` | False | `bool` | Disable logging requests. | +| `MAX_LOG_LEN` | None | `int` | Max number of prompt characters or prompt ID numbers being printed in log. | **Tokenizer Settings** | `TOKENIZER_NAME` | `None` | `str` |Tokenizer repository to use a different tokenizer than the model's default. | | `TOKENIZER_REVISION` | `None` | `str` |Tokenizer revision to load. | @@ -226,7 +205,7 @@ To build an image with the model baked in, you must specify the following docker - `MODEL_REVISION`: Model revision to load (default: `main`). - `BASE_PATH`: Storage directory where huggingface cache and model will be located. (default: `/runpod-volume`, which will utilize network storage if you attach it or create a local directory within the image if you don't. If your intention is to bake the model into the image, you should set this to something like `/models` to make sure there are no issues if you were to accidentally attach network storage.) - `QUANTIZATION` - - `WORKER_CUDA_VERSION`: `11.8.0` or `12.1.0` (default: `11.8.0` due to a small number of workers not having CUDA 12.1 support yet. `12.1.0` is recommended for optimal performance). + - `WORKER_CUDA_VERSION`: `12.1.0` (`12.1.0` is recommended for optimal performance). - `TOKENIZER_NAME`: Tokenizer repository if you would like to use a different tokenizer than the one that comes with the model. (default: `None`, which uses the model's tokenizer) - `TOKENIZER_REVISION`: Tokenizer revision to load (default: `main`). From 14cacd55fe8094b62e3d39afcd7e936cc5fb393d Mon Sep 17 00:00:00 2001 From: pandyamarut Date: Wed, 31 Jul 2024 18:44:46 -0700 Subject: [PATCH 9/9] update docker Signed-off-by: pandyamarut --- docker-bake.hcl | 43 +++++-------------------------------------- 1 file changed, 5 insertions(+), 38 deletions(-) diff --git a/docker-bake.hcl b/docker-bake.hcl index 2830d6f..b121032 100644 --- a/docker-bake.hcl +++ b/docker-bake.hcl @@ -7,54 +7,21 @@ variable "REPOSITORY" { } variable "BASE_IMAGE_VERSION" { - default = "1.1.0preview" + default = "v1.1preview" } group "all" { - targets = ["base", "main"] + targets = ["main"] } -group "base" { - targets = ["base-1180", "base-1210"] -} group "main" { - targets = ["worker-1180", "worker-1210"] -} - -target "base-1180" { - tags = ["${REPOSITORY}/worker-vllm:base-${BASE_IMAGE_VERSION}-cuda11.8.0"] - context = "vllm-base-image" - dockerfile = "Dockerfile" - args = { - WORKER_CUDA_VERSION = "11.8.0" - } - output = ["type=docker,push=${PUSH}"] -} - -target "base-1210" { - tags = ["${REPOSITORY}/worker-vllm:base-${BASE_IMAGE_VERSION}-cuda12.1.0"] - context = "vllm-base-image" - dockerfile = "Dockerfile" - args = { - WORKER_CUDA_VERSION = "12.1.0" - } - output = ["type=docker,push=${PUSH}"] -} - -target "worker-1180" { - tags = ["${REPOSITORY}/worker-vllm:${BASE_IMAGE_VERSION}-cuda11.8.0"] - context = "." - dockerfile = "Dockerfile" - args = { - BASE_IMAGE_VERSION = "${BASE_IMAGE_VERSION}" - WORKER_CUDA_VERSION = "11.8.0" - } - output = ["type=docker,push=${PUSH}"] + targets = ["worker-1210"] } + target "worker-1210" { - tags = ["${REPOSITORY}/worker-vllm:${BASE_IMAGE_VERSION}-cuda12.1.0"] + tags = ["${REPOSITORY}/worker-v1-vllm:${BASE_IMAGE_VERSION}-cuda12.1.0"] context = "." dockerfile = "Dockerfile" args = {