Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rebase #141

Merged
merged 13 commits into from
Jan 3, 2025
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade -r /requirements.txt

# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer
RUN python3 -m pip install vllm==0.6.4 && \
RUN python3 -m pip install vllm==0.6.6.post1 && \
python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3

# Setup for Option 2: Building the Image with the Model included
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ Deploy OpenAI-Compatible Blazing-Fast LLM Endpoints powered by the [vLLM](https:
### 1. UI for Deploying vLLM Worker on RunPod console:
![Demo of Deploying vLLM Worker on RunPod console with new UI](media/ui_demo.gif)

### 2. Worker vLLM `v1.6.0` with vLLM `0.6.3` now available under `stable` tags
### 2. Worker vLLM `v1.8.0` with vLLM `0.6.6` now available under `stable` tags

Update v1.6.0 is now available, use the image tag `runpod/worker-v1-vllm:v1.6.0stable-cuda12.1.0`.
Update v1.8.0 is now available, use the image tag `runpod/worker-v1-vllm:v1.8.0stable-cuda12.1.0`.

### 3. OpenAI-Compatible [Embedding Worker](https://github.com/runpod-workers/worker-infinity-embedding) Released
Deploy your own OpenAI-compatible Serverless Endpoint on RunPod with multiple embedding models and fast inference for RAG and more!
Expand Down Expand Up @@ -82,7 +82,7 @@ Below is a summary of the available RunPod Worker images, categorized by image s

| CUDA Version | Stable Image Tag | Development Image Tag | Note |
|--------------|-----------------------------------|-----------------------------------|----------------------------------------------------------------------|
| 12.1.0 | `runpod/worker-v1-vllm:v1.6.0stable-cuda12.1.0` | `runpod/worker-v1-vllm:v1.6.0dev-cuda12.1.0` | When creating an Endpoint, select CUDA Version 12.3, 12.2 and 12.1 in the filter. |
| 12.1.0 | `runpod/worker-v1-vllm:v1.8.0stable-cuda12.1.0` | `runpod/worker-v1-vllm:v1.8.0dev-cuda12.1.0` | When creating an Endpoint, select CUDA Version 12.3, 12.2 and 12.1 in the filter. |



Expand Down
3 changes: 3 additions & 0 deletions src/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,11 @@ async def _initialize_engines(self):
base_model_paths=self.base_model_paths,
response_role=self.response_role,
chat_template=self.tokenizer.tokenizer.chat_template,
enable_auto_tools=os.getenv('ENABLE_AUTO_TOOL_CHOICE', 'false').lower() == 'true',
tool_parser=os.getenv('TOOL_CALL_PARSER', "") or None,
lora_modules=lora_modules,
prompt_adapters=None,
chat_template_content_format="auto",
request_logger=None
)
self.completion_engine = OpenAIServingCompletion(
Expand Down
57 changes: 42 additions & 15 deletions worker-config.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"versions": {
"0.6.4": {
"imageName": "runpod/worker-v1-vllm:v1.6.0stable-cuda12.1.0",
"0.6.6": {
"imageName": "runpod/worker-v1-vllm:v1.8.0stable-cuda12.1.0",
"minimumCudaVersion": "12.1",
"categories": [
{
Expand All @@ -24,7 +24,8 @@
"NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD",
"TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA",
"MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD",
"PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST"
"PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST",
"ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER"
]
},
{
Expand Down Expand Up @@ -60,8 +61,8 @@
}
]
},
"0.6.3": {
"imageName": "runpod/worker-v1-vllm:v1.6.0stable-cuda12.1.0",
"0.6.4": {
"imageName": "runpod/worker-v1-vllm:v1.7.0stable-cuda12.1.0",
"minimumCudaVersion": "12.1",
"categories": [
{
Expand All @@ -84,7 +85,8 @@
"NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD",
"TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA",
"MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD",
"PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST"
"PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST",
"ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER"
]
},
{
Expand Down Expand Up @@ -120,8 +122,8 @@
}
]
},
"0.6.2": {
"imageName": "runpod/worker-v1-vllm:v1.5.0stable-cuda12.1.0",
"0.6.3": {
"imageName": "runpod/worker-v1-vllm:v1.6.0stable-cuda12.1.0",
"minimumCudaVersion": "12.1",
"categories": [
{
Expand All @@ -132,7 +134,7 @@
"MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND",
"WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE",
"TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING",
"DISABLE_SLIDING_WINDOW", "USE_V2_BLOCK_MANAGER", "NUM_LOOKAHEAD_SLOTS",
"DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS",
"SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS",
"MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA",
"TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG",
Expand All @@ -144,7 +146,8 @@
"NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD",
"TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA",
"MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD",
"PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST"
"PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST",
"ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER"
]
},
{
Expand Down Expand Up @@ -180,8 +183,8 @@
}
]
},
"0.6.1": {
"imageName": "runpod/worker-v1-vllm:v1.4.0stable-cuda12.1.0",
"0.6.2": {
"imageName": "runpod/worker-v1-vllm:v1.5.0stable-cuda12.1.0",
"minimumCudaVersion": "12.1",
"categories": [
{
Expand Down Expand Up @@ -240,8 +243,8 @@
}
]
},
"0.5.5": {
"imageName": "runpod/worker-v1-vllm:v1.3.1stable-cuda12.1.0",
"0.6.1": {
"imageName": "runpod/worker-v1-vllm:v1.4.0stable-cuda12.1.0",
"minimumCudaVersion": "12.1",
"categories": [
{
Expand Down Expand Up @@ -991,6 +994,30 @@
"description": "Enables or disables vLLM request logging",
"required": false,
"type": "toggle"
},
"ENABLE_AUTO_TOOL_CHOICE": {
"env_var_name": "ENABLE_AUTO_TOOL_CHOICE",
"value": false,
"title": "Enable Auto Tool Choice",
"description": "Enables or disables auto tool choice",
"required": false,
"type": "toggle"
},
"TOOL_CALL_PARSER": {
"env_var_name": "TOOL_CALL_PARSER",
"value": "",
"title": "Tool Call Parser",
"description": "Tool call parser",
"required": false,
"type": "select",
"options": [
{ "value": "", "label": "None" },
{ "value": "hermes", "label": "Hermes" },
{ "value": "mistral", "label": "Mistral" },
{ "value": "llama3_json", "label": "Llama3 JSON" },
{ "value": "pythonic", "label": "Pythonic" },
{ "value": "internlm", "label": "InternLM" }
]
}
}
}
}
Loading