Skip to content
This repository has been archived by the owner on Feb 18, 2025. It is now read-only.

Commit

Permalink
merge main
Browse files Browse the repository at this point in the history
  • Loading branch information
shen-shanshan committed Jan 13, 2025
2 parents 8a9cad2 + 319f283 commit d16a403
Show file tree
Hide file tree
Showing 13 changed files with 44 additions and 54 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ RUN pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm_ascend//vllm/requirements-build.txt
# build vLLM with NPU backend
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="cpu" python3 -m pip install /workspace/vllm_ascend/vllm/
# install vllm_ascend_plugin
# install vllm_ascend
RUN python3 -m pip install /workspace/vllm_ascend/

CMD ["/bin/bash"]
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ docker exec -it vllm bash

### 1. Prepare CANN env

Before install vllm_ascend_plugin, you need to install the Ascend CANN Toolkit and Kernels. Please follow the [installation tutorial](https://ascend.github.io/docs/sources/ascend/quick_install.html#id1) or use the following commands for quick installation:
Before install vllm_ascend, you need to install the Ascend CANN Toolkit and Kernels. Please follow the [installation tutorial](https://ascend.github.io/docs/sources/ascend/quick_install.html#id1) or use the following commands for quick installation:

```bash
# replace the url according to your CANN version and devices
Expand Down
8 changes: 4 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@ def _read_requirements(filename: str) -> List[str]:
try:
requirements = _read_requirements("requirements.txt")
except ValueError:
print("Failed to read requirements.txt in vllm_ascend_plugin.")
print("Failed to read requirements.txt in vllm_ascend.")
return requirements


setup(name='vllm_ascend_plugin',
setup(name='vllm_ascend',
version='0.1',
packages=['vllm_ascend_plugin'],
packages=['vllm_ascend'],
install_requires=get_requirements(),
extras_require={
"tensorizer": ["tensorizer>=2.9.0"],
Expand All @@ -43,5 +43,5 @@ def _read_requirements(filename: str) -> List[str]:
},
entry_points={
'vllm.platform_plugins':
["ascend_plugin = vllm_ascend_plugin:register"]
["ascend_plugin = vllm_ascend:register"]
})
2 changes: 1 addition & 1 deletion vllm_ascend_plugin/__init__.py → vllm_ascend/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
def register():
"""Register the NPU platform."""
return "vllm_ascend_plugin.platform.NPUPlatform"
return "vllm_ascend.platform.NPUPlatform"
11 changes: 5 additions & 6 deletions vllm_ascend_plugin/attention.py → vllm_ascend/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
import torch

try:
import torch_npu # noqa: F401
import torch_npu
except ImportError:
print("Failed to import torch_npu.")
print("Failed to import torch_npu")

from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
AttentionMetadata, AttentionType)
Expand All @@ -19,7 +19,7 @@
PagedAttentionMetadata)

if TYPE_CHECKING:
from vllm_ascend_plugin.model_runner import ModelInputForNPUBuilder
from vllm_ascend.model_runner import ModelInputForNPUBuilder

SHARE_MASK_TRIL_PREFIX_CACHE = None
SHARE_MASK_TRIL = None
Expand Down Expand Up @@ -375,16 +375,15 @@ def _add_seq_group(
# TODO(sang): Combine chunked prefill and prefix caching by
# only allowing multiple of block_size chunk size.
# NOTE: This only works for oooooooxxx style attention.
block_table: List[int] = []
block_table = []
prefix_cache_hit = any([
inter_data.prefix_cache_hit
for inter_data in self.input_builder.inter_data_list
])
if prefix_cache_hit:
# NOTE(woosuk): For flash-attn, the block table should
# include the entries for the incoming prefill tokens.
if block_tables is not None:
block_table = block_tables[seq_id]
block_table = block_tables[seq_id]
elif ((chunked_prefill_enabled or not is_prompt)
and block_tables is not None):
if curr_sliding_window_block == 0:
Expand Down
File renamed without changes.
File renamed without changes.
1 change: 1 addition & 0 deletions vllm_ascend/ops/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
import vllm_ascend.ops.layernorm # noqa
23 changes: 23 additions & 0 deletions vllm_ascend/ops/layernorm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from typing import Optional, Tuple, Union

import torch

from vllm.model_executor.layers.layernorm import RMSNorm

def forward_oot(
self,
x: torch.Tensor,
residual: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
import torch_npu

if residual is not None:
x, _, residual = torch_npu.npu_add_rms_norm(
x, residual, self.weight, self.variance_epsilon)
return x, residual

x, residual = torch_npu.npu_rms_norm(x, self.weight,
self.variance_epsilon)
return x

RMSNorm.forward_oot = forward_oot
14 changes: 7 additions & 7 deletions vllm_ascend_plugin/platform.py → vllm_ascend/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
import torch

try:
import torch_npu # noqa: F401
import torch_npu
except ImportError:
print("Failed to import torch_npu.")
print("Failed to import torch_npu")

from vllm.config import VllmConfig
from vllm.platforms import Platform
Expand All @@ -32,7 +32,7 @@ class NPUPlatform(Platform):
_enum = "Ascend"
device_name: str = "npu"
device_type: str = "npu"
torch_compile_backend: str = "npu"
simple_compile_backend: str = "npu"
ray_device_key: str = "NPU"
visible_device_name: str = "ASCEND_RT"

Expand Down Expand Up @@ -72,19 +72,19 @@ def mem_get_info(cls) -> Tuple[int, int]:
@classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
# Register ops when setup.
from vllm_ascend_plugin import ops # noqa: F401
from vllm_ascend import ops

parallel_config = vllm_config.parallel_config
if parallel_config.worker_cls == "auto":
parallel_config.worker_cls = "vllm_ascend_plugin.worker.NPUWorker"
parallel_config.worker_cls = "vllm_ascend.worker.NPUWorker"
cache_config = vllm_config.cache_config
if cache_config and cache_config.block_size is None:
cache_config.block_size = 16

@classmethod
def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
kv_cache_dtype, block_size, use_v1):
return "vllm_ascend_plugin.attention.AscendAttentionBackend"
return "vllm_ascend.attention.AscendAttentionBackend"

@classmethod
def get_current_memory_usage(cls,
Expand All @@ -95,4 +95,4 @@ def get_current_memory_usage(cls,

@classmethod
def get_device_communicator_cls(cls) -> str:
return "vllm_ascend_plugin.communicator.NPUCommunicator"
return "vllm_ascend.communicator.NPUCommunicator"
2 changes: 1 addition & 1 deletion vllm_ascend_plugin/worker.py → vllm_ascend/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from vllm.worker.worker import Worker
from vllm.worker.worker_base import WorkerBase

from vllm_ascend_plugin.model_runner import NPUModelRunner
from vllm_ascend.model_runner import NPUModelRunner


class NPUWorker(Worker):
Expand Down
10 changes: 0 additions & 10 deletions vllm_ascend_plugin/ops/__init__.py

This file was deleted.

23 changes: 0 additions & 23 deletions vllm_ascend_plugin/ops/layernorm.py

This file was deleted.

0 comments on commit d16a403

Please sign in to comment.