merge main

cosdt · Jan 13, 2025 · d16a403 · d16a403
2 parents 8a9cad2 + 319f283
commit d16a403
Show file tree

Hide file tree

Showing 13 changed files with 44 additions and 54 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -21,7 +21,7 @@ RUN pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm_ascend//vllm/requirements-build.txt
 # build vLLM with NPU backend
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="cpu" python3 -m pip install /workspace/vllm_ascend/vllm/
-# install vllm_ascend_plugin
+# install vllm_ascend
 RUN python3 -m pip install /workspace/vllm_ascend/
 
 CMD ["/bin/bash"]
diff --git a/README.md b/README.md
@@ -38,7 +38,7 @@ docker exec -it vllm bash
 
 ### 1. Prepare CANN env
 
-Before install vllm_ascend_plugin, you need to install the Ascend CANN Toolkit and Kernels. Please follow the [installation tutorial](https://ascend.github.io/docs/sources/ascend/quick_install.html#id1) or use the following commands for quick installation:
+Before install vllm_ascend, you need to install the Ascend CANN Toolkit and Kernels. Please follow the [installation tutorial](https://ascend.github.io/docs/sources/ascend/quick_install.html#id1) or use the following commands for quick installation:
 
 ```bash
 # replace the url according to your CANN version and devices

diff --git a/setup.py b/setup.py
@@ -27,13 +27,13 @@ def _read_requirements(filename: str) -> List[str]:
     try:
         requirements = _read_requirements("requirements.txt")
     except ValueError:
-        print("Failed to read requirements.txt in vllm_ascend_plugin.")
+        print("Failed to read requirements.txt in vllm_ascend.")
     return requirements
 
 
-setup(name='vllm_ascend_plugin',
+setup(name='vllm_ascend',
       version='0.1',
-      packages=['vllm_ascend_plugin'],
+      packages=['vllm_ascend'],
       install_requires=get_requirements(),
       extras_require={
         "tensorizer": ["tensorizer>=2.9.0"],
@@ -43,5 +43,5 @@ def _read_requirements(filename: str) -> List[str]:
       },
       entry_points={
           'vllm.platform_plugins':
-          ["ascend_plugin = vllm_ascend_plugin:register"]
+          ["ascend_plugin = vllm_ascend:register"]
       })
diff --git a/vllm_ascend_plugin/__init__.py → vllm_ascend/__init__.py b/vllm_ascend_plugin/__init__.py → vllm_ascend/__init__.py
@@ -1,3 +1,3 @@
 def register():
     """Register the NPU platform."""
-    return "vllm_ascend_plugin.platform.NPUPlatform"
+    return "vllm_ascend.platform.NPUPlatform"
diff --git a/vllm_ascend_plugin/attention.py → vllm_ascend/attention.py b/vllm_ascend_plugin/attention.py → vllm_ascend/attention.py
@@ -5,9 +5,9 @@
 import torch
 
 try:
-    import torch_npu  # noqa: F401
+    import torch_npu
 except ImportError:
-    print("Failed to import torch_npu.")
+    print("Failed to import torch_npu")
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
@@ -19,7 +19,7 @@
                                            PagedAttentionMetadata)
 
 if TYPE_CHECKING:
-    from vllm_ascend_plugin.model_runner import ModelInputForNPUBuilder
+    from vllm_ascend.model_runner import ModelInputForNPUBuilder
 
 SHARE_MASK_TRIL_PREFIX_CACHE = None
 SHARE_MASK_TRIL = None
@@ -375,16 +375,15 @@ def _add_seq_group(
             # TODO(sang): Combine chunked prefill and prefix caching by
             # only allowing multiple of block_size chunk size.
             # NOTE: This only works for oooooooxxx style attention.
-            block_table: List[int] = []
+            block_table = []
             prefix_cache_hit = any([
                 inter_data.prefix_cache_hit
                 for inter_data in self.input_builder.inter_data_list
             ])
             if prefix_cache_hit:
                 # NOTE(woosuk): For flash-attn, the block table should
                 # include the entries for the incoming prefill tokens.
-                if block_tables is not None:
-                    block_table = block_tables[seq_id]
+                block_table = block_tables[seq_id]
             elif ((chunked_prefill_enabled or not is_prompt)
                   and block_tables is not None):
                 if curr_sliding_window_block == 0:

diff --git a/vllm_ascend_plugin/communicator.py → vllm_ascend/communicator.py b/vllm_ascend_plugin/communicator.py → vllm_ascend/communicator.py
diff --git a/vllm_ascend_plugin/model_runner.py → vllm_ascend/model_runner.py b/vllm_ascend_plugin/model_runner.py → vllm_ascend/model_runner.py
diff --git a/vllm_ascend/ops/__init__.py b/vllm_ascend/ops/__init__.py
@@ -0,0 +1 @@
+import vllm_ascend.ops.layernorm # noqa
diff --git a/vllm_ascend/ops/layernorm.py b/vllm_ascend/ops/layernorm.py
@@ -0,0 +1,23 @@
+from typing import Optional, Tuple, Union
+
+import torch
+
+from vllm.model_executor.layers.layernorm import RMSNorm
+
+def forward_oot(
+    self,
+    x: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    import torch_npu
+
+    if residual is not None:
+        x, _, residual = torch_npu.npu_add_rms_norm(
+            x, residual, self.weight, self.variance_epsilon)
+        return x, residual
+
+    x, residual = torch_npu.npu_rms_norm(x, self.weight,
+                                            self.variance_epsilon)
+    return x
+
+RMSNorm.forward_oot = forward_oot
diff --git a/vllm_ascend_plugin/platform.py → vllm_ascend/platform.py b/vllm_ascend_plugin/platform.py → vllm_ascend/platform.py
@@ -4,9 +4,9 @@
 import torch
 
 try:
-    import torch_npu  # noqa: F401
+    import torch_npu
 except ImportError:
-    print("Failed to import torch_npu.")
+    print("Failed to import torch_npu")
 
 from vllm.config import VllmConfig
 from vllm.platforms import Platform
@@ -32,7 +32,7 @@ class NPUPlatform(Platform):
     _enum = "Ascend"
     device_name: str = "npu"
     device_type: str = "npu"
-    torch_compile_backend: str = "npu"
+    simple_compile_backend: str = "npu"
     ray_device_key: str = "NPU"
     visible_device_name: str = "ASCEND_RT"
 
@@ -72,19 +72,19 @@ def mem_get_info(cls) -> Tuple[int, int]:
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         # Register ops when setup.
-        from vllm_ascend_plugin import ops  # noqa: F401
+        from vllm_ascend import ops
 
         parallel_config = vllm_config.parallel_config
         if parallel_config.worker_cls == "auto":
-            parallel_config.worker_cls = "vllm_ascend_plugin.worker.NPUWorker"
+            parallel_config.worker_cls = "vllm_ascend.worker.NPUWorker"
         cache_config = vllm_config.cache_config
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                              kv_cache_dtype, block_size, use_v1):
-        return "vllm_ascend_plugin.attention.AscendAttentionBackend"
+        return "vllm_ascend.attention.AscendAttentionBackend"
 
     @classmethod
     def get_current_memory_usage(cls,
@@ -95,4 +95,4 @@ def get_current_memory_usage(cls,
 
     @classmethod
     def get_device_communicator_cls(cls) -> str:
-        return "vllm_ascend_plugin.communicator.NPUCommunicator"
+        return "vllm_ascend.communicator.NPUCommunicator"
diff --git a/vllm_ascend_plugin/worker.py → vllm_ascend/worker.py b/vllm_ascend_plugin/worker.py → vllm_ascend/worker.py
@@ -18,7 +18,7 @@
 from vllm.worker.worker import Worker
 from vllm.worker.worker_base import WorkerBase
 
-from vllm_ascend_plugin.model_runner import NPUModelRunner
+from vllm_ascend.model_runner import NPUModelRunner
 
 
 class NPUWorker(Worker):

diff --git a/vllm_ascend_plugin/ops/__init__.py b/vllm_ascend_plugin/ops/__init__.py
diff --git a/vllm_ascend_plugin/ops/layernorm.py b/vllm_ascend_plugin/ops/layernorm.py