diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml new file mode 100644 index 0000000..0226cf0 --- /dev/null +++ b/.github/workflows/actionlint.yml @@ -0,0 +1,40 @@ +name: Lint GitHub Actions workflows +on: + push: + branches: + - "main" + paths: + - '.github/workflows/*.ya?ml' + - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' + pull_request: + branches: + - "main" + paths: + - '.github/workflows/*.ya?ml' + - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + actionlint: + runs-on: ubuntu-latest + steps: + - name: "Checkout" + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + + - name: "Run actionlint" + run: | + echo "::add-matcher::.github/workflows/matchers/actionlint.json" + tools/actionlint.sh -color diff --git a/.github/workflows/matchers/actionlint.json b/.github/workflows/matchers/actionlint.json new file mode 100644 index 0000000..4613e16 --- /dev/null +++ b/.github/workflows/matchers/actionlint.json @@ -0,0 +1,17 @@ +{ + "problemMatcher": [ + { + "owner": "actionlint", + "pattern": [ + { + "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$", + "file": 1, + "line": 2, + "column": 3, + "message": 4, + "code": 5 + } + ] + } + ] +} diff --git a/.github/workflows/matchers/mypy.json b/.github/workflows/matchers/mypy.json new file mode 100644 index 0000000..f048fce --- /dev/null +++ b/.github/workflows/matchers/mypy.json @@ -0,0 +1,16 @@ +{ + "problemMatcher": [ + { + "owner": "mypy", + "pattern": [ + { + "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$", + "file": 1, + "line": 2, + "severity": 3, + "message": 4 + } + ] + } + ] +} diff --git a/.github/workflows/matchers/ruff.json b/.github/workflows/matchers/ruff.json new file mode 100644 index 0000000..f6d4479 --- /dev/null +++ b/.github/workflows/matchers/ruff.json @@ -0,0 +1,17 @@ +{ + "problemMatcher": [ + { + "owner": "ruff", + "pattern": [ + { + "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$", + "file": 1, + "line": 2, + "column": 3, + "code": 4, + "message": 5 + } + ] + } + ] + } diff --git a/.gitignore b/.gitignore index adcb690..8da361f 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,12 @@ __pycache__/ # egg-info vllm_ascend.egg-info/ + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Linting +actionlint +shellcheck*/ diff --git a/README.md b/README.md index 81d13bf..b06806a 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,3 @@ bash format.sh # 3. Commit changed files with message 'Run yapf and ruff' git commit -m "Run yapf and ruff" ``` - -> [!NOTE] -> The warning "F401 `torch_npu` imported but unused" doesn't matter because the api `torch.npu` will call this library indirectly while there are not `torch_npu` in the codes explicitly. diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py index a0b7aff..57fa98a 100644 --- a/examples/offline_inference_audio_language.py +++ b/examples/offline_inference_audio_language.py @@ -7,7 +7,6 @@ """ from transformers import AutoTokenizer - from vllm import LLM, SamplingParams from vllm.assets.audio import AudioAsset from vllm.utils import FlexibleArgumentParser diff --git a/format.sh b/format.sh index 5ffe784..11e6240 100755 --- a/format.sh +++ b/format.sh @@ -43,7 +43,7 @@ ISORT_VERSION=$(isort --vn) CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') SPHINX_LINT_VERSION=$(sphinx-lint --version | awk '{print $2}') -# # params: tool name, tool version, required version +# params: tool name, tool version, required version tool_version_check() { expected=$(grep "$1" requirements-lint.txt | cut -d'=' -f3) if [[ "$2" != "$expected" ]]; then @@ -112,9 +112,9 @@ fi echo 'vLLM yapf: Done' # Run mypy -# echo 'vLLM mypy:' -# tools/mypy.sh -# echo 'vLLM mypy: Done' +echo 'vLLM mypy:' +tools/mypy.sh +echo 'vLLM mypy: Done' # If git diff returns a file that is in the skip list, the file may be checked anyway: @@ -316,6 +316,6 @@ else echo "✨🎉 Format check passed! Congratulations! 🎉✨" fi -echo 'vLLM sphinx-lint:' -tools/sphinx-lint.sh -echo 'vLLM sphinx-lint: Done' +# echo 'vLLM sphinx-lint:' +# tools/sphinx-lint.sh +# echo 'vLLM sphinx-lint: Done' diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..fe0fd66 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,10 @@ +[mypy] +; warn_return_any = True +warn_unused_configs = True + +; Suppress all missing import errors from torch_npu for mypy. +[mypy-torch_npu.*] +ignore_missing_imports = True + +[mypy-transformers.*] +ignore_missing_imports = True diff --git a/setup.py b/setup.py index d34c5ac..59fdd41 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,6 @@ import os -from typing import Dict, List +from typing import List + from setuptools import setup ROOT_DIR = os.path.dirname(__file__) @@ -8,6 +9,7 @@ def get_path(*filepath) -> str: return os.path.join(ROOT_DIR, *filepath) + def get_requirements() -> List[str]: """Get Python package dependencies from requirements.txt.""" @@ -23,7 +25,7 @@ def _read_requirements(filename: str) -> List[str]: else: resolved_requirements.append(line) return resolved_requirements - + try: requirements = _read_requirements("requirements.txt") except ValueError: @@ -31,17 +33,17 @@ def _read_requirements(filename: str) -> List[str]: return requirements -setup(name='vllm_ascend', - version='0.1', - packages=['vllm_ascend'], - install_requires=get_requirements(), - extras_require={ +setup( + name='vllm_ascend', + version='0.1', + packages=['vllm_ascend'], + install_requires=get_requirements(), + extras_require={ "tensorizer": ["tensorizer>=2.9.0"], "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"], "audio": ["librosa", "soundfile"], # Required for audio processing "video": ["decord"] # Required for video processing - }, - entry_points={ - 'vllm.platform_plugins': - ["ascend_plugin = vllm_ascend:register"] - }) + }, + entry_points={ + 'vllm.platform_plugins': ["ascend_plugin = vllm_ascend:register"] + }) diff --git a/tools/actionlint.sh b/tools/actionlint.sh new file mode 100755 index 0000000..f6a8b5e --- /dev/null +++ b/tools/actionlint.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +if command -v actionlint &> /dev/null; then + actionlint "$@" + exit 0 +elif [ -x ./actionlint ]; then + ./actionlint "$@" + exit 0 +fi + +# download a binary to the current directory - v1.7.3 +bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash) +./actionlint "$@" diff --git a/tools/check_repo.sh b/tools/check_repo.sh new file mode 100644 index 0000000..48eba5b --- /dev/null +++ b/tools/check_repo.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time) + +if ! git diff --quiet; then + echo "Repo is dirty" >&2 + + exit 1 +fi + +if ! git describe --tags; then + echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2 + + exit 1 +fi diff --git a/tools/mypy.sh b/tools/mypy.sh new file mode 100755 index 0000000..5c789fa --- /dev/null +++ b/tools/mypy.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +CI=${1:-0} +PYTHON_VERSION=${2:-3.9} + +if [ "$CI" -eq 1 ]; then + set -e +fi + +run_mypy() { + echo "Running mypy on $1" + if [ "$CI" -eq 1 ] && [ -z "$1" ]; then + mypy --python-version "${PYTHON_VERSION}" "$@" + return + fi + mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@" +} + +run_mypy # Note that this is less strict than CI +run_mypy vllm_ascend +run_mypy examples diff --git a/tools/png-lint.sh b/tools/png-lint.sh new file mode 100755 index 0000000..a80fe98 --- /dev/null +++ b/tools/png-lint.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# Ensure that *.excalidraw.png files have the excalidraw metadata +# embedded in them. This ensures they can be loaded back into +# the tool and edited in the future. + +find . -iname '*.excalidraw.png' | while read -r file; do + if git check-ignore -q "$file"; then + continue + fi + if ! grep -q "excalidraw+json" "$file"; then + echo "$file was not exported from excalidraw with 'Embed Scene' enabled." + exit 1 + fi +done diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh new file mode 100755 index 0000000..bd46f7c --- /dev/null +++ b/tools/shellcheck.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -e + +scversion="stable" + +if [ -d "shellcheck-${scversion}" ]; then + PATH="$PATH:$(pwd)/shellcheck-${scversion}" + export PATH +fi + +if ! [ -x "$(command -v shellcheck)" ]; then + if [ "$(uname -s)" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then + echo "Please install shellcheck: https://github.com/koalaman/shellcheck?tab=readme-ov-file#installing" + exit 1 + fi + + # automatic local install if linux x86_64 + wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv + PATH="$PATH:$(pwd)/shellcheck-${scversion}" + export PATH +fi + +# TODO - fix warnings in .buildkite/run-amd-test.sh +find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"' diff --git a/tools/sphinx-lint.sh b/tools/sphinx-lint.sh new file mode 100755 index 0000000..04f8075 --- /dev/null +++ b/tools/sphinx-lint.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +sphinx-lint --disable trailing-whitespace,missing-final-newline docs diff --git a/vllm_ascend/attention.py b/vllm_ascend/attention.py index 2c0ab1f..199130f 100644 --- a/vllm_ascend/attention.py +++ b/vllm_ascend/attention.py @@ -5,9 +5,9 @@ import torch try: - import torch_npu + import torch_npu # noqa: F401 except ImportError: - print("Failed to import torch_npu") + print("Failed to import torch_npu.") from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) @@ -375,7 +375,7 @@ def _add_seq_group( # TODO(sang): Combine chunked prefill and prefix caching by # only allowing multiple of block_size chunk size. # NOTE: This only works for oooooooxxx style attention. - block_table = [] + block_table: List[int] = [] prefix_cache_hit = any([ inter_data.prefix_cache_hit for inter_data in self.input_builder.inter_data_list @@ -383,7 +383,8 @@ def _add_seq_group( if prefix_cache_hit: # NOTE(woosuk): For flash-attn, the block table should # include the entries for the incoming prefill tokens. - block_table = block_tables[seq_id] + if block_tables is not None: + block_table = block_tables[seq_id] elif ((chunked_prefill_enabled or not is_prompt) and block_tables is not None): if curr_sliding_window_block == 0: diff --git a/vllm_ascend/model_runner.py b/vllm_ascend/model_runner.py index 7ffaa00..b1dfb54 100644 --- a/vllm_ascend/model_runner.py +++ b/vllm_ascend/model_runner.py @@ -3,13 +3,12 @@ import torch import torch.distributed - from vllm.distributed import get_pp_group from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest from vllm.model_executor import SamplingMetadata -from vllm.multimodal import MultiModalKwargs +from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderMap from vllm.platforms import current_platform from vllm.prompt_adapter.layers import PromptAdapterMapping from vllm.prompt_adapter.request import PromptAdapterRequest @@ -214,6 +213,218 @@ def build(self) -> ModelInputForGPU: prompt_adapter_mapping=prompt_adapter_mapping, prompt_adapter_requests=prompt_adapter_requests) + class InterDataForSeqGroup: + """Intermediate data for the current sequence group.""" + + def simple_reinit(self): + self.input_tokens[0].clear() # type: ignore + self.input_positions[0].clear() # type: ignore + self.token_types[0].clear() # type: ignore + self.mrope_input_positions = None # type: ignore + self.seq_lens[0] = 0 # type: ignore + self.orig_seq_lens[0] = 0 # type: ignore + self.query_lens[0] = 0 # type: ignore + self.context_lens[0] = 0 # type: ignore + self.curr_sliding_window_blocks[0] = 0 # type: ignore + self.lora_index_mapping.clear() # type: ignore + self.lora_prompt_mapping.clear() # type: ignore + self.lora_requests.clear() # type: ignore + self.prompt_adapter_index_mapping.clear() # type: ignore + self.prompt_adapter_prompt_mapping.clear() # type: ignore + + def __init__( + self, + *, + # From sequence group metadata. + request_id: str, + seq_ids: List[int], + is_prompt: bool, + block_tables: Optional[Dict[int, List[int]]], + computed_block_nums: List[int], + n_seqs: int = 0, + + # Input tokens and positions. + input_tokens: Optional[List[List[int]]] = None, + input_positions: Optional[List[List[int]]] = None, + token_types: Optional[List[List[int]]] = None, + mrope_input_positions: Optional[List[List[List[int]]]] = None, + + # The sequence length (may be capped to the sliding window). + seq_lens: Optional[List[int]] = None, + # The original sequence length (before applying sliding window). + # This is used to compute slot mapping. + orig_seq_lens: Optional[List[int]] = None, + # The query length. + query_lens: Optional[List[int]] = None, + # The number of tokens that are already computed. + context_lens: Optional[List[int]] = None, + # The current sliding window block. + curr_sliding_window_blocks: Optional[List[int]] = None, + + # LoRA inputs. + lora_index_mapping: Optional[List[List[int]]] = None, + lora_prompt_mapping: Optional[List[List[int]]] = None, + lora_requests: Optional[Set[LoRARequest]] = None, + + # Prompt adapter inputs. + prompt_adapter_index_mapping: Optional[List[int]] = None, + prompt_adapter_prompt_mapping: Optional[List[int]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + + # Multi-modal inputs. + multi_modal_kwargs: Optional[MultiModalKwargs] = None, + multi_modal_placeholder_maps: Optional[Dict[ + str, MultiModalPlaceholderMap]] = None, + + # Whether the prefix cache is hit (prefill only). + prefix_cache_hit: bool = False, + reinit: bool = False, + reinit_use_defaults: bool = False, + encoder_seq_len: int = 0, + ): + if reinit: + assert len(self.seq_ids) == len(seq_ids) # type: ignore + for i, seq_id in enumerate(seq_ids): + self.seq_ids[i] = seq_id # type: ignore + else: + self.seq_ids = seq_ids + + self.request_id = request_id + self.is_prompt = is_prompt + self.block_tables = block_tables + self.computed_block_nums = computed_block_nums + self.n_seqs = n_seqs + self.encoder_seq_len = encoder_seq_len + + if reinit: + if len(self.seq_ids) == 1 and reinit_use_defaults: + self.simple_reinit() + else: + if input_tokens: + self.input_tokens = input_tokens + else: + for seq_id in range(len(self.seq_ids)): + self.input_tokens[seq_id].clear() + + if input_positions: + self.input_positions = input_positions + else: + for seq_id in range(len(self.seq_ids)): + self.input_positions[seq_id].clear() + + if token_types: + self.token_types = token_types + else: + for seq_id in range(len(self.seq_ids)): + self.token_types[seq_id].clear() + + self.mrope_input_positions = None + + if seq_lens: + self.seq_lens = seq_lens + else: + for seq_id in range(len(self.seq_ids)): + self.seq_lens[seq_id] = 0 + + if orig_seq_lens: + self.orig_seq_lens = orig_seq_lens + else: + for seq_id in range(len(self.seq_ids)): + self.orig_seq_lens[seq_id] = 0 + + if query_lens: + self.query_lens = query_lens + else: + for seq_id in range(len(self.seq_ids)): + self.query_lens[seq_id] = 0 + + if context_lens: + self.context_lens = context_lens + else: + for seq_id in range(len(self.seq_ids)): + self.context_lens[seq_id] = 0 + + if curr_sliding_window_blocks: + self.curr_sliding_window_blocks = \ + curr_sliding_window_blocks + else: + for seq_id in range(len(self.seq_ids)): + self.curr_sliding_window_blocks[seq_id] = 0 + + if lora_index_mapping: + self.lora_index_mapping = lora_index_mapping + else: + self.lora_index_mapping.clear() + + if lora_prompt_mapping: + self.lora_prompt_mapping = lora_prompt_mapping + else: + self.lora_prompt_mapping.clear() + + if lora_requests: + self.lora_requests = lora_requests + else: + self.lora_requests.clear() + + if prompt_adapter_index_mapping: + self.prompt_adapter_index_mapping = \ + prompt_adapter_index_mapping + else: + self.prompt_adapter_index_mapping.clear() + + if prompt_adapter_prompt_mapping: + self.prompt_adapter_prompt_mapping = \ + prompt_adapter_prompt_mapping + else: + self.prompt_adapter_prompt_mapping.clear() + + else: + self.input_tokens = input_tokens or [] + self.input_positions = input_positions or [] + self.token_types = token_types or [] + self.mrope_input_positions = mrope_input_positions or None + self.seq_lens = seq_lens or [] + self.orig_seq_lens = orig_seq_lens or [] + self.query_lens = query_lens or [] + self.context_lens = context_lens or [] + self.curr_sliding_window_blocks = \ + curr_sliding_window_blocks or [] + + self.lora_index_mapping = lora_index_mapping or [] + self.lora_prompt_mapping = lora_prompt_mapping or [] + self.lora_requests = lora_requests or set() + + self.prompt_adapter_index_mapping = ( + prompt_adapter_index_mapping or []) + self.prompt_adapter_prompt_mapping = ( + prompt_adapter_prompt_mapping or []) + + self.prompt_adapter_request = prompt_adapter_request + self.multi_modal_kwargs = multi_modal_kwargs + self.multi_modal_placeholder_maps = multi_modal_placeholder_maps + self.prefix_cache_hit = prefix_cache_hit + + self.n_seqs = len(self.seq_ids) + + if not reinit: + self.__post_init__() + + def __post_init__(self): + self.n_seqs = len(self.seq_ids) + + self.input_tokens = [[] for _ in range(self.n_seqs)] + self.input_positions = [[] for _ in range(self.n_seqs)] + self.token_types = [[] for _ in range(self.n_seqs)] + self.mrope_input_positions = None + self.seq_lens = [0] * self.n_seqs + self.orig_seq_lens = [0] * self.n_seqs + self.query_lens = [0] * self.n_seqs + self.context_lens = [0] * self.n_seqs + self.curr_sliding_window_blocks = [0] * self.n_seqs + + self.lora_index_mapping = [] + self.lora_prompt_mapping = [] + class NPUModelRunner(ModelRunner): """ @@ -375,7 +586,7 @@ def prepare_model_input( self.sampling_metadata_cache, # TODO (cmq): enable this after supported in vllm # pad_for_invariant_seq_len=True, - ) + ) else: sampling_metadata = None is_prompt = (seq_group_metadata_list[0].is_prompt diff --git a/vllm_ascend/ops/__init__.py b/vllm_ascend/ops/__init__.py index 7162ac1..424da25 100644 --- a/vllm_ascend/ops/__init__.py +++ b/vllm_ascend/ops/__init__.py @@ -1 +1 @@ -import vllm_ascend.ops.layernorm # noqa +import vllm_ascend.ops.layernorm # noqa diff --git a/vllm_ascend/ops/layernorm.py b/vllm_ascend/ops/layernorm.py index 30310e6..a03fb43 100644 --- a/vllm_ascend/ops/layernorm.py +++ b/vllm_ascend/ops/layernorm.py @@ -1,9 +1,9 @@ from typing import Optional, Tuple, Union import torch - from vllm.model_executor.layers.layernorm import RMSNorm + def forward_oot( self, x: torch.Tensor, @@ -12,12 +12,12 @@ def forward_oot( import torch_npu if residual is not None: - x, _, residual = torch_npu.npu_add_rms_norm( - x, residual, self.weight, self.variance_epsilon) + x, _, residual = torch_npu.npu_add_rms_norm(x, residual, self.weight, + self.variance_epsilon) return x, residual - x, residual = torch_npu.npu_rms_norm(x, self.weight, - self.variance_epsilon) + x, residual = torch_npu.npu_rms_norm(x, self.weight, self.variance_epsilon) return x + RMSNorm.forward_oot = forward_oot diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 6ed8bd6..41a23a3 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -4,9 +4,9 @@ import torch try: - import torch_npu + import torch_npu # noqa: F401 except ImportError: - print("Failed to import torch_npu") + print("Failed to import torch_npu.") from vllm.config import VllmConfig from vllm.platforms import Platform @@ -72,7 +72,7 @@ def mem_get_info(cls) -> Tuple[int, int]: @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: # Register ops when setup. - from vllm_ascend import ops + from vllm_ascend import ops # noqa: F401 parallel_config = vllm_config.parallel_config if parallel_config.worker_cls == "auto":