Support mla (vllm-project#775)

yangw1234 · web-flow · commit 55809c9988fe · 2025-02-03T11:07:28.000-06:00
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -549,6 +549,7 @@ async def benchmark(
         # multi-modal benchmark is only available on OpenAI Chat backend.
         raise ValueError(
             "Multi-modal content is only supported on 'openai-chat' backend.")
+    test_output_len = 10
     test_input = RequestFuncInput(
         model=model_id,
         model_name=model_name,
diff --git a/scripts/run_example_tp.py b/scripts/run_example_tp.py
@@ -3,8 +3,8 @@
 import argparse
 import os
 
-#model_path = "/software/data/DeepSeek-R1/"
-model_path = "deepseek-ai/DeepSeek-V2-Lite"
+model_path = "/data/models/DeepSeek-R1/"
+# model_path = "deepseek-ai/DeepSeek-V2-Lite"
 
 # Parse the command-line arguments.
 parser = argparse.ArgumentParser()
@@ -13,51 +13,53 @@
 parser.add_argument("--tokenizer", type=str, default=model_path, help="The model path.")
 #parser.add_argument("--model", type=str, default="/data/models/DeepSeek-R1-bf16-small/", help="The model path.")
 #parser.add_argument("--tokenizer", type=str, default="opensourcerelease/DeepSeek-R1-bf16", help="The model path.")
-parser.add_argument("--tp_size", type=int, default=1, help="The number of threads.")
+parser.add_argument("--tp_size", type=int, default=8, help="The number of threads.")
 args = parser.parse_args()
 
 os.environ["VLLM_SKIP_WARMUP"] = "true"
 os.environ["HABANA_VISIBLE_DEVICES"] = "ALL"
 os.environ["PT_HPU_ENABLE_LAZY_COLLECTIVES"] = "true"
-os.environ["VLLM_RAY_DISABLE_LOG_TO_DRIVER"] = "1"
-os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1"
+# os.environ["VLLM_RAY_DISABLE_LOG_TO_DRIVER"] = "1"
+# os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1"
 os.environ["VLLM_MOE_N_SLICE"] = "8"
+os.environ["VLLM_MLA_DISABLE_REQUANTIZATION"] = "1"
 
+if __name__ == "__main__":
 
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0, max_tokens=50)
-model = args.model
-if args.tp_size == 1:
-    llm = LLM(
-        model=model, 
-        tokenizer=args.tokenizer,
-        trust_remote_code=True,
-        dtype="bfloat16",
-        max_model_len=1024,
-    )
-else:
-    llm = LLM(
-        model=model, 
-        tokenizer=args.tokenizer,
-        tensor_parallel_size=args.tp_size,
-        distributed_executor_backend='ray',
-        trust_remote_code=True,
-        max_model_len=1024,
-        dtype="bfloat16",
-    )
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0, max_tokens=50)
+    model = args.model
+    if args.tp_size == 1:
+        llm = LLM(
+            model=model, 
+            tokenizer=args.tokenizer,
+            trust_remote_code=True,
+            dtype="bfloat16",
+            max_model_len=1024,
+        )
+    else:
+        llm = LLM(
+            model=model, 
+            tokenizer=args.tokenizer,
+            tensor_parallel_size=args.tp_size,
+            distributed_executor_backend='mp',
+            trust_remote_code=True,
+            max_model_len=1024,
+            dtype="bfloat16",
+        )
 
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/scripts/run_static-online.sh b/scripts/run_static-online.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 tp_parrallel=8
-bs=32
+bs=96
 in_len=1024
 out_len=1024
 multi_step=1
@@ -10,12 +10,13 @@ VLLM_DECODE_BLOCK_BUCKET_MAX=$((total_len * bs / 128 + 128))
 
 # model="/data/models/DeepSeek-R1/"
 # tokenizer="/data/models/DeepSeek-R1/"
-model="/software/data/DeepSeek-R1/"
-tokenizer="/software/data/DeepSeek-R1/"
+model="/data/models/DeepSeek-R1/"
+tokenizer="/data/models/DeepSeek-R1/"
 model_name="DeepSeek-R1"
 
 HABANA_VISIBLE_DEVICES="ALL" \
-VLLM_MOE_N_SLICE=8 \
+VLLM_MOE_N_SLICE=4 \
+VLLM_MLA_DISABLE_REQUANTIZATION=1 \
 PT_HPU_ENABLE_LAZY_COLLECTIVES="true" \
 VLLM_RAY_DISABLE_LOG_TO_DRIVER="1" \
 RAY_IGNORE_UNHANDLED_ERRORS="1" \
@@ -37,7 +38,6 @@ python -m vllm.entrypoints.openai.api_server \
     --use-v2-block-manager \
     --num_scheduler_steps ${multi_step}\
     --max-model-len 2048 \
-    --max-num-batched-tokens 2048 \
     --distributed_executor_backend ray \
     --gpu_memory_utilization 0.9 \
     --trust_remote_code 2>&1 | tee benchmark_logs/serving.log &
@@ -53,7 +53,7 @@ done
 sleep 5s
 echo ${pid}
 
-num_prompts=32
+num_prompts=300
 request_rate=1
 start_time=$(date +%s)
 echo "Start to benchmark"
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
@@ -73,10 +73,9 @@ def get_kv_cache_shape(
         num_blocks: int,
         block_size: int,
         num_kv_heads: int,
-        kv_lora_rank: int,
+        head_size: int,
     ) -> Tuple[int, ...]:
-        k_pe_size = kv_lora_rank // 8
-        return (num_blocks, block_size, kv_lora_rank + k_pe_size), True
+        return (num_blocks, block_size, head_size), (num_blocks, block_size, head_size//9*8)
     
     @staticmethod
     def get_impl_cls() -> Type["HPUAttentionImpl"]:
@@ -137,7 +136,8 @@ def __init__(
         self.matmul_av = Matmul()
         self.batch2block_matmul = Matmul()
         self.block2batch_matmul = Matmul()
-        self.latent_cache = VLLMKVCache()
+        self.latent_cache_k = VLLMKVCache()
+        self.latent_cache_v = VLLMKVCache()
         HPUFusedSDPA = kernels.fsdpa()
         self.fused_scaled_dot_product_attention = None if HPUFusedSDPA is None \
             else ModuleFusedSDPA(HPUFusedSDPA)
@@ -186,9 +186,6 @@ def forward(
             q_pe = torch.matmul(hidden_states_or_q_c, self.W_QR)\
                 .view(-1, self.num_heads, self.qk_rope_head_dim)
             input_positions = attn_metadata.input_positions.view(-1)
-            print("q_pe", q_pe.shape)
-            print("k_pe", k_pe.shape)
-            print("input_positions", attn_metadata.input_positions.shape)
             q_pe, k_pe = \
                 self.rotary_emb(input_positions, q_pe, k_pe)
         else:
@@ -197,9 +194,6 @@ def forward(
             
             q_pe = q[..., self.qk_nope_head_dim:]
 
-            # print("q_pe shape", q_pe.shape)
-            # print("k_pe shape", k_pe.shape)
-            # print("input_positions shape", attn_metadata.input_positions.shape)
             input_positions = attn_metadata.input_positions.view(-1)
             # TODO(lucas): there must be a nicer way to write this line
             q[..., self.qk_nope_head_dim:], k_pe = \
@@ -208,15 +202,29 @@ def forward(
         block_indices = attn_metadata.block_indices
         block_offsets = attn_metadata.block_offsets
 
-        latent_vec = torch.concat(
+        latent_vec_k = torch.concat(
                 (k_c_normed, k_pe.view(batch_size, -1, self.qk_rope_head_dim)), dim=-1)
         # assert layer._k_scale == 0, f"got _k_scale={layer._k_scale}"
-        # print(f"layer._k_scale={layer._k_scale}")
+        latent_vec_k = latent_vec_k.view(-1, self.qk_rope_head_dim + self.kv_lora_rank)
+        latent_vec_v = k_c_normed.view(-1, self.kv_lora_rank)
+        if is_prefill:
+            latent_vec_k = latent_vec_k.unflatten(0, (block_indices.size(0), -1))
+            latent_vec_v = latent_vec_v.unflatten(0, (block_indices.size(0), -1))
+        # print("latent_vec", latent_vec.shape)
+
 
         # write the latent and rope to kv cache
-        if kv_cache is not None:
-            kv_cache = self.latent_cache(latent_vec, kv_cache, block_indices,
+        if kv_cache is not None and len(kv_cache) == 2:
+            # print(f"k cache shape: {kv_cache[0].shape}")
+            # print(f"v cache shape: {kv_cache[1].shape}")
+            # print(f"latent vec k shape: {latent_vec_k.shape}")
+            # print(f"latent vec v shape: {latent_vec_v.shape}")
+            
+            k_cache = self.latent_cache_k(latent_vec_k, kv_cache[0], block_indices,
                                         block_offsets)
+            v_cache = self.latent_cache_v(latent_vec_v, kv_cache[1], block_indices,
+                                        block_offsets)
+            kv_cache = (k_cache, v_cache)
 
         if is_prefill:
             return self._forward_prefill(q, k_c_normed, k_pe, attn_metadata, batch_size)
@@ -268,20 +276,14 @@ def _forward_decode(
         self,
         q_nope: torch.Tensor,
         q_pe: torch.Tensor,
-        kv_c_and_k_pe_cache: torch.Tensor,
+        kv_cache: torch.Tensor,
         attn_metadata: HPUAttentionMetadata,
         batch_size: int
     ) -> torch.Tensor:
-        print(f"q_nope shape: {q_nope.shape}")
-        print(f"q_pe shape: {q_pe.shape}")
-
         q = torch.cat([q_nope, q_pe], dim=-1)
-        kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.unsqueeze(2)
-        kv_c_cache = kv_c_and_k_pe_cache[..., :self.kv_lora_rank]
+        kv_c_and_k_pe_cache = kv_cache[0].unsqueeze(2)
+        kv_c_cache = kv_cache[1].unsqueeze(2)
 
-        print(f"q shape: {q.shape}")
-        print(f"kv_c_and_k_pe_cache shape: {kv_c_and_k_pe_cache.shape}")
-        print(f"kv_c_cache shape: {kv_c_cache.shape}")
         output = HPUPagedAttention.forward_decode(
             query=q,
             key_cache=kv_c_and_k_pe_cache,
@@ -296,13 +298,11 @@ def _forward_decode(
             matmul_av_op=self.matmul_av,
             batch2block_matmul_op=self.batch2block_matmul,
             block2batch_matmul_op=self.block2batch_matmul,
-            keys_fetch_func=self.latent_cache.fetch_from_cache,
-            values_fetch_func=self.latent_cache.fetch_from_cache)
+            keys_fetch_func=self.latent_cache_k.fetch_from_cache,
+            values_fetch_func=self.latent_cache_v.fetch_from_cache)
         output = output.view(batch_size, 1, -1)
-        print("output", output.shape)
         result = self._v_up_proj_and_o_proj(output)
         result = result.view(batch_size, 1, -1)
-        print("result", result.shape)
         return result
 
 
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
@@ -112,7 +112,8 @@ def get_cache_block_size(
         key_cache_block = cache_config.block_size * num_heads * head_size
         # For MLA there is no value cache, since the latent vector
         # is joint keys and values.
-        value_cache_block = key_cache_block if not model_config.use_mla else 0
+        # value_cache_block = key_cache_block if not model_config.use_mla else 0
+        value_cache_block = key_cache_block // 9 * 8
         total = num_attention_layers * (key_cache_block + value_cache_block)
         if cache_config.cache_dtype == "auto":
             dtype = model_config.dtype
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
@@ -568,25 +568,24 @@ def _allocate_kv_cache(
             num_blocks, self.block_size, self.num_kv_heads, self.head_size)
 
         use_mla = False
-        if len(kv_cache_shape) == 2 and kv_cache_shape[1]:
+        if len(kv_cache_shape) == 2:
             use_mla = True
-            kv_cache_shape = kv_cache_shape[0]
+            k_cache_shape = kv_cache_shape[0]
+            v_cache_shape = kv_cache_shape[1]
+        else:
+            k_cache_shape = kv_cache_shape
+            v_cache_shape = kv_cache_shape
 
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]] = []
         dtype = self.dtype
         if device != 'hpu' and not is_fake_hpu() \
           and self.dtype == torch.float8_e4m3fn:
             dtype = torch.uint8
         for _ in range(self.num_attention_layers):
-            if use_mla:
-                kv_layer = torch.zeros(kv_cache_shape,
-                                           dtype=dtype,
-                                           device=device)
-            else:
-                key_cache = torch.zeros(kv_cache_shape, dtype=dtype, device=device)
-                value_cache = torch.zeros(kv_cache_shape,
-                                          dtype=dtype,
-                                          device=device)
-                kv_layer = (key_cache, value_cache)
+            key_cache = torch.zeros(k_cache_shape, dtype=dtype, device=device)
+            value_cache = torch.zeros(v_cache_shape,
+                                        dtype=dtype,
+                                        device=device)
+            kv_layer = (key_cache, value_cache)
             kv_cache.append(kv_layer)
         return kv_cache
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
@@ -48,7 +48,10 @@ def _init_attn_metadata_from_tensor_dict(
     valid_attn_kwargs = {}
     for field in dataclasses.fields(attn_backend.get_metadata_cls()):
         if field.name in tensor_dict:
-            valid_attn_kwargs[field.name] = tensor_dict.pop(field.name)
+            if field.name == "input_positions":
+                valid_attn_kwargs[field.name] = tensor_dict[field.name]
+            else:
+                valid_attn_kwargs[field.name] = tensor_dict.pop(field.name)
 
     attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs)
     tensor_dict["attn_metadata"] = attn_metadata