Merge branch 'dev/yi/ds_r1' into dev/mengni/layer

yi · yi · commit 43d990538875 · 2025-03-05T14:51:44.000+08:00
diff --git a/scripts/QuantizeDeepSeek.md b/scripts/QuantizeDeepSeek.md
@@ -10,7 +10,7 @@
         - [Exporting Environment Variables](#exporting-environment-variables)
     - [Calibration](#calibration)
     - [Inference with FP8 Models on Two Nodes](#inference-with-fp8-models-on-two-nodes)
-    - [Inference with FP8 Models on a Single Node WIP](#inference-with-fp8-models-on-a-single-node-wip)
+    - [Inference with FP8 Models on a Single Node](#inference-with-fp8-models-on-a-single-node)
         - [Prerequisites](#prerequisites)
         - [Running the Example](#running-the-example)
     - [Accuracy Evaluation WIP](#accuracy-evaluation-wip)
@@ -63,6 +63,8 @@ For more details, please refer to the <https://github.com/yangulei/vllm-fork/blo
 git clone https://github.com/intel/neural-compressor.git inc
 cd inc
 git checkout dev/ds_r1
+pip install -r requirements.txt
+pip install -r requirements_pt.txt
 python setup.py pt develop
 ```
 
@@ -147,7 +149,7 @@ export QUANT_CONFIG=inc_quant_with_fp8kv_config.json
 python inc_example_two_nodes.py --mode quant --fp8_kvcache
 ```
 
-## Inference with FP8 Models on a Single Node (WIP)
+## Inference with FP8 Models on a Single Node
 
 In this section, we load the BF16 model on DRAM and quantize it to FP8 model using unified measurement results obtained from the two-node calibration.
 
@@ -156,10 +158,9 @@ In this section, we load the BF16 model on DRAM and quantize it to FP8 model usi
 - Hardware: 1x8G3 or 1x8G2(WIP)
 - Docker: 1.20.0-521
 
-### Running the Example
-
-Quantize model weights to FP8 and using BF16 KVCache(WIP)
+> [!NOTE] The DRAM requirement can be decreased to less than 1T in a few days.
 
+### Running the Example
 
 - BF16 KVCache
 ```bash
diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py
@@ -15,6 +15,7 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase, RowParallelLinear,
                                                UnquantizedLinearMethod)
+from vllm.logger import ForkedPdb
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensorsLinearMethod)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
@@ -35,7 +36,7 @@ class MLACommonMetadata(AttentionMetadata):
     input_positions: torch.Tensor
 
 
-class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
+class MLACommonImpl(MLAAttentionImpl[T], Generic[T], torch.nn.Module):
     """
     Common class for implementing repeated parts
 
@@ -154,6 +155,9 @@ def __init__(
         kv_b_proj: ColumnParallelLinear,
         o_proj: RowParallelLinear,
     ) -> None:
+        # NOTE: Make `MLACommonImpl` an `nn.Module` and `W_UV_O`, `W_Q_UK`, and `W_UK` `nn.Parameter`s,
+        # so that we can transfer them to the accelerator in case they are initialized on the CPU.
+        torch.nn.Module.__init__(self)
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -386,7 +390,11 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
                 self.W_UV_O_scales = W_UV_O_scales.T.contiguous()
             else:
                 self.W_UV_O = W_UV_O.to(act_dtype)
-
+            # NOTE: We need transfer them to the accelerator in case they are initialized on the CPU.
+            self.W_UV_O = torch.nn.Parameter(self.W_UV_O, requires_grad=False)
+            self.W_Q_UK = torch.nn.Parameter(self.W_Q_UK, requires_grad=False)
+            self.W_UK = torch.nn.Parameter(self.W_UK, requires_grad=False)
+            self.W_QR = torch.nn.Parameter(self.W_QR, requires_grad=False)
             self.tp_size = get_tensor_model_parallel_world_size()
         else:
             if is_fp8(weight_dtype):
diff --git a/vllm/logger.py b/vllm/logger.py
@@ -181,17 +181,17 @@ def show_mem_info(logger=None, msg="", loglevel="info"):
     rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else -1
     if rank == 0:
         show_fn(f"[Rank {rank}] {msg}")
-        show_fn(f"[Rank {rank}] Used HPU memory: {hpu_mem_mb // 1000} GB {hpu_mem_mb % 1000} MB")
         cpu_mem_mb = get_used_cpu_mem_MB()
-        show_fn(f"[Rank {rank}] Used CPU memory: {cpu_mem_mb // 1000} GB {cpu_mem_mb % 1000} MB")
+        show_fn(f"[Rank {rank}] Used HPU: {hpu_mem_mb // 1000} GB {hpu_mem_mb % 1000:.2f} MB; CPU: {cpu_mem_mb // 1000} GB {cpu_mem_mb % 1000:.2f} MB")
     
 
 def get_used_hpu_mem_MB():
     """Get HPU used memory: MiB."""
     import torch
     import numpy as np
+    import habana_frameworks.torch as htorch
     from habana_frameworks.torch.hpu import memory_stats
-
+    htorch.core.mark_step()
     torch.hpu.synchronize()
     mem_stats = memory_stats()
     used_hpu_mem = np.round(mem_stats["InUse"] / 1024**2, 3)
diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
@@ -114,6 +114,7 @@ def __init__(
         self.routed_scaling_factor = config.routed_scaling_factor
         self.n_shared_experts = config.n_shared_experts
         self.routed_scaling_factor = config.routed_scaling_factor
+        self._prefix = prefix
         if self.tp_size > config.n_routed_experts:
             raise ValueError(
                 f"Tensor parallel size {self.tp_size} is greater than "
@@ -164,6 +165,7 @@ def __init__(
 
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # show_mem_info(logger, f"{self._prefix}: before gate")
         batch_size, seq_len, hidden_dim = hidden_states.shape
         num_tokens = batch_size * seq_len
         hidden_states = hidden_states.view(-1, hidden_dim)
@@ -172,15 +174,18 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
         hidden_states = hidden_states.reshape(batch_size, seq_len, hidden_dim)
+        # show_mem_info(logger, f"{self._prefix}: shared_output shape {shared_output.shape}, router_logits shape {router_logits.shape}, hidden_states shape {hidden_states.shape}")
+        # show_mem_info(logger, f"{self._prefix}: before experts")
         final_hidden_states = self.experts(
             hidden_states=hidden_states,
             router_logits=router_logits) * self.routed_scaling_factor
+        # show_mem_info(logger, f"{self._prefix}: after experts")
         if shared_output is not None:
             final_hidden_states = final_hidden_states + shared_output
         if self.ep_size == 1 and self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
                 final_hidden_states)
-
+        # show_mem_info(logger, f"{self._prefix}: before return")
         return final_hidden_states.view(batch_size, seq_len, hidden_dim)
 
 
@@ -536,6 +541,7 @@ def __init__(
         # DecoderLayers are created with `make_layers` which passes the prefix
         # with the layer's index.
         layer_idx = int(prefix.split(sep='.')[-1])
+        self._prefix = prefix
         if model_config.use_mla:
             attn_cls = DeepseekV3MLAAttention
         else:
@@ -594,20 +600,20 @@ def forward(
             hidden_states, residual = self.input_layernorm(
                 hidden_states, residual)
         # logger.info(f"hidden_states shape : {hidden_states.shape}")
-        # show_mem_info(logger, "DeepseekV3DecoderLayer: before self_attn")
+        # show_mem_info(logger, f"{self._prefix}: before self_attn")
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
             kv_cache=kv_cache,
             attn_metadata=attn_metadata,
         )
-        # show_mem_info(logger, "DeepseekV3DecoderLayer: after self_attn")
-        htorch.core.mark_step()
+        # htorch.core.mark_step()
+        # show_mem_info(logger, f"{self._prefix}: after self_attn")
         # Fully Connected
         hidden_states, residual = self.post_attention_layernorm(
             hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
-        # show_mem_info(logger, "DeepseekV3DecoderLayer: after mlp")
+        # show_mem_info(logger, f"{self._prefix}: after mlp")
         return hidden_states, residual