Skip to content

Commit 43d9905

Browse files
author
yi
committed
Merge branch 'dev/yi/ds_r1' into dev/mengni/layer
2 parents 3311c48 + 896dca1 commit 43d9905

File tree

4 files changed

+30
-15
lines changed

4 files changed

+30
-15
lines changed

scripts/QuantizeDeepSeek.md

+6-5
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
- [Exporting Environment Variables](#exporting-environment-variables)
1111
- [Calibration](#calibration)
1212
- [Inference with FP8 Models on Two Nodes](#inference-with-fp8-models-on-two-nodes)
13-
- [Inference with FP8 Models on a Single Node WIP](#inference-with-fp8-models-on-a-single-node-wip)
13+
- [Inference with FP8 Models on a Single Node](#inference-with-fp8-models-on-a-single-node)
1414
- [Prerequisites](#prerequisites)
1515
- [Running the Example](#running-the-example)
1616
- [Accuracy Evaluation WIP](#accuracy-evaluation-wip)
@@ -63,6 +63,8 @@ For more details, please refer to the <https://github.com/yangulei/vllm-fork/blo
6363
git clone https://github.com/intel/neural-compressor.git inc
6464
cd inc
6565
git checkout dev/ds_r1
66+
pip install -r requirements.txt
67+
pip install -r requirements_pt.txt
6668
python setup.py pt develop
6769
```
6870

@@ -147,7 +149,7 @@ export QUANT_CONFIG=inc_quant_with_fp8kv_config.json
147149
python inc_example_two_nodes.py --mode quant --fp8_kvcache
148150
```
149151

150-
## Inference with FP8 Models on a Single Node (WIP)
152+
## Inference with FP8 Models on a Single Node
151153

152154
In this section, we load the BF16 model on DRAM and quantize it to FP8 model using unified measurement results obtained from the two-node calibration.
153155

@@ -156,10 +158,9 @@ In this section, we load the BF16 model on DRAM and quantize it to FP8 model usi
156158
- Hardware: 1x8G3 or 1x8G2(WIP)
157159
- Docker: 1.20.0-521
158160

159-
### Running the Example
160-
161-
Quantize model weights to FP8 and using BF16 KVCache(WIP)
161+
> [!NOTE] The DRAM requirement can be decreased to less than 1T in a few days.
162162
163+
### Running the Example
163164

164165
- BF16 KVCache
165166
```bash

vllm/attention/backends/mla/utils.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
1616
LinearBase, RowParallelLinear,
1717
UnquantizedLinearMethod)
18+
from vllm.logger import ForkedPdb
1819
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
1920
CompressedTensorsLinearMethod)
2021
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
@@ -35,7 +36,7 @@ class MLACommonMetadata(AttentionMetadata):
3536
input_positions: torch.Tensor
3637

3738

38-
class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
39+
class MLACommonImpl(MLAAttentionImpl[T], Generic[T], torch.nn.Module):
3940
"""
4041
Common class for implementing repeated parts
4142
@@ -154,6 +155,9 @@ def __init__(
154155
kv_b_proj: ColumnParallelLinear,
155156
o_proj: RowParallelLinear,
156157
) -> None:
158+
# NOTE: Make `MLACommonImpl` an `nn.Module` and `W_UV_O`, `W_Q_UK`, and `W_UK` `nn.Parameter`s,
159+
# so that we can transfer them to the accelerator in case they are initialized on the CPU.
160+
torch.nn.Module.__init__(self)
157161
self.num_heads = num_heads
158162
self.head_size = head_size
159163
self.scale = float(scale)
@@ -386,7 +390,11 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
386390
self.W_UV_O_scales = W_UV_O_scales.T.contiguous()
387391
else:
388392
self.W_UV_O = W_UV_O.to(act_dtype)
389-
393+
# NOTE: We need transfer them to the accelerator in case they are initialized on the CPU.
394+
self.W_UV_O = torch.nn.Parameter(self.W_UV_O, requires_grad=False)
395+
self.W_Q_UK = torch.nn.Parameter(self.W_Q_UK, requires_grad=False)
396+
self.W_UK = torch.nn.Parameter(self.W_UK, requires_grad=False)
397+
self.W_QR = torch.nn.Parameter(self.W_QR, requires_grad=False)
390398
self.tp_size = get_tensor_model_parallel_world_size()
391399
else:
392400
if is_fp8(weight_dtype):

vllm/logger.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -181,17 +181,17 @@ def show_mem_info(logger=None, msg="", loglevel="info"):
181181
rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else -1
182182
if rank == 0:
183183
show_fn(f"[Rank {rank}] {msg}")
184-
show_fn(f"[Rank {rank}] Used HPU memory: {hpu_mem_mb // 1000} GB {hpu_mem_mb % 1000} MB")
185184
cpu_mem_mb = get_used_cpu_mem_MB()
186-
show_fn(f"[Rank {rank}] Used CPU memory: {cpu_mem_mb // 1000} GB {cpu_mem_mb % 1000} MB")
185+
show_fn(f"[Rank {rank}] Used HPU: {hpu_mem_mb // 1000} GB {hpu_mem_mb % 1000:.2f} MB; CPU: {cpu_mem_mb // 1000} GB {cpu_mem_mb % 1000:.2f} MB")
187186

188187

189188
def get_used_hpu_mem_MB():
190189
"""Get HPU used memory: MiB."""
191190
import torch
192191
import numpy as np
192+
import habana_frameworks.torch as htorch
193193
from habana_frameworks.torch.hpu import memory_stats
194-
194+
htorch.core.mark_step()
195195
torch.hpu.synchronize()
196196
mem_stats = memory_stats()
197197
used_hpu_mem = np.round(mem_stats["InUse"] / 1024**2, 3)

vllm/model_executor/models/deepseek_v3.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ def __init__(
114114
self.routed_scaling_factor = config.routed_scaling_factor
115115
self.n_shared_experts = config.n_shared_experts
116116
self.routed_scaling_factor = config.routed_scaling_factor
117+
self._prefix = prefix
117118
if self.tp_size > config.n_routed_experts:
118119
raise ValueError(
119120
f"Tensor parallel size {self.tp_size} is greater than "
@@ -164,6 +165,7 @@ def __init__(
164165

165166

166167
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
168+
# show_mem_info(logger, f"{self._prefix}: before gate")
167169
batch_size, seq_len, hidden_dim = hidden_states.shape
168170
num_tokens = batch_size * seq_len
169171
hidden_states = hidden_states.view(-1, hidden_dim)
@@ -172,15 +174,18 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
172174
# router_logits: (num_tokens, n_experts)
173175
router_logits, _ = self.gate(hidden_states)
174176
hidden_states = hidden_states.reshape(batch_size, seq_len, hidden_dim)
177+
# show_mem_info(logger, f"{self._prefix}: shared_output shape {shared_output.shape}, router_logits shape {router_logits.shape}, hidden_states shape {hidden_states.shape}")
178+
# show_mem_info(logger, f"{self._prefix}: before experts")
175179
final_hidden_states = self.experts(
176180
hidden_states=hidden_states,
177181
router_logits=router_logits) * self.routed_scaling_factor
182+
# show_mem_info(logger, f"{self._prefix}: after experts")
178183
if shared_output is not None:
179184
final_hidden_states = final_hidden_states + shared_output
180185
if self.ep_size == 1 and self.tp_size > 1:
181186
final_hidden_states = tensor_model_parallel_all_reduce(
182187
final_hidden_states)
183-
188+
# show_mem_info(logger, f"{self._prefix}: before return")
184189
return final_hidden_states.view(batch_size, seq_len, hidden_dim)
185190

186191

@@ -536,6 +541,7 @@ def __init__(
536541
# DecoderLayers are created with `make_layers` which passes the prefix
537542
# with the layer's index.
538543
layer_idx = int(prefix.split(sep='.')[-1])
544+
self._prefix = prefix
539545
if model_config.use_mla:
540546
attn_cls = DeepseekV3MLAAttention
541547
else:
@@ -594,20 +600,20 @@ def forward(
594600
hidden_states, residual = self.input_layernorm(
595601
hidden_states, residual)
596602
# logger.info(f"hidden_states shape : {hidden_states.shape}")
597-
# show_mem_info(logger, "DeepseekV3DecoderLayer: before self_attn")
603+
# show_mem_info(logger, f"{self._prefix}: before self_attn")
598604
hidden_states = self.self_attn(
599605
positions=positions,
600606
hidden_states=hidden_states,
601607
kv_cache=kv_cache,
602608
attn_metadata=attn_metadata,
603609
)
604-
# show_mem_info(logger, "DeepseekV3DecoderLayer: after self_attn")
605-
htorch.core.mark_step()
610+
# htorch.core.mark_step()
611+
# show_mem_info(logger, f"{self._prefix}: after self_attn")
606612
# Fully Connected
607613
hidden_states, residual = self.post_attention_layernorm(
608614
hidden_states, residual)
609615
hidden_states = self.mlp(hidden_states)
610-
# show_mem_info(logger, "DeepseekV3DecoderLayer: after mlp")
616+
# show_mem_info(logger, f"{self._prefix}: after mlp")
611617
return hidden_states, residual
612618

613619

0 commit comments

Comments
 (0)