Add qdq eval (#2121)

yiliu30 · Yi4Liu · web-flow · commit 45a2c1af3d11 · 2025-02-12T17:43:10.000+08:00
* add eval

Change-Id: I7ce64ede965976dd79e979aace82f4d251cc6803
Signed-off-by: Yi Liu &lt;yiliu4@habana.ai&gt;

* fix

Change-Id: I72305d9d6ef6e3588bc8361f62baeeca06f42848
Signed-off-by: Yi Liu &lt;yiliu4@habana.ai&gt;

* add float model

Change-Id: Ia46444d77d349b1a976e6d7031d06bb621d6d7e4
Signed-off-by: Yi Liu &lt;yiliu4@habana.ai&gt;

* add prompt

Change-Id: Ie7b35f45d8f67a655dc9fb06eda824eb8a7f56c1
Signed-off-by: Yi Liu &lt;yiliu4@habana.ai&gt;

---------

Signed-off-by: Yi Liu &lt;yiliu4@habana.ai&gt;
Co-authored-by: Yi Liu &lt;yiliu4@habana.ai&gt;
diff --git a/examples/ds/eval.py b/examples/ds/eval.py
@@ -0,0 +1,143 @@
+import os
+import torch
+import tqdm
+from loguru import logger
+import logging
+import safetensors
+from safetensors import safe_open
+from safetensors.torch import save_file
+import json
+
+logging.basicConfig(level=logging.DEBUG)
+torch.set_grad_enabled(False)
+
+# CONSTANTS
+SAFETENSORS = "safetensors"
+WEIGHT_SCALE_NAME = "scale_weight"
+INPUT_SCALE_NAME = "scale_input"
+SCALE_DTYPE = torch.bfloat16
+SCALE_FILE_NAME = f"scales.{SAFETENSORS}"
+FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
+WEIGHT_BACKOFF = 0.5
+QUANT_MODULE_TYPES = (torch.nn.Linear,)
+SKIP_WEIGHT_LST = {
+    "model.norm",
+    "layernorm",
+    "e_score_correction_bias",
+    # "lm_head.weight",
+    "embed_tokens",
+    "mlp.gate.weight",  # mlp.gate is not linear
+}
+"""
+# https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html?highlight=backoff#supported-json-config-file-options
+Similarly, the maxabs value of a weight is scaled to weight_backoff*FP8_143_FULLSCALE. The default values are input_backoff=0.25 and weight_backoff=0.5.
+"""
+MODEL_STATE_DICT_MAPPING_FILENAME = "model.safetensors.index.json"
+
+
+def skip_weight(weight_name):
+    return any([skip_name in weight_name for skip_name in SKIP_WEIGHT_LST])
+
+
+def get_cpu_mem_size_in_gb():
+    import psutil
+
+    mem = psutil.virtual_memory()
+    return mem.available
+
+
+from quant import quant_tensor
+
+
+from torch import nn
+
+
+# Adapted from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/1d044fd82b15f1cedb197a288e50cc96a2c27205/inference/model.py#L91-L108
+class FP8QDQLinear(torch.nn.Linear):
+    dtype = torch.bfloat16
+    fp8_dtype = torch.float8_e4m3fn
+
+    def __init__(self, in_features: int, out_features: int, bias: bool = True, device=None):
+        super().__init__(in_features, out_features, bias=bias)
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = nn.Parameter(
+            torch.empty(out_features, in_features, dtype=FP8QDQLinear.fp8_dtype), requires_grad=True
+        )
+        self.scale_weight = nn.Parameter(torch.tensor(0, dtype=FP8QDQLinear.dtype), requires_grad=False)
+        if bias:
+            self.bias = nn.Parameter(torch.empty(out_features))
+        else:
+            self.register_parameter("bias", None)
+
+    def dequant_weight_online(self):
+        fp8_weight = self.weight
+        qdq_weight = fp8_weight.to(FP8QDQLinear.dtype) * self.scale_weight
+        return qdq_weight
+
+    def qdq_input(self, bf16_input: torch.Tensor):
+        input_scale, input_fp8 = quant_tensor(bf16_input)
+        qdq_input_bf16 = input_fp8.to(FP8QDQLinear.dtype) * input_scale
+        return qdq_input_bf16
+
+    @classmethod
+    def create_from_linear(cls, linear: nn.Linear):
+        qdq_linear = cls(linear.in_features, linear.out_features)
+        qdq_linear.weight.data = linear.weight.data
+        if linear.bias is not None:
+            qdq_linear.bias = linear.bias
+        return qdq_linear
+
+    def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
+        qdq_input = self.qdq_input(bf16_input)
+        qdq_weight = self.dequant_weight_online()
+        out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias)
+        return out
+
+
+def patch_lin():
+    logger.warning("Patching torch.nn.Linear to FP8QDQLinear")
+    torch.nn.Linear = FP8QDQLinear
+
+
+def qdq_eval(model_path, not_patch_lin=False):
+    import transformers
+    from transformers.modeling_utils import no_init_weights
+    from patch_for_ds import patch_transformers
+
+    if not not_patch_lin:
+        patch_lin()
+
+    def _patch__initialize_weights(self, module):
+        print(f"Skipping init_weights ")
+        module._is_hf_initialized = True
+
+    transformers.modeling_utils.PreTrainedModel._initialize_weights = _patch__initialize_weights
+    patch_transformers()
+    with no_init_weights():
+        model = transformers.AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype="auto",
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+        )
+    logger.info(f"Patched model: {model}")
+    model.eval()
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
+    prompt = "Hi, who"
+    encode = tokenizer.encode(prompt, return_tensors="pt")
+    with torch.no_grad():
+        output_tokens = model.generate(encode, max_length=10)
+        output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
+        logger.info(f"Prompt: {prompt}")
+        logger.info(f"Output: {output}")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--qmodel_path", type=str, required=True)
+    parser.add_argument("--not_patch_lin", action="store_true", help="Measure float model")
+    args = parser.parse_args()
+    qdq_eval(args.qmodel_path, not_patch_lin=args.not_patch_lin)
diff --git a/examples/ds/patch_for_ds.py b/examples/ds/patch_for_ds.py
@@ -1,5 +1,5 @@
 # ==--------------------------------------------------------------------------==
-# Patch for loading DS models
+# Patch for loading DS models from transformers
 from typing import Union, Optional
 import torch
 import os
@@ -101,7 +101,7 @@ def load_state_dict(
                 "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True."
             )
 
-
+#  https://github.com/huggingface/transformers/pull/35493
 def set_initialized_submodules(model, state_dict_keys):
     """
     Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state