fix bug for linear

xinhe3 · xinhe3 · commit 291000b2ddb4 · 2024-02-21T09:17:00.000+02:00
Signed-off-by: xinhe3 &lt;xinhe3@habana.ai&gt;
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py
@@ -1,7 +1,7 @@
 import os
 os.environ["EXPERIMENTAL_WEIGHT_SHARING"] = "False"
 os.environ["USE_GAUDI2_SCALE"] = "True"
-os.environ.pop("USE_GAUDI2_SCALE")  # gaudi2 scale does not work
+os.environ.pop("USE_GAUDI2_SCALE")  # gaudi scale work
 # os.environ["GRAPH_VISUALIZATION"] = "True"
 import shutil
 shutil.rmtree(".graph_dumps", ignore_errors=True)
@@ -14,12 +14,13 @@
 import torch.nn.functional as F
 import deepspeed
 import transformers
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 import habana_frameworks.torch.core as htcore
 import numpy as np
 import lm_eval
 import lm_eval.tasks
 import lm_eval.evaluator
+from accelerate import init_empty_weights, load_checkpoint_and_dispatch
 
 
 torch.set_grad_enabled(False)
@@ -110,11 +111,16 @@ def itrex_bootstrap_stderr(f, xs, iters):
              token=None,
         )
     else:
-        user_model = AutoModelForCausalLM.from_pretrained(
-            args.model,
-            device_map='hpu',
-            torch_dtype=model_dtype,
-        )
+        if args.load:
+            config = AutoConfig.from_pretrained(args.model, torch_dtype=model_dtype)
+            with init_empty_weights():
+                user_model = AutoModelForCausalLM.from_config(config)
+        else:
+            user_model = AutoModelForCausalLM.from_pretrained(
+                args.model,
+                device_map='hpu',
+                torch_dtype=model_dtype,
+            )
 elif re.search("chatglm", args.model.lower()):
     from models.modeling_chatglm import ChatGLMForConditionalGeneration
     user_model = ChatGLMForConditionalGeneration.from_pretrained(
@@ -126,13 +132,18 @@ def itrex_bootstrap_stderr(f, xs, iters):
     # print(user_model.transformer.output_layer.weight.dtype) # always fp16
     user_model.float() # static fp8 need float32 for graph compiler
 else:
-    user_model = AutoModelForCausalLM.from_pretrained(
-        args.model,
-        trust_remote_code=args.trust_remote_code,
-        revision=args.revision,
-        device_map='hpu',
-        torch_dtype=model_dtype,
-    )
+    if args.load:
+        config = AutoConfig.from_pretrained(args.model, torch_dtype=model_dtype)
+        with init_empty_weights():
+            user_model = AutoModelForCausalLM.from_config(config)
+    else:
+        user_model = AutoModelForCausalLM.from_pretrained(
+            args.model,
+            trust_remote_code=args.trust_remote_code,
+            revision=args.revision,
+            device_map='hpu',
+            torch_dtype=model_dtype,
+        )
 
 # tokenizer
 if re.search("baichuan", args.model.lower()):
@@ -219,11 +230,40 @@ def replace_torch_mm_bmm():
         _check_params_as_const(user_model)
         # saving
         user_model.save("saved_results")
-    print(user_model, flush=True)
+    #print(user_model, flush=True)
+    def show_msg():
+        import numpy as np
+        import glob
+        from habana_frameworks.torch.hpu import memory_stats
+        print("Number of HPU graphs:", len(glob.glob(".graph_dumps/*PreGraph*")))
+        mem_stats = memory_stats()
+        mem_dict = {
+            "memory_allocated (GB)": np.round(mem_stats["InUse"] / 1024**3, 2),
+            "max_memory_allocated (GB)": np.round(mem_stats["MaxInUse"] / 1024**3, 2),
+            "total_memory_available (GB)": np.round(mem_stats["Limit"] / 1024**3, 2),
+        }
+        for k, v in mem_dict.items():
+            print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v))
+    show_msg()
 
 if args.load:
+    def show_msg():
+        import numpy as np
+        import glob
+        from habana_frameworks.torch.hpu import memory_stats
+        print("Number of HPU graphs:", len(glob.glob(".graph_dumps/*PreGraph*")))
+        mem_stats = memory_stats()
+        mem_dict = {
+            "memory_allocated (GB)": np.round(mem_stats["InUse"] / 1024**3, 2),
+            "max_memory_allocated (GB)": np.round(mem_stats["MaxInUse"] / 1024**3, 2),
+            "total_memory_available (GB)": np.round(mem_stats["Limit"] / 1024**3, 2),
+        }
+        for k, v in mem_dict.items():
+            print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v))
+    show_msg()
     from neural_compressor.torch.quantization import load
     user_model = load(user_model, "saved_results")
+    show_msg()
     # replace torch.matmul and toch.bmm by injection
     def replace_torch_mm_bmm():
         from neural_compressor.torch.amp.fp8.functions import fp8_matmul
@@ -235,7 +275,8 @@ def replace_torch_mm_bmm():
     from habana_frameworks.torch.core.quantization import _check_params_as_const, _mark_params_as_const
     _mark_params_as_const(user_model)  # can reduce memory allocated and speed up
     _check_params_as_const(user_model)
-    print(user_model, flush=True)
+    #print(user_model, flush=True)
+    show_msg()
 
 if args.to_graph:
     import habana_frameworks.torch.hpu.graphs as htgraphs
diff --git a/neural_compressor/torch/algorithms/habana_fp8/modules.py b/neural_compressor/torch/algorithms/habana_fp8/modules.py
@@ -212,11 +212,10 @@ class FP8Linear(torch.nn.Module):
     def __init__(self, org_module, dtype) -> None:
         super().__init__()
         # attributes
-        org_module.to("hpu")
-        self.dtype = dtype
-        self.dtype_amax = E4M3_AMAX if self.dtype == torch.float8_e4m3fn else E5M2_AMAX
         self.in_features = org_module.in_features
         self.out_features = org_module.out_features
+        self.dtype = dtype
+        self.dtype_amax = E4M3_AMAX if self.dtype == torch.float8_e4m3fn else E5M2_AMAX
         self.weight_dtype = self.dtype
         self.out_dtype = org_module.weight.dtype
         self.register_buffer(
@@ -228,50 +227,78 @@ def __init__(self, org_module, dtype) -> None:
                 dtype=self.weight_dtype,
             ),
         )
+        if org_module.bias is not None:
+            self.register_buffer(
+                "bias",
+                torch.empty(
+                    self.out_features,
+                    device="hpu",
+                    dtype=self.out_dtype,
+                ),
+            )
+        else:
+            self.bias = None
+        input_scale = _map_guadi2_scale(org_module.scale) if hasattr(org_module, "scale") else torch.tensor(1.0)
         self.register_buffer(
-            "bias",
-            torch.empty(
-                self.out_features,
+            "input_scale",
+            torch.tensor(
+                input_scale,
                 device="hpu",
-                dtype=self.out_dtype,
+                dtype=torch.float32,
             ),
         )
-        scale = org_module.scale if hasattr(org_module, "scale") else 1.0
         self.register_buffer(
-            "scale",
+            "input_scale_inv",
             torch.tensor(
-                scale,
+                torch.reciprocal(input_scale),
                 device="hpu",
                 dtype=torch.float32,
             ),
         )
-
-        self.weight_scale = self.dtype_amax / org_module.weight.data.abs().max()
-        self.weight_scale = _map_guadi2_scale(self.weight_scale)
-        self.weight_scale_inv = torch.reciprocal(self.weight_scale)
-        self.weight.data.copy_(
-            torch.ops.hpu.cast_to_fp8_v2(org_module.weight.data, self.weight_scale, False, False, self.dtype)[0]
+        if not org_module.weight.device.type == "meta":
+            weight_scale = self.dtype_amax / org_module.weight.data.abs().max()
+            weight_scale = _map_guadi2_scale(weight_scale)
+        else:
+            weight_scale = torch.tensor(1.0)
+        self.register_buffer(
+            "weight_scale",
+            torch.tensor(
+                weight_scale,
+                device="hpu",
+                dtype=torch.float32,
+            ),
+        )
+        self.register_buffer(
+            "weight_scale_inv",
+            torch.tensor(
+                torch.reciprocal(weight_scale),
+                device="hpu",
+                dtype=torch.float32,
+            ),
         )
+        # copy weight and bias
+        if not org_module.weight.device.type == "meta":
+            org_module.to("hpu")
+            self.weight.data.copy_(
+                torch.ops.hpu.cast_to_fp8_v2(org_module.weight.data, self.weight_scale, False, False, self.dtype)[0]
+            )
+            if org_module.bias is not None:
+                self.bias.data.copy_(org_module.bias.data.type(self.out_dtype))
 
-        if org_module.bias is not None:
-            self.bias.data.copy_(org_module.bias.data.type(self.out_dtype))
-        else:
-            self.bias = None
 
     def forward(self, inp):
         assert inp.shape[-1] == self.in_features, "GEMM not possible"
         org_middle_shape = inp.shape[1:-1]
         inp = inp.view((-1, self.in_features))
-        inp = torch.ops.hpu.cast_to_fp8_v2(inp, self.scale, False, False, self.dtype)[0]
-        self.scale_inv = torch.reciprocal(self.scale)
+        inp = torch.ops.hpu.cast_to_fp8_v2(inp, self.input_scale, False, False, self.dtype)[0]
         out = torch.ops.hpu.fp8_gemm_v2(
             inp,
             False,
             self.weight,
             True,
             None,
             self.out_dtype,
-            self.scale_inv,  # inv is used for recover scale
+            self.input_scale_inv,  # inv is used for recover scale
             self.weight_scale_inv,
             self.bias,
             False,
@@ -284,7 +311,7 @@ def extra_repr(self) -> str:
             self.in_features,
             self.out_features,
             self.bias is not None,
-            self.scale,
+            self.input_scale,
             self.dtype,
         )
 
diff --git a/neural_compressor/torch/algorithms/habana_fp8/save_load.py b/neural_compressor/torch/algorithms/habana_fp8/save_load.py
@@ -92,7 +92,8 @@ def load(model, output_dir="./saved_results"):
                 module = FP8Cast(dtype=dtype)
         set_module(model, op_name, module)
         htcore.mark_step()
-    model.load_state_dict(stat_dict)
+    model.load_state_dict(stat_dict, assign=True)
+    model.to('hpu')
     htcore.mark_step()
     logger.info("Quantized model loading successful.")
     return model