intel
diff --git a/‎examples/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py
+5-5 b/‎examples/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py
+5-5
diff --git a/‎neural_compressor/torch/algorithms/habana_fp8/fp8_quant.py
+28-12 b/‎neural_compressor/torch/algorithms/habana_fp8/fp8_quant.py
+28-12
diff --git a/‎neural_compressor/torch/algorithms/habana_fp8/modules.py
+29-13 b/‎neural_compressor/torch/algorithms/habana_fp8/modules.py
+29-13
diff --git a/‎neural_compressor/torch/algorithms/habana_fp8/observer.py
+1-3 b/‎neural_compressor/torch/algorithms/habana_fp8/observer.py
+1-3
diff --git a/‎neural_compressor/torch/algorithms/habana_fp8/save_load.py
+22-5 b/‎neural_compressor/torch/algorithms/habana_fp8/save_load.py
+22-5
diff --git a/‎neural_compressor/torch/algorithms/habana_fp8/tensor/__init__.py
+13 b/‎neural_compressor/torch/algorithms/habana_fp8/tensor/__init__.py
+13
@@ -166,7 +166,7 @@ def itrex_bootstrap_stderr(f, xs, iters):
 
 if args.approach in ["dynamic", "static"]:
     print("device:", next(user_model.parameters()).device)
-    from neural_compressor.torch.quantization.config import FP8QConfig, get_default_fp8_qconfig
+    from neural_compressor.torch.quantization.config import FP8Config, get_default_fp8_config
     from neural_compressor.torch.algorithms.habana_fp8 import quantize_dynamic
     from neural_compressor.torch.quantization import quantize
     if args.precision == "fp8_e4m3":
@@ -175,15 +175,15 @@ def itrex_bootstrap_stderr(f, xs, iters):
         dtype = torch.float8_e5m2
     if args.approach == "dynamic":
         #user_model = quantize_dynamic(user_model, dtype, inplace=True)
-        qconfig = FP8QConfig(weight_dtype=dtype, act_dtype=dtype, approach="dynamic")
+        qconfig = FP8Config(weight_dtype=dtype, act_dtype=dtype, approach="dynamic")
         if args.skip_lm_head:
-            fp32_config = FP8QConfig(weight_dtype=torch.float32, act_dtype=torch.float32)
+            fp32_config = FP8Config(weight_dtype=torch.float32, act_dtype=torch.float32)
             qconfig.set_local("lm_head", fp32_config)
         user_model = quantize_dynamic(user_model, qconfig, inplace=True)
     elif args.approach == "static":
-        qconfig = FP8QConfig(weight_dtype=dtype, act_dtype=dtype, approach="static")
+        qconfig = FP8Config(weight_dtype=dtype, act_dtype=dtype, approach="static")
         if args.skip_lm_head:
-            fp32_config = FP8QConfig(weight_dtype=torch.float32, act_dtype=torch.float32)
+            fp32_config = FP8Config(weight_dtype=torch.float32, act_dtype=torch.float32)
             qconfig.set_local("lm_head", fp32_config)
         # dataset
         from datasets import load_dataset
 
@@ -17,6 +17,7 @@
 import os
 
 import habana_frameworks.torch.core as htcore
+from habana_frameworks.torch.core.quantization import _check_params_as_const, _mark_params_as_const
 import torch
 from deepspeed.module_inject import LinearAllreduce, LinearLayer
 from deepspeed.module_inject.layers import LmHeadLinearAllreduce
@@ -40,6 +41,10 @@
     FP8LinearLayer,
     FP8LmHeadLinearAllreduce,
     FP8Matmul,
+    # dtype amax
+    E4M3_AMAX,
+    E5M2_AMAX,
+    _map_guadi2_scale,
 )
 
 quantization_mapping = {
@@ -55,20 +60,20 @@
 white_list = tuple(quantization_mapping.keys())
 
 
-# without scale factor 0.9, the output will be abnormal.
-E4M3_AMAX = torch.tensor(240 * 0.9, dtype=torch.float).to("hpu")
-E5M2_AMAX = torch.tensor(57344 * 0.9, dtype=torch.float).to("hpu")
-FP8_DTYPE = [torch.float8_e5m2, torch.float8_e4m3fn]
+FP8_DTYPE = [torch.float8_e5m2, torch.float8_e4m3fn, "fp8_e5m2", "fp8_e4m3"]
+dtype_mapping = {"fp8_e5m2": torch.float8_e5m2, "fp8_e4m3": torch.float8_e4m3fn}
+# enable inference optimizations
+htcore.hpu_initialize()
 
 
 def _replace_module(module, qconfig):
     if qconfig.approach == "static":
         if isinstance(module, white_list):
             QModule = quantization_mapping[type(module)]
-            assert qconfig.weight_dtype == qconfig.act_dtype, "weight and activation should be the same dtype."
-            module = QModule(module, qconfig.act_dtype)
+            assert qconfig.w_dtype == qconfig.act_dtype, "weight and activation should be the same dtype."
+            module = QModule(module, dtype_mapping[qconfig.act_dtype])
     elif qconfig.approach == "dynamic":
-        dtype = qconfig.act_dtype
+        dtype = dtype_mapping[qconfig.act_dtype]
         if isinstance(module, torch.nn.Linear):
             # need module for initialization
             module = FP8DynamicLinear(module, dtype)
@@ -84,6 +89,8 @@ def _replace_module(module, qconfig):
 
 def quantize_dynamic(model, dtype=torch.float8_e4m3fn, inplace=True):
     q_model = model if inplace else copy.deepcopy(model)
+    if isinstance(dtype, str):
+        dtype = dtype_mapping[dtype]
     for n, m in q_model.named_modules():
         if isinstance(m, torch.nn.Linear):
             new_m = FP8DynamicLinear(m, dtype)  # need m for init
@@ -98,6 +105,8 @@ def quantize_dynamic(model, dtype=torch.float8_e4m3fn, inplace=True):
             new_m = FP8Cast(dtype=dtype)
             set_module(q_model, n, new_m)
         htcore.mark_step()
+    _mark_params_as_const(q_model)
+    _check_params_as_const(q_model)
     return q_model
 
 
@@ -133,7 +142,7 @@ def _remove_observer(module, qconfig):
     import deepspeed.comm as dist
     from torch.distributed import ReduceOp
 
-    HF_max = E4M3_AMAX if qconfig.act_dtype == torch.float8_e4m3fn else E5M2_AMAX
+    HF_max = E4M3_AMAX if qconfig.act_dtype == "fp8_e4m3" else E5M2_AMAX
     if hasattr(module, "input_activation_post_process"):
         if hasattr(module.input_activation_post_process, "_non_linear_param_search"):  # kl
             min_val, max_val = module.input_activation_post_process._non_linear_param_search()
@@ -145,7 +154,11 @@ def _remove_observer(module, qconfig):
             amax = amax.to("hpu")
             dist.all_reduce(amax, op=ReduceOp.MAX)
         scale = HF_max / amax
-        module.register_parameter("scale", torch.nn.Parameter(scale))
+        scale = _map_guadi2_scale(scale)
+        if hasattr(module, "input_activation_post_process1"):
+            module.register_parameter("scale1", torch.nn.Parameter(scale))
+        else:
+            module.register_parameter("scale", torch.nn.Parameter(scale))
         delattr(module, "input_activation_post_process")
     if hasattr(module, "input_activation_post_process1"):
         if hasattr(module.input_activation_post_process1, "_non_linear_param_search"):
@@ -158,7 +171,8 @@ def _remove_observer(module, qconfig):
             amax = amax.to("hpu")
             dist.all_reduce(amax, op=ReduceOp.MAX)
         scale = HF_max / amax
-        module.register_parameter("scale1", torch.nn.Parameter(scale))
+        scale = _map_guadi2_scale(scale)
+        module.register_parameter("scale2", torch.nn.Parameter(scale))
         delattr(module, "input_activation_post_process1")
 
     # remove observer hooks
@@ -175,7 +189,7 @@ def prepare(model, qconfig_mapping):
     for (op_name, op_type), qconfig in qconfig_mapping.items():
         if qconfig.approach == "dynamic":
             continue
-        if qconfig.weight_dtype not in FP8_DTYPE:
+        if qconfig.w_dtype not in FP8_DTYPE:
             continue
         module = fetch_module(model, op_name)
         if module is None:
@@ -188,7 +202,7 @@ def prepare(model, qconfig_mapping):
 
 def convert(model, qconfig_mapping):
     for (op_name, op_type), qconfig in qconfig_mapping.items():
-        if qconfig.weight_dtype not in FP8_DTYPE:
+        if qconfig.w_dtype not in FP8_DTYPE:
             continue
         module = fetch_module(model, op_name)
         if module is None:
@@ -211,4 +225,6 @@ def quantize(model, qconfig_mapping, run_fn=None, run_args=None, inplace=True):
         else:
             run_fn(q_model)
     q_model = convert(q_model, qconfig_mapping)
+    _mark_params_as_const(q_model)
+    _check_params_as_const(q_model)
     return q_model
@@ -53,6 +53,17 @@ def forward(self, x):
 
 
 ##################### FP8 modules #######################
+def _map_guadi2_scale(scale):
+    USE_GUADI2_SCALE = os.environ.get("USE_GUADI2_SCALE")
+    if USE_GUADI2_SCALE:
+        scale_list = torch.tensor([16, 1, 1/16, 1/256])
+        for i in scale_list:
+            if scale > i or i == torch.tensor(1/256):
+                return i
+    else:
+        return scale
+
+
 class FP8DynamicLinear(torch.nn.Module):
     def __init__(self, org_module, dtype=torch.float8_e4m3fn) -> None:
         super().__init__()
@@ -86,6 +97,7 @@ def __init__(self, org_module, dtype=torch.float8_e4m3fn) -> None:
         # scale = HF_max /amax
         if self.use_amax:
             self.weight_scale = self.dtype_amax / org_module.weight.data.abs().max()
+            self.weight_scale = _map_guadi2_scale(self.weight_scale)
             self.weight_scale_inv = torch.reciprocal(self.weight_scale)
         else:
             self.weight_scale = None
@@ -233,9 +245,9 @@ def __init__(self, org_module, dtype) -> None:
                 dtype=torch.float32,
             ),
         )
-        self.scale_inv = torch.reciprocal(self.scale)
 
         self.weight_scale = self.dtype_amax / org_module.weight.data.abs().max()
+        self.weight_scale = _map_guadi2_scale(self.weight_scale)
         self.weight_scale_inv = torch.reciprocal(self.weight_scale)
         self.weight.data.copy_(
             torch.ops.hpu.cast_to_fp8_v2(org_module.weight.data, self.weight_scale, False, False, self.dtype)[0]
@@ -251,6 +263,7 @@ def forward(self, inp):
         org_middle_shape = inp.shape[1:-1]
         inp = inp.view((-1, self.in_features))
         inp = torch.ops.hpu.cast_to_fp8_v2(inp, self.scale, False, False, self.dtype)[0]
+        self.scale_inv = torch.reciprocal(self.scale)
         out = torch.ops.hpu.fp8_gemm_v2(
             inp,
             False,
@@ -283,26 +296,24 @@ def __init__(self, org_module, dtype) -> None:
         self.dtype = dtype
         self.dtype_amax = E4M3_AMAX if self.dtype == torch.float8_e4m3fn else E5M2_AMAX
         self.out_dtype = torch.float32
-        scale = org_module.scale if hasattr(org_module, "scale") else 1.0
         scale1 = org_module.scale1 if hasattr(org_module, "scale1") else 1.0
+        scale2 = org_module.scale2 if hasattr(org_module, "scale2") else 1.0
         self.register_buffer(
-            "scale",
+            "scale1",
             torch.tensor(
-                scale,
+                scale1,
                 device="hpu",
                 dtype=self.out_dtype,
             ),
         )
         self.register_buffer(
-            "scale1",
+            "scale2",
             torch.tensor(
-                scale1,
+                scale2,
                 device="hpu",
                 dtype=self.out_dtype,
             ),
         )
-        self.input1_scale_inv = torch.reciprocal(self.scale)
-        self.input2_scale_inv = torch.reciprocal(self.scale1)
 
     def forward(self, input1, input2):
         dim1 = input1.shape[-1]
@@ -311,12 +322,14 @@ def forward(self, input1, input2):
 
         if input1.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
             self.out_dtype = input1.dtype
-            input1 = torch.ops.hpu.cast_to_fp8_v2(input1, self.scale, False, False, self.dtype)[0]
+            input1 = torch.ops.hpu.cast_to_fp8_v2(input1, self.scale1, False, False, self.dtype)[0]
+            self.input1_scale_inv = torch.reciprocal(self.scale1)
         else:
             self.input1_scale_inv = None
         if input2.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
             self.out_dtype = input2.dtype
-            input2 = torch.ops.hpu.cast_to_fp8_v2(input2, self.scale1, False, False, self.dtype)[0]
+            input2 = torch.ops.hpu.cast_to_fp8_v2(input2, self.scale2, False, False, self.dtype)[0]
+            self.input2_scale_inv = torch.reciprocal(self.scale2)
         else:
             self.input2_scale_inv = None
         out = torch.ops.hpu.fp8_gemm_v2(
@@ -407,10 +420,10 @@ def __init__(self, org_module, dtype) -> None:
                 dtype=torch.float32,
             ),
         )
-        self.scale_inv = 1.0 / self.scale
         # user configuration
         # scale = HF_max /amax
         self.weight_scale = self.dtype_amax / org_module.weight.data.abs().max()
+        self.weight_scale = _map_guadi2_scale(self.weight_scale)
         self.weight_scale_inv = 1.0 / self.weight_scale
         self.weight = torch.ops.hpu.cast_to_fp8_v2(org_module.weight.data, self.weight_scale, False, False, self.dtype)[
             0
@@ -432,6 +445,7 @@ def forward(self, inp):
         assert inp.shape[-1] == self.in_features, "GEMM not possible"
         inputmat = inp.view((-1, self.in_features))
         inputmat = torch.ops.hpu.cast_to_fp8_v2(inputmat, self.scale, False, False, self.dtype)[0]
+        self.scale_inv = torch.reciprocal(self.scale)
         out = torch.ops.hpu.fp8_gemm_v2(
             inputmat,
             False,
@@ -487,10 +501,10 @@ def __init__(self, org_module, dtype) -> None:
                 dtype=torch.float32,
             ),
         )
-        self.scale_inv = 1.0 / self.scale
         # user configuration
         # scale = HF_max /amax
         self.weight_scale = self.dtype_amax / org_module.weight.data.abs().max()
+        self.weight_scale = _map_guadi2_scale(self.weight_scale)
         self.weight_scale_inv = 1.0 / self.weight_scale
         self.weight = torch.ops.hpu.cast_to_fp8_v2(org_module.weight.data, self.weight_scale, False, False, self.dtype)[
             0
@@ -513,6 +527,7 @@ def forward(self, inp):
         assert inp.shape[-1] == self.in_features, "GEMM not possible"
         inputmat = inp.view((-1, self.in_features))
         inputmat = torch.ops.hpu.cast_to_fp8_v2(inputmat, self.scale, False, False, self.dtype)[0]
+        self.scale_inv = torch.reciprocal(self.scale)
         out = torch.ops.hpu.fp8_gemm_v2(
             inputmat,
             False,
@@ -572,10 +587,10 @@ def __init__(self, org_module, dtype) -> None:
                 dtype=torch.float32,
             ),
         )
-        self.scale_inv = 1.0 / self.scale
         # user configuration
         # scale = HF_max /amax
         self.weight_scale = self.dtype_amax / org_module.weight.data.abs().max()
+        self.weight_scale = _map_guadi2_scale(self.weight_scale)
         self.weight_scale_inv = 1.0 / self.weight_scale
         self.weight = torch.ops.hpu.cast_to_fp8_v2(org_module.weight.data, self.weight_scale, False, False, self.dtype)[
             0
@@ -608,6 +623,7 @@ def forward(self, inp):
         input_shard = inp.shape[-1] // self.world_size
         inputmat = inp[:, :, self.rank * input_shard : (self.rank + 1) * input_shard]
         inputmat = torch.ops.hpu.cast_to_fp8_v2(inputmat, self.scale, False, False, self.dtype)[0]
+        self.scale_inv = torch.reciprocal(self.scale)
         out = torch.ops.hpu.fp8_gemm_v2(
             inputmat,
             False,
 
@@ -17,9 +17,7 @@
 import torch
 from torch.ao.quantization.observer import *
 
-# without scale factor 0.9, the output will be abnormal.
-E4M3_AMAX = torch.tensor(240 * 0.9, dtype=torch.float).to("hpu")
-E5M2_AMAX = torch.tensor(57344 * 0.9, dtype=torch.float).to("hpu")
+from .modules import E4M3_AMAX, E5M2_AMAX
 
 
 class FP8HistogramObserver(HistogramObserver):
 
@@ -14,17 +14,27 @@
     FP8DynamicMatmul,
     FP8Cast,
 )
+from .fp8_quant import FP8_DTYPE, dtype_mapping
+
 
 def save(model, output_dir="./saved_results"):
     if not os.path.exists(output_dir):
         os.mkdir(output_dir)
     qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), "quantized_model.pt")
     qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), "qconfig.json")
     # saving process
-    torch.save(model.stat_dict(), qmodel_file_path)
-    logger.info("Save state_dict of quantized model to {}.".format(qmodel_file_path))
     with open(qconfig_file_path, "w") as f:
         json.dump(model.qconfig, f, indent=4)
+
+    import fp8_convert
+    stat_dict = {}
+    for k, v in model.state_dict().items():
+        if v.dtype in FP8_DTYPE:
+            v = fp8_convert.to_u8(v.to('cpu'))
+        stat_dict[k] = v.to('cpu')
+    torch.save(stat_dict, qmodel_file_path)
+
+    logger.info("Save state_dict of quantized model to {}.".format(qmodel_file_path))
     logger.info("Save configuration of quantized model to {}.".format(qconfig_file_path))
 
 
@@ -36,12 +46,17 @@ def load(model, output_dir="./saved_results"):
     with open(qconfig_file_path, "r") as f:
         model_qconfig = json.load(f)
     # load quantization configuration
-    from .fp8_quant import FP8_DTYPE
-    for (op_name, op_type), op_qconfig in model_qconfig.items():
-        dtype = op_qconfig['weight_dtype']
+    stat_dict = torch.load(qmodel_file_path)
+    import fp8_convert
+    for op_name, op_qconfig in model_qconfig["per_module_qconfig"].items():
+        dtype = op_qconfig['w_dtype']
+        choice = 1 if dtype=="fp8_e4m3" else 0
+        if op_name+".weight" in stat_dict:
+            stat_dict[op_name+".weight"] = fp8_convert.from_u8(stat_dict[op_name+".weight"], choice)
         if dtype not in FP8_DTYPE:
             continue
         module = fetch_module(model, op_name)
+        dtype = dtype_mapping[dtype]
         if op_qconfig['approach'] == "static":
             if isinstance(module, white_list):
                 QModule = quantization_mapping[type(module)]
@@ -58,5 +73,7 @@ def load(model, output_dir="./saved_results"):
                 module = FP8Cast(dtype=dtype)
         set_module(model, op_name, module)
         htcore.mark_step()
+    model.load_state_dict(stat_dict)
+    htcore.mark_step()
     logger.info("Quantized model loading successful.")
     return model
@@ -0,0 +1,13 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.