intel
diff --git a/‎.gitignore
+2 b/‎.gitignore
+2
diff --git a/‎examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/main.py
+1 b/‎examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/main.py
+1
diff --git a/‎examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/sdxl_smooth_quant.py
+1 b/‎examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/sdxl_smooth_quant.py
+1
diff --git a/‎examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/fp8_quant/README.md
+49 b/‎examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/fp8_quant/README.md
+49
diff --git a/‎examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/fp8_quant/quantize.py
+169 b/‎examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/fp8_quant/quantize.py
+169
diff --git a/‎examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/fp8_quant/requirement.txt
+3 b/‎examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/fp8_quant/requirement.txt
+3
diff --git a/‎examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/fp8_quant/run_benchmark.sh
+115 b/‎examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/fp8_quant/run_benchmark.sh
+115
@@ -22,3 +22,5 @@ lpot_workspace/
 .torch/
 node_modules
 build_tmp
+hqt_output*/
+inc_output*/
@@ -400,6 +400,7 @@ def __call__(
     torch_dtype=dtype,
     use_safetensors=True,
 )
+pipe = pipe.to(dtype)  # Ensure all modules are set as dtype
 pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
 
 if args.refiner:
 
@@ -361,6 +361,7 @@ def main():
         torch_dtype=dtype,
         use_safetensors=True,
     )
+    pipeline = pipeline.to(dtype)  # Ensure all modules are set as dtype
     pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
 
     # This is a list of prompts
 
@@ -0,0 +1,49 @@
+# Step-by-step
+
+Here we demonstrate FP8 quantization with some advanced techniques.
+- block-wise calibration: reduce device memory requirement during calibration
+- layer-wise quantization (base on memory mapping): reduce host memory requirement during quantization
+- lm_eval evaluation for HPU: balance performance and memory usage, `--use_hpu_graph` is required.
+
+Typically, quantization requires calibration with a high-precision model (such as bf16), which occupies a lot of device memory. Block-wise calibration splits the LLM into blocks and performs calibration one by one. Use ` --enable_block_wise_calibration` to enable this feature.
+
+By default, This example loads model into shared memory from disk and loads to physical host memory layer-by-layer during quantization. The occupied physical host memory will be released in time.
+
+In this example, you can measure and quantize`llama3.1/Meta-Llama-3.1-405B-Instruct` in torch.bfloat16 dtype with 8 Gaudi2 cards or even less, and host memory requirement is also low.
+
+## Install deepspeed
+Due to a known issue [microsoft/DeepSpeed/issues/3207](https://github.com/microsoft/DeepSpeed/issues/3207), we recommend installing deepspeed as follows.
+```shell
+git clone https://github.com/HabanaAI/DeepSpeed.git
+cd DeepSpeed
+git checkout 1.19.0
+pip install -e .
+cd ..
+```
+
+# Run
+
+## meta-llama/Llama-2-70b-hf
+
+```bash
+# Measure, quantize and save
+deepspeed --num_gpus 2 quantize.py --model_name_or_path meta-llama/Llama-2-70b-hf --quantize --save --save_path llama2_70b_fp8/
+# With block-wise calibration, we can quantize 70b with one Gaudi2 cards
+python quantize.py --model_name_or_path meta-llama/Llama-2-70b-hf --quantize --enable_block_wise_calibration --save --save_path llama2_70b_fp8/
+
+
+# Load fp8 model and verify accuracy
+python quantize.py --model_name_or_path llama2_70b_fp8/ --load --use_hpu_graph --accuracy
+```
+
+> Note: To get the best performance of fp8 model, please go to [optimum-habana](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation#running-with-fp8) to quantize the model. These advanced techniques will be upstreamed to optimum-habana soon.
+
+## meta-llama/Llama-3.1-405B-Instruct
+
+```bash
+# Measure
+deepspeed --num_gpus 8 quantize.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --quantize --enable_block_wise_calibration --save --save_path llama3.1_405b_fp8/ 
+
+# Load fp8 model and verify accuracy
+deepspeed --num_gpus 8 quantize.py --model_name_or_path llama3.1_405b_fp8/ --load --use_hpu_graph --accuracy
+```
@@ -0,0 +1,169 @@
+import os
+import argparse
+import tqdm
+
+# ensure that unnecessary memory is released during quantization.
+os.environ.setdefault("PT_HPU_WEIGHT_SHARING", "0")
+if int(os.getenv("WORLD_SIZE", "0")) > 0:
+    os.environ.setdefault("PT_HPU_LAZY_ACC_PAR_MODE", "0")
+    os.environ.setdefault("PT_HPU_ENABLE_LAZY_COLLECTIVES", "true")
+
+
+import torch
+import habana_frameworks.torch.core as htcore
+
+from neural_compressor.torch.quantization import (
+    FP8Config,
+    prepare,
+    convert,
+    finalize_calibration,
+    save,
+    load,
+)
+from neural_compressor.torch.utils import get_used_hpu_mem_MB, get_used_cpu_mem_MB, logger, forward_wrapper
+from neural_compressor.torch.utils.block_wise import block_wise_calibration
+from neural_compressor.torch.utils.llm_utility import (
+    initialize_model_and_tokenizer,
+    get_default_llm_dataloader,
+    llm_benchmark,
+)
+
+# use no_grad mode for quantization
+torch.set_grad_enabled(False)
+htcore.hpu_set_env()
+hpu_mem_0 = get_used_hpu_mem_MB()
+cpu_mem_0 = get_used_cpu_mem_MB()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Habana FP8 quantization.", formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--model_name_or_path", type=str, default="meta-llama/Meta-Llama-3.1-405B", help="model name or path")
+    parser.add_argument("--quantize", action="store_true", help="whether to quantize model")
+    parser.add_argument("--scale_method", type=str, default="maxabs_hw", help="Choose scale method", choices=[
+        # per-tensor
+        "unit_scale", "hw_aligned_single_scale", "maxabs_hw", "maxabs_pow2", 
+        "maxabs_arbitrary", "maxabs_hw_opt_weight", "maxabs_pow2_opt_weight", 
+        # per-channel
+        "act_maxabs_hw_weights_pcs_maxabs_pow2", "act_maxabs_hw_weights_pcs_opt_pow2", 
+        "act_maxabs_pow2_weights_pcs_maxabs_pow2", "act_maxabs_pow2_weights_pcs_opt_pow2",
+    ])
+    parser.add_argument("--use_hpu_graph", action="store_true", help="whether to use hpu graph mode to accelerate performance")
+    parser.add_argument("--enable_block_wise_calibration", action="store_true", help="whether to use block-wise calibration")
+    parser.add_argument("--disable_optimum_habana", action="store_true", help="whether to use adapt_transformers_to_gaudi")
+    parser.add_argument("--save", action="store_true", help="whether to save the quantized model")
+    parser.add_argument("--load", action="store_true", help="whether to load the quantized model")
+    parser.add_argument("--save_path", type=str, default="saved_results", help="path to save the quantized model")
+    parser.add_argument("--accuracy", action="store_true", help="accuracy measurement")
+    parser.add_argument("--performance", action="store_true", help="performance measurement")
+    parser.add_argument("--local_rank", type=int, default=0, metavar="N", help="Local process rank.")
+    parser.add_argument("--batch_size", default=1, type=int, help="batch size for accuracy measurement.")
+    parser.add_argument("--num_fewshot", default=0, type=int, help="num_fewshot of lm_eval.")
+    parser.add_argument("--dump_stats_path", type=str, default="./hqt_output/measure", help="path and prefix to calibration info file.")
+    parser.add_argument("--tasks", default="lambada_openai",
+                        type=str, help="tasks for accuracy validation, text-generation and code-generation tasks are different.")
+    parser.add_argument("--dataset_name", type=str, default="NeelNanda/pile-10k", help="dataset name for calibration dataloader")
+    parser.add_argument("--nsamples", type=int, default=128, help="number of samples for calibration dataloader")
+    parser.add_argument("--seq_len", type=int, default=128, help="sequence length for calibration dataloader and benchmarking")
+    args = parser.parse_args()
+    if not args.disable_optimum_habana:
+        # Tweak generation so that it runs faster on Gaudi
+        import transformers
+        from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+        if args.quantize:
+            orig_check_support_param_buffer_assignment = transformers.modeling_utils.check_support_param_buffer_assignment
+            adapt_transformers_to_gaudi()
+            # to protect memory mapping usage for quantization
+            transformers.modeling_utils.check_support_param_buffer_assignment = orig_check_support_param_buffer_assignment
+        else:
+            adapt_transformers_to_gaudi()
+
+    model, tokenizer = initialize_model_and_tokenizer(args.model_name_or_path, use_load=args.load, device="hpu")
+    # show used memory
+    logger.info(f"After loading model, used HPU memory: {round((get_used_hpu_mem_MB() - hpu_mem_0)/1024, 3)} GiB")
+    logger.info(f"After loading model, used CPU memory: {round((get_used_cpu_mem_MB() - cpu_mem_0)/1024, 3)} GiB")
+
+    if args.quantize:
+        if args.enable_block_wise_calibration:
+            logger.warning("Block-wise calibration is enabled, lm_head will be excluded from calibration.")
+
+        # prepare
+        qconfig = FP8Config(
+            fp8_config="E4M3",
+            scale_method=args.scale_method,
+            blocklist={"names": ["lm_head"]} if args.enable_block_wise_calibration else {},  # block-wise cannot calibrate lm_head
+            measure_on_hpu=False if args.enable_block_wise_calibration else True,  # to avoid device mapping of model
+            dump_stats_path=args.dump_stats_path,
+        )
+        if args.scale_method in ["unit_scale", "hw_aligned_single_scale"]:
+            model = convert(model, qconfig)
+        else:
+            model = prepare(model, qconfig)
+
+            # calibration
+            dataloader = get_default_llm_dataloader(
+                tokenizer, 
+                dataset_name=args.dataset_name, 
+                bs=args.batch_size, 
+                nsamples=args.nsamples,
+                seq_len=args.seq_len, 
+                seed=42, 
+            )
+            if args.enable_block_wise_calibration:
+                block_wise_calibration(model, dataloader)
+            else:
+                if args.use_hpu_graph:
+                    from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+                    model = wrap_in_hpu_graph(model)
+                for data in tqdm.tqdm(dataloader):
+                    logger.info("Calibration started")
+                    forward_wrapper(model, data)
+                    logger.info("Calibration end")
+
+            # convert
+            model = convert(model)
+
+        # show used memory
+        logger.info(f"Used HPU memory: {round((get_used_hpu_mem_MB() - hpu_mem_0)/1024, 3)} GiB")
+        logger.info(f"Used CPU memory: {round((get_used_cpu_mem_MB() - cpu_mem_0)/1024, 3)} GiB")
+        if args.save:
+            logger.info(f"Saving quantized model to {args.save_path}")
+            save(model, args.save_path, format="huggingface")
+            tokenizer.save_pretrained(args.save_path)
+            logger.info(f"Saved quantized model to {args.save_path}")
+        exit(0)  # model is wrapped during calibration, need to exit before accuracy and performance measurement
+
+    # preprocess model for accuracy and performance measurement
+    if not args.load:
+        # compare fp8 with bf16, not fp32.
+        model = model.to(torch.bfloat16)
+    model = model.eval().to("hpu")
+    if args.use_hpu_graph:
+        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+        model = wrap_in_hpu_graph(model)
+    htcore.hpu_inference_initialize(model, mark_only_scales_as_const=True)
+
+    if args.accuracy:
+        from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
+        eval_args = LMEvalParser(
+            model="hf", 
+            user_model=model,
+            tokenizer=tokenizer,
+            batch_size=args.batch_size,
+            tasks=args.tasks,
+            device="hpu",
+            pad_to_buckets=True,
+            num_fewshot=args.num_fewshot,
+        )
+        results = evaluate(eval_args)
+        # show used memory
+        logger.info(f"Used HPU memory: {round((get_used_hpu_mem_MB() - hpu_mem_0)/1024, 3)} GiB")
+        logger.info(f"Used CPU memory: {round((get_used_cpu_mem_MB() - cpu_mem_0)/1024, 3)} GiB")
+
+
+    if args.performance:
+        llm_benchmark(model, args.batch_size, args.seq_len)
+        # show used memory
+        logger.info(f"Used HPU memory: {round((get_used_hpu_mem_MB() - hpu_mem_0)/1024, 3)} GiB")
+        logger.info(f"Used CPU memory: {round((get_used_cpu_mem_MB() - cpu_mem_0)/1024, 3)} GiB")
@@ -0,0 +1,3 @@
+lm-eval>=0.4.3
+transformers >= 4.45.2, < 4.46.0  # refer to optimum-habana
+datasets
@@ -0,0 +1,115 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  batch_size=1
+  tuned_checkpoint=saved_results
+  task=lambada_openai
+
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --config=*)
+          tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+
+# run_benchmark
+function run_benchmark {
+    python_cmd="python"
+
+    if [[ ${mode} == "accuracy" ]]; then
+        mode_cmd=" --accuracy "
+    elif [[ ${mode} == "performance" ]]; then
+        mode_cmd=" --performance"
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+
+    if [ "${topology}" = "opt_125m_fp8" ]; then
+        model_name_or_path="facebook/opt-125m"
+        tuned_checkpoint="opt_125m_fp8"
+    elif [ "${topology}" = "opt_125m_fp8_pcs" ]; then
+        model_name_or_path="facebook/opt-125m"
+        tuned_checkpoint="opt_125m_fp8_pcs"
+    elif [ "${topology}" = "opt_125m_fp8_block_wise" ]; then
+        model_name_or_path="facebook/opt-125m"
+        tuned_checkpoint="opt_125m_fp8_block_wise"
+    elif [ "${topology}" = "llama3_1_8b_fp8" ]; then
+        model_name_or_path="/git_lfs/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct/"
+        tuned_checkpoint="/software/llama_fp8/llama3_1_8b_fp8"
+    elif [ "${topology}" = "llama3_1_8b_fp8_block_wise" ]; then
+        model_name_or_path="/git_lfs/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct/"
+        tuned_checkpoint="/software/llama_fp8/llama3_1_8b_fp8_block_wise"
+    elif [ "${topology}" = "llama3_1_8b_fp8_block_wise_pcs" ]; then
+        model_name_or_path="/git_lfs/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct/"
+        tuned_checkpoint="/software/llama_fp8/llama3_1_8b_fp8_block_wise_pcs"
+    elif [ "${topology}" = "llama2_70b_fp8_block_wise" ]; then
+        model_name_or_path="/git_lfs/data/pytorch/llama2/Llama-2-70b-hf/"
+        tuned_checkpoint="/software/llama_fp8/llama2_70b_fp8_block_wise"
+    elif [ "${topology}" = "mixtral_8x7b_fp8_block_wise" ]; then
+        model_name_or_path="mistralai/Mixtral-8x7B-v0.1"
+        tuned_checkpoint="/software/mixtral_fp8/mixtral_8x7b_fp8_block_wise"
+    elif [ "${topology}" = "llama3_1_405b_fp8_block_wise" ]; then
+        model_name_or_path="/git_lfs/data/pytorch/llama3.1/Meta-Llama-3.1-405B-Instruct/"
+        tuned_checkpoint="/software/llama_fp8/llama3_1_405b_fp8_block_wise"
+        python_cmd="deepspeed --num_gpus 8"
+    fi
+
+    if [[ ${int8} == "true" ]]; then
+        ${python_cmd} quantize.py \
+            --model ${tuned_checkpoint} \
+            --load\
+            --task ${task} \
+            --batch_size ${batch_size} \
+            --use_hpu_graph \
+            ${mode_cmd}
+    else
+        ${python_cmd} quantize.py \
+            --model ${model_name_or_path} \
+            --task ${task} \
+            --batch_size ${batch_size} \
+            --use_hpu_graph \
+            ${mode_cmd}
+    fi
+}
+
+main "$@"
Original file line number	Diff line number	Diff line change
`@@ -400,6 +400,7 @@ def __call__(`
`400`	`400`	`torch_dtype=dtype,`
`401`	`401`	`use_safetensors=True,`
`402`	`402`	`)`
	`403`	`+pipe = pipe.to(dtype) # Ensure all modules are set as dtype`
`403`	`404`	`pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)`
`404`	`405`
`405`	`406`	`if args.refiner:`
Original file line number	Diff line number	Diff line change
`@@ -361,6 +361,7 @@ def main():`
`361`	`361`	`torch_dtype=dtype,`
`362`	`362`	`use_safetensors=True,`
`363`	`363`	`)`
	`364`	`+ pipeline = pipeline.to(dtype) # Ensure all modules are set as dtype`
`364`	`365`	`pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)`
`365`	`366`
`366`	`367`	`# This is a list of prompts`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+lm-eval>=0.4.3`
	`2`	`+transformers >= 4.45.2, < 4.46.0 # refer to optimum-habana`
	`3`	`+datasets`