Fp8 implementation (#1100)

nikita-malininn · AlexKoff88 · web-flow · commit 878b47496bf4 · 2025-01-16T10:45:15.000+01:00
* Fp8 implementation

* All datasets support

* Added test

* Update test

* Correctness

* Correctness

* Update docs/source/openvino/export.mdx

Co-authored-by: Alexander Kozlov &lt;alexander.kozlov@intel.com&gt;

* Change test model

* Apply comments

---------

Co-authored-by: Alexander Kozlov &lt;alexander.kozlov@intel.com&gt;
diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx
@@ -31,7 +31,7 @@ Check out the help for more options:
 
 ```text
 usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code]
-                                   [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8}]
+                                   [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8,f8e4m3,f8e5m2}]
                                    [--library {transformers,diffusers,timm,sentence_transformers,open_clip}]
                                    [--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym]
                                    [--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}]
@@ -67,10 +67,9 @@ Optional arguments:
                         on your local machine arbitrary code present in the model repository.
   --weight-format {fp32,fp16,int8,int4,mxfp4,nf4}
                         The weight format of the exported model.
-  --quant-mode {int8}
+  --quant-mode {int8,f8e4m3,f8e5m2}
                         Quantization precision mode. This is used for applying full model quantization including
-                        activations. The only currently supported choice is 'int8' for int8 quantization of both
-                        weights and activations.
+                        activations.
   --library {transformers,diffusers,timm,sentence_transformers,open_clip}
                         The library used to load the model before export. If not provided, will attempt to infer the
                         local checkpoint's library
@@ -166,7 +165,7 @@ Models larger than 1 billion parameters are exported to the OpenVINO format with
 </Tip>
 
 
-Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to `int8`. This will quantize both weights and activations of Linear, Convolutional and some other layers to int8. Currently this is only supported for speech-to-text models. Please see example below.
+Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to preffered precision. This will quantize both weights and activations of Linear, Convolutional and some other layers to selected mode. Please see example below.
 
 ```bash
 optimum-cli export openvino -m openai/whisper-large-v3-turbo --quant-mode int8 --dataset librispeech --num-samples 32 --smooth-quant-alpha 0.9 ./whisper-large-v3-turbo
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -78,11 +78,10 @@ def parse_args_openvino(parser: "ArgumentParser"):
     optional_group.add_argument(
         "--quant-mode",
         type=str,
-        choices=["int8"],
+        choices=["int8", "f8e4m3", "f8e5m2"],
         default=None,
         help=(
             "Quantization precision mode. This is used for applying full model quantization including activations. "
-            "The only currently supported choice is 'int8' for int8 quantization of both weights and activations."
         ),
     )
     optional_group.add_argument(
@@ -365,9 +364,6 @@ def run(self):
                 quantization_config["trust_remote_code"] = self.args.trust_remote_code
             ov_config = OVConfig(quantization_config=quantization_config)
         else:
-            if self.args.quant_mode != "int8":
-                raise ValueError("Only 'int8' quantization mode is currently supported.")
-
             quantization_config = {
                 "weight_format": self.args.quant_mode,
                 "activation_format": self.args.quant_mode,
diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
@@ -26,7 +26,7 @@
 from optimum.configuration_utils import BaseConfig
 
 from ..utils.import_utils import is_nncf_available
-from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_SPEECH_TO_TEXT_DATASETS, PREDEFINED_VISUAL_LM_DATASETS
+from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_VISUAL_LM_DATASETS
 
 
 if is_nncf_available():
@@ -638,9 +638,9 @@ def __init__(
                 SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and
                 reduces quantization error.
             weight_format (`str`, defaults to "int8"):
-                Data format weights are quantized to. Possible values: ['int8'].
+                Data format weights are quantized to. Possible values: ['int8', 'f8e4m3', 'f8e5m2'].
             activation_format (`str`, defaults to "int8"):
-                Data format activations are compressed to. Possible values: ['int8'].
+                Data format activations are compressed to. Possible values: ['int8', 'f8e4m3', 'f8e5m2'].
         """
         super().__init__(
             bits=bits,
@@ -658,6 +658,13 @@ def __init__(
         self.overflow_fix = overflow_fix
         self.smooth_quant_alpha = smooth_quant_alpha
         self.activation_format = activation_format
+
+        f8_formats = ["f8e4m3", "f8e5m2"]
+        if self.activation_format in f8_formats and self.weight_format in f8_formats:
+            logger.info(
+                f"{self.activation_format} for activations and {self.weight_format} weights were found. A symmetrical scheme will be used."
+            )
+            self.sym = True
         self.post_init()
 
     def post_init(self):
@@ -669,24 +676,11 @@ def post_init(self):
         if self.bits != 8:
             raise ValueError(f"Only support 8-bit for static quantization but found {self.bits}")
 
-        if self.dataset is not None:
-            if self.dataset not in PREDEFINED_SPEECH_TO_TEXT_DATASETS:
-                raise ValueError(
-                    f"You have entered the following string value for dataset: {self.dataset}. But it is not supported."
-                    f" Currently you can only choose {list(PREDEFINED_SPEECH_TO_TEXT_DATASETS.keys())}."
-                )
-
         if self.smooth_quant_alpha is not None and not (0 <= self.smooth_quant_alpha <= 1):
             raise ValueError(
                 f"SmoothQuant alpha parameter must be in range [0, 1], but found {self.smooth_quant_alpha}"
             )
 
-        if self.weight_format != "int8":
-            raise ValueError("Only 'int8' weight format is currently supported.")
-
-        if self.activation_format != "int8":
-            raise ValueError("Only 'int8' activation format is currently supported.")
-
 
 class OVConfig(BaseConfig):
     CONFIG_NAME = "openvino_config.json"
@@ -711,10 +705,7 @@ def __init__(
             "compression", None
         )  # A field for backward-compatability of training-time compression parameters
         if self.quantization_config is not None:
-            if isinstance(self.quantization_config, OVWeightQuantizationConfig):
-                self.dtype = self.quantization_config.weight_format
-            else:
-                self.dtype = "int8"
+            self.dtype = self.quantization_config.weight_format
         else:
             self.dtype = dtype
 
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
@@ -458,11 +458,6 @@ def _quantize_ovbasemodel(
             if calibration_dataset is None:
                 raise ValueError("Calibration dataset is required to run quantization.")
 
-            if quantization_config.weight_format != "int8":
-                raise ValueError("Only 'int8' weight format is currently supported.")
-            if quantization_config.activation_format != "int8":
-                raise ValueError("Only 'int8' activation format is currently supported.")
-
             # Quantize model(s)
             if isinstance(self.model, _OVModelForWhisper):
                 self._quantize_whisper_model(quantization_config, calibration_dataset, **kwargs)
@@ -1077,6 +1072,14 @@ def _full_quantization(
             matmul=quantization_config.smooth_quant_alpha
         )
 
+    q_mode_map = {
+        "f8e4m3": nncf.QuantizationMode.FP8_E4M3,
+        "f8e5m2": nncf.QuantizationMode.FP8_E5M2,
+    }
+
+    if quantization_config.activation_format in q_mode_map:
+        kwargs.update({"mode": q_mode_map[quantization_config.activation_format]})
+
     quantized_model = nncf.quantize(
         model,
         calibration_dataset,
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -118,10 +118,19 @@ class OVCLIExportTestCase(unittest.TestCase):
         (
             "automatic-speech-recognition",
             "whisper",
-            "--quant-mode int8 --dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code",
+            "int8",
+            "--dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code",
             (14, 22, 21) if is_transformers_version("<=", "4.36.0") else (14, 22, 25),
             (14, 21, 17) if is_transformers_version("<=", "4.36.0") else (14, 22, 18),
         ),
+        (
+            "text-generation",
+            "llama",
+            "f8e4m3",
+            "--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code",
+            (13,),
+            (16,),
+        ),
     ]
 
     TEST_4BIT_CONFIGURATIONS = [
@@ -411,30 +420,31 @@ def test_exporters_cli_full_quantization(
         self,
         task: str,
         model_type: str,
+        quant_mode: str,
         option: str,
-        expected_num_fq_nodes_per_model: Tuple[int],
+        expected_num_f_nodes_per_model: Tuple[int],
         expected_num_weight_nodes_per_model: Tuple[int],
     ):
         with TemporaryDirectory() as tmpdir:
             subprocess.run(
-                f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} {option} {tmpdir}",
+                f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --quant-mode {quant_mode} {option} {tmpdir}",
                 shell=True,
                 check=True,
             )
             model = eval(_HEAD_TO_AUTOMODELS[task]).from_pretrained(tmpdir)
 
-            submodels = []
+            models = [model]
             if task == "automatic-speech-recognition":
-                submodels = [model.encoder, model.decoder]
+                models = [model.encoder, model.decoder]
                 if model.decoder_with_past is not None:
-                    submodels.append(model.decoder_with_past)
+                    models.append(model.decoder_with_past)
                 else:
-                    expected_num_fq_nodes_per_model = expected_num_fq_nodes_per_model[:-1]
-            self.assertEqual(len(expected_num_fq_nodes_per_model), len(submodels))
-            for i, model in enumerate(submodels):
-                actual_num_fq_nodes, actual_num_weight_nodes = get_num_quantized_nodes(model)
-                self.assertEqual(expected_num_fq_nodes_per_model[i], actual_num_fq_nodes)
-                self.assertEqual(expected_num_weight_nodes_per_model[i], actual_num_weight_nodes["int8"])
+                    expected_num_f_nodes_per_model = expected_num_f_nodes_per_model[:-1]
+            self.assertEqual(len(expected_num_f_nodes_per_model), len(models))
+            for i, model in enumerate(models):
+                actual_num_f_nodes, actual_num_weight_nodes = get_num_quantized_nodes(model)
+                self.assertEqual(expected_num_f_nodes_per_model[i], actual_num_f_nodes)
+                self.assertEqual(expected_num_weight_nodes_per_model[i], actual_num_weight_nodes[quant_mode])
 
     def test_exporters_cli_int4_with_local_model_and_default_config(self):
         with TemporaryDirectory() as tmpdir:
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
@@ -206,31 +206,31 @@
 
 
 def get_num_quantized_nodes(model):
-    num_fake_quantize = 0
-    num_weight_nodes = {
-        "int8": 0,
-        "int4": 0,
-        "f4e2m1": 0,
-        "f8e8m0": 0,
-        "nf4": 0,
+    num_fake_nodes = 0
+    types_map = {
+        "i8": "int8",
+        "u8": "int8",
+        "i4": "int4",
+        "u4": "int4",
+        "f4e2m1": "f4e2m1",
+        "f8e8m0": "f8e8m0",
+        "nf4": "nf4",
+        "f8e4m3": "f8e4m3",
+        "f8e5m2": "f8e5m2",
     }
+    num_weight_nodes = {n: 0 for n in types_map.values()}
     ov_model = model if isinstance(model, ov.Model) else model.model
     for elem in ov_model.get_ops():
         if "FakeQuantize" in elem.name:
-            num_fake_quantize += 1
+            num_fake_nodes += 1
+        if "FakeConvert" in elem.name:
+            num_fake_nodes += 1
         for i in range(elem.get_output_size()):
             type_name = elem.get_output_element_type(i).get_type_name()
-            if type_name in ["i8", "u8"]:
-                num_weight_nodes["int8"] += 1
-            if type_name in ["i4", "u4"]:
-                num_weight_nodes["int4"] += 1
-            if type_name == "f4e2m1":
-                num_weight_nodes["f4e2m1"] += 1
-            if type_name == "f8e8m0":
-                num_weight_nodes["f8e8m0"] += 1
-            if type_name == "nf4":
-                num_weight_nodes["nf4"] += 1
-    return num_fake_quantize, num_weight_nodes
+            if type_name in types_map:
+                name = types_map[type_name]
+                num_weight_nodes[name] += 1
+    return num_fake_nodes, num_weight_nodes
 
 
 @contextmanager