Add int4_f8e4m3 quant mode

nikita-savelyevv · nikita-savelyevv · commit 569fe61c19bd · 2025-02-10T20:49:16.000+01:00
diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx
@@ -31,7 +31,8 @@ Check out the help for more options:
 
 ```text
 usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code]
-                                   [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3}]
+                                   [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}]
+                                   [--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,int4_f8e4m3}]
                                    [--library {transformers,diffusers,timm,sentence_transformers,open_clip}]
                                    [--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym]
                                    [--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}]
@@ -67,7 +68,7 @@ Optional arguments:
                         on your local machine arbitrary code present in the model repository.
   --weight-format {fp32,fp16,int8,int4,mxfp4,nf4}
                         The weight format of the exported model.
-  --quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3}
+  --quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,int4_f8e4m3}
                         Quantization precision mode. This is used for applying full model quantization including
                         activations.
   --library {transformers,diffusers,timm,sentence_transformers,open_clip}
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -78,7 +78,7 @@ def parse_args_openvino(parser: "ArgumentParser"):
     optional_group.add_argument(
         "--quant-mode",
         type=str,
-        choices=["int8", "f8e4m3", "f8e5m2", "nf4_f8e4m3"],
+        choices=["int8", "f8e4m3", "f8e5m2", "nf4_f8e4m3", "int4_f8e4m3"],
         default=None,
         help=(
             "Quantization precision mode. This is used for applying full model quantization including activations. "
@@ -359,9 +359,10 @@ def run(self):
                     "Dataset is required for full quantization. Please provide it with --dataset argument."
                 )
 
-            if self.args.quant_mode == "nf4_f8e4m3":
+            if self.args.quant_mode in ["nf4_f8e4m3", "int4_f8e4m3"]:
                 wc_config = prepare_wc_config(self.args, _DEFAULT_4BIT_CONFIG)
-                wc_config["dtype"] = "nf4"
+                weight_dtype_map = {"nf4_f8e4m3": "nf4", "int4_f8e4m3": "int4"}
+                wc_config["dtype"] = weight_dtype_map[self.args.quant_mode]
 
                 q_config = prepare_q_config(self.args)
                 q_config["dtype"] = "f8e4m3"
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -157,6 +157,30 @@ class OVCLIExportTestCase(unittest.TestCase):
                 {"int8": 4, "nf4": 14},
             ],
         ),
+        (
+            "text-generation",
+            "llama",
+            "int4_f8e4m3",
+            "--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --group-size 16 --trust-remote-code",
+            [
+                13,
+            ],
+            [
+                {"int8": 4, "int4": 28},
+            ],
+        ),
+        (
+            "text-generation",
+            "llama",
+            "int4_f8e4m3",
+            "--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --group-size 16 --trust-remote-code --sym",
+            [
+                13,
+            ],
+            [
+                {"int8": 4, "int4": 14},
+            ],
+        ),
     ]
 
     TEST_4BIT_CONFIGURATIONS = [
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
@@ -174,6 +174,22 @@ class OVQuantizerTest(unittest.TestCase):
                 {"int8": 4, "f8e4m3": 4, "nf4": 6},
             ],
         ),
+        (
+            OVModelForCausalLM,
+            "llama",
+            OVMixedQuantizationConfig(
+                weight_quantization_config=OVWeightQuantizationConfig(bits=4, group_size=16),
+                full_quantization_config=OVQuantizationConfig(dtype="f8e4m3"),
+                dataset="wikitext2",
+                num_samples=1,
+            ),
+            [
+                13,
+            ],
+            [
+                {"int8": 4, "int4": 28},
+            ],
+        ),
     ]
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_TORCH_MODEL)