huggingface · echarlaix · May 17, 2024 · May 16, 2024 · May 16, 2024
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -119,6 +119,15 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models."
         ),
     )
+    optional_group.add_argument(
+        "--all-layers",
+        action="store_true",
+        default=None,
+        help=(
+            "Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an weight "
+            "compression is applied, they are compressed to INT8."
+        ),
+    )
     optional_group.add_argument(
         "--disable-stateful",
         action="store_true",
@@ -198,6 +207,7 @@ def run(self):
                 and self.args.ratio is None
                 and self.args.group_size is None
                 and self.args.sym is None
+                and self.args.all_layers is None
                 and self.args.model in _DEFAULT_4BIT_CONFIGS
             ):
                 quantization_config = _DEFAULT_4BIT_CONFIGS[self.args.model]
@@ -207,6 +217,7 @@ def run(self):
                     "ratio": 1 if is_int8 else (self.args.ratio or 0.8),
                     "sym": self.args.sym or False,
                     "group_size": -1 if is_int8 else self.args.group_size,
+                    "all_layers": None if is_int8 else self.args.all_layers,
                 }
 
             if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}:

diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -18,7 +18,6 @@
 
 from parameterized import parameterized
 from utils_tests import (
-    _ARCHITECTURES_TO_EXPECTED_INT4_INT8,
     _ARCHITECTURES_TO_EXPECTED_INT8,
     MODEL_NAMES,
     get_num_quantized_nodes,
@@ -84,14 +83,13 @@ class OVCLIExportTestCase(unittest.TestCase):
         ("latent-consistency", 50, 135),
     )
 
-    SUPPORTED_4BIT_ARCHITECTURES = (("text-generation-with-past", "opt125m"),)
-
-    SUPPORTED_4BIT_OPTIONS = ["int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"]
-
-    TEST_4BIT_CONFIGURATONS = []
-    for arch in SUPPORTED_4BIT_ARCHITECTURES:
-        for option in SUPPORTED_4BIT_OPTIONS:
-            TEST_4BIT_CONFIGURATONS.append([arch[0], arch[1], option])
+    TEST_4BIT_CONFIGURATONS = [
+        ("text-generation-with-past", "opt125m", "int4_sym_g128", 62, 86),
+        ("text-generation-with-past", "opt125m", "int4_asym_g128", 62, 86),
+        ("text-generation-with-past", "opt125m", "int4_sym_g64", 62, 86),
+        ("text-generation-with-past", "opt125m", "int4_asym_g64", 62, 86),
+        ("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 16 --all-layers", 0, 32),
+    ]
 
     def _openvino_export(
         self, model_name: str, task: str, compression_option: str = None, compression_ratio: float = None
@@ -197,17 +195,16 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in
             self.assertEqual(exp_num_fq, num_fq)
 
     @parameterized.expand(TEST_4BIT_CONFIGURATONS)
-    def test_exporters_cli_int4(self, task: str, model_type: str, option: str):
+    def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expected_int8: int, expected_int4: int):
         with TemporaryDirectory() as tmpdir:
             subprocess.run(
-                f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task}  --weight-format {option} {tmpdir}",
+                f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}",
                 shell=True,
                 check=True,
             )
             model_kwargs = {"use_cache": task.endswith("with-past")} if "generation" in task else {}
             model = eval(_HEAD_TO_AUTOMODELS[task.replace("-with-past", "")]).from_pretrained(tmpdir, **model_kwargs)
 
-            expected_int8, expected_int4 = _ARCHITECTURES_TO_EXPECTED_INT4_INT8[model_type]
             _, num_int8, num_int4 = get_num_quantized_nodes(model)
             self.assertEqual(expected_int8, num_int8)
             self.assertEqual(expected_int4, num_int4)

diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
@@ -145,8 +145,6 @@
     "stable-diffusion-xl-refiner": (366, 34, 42, 66),
 }
 
-_ARCHITECTURES_TO_EXPECTED_INT4_INT8 = {"opt125m": (62, 86)}
-
 
 def get_num_quantized_nodes(ov_model):
     num_fake_quantize = 0