From 2b424b0cbd56cdd1aa8a32ad3f227376d963ab89 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 16 May 2024 11:11:53 +0200
Subject: [PATCH 1/2] Add --all-layers argument to CLI

---
 optimum/commands/export/openvino.py  | 10 ++++++++++
 tests/openvino/test_exporters_cli.py | 21 +++++++++------------
 tests/openvino/utils_tests.py        |  2 --
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 025a40e057..c5f98442af 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -119,6 +119,14 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models."
         ),
     )
+    optional_group.add_argument(
+        "--all-layers",
+        action="store_true",
+        default=None,
+        help=(
+            "Whether embeddings and last MatMul layers should be compressed to a primary precision (usually, INT4)."
+        ),
+    )
     optional_group.add_argument(
         "--disable-stateful",
         action="store_true",
@@ -198,6 +206,7 @@ def run(self):
                 and self.args.ratio is None
                 and self.args.group_size is None
                 and self.args.sym is None
+                and self.args.all_layers is None
                 and self.args.model in _DEFAULT_4BIT_CONFIGS
             ):
                 quantization_config = _DEFAULT_4BIT_CONFIGS[self.args.model]
@@ -207,6 +216,7 @@ def run(self):
                     "ratio": 1 if is_int8 else (self.args.ratio or 0.8),
                     "sym": self.args.sym or False,
                     "group_size": -1 if is_int8 else self.args.group_size,
+                    "all_layers": None if is_int8 else self.args.all_layers,
                 }
 
             if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}:
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index cac79abaee..cce25bbae1 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -18,7 +18,6 @@
 
 from parameterized import parameterized
 from utils_tests import (
-    _ARCHITECTURES_TO_EXPECTED_INT4_INT8,
     _ARCHITECTURES_TO_EXPECTED_INT8,
     MODEL_NAMES,
     get_num_quantized_nodes,
@@ -84,14 +83,13 @@ class OVCLIExportTestCase(unittest.TestCase):
         ("latent-consistency", 50, 135),
     )
 
-    SUPPORTED_4BIT_ARCHITECTURES = (("text-generation-with-past", "opt125m"),)
-
-    SUPPORTED_4BIT_OPTIONS = ["int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"]
-
-    TEST_4BIT_CONFIGURATONS = []
-    for arch in SUPPORTED_4BIT_ARCHITECTURES:
-        for option in SUPPORTED_4BIT_OPTIONS:
-            TEST_4BIT_CONFIGURATONS.append([arch[0], arch[1], option])
+    TEST_4BIT_CONFIGURATONS = [
+        ("text-generation-with-past", "opt125m", "int4_sym_g128", 62, 86),
+        ("text-generation-with-past", "opt125m", "int4_asym_g128", 62, 86),
+        ("text-generation-with-past", "opt125m", "int4_sym_g64", 62, 86),
+        ("text-generation-with-past", "opt125m", "int4_asym_g64", 62, 86),
+        ("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 16 --all-layers", 0, 32),
+    ]
 
     def _openvino_export(
         self, model_name: str, task: str, compression_option: str = None, compression_ratio: float = None
@@ -197,17 +195,16 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in
             self.assertEqual(exp_num_fq, num_fq)
 
     @parameterized.expand(TEST_4BIT_CONFIGURATONS)
-    def test_exporters_cli_int4(self, task: str, model_type: str, option: str):
+    def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expected_int8: int, expected_int4: int):
         with TemporaryDirectory() as tmpdir:
             subprocess.run(
-                f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task}  --weight-format {option} {tmpdir}",
+                f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}",
                 shell=True,
                 check=True,
             )
             model_kwargs = {"use_cache": task.endswith("with-past")} if "generation" in task else {}
             model = eval(_HEAD_TO_AUTOMODELS[task.replace("-with-past", "")]).from_pretrained(tmpdir, **model_kwargs)
 
-            expected_int8, expected_int4 = _ARCHITECTURES_TO_EXPECTED_INT4_INT8[model_type]
             _, num_int8, num_int4 = get_num_quantized_nodes(model)
             self.assertEqual(expected_int8, num_int8)
             self.assertEqual(expected_int4, num_int4)
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index aa3ea5f33a..441b1224c4 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -145,8 +145,6 @@
     "stable-diffusion-xl-refiner": (366, 34, 42, 66),
 }
 
-_ARCHITECTURES_TO_EXPECTED_INT4_INT8 = {"opt125m": (62, 86)}
-
 
 def get_num_quantized_nodes(ov_model):
     num_fake_quantize = 0

From 6bb1330905bbfaf3b60e711a52d9347b3e5474c3 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 16 May 2024 13:02:55 +0200
Subject: [PATCH 2/2] Update description

---
 optimum/commands/export/openvino.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index c5f98442af..ffd084d4e6 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -124,7 +124,8 @@ def parse_args_openvino(parser: "ArgumentParser"):
         action="store_true",
         default=None,
         help=(
-            "Whether embeddings and last MatMul layers should be compressed to a primary precision (usually, INT4)."
+            "Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an weight "
+            "compression is applied, they are compressed to INT8."
         ),
     )
     optional_group.add_argument(