From 2b424b0cbd56cdd1aa8a32ad3f227376d963ab89 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 16 May 2024 11:11:53 +0200 Subject: [PATCH 1/2] Add --all-layers argument to CLI --- optimum/commands/export/openvino.py | 10 ++++++++++ tests/openvino/test_exporters_cli.py | 21 +++++++++------------ tests/openvino/utils_tests.py | 2 -- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 025a40e057..c5f98442af 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -119,6 +119,14 @@ def parse_args_openvino(parser: "ArgumentParser"): "or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models." ), ) + optional_group.add_argument( + "--all-layers", + action="store_true", + default=None, + help=( + "Whether embeddings and last MatMul layers should be compressed to a primary precision (usually, INT4)." + ), + ) optional_group.add_argument( "--disable-stateful", action="store_true", @@ -198,6 +206,7 @@ def run(self): and self.args.ratio is None and self.args.group_size is None and self.args.sym is None + and self.args.all_layers is None and self.args.model in _DEFAULT_4BIT_CONFIGS ): quantization_config = _DEFAULT_4BIT_CONFIGS[self.args.model] @@ -207,6 +216,7 @@ def run(self): "ratio": 1 if is_int8 else (self.args.ratio or 0.8), "sym": self.args.sym or False, "group_size": -1 if is_int8 else self.args.group_size, + "all_layers": None if is_int8 else self.args.all_layers, } if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}: diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index cac79abaee..cce25bbae1 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -18,7 +18,6 @@ from parameterized import parameterized from utils_tests import ( - _ARCHITECTURES_TO_EXPECTED_INT4_INT8, _ARCHITECTURES_TO_EXPECTED_INT8, MODEL_NAMES, get_num_quantized_nodes, @@ -84,14 +83,13 @@ class OVCLIExportTestCase(unittest.TestCase): ("latent-consistency", 50, 135), ) - SUPPORTED_4BIT_ARCHITECTURES = (("text-generation-with-past", "opt125m"),) - - SUPPORTED_4BIT_OPTIONS = ["int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"] - - TEST_4BIT_CONFIGURATONS = [] - for arch in SUPPORTED_4BIT_ARCHITECTURES: - for option in SUPPORTED_4BIT_OPTIONS: - TEST_4BIT_CONFIGURATONS.append([arch[0], arch[1], option]) + TEST_4BIT_CONFIGURATONS = [ + ("text-generation-with-past", "opt125m", "int4_sym_g128", 62, 86), + ("text-generation-with-past", "opt125m", "int4_asym_g128", 62, 86), + ("text-generation-with-past", "opt125m", "int4_sym_g64", 62, 86), + ("text-generation-with-past", "opt125m", "int4_asym_g64", 62, 86), + ("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 16 --all-layers", 0, 32), + ] def _openvino_export( self, model_name: str, task: str, compression_option: str = None, compression_ratio: float = None @@ -197,17 +195,16 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in self.assertEqual(exp_num_fq, num_fq) @parameterized.expand(TEST_4BIT_CONFIGURATONS) - def test_exporters_cli_int4(self, task: str, model_type: str, option: str): + def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expected_int8: int, expected_int4: int): with TemporaryDirectory() as tmpdir: subprocess.run( - f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}", + f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}", shell=True, check=True, ) model_kwargs = {"use_cache": task.endswith("with-past")} if "generation" in task else {} model = eval(_HEAD_TO_AUTOMODELS[task.replace("-with-past", "")]).from_pretrained(tmpdir, **model_kwargs) - expected_int8, expected_int4 = _ARCHITECTURES_TO_EXPECTED_INT4_INT8[model_type] _, num_int8, num_int4 = get_num_quantized_nodes(model) self.assertEqual(expected_int8, num_int8) self.assertEqual(expected_int4, num_int4) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index aa3ea5f33a..441b1224c4 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -145,8 +145,6 @@ "stable-diffusion-xl-refiner": (366, 34, 42, 66), } -_ARCHITECTURES_TO_EXPECTED_INT4_INT8 = {"opt125m": (62, 86)} - def get_num_quantized_nodes(ov_model): num_fake_quantize = 0 From 6bb1330905bbfaf3b60e711a52d9347b3e5474c3 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 16 May 2024 13:02:55 +0200 Subject: [PATCH 2/2] Update description --- optimum/commands/export/openvino.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index c5f98442af..ffd084d4e6 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -124,7 +124,8 @@ def parse_args_openvino(parser: "ArgumentParser"): action="store_true", default=None, help=( - "Whether embeddings and last MatMul layers should be compressed to a primary precision (usually, INT4)." + "Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an weight " + "compression is applied, they are compressed to INT8." ), ) optional_group.add_argument(