Add quantization with dataset after model export for text-generation models

nikita-savelyevv · nikita-savelyevv · commit 56878bb0e108 · 2024-05-21T15:09:24.000+02:00
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -15,6 +15,7 @@
 
 import logging
 import sys
+import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional
 
@@ -128,6 +129,29 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "compression is applied, they are compressed to INT8."
         ),
     )
+    optional_group.add_argument(
+        "--quant-method",
+        type=str,
+        default=None,
+        choices=["default", "awq", "hybrid"],
+        help=("The quantization method to apply. Can be one of the following: ['default', 'awq', 'hybrid']."),
+    )
+    optional_group.add_argument(
+        "--sensitivity-metric",
+        type=str,
+        default=None,
+        help=(
+            "The sensitivity metric for assigning quantization precision to layers. Can be one of the following: "
+            "['weight_quantization_error', 'hessian_input_activation', 'mean_activation_variance', "
+            "'max_activation_variance', 'mean_activation_magnitude']."
+        ),
+    )
+    optional_group.add_argument(
+        "--num-samples",
+        type=int,
+        default=None,
+        help=("The maximum number of samples composing the calibration dataset for quantization."),
+    )
     optional_group.add_argument(
         "--disable-stateful",
         action="store_true",
@@ -180,7 +204,7 @@ def parse_args(parser: "ArgumentParser"):
         return parse_args_openvino(parser)
 
     def run(self):
-        from ...exporters.openvino.__main__ import main_export
+        from ...exporters.openvino.__main__ import main_export, infer_task
         from ...intel.openvino.configuration import _DEFAULT_4BIT_CONFIGS, OVConfig
 
         if self.args.fp16:
@@ -208,6 +232,9 @@ def run(self):
                 and self.args.group_size is None
                 and self.args.sym is None
                 and self.args.all_layers is None
+                and self.args.dataset is None
+                and self.args.quant_method is None
+                and self.args.sensitivity_metric is None
                 and self.args.model in _DEFAULT_4BIT_CONFIGS
             ):
                 quantization_config = _DEFAULT_4BIT_CONFIGS[self.args.model]
@@ -218,6 +245,10 @@ def run(self):
                     "sym": self.args.sym or False,
                     "group_size": -1 if is_int8 else self.args.group_size,
                     "all_layers": None if is_int8 else self.args.all_layers,
+                    "dataset": self.args.dataset,
+                    "num_samples": self.args.num_samples,
+                    "quant_method": self.args.quant_method,
+                    "sensitivity_metric": self.args.sensitivity_metric,
                 }
 
             if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}:
@@ -226,7 +257,6 @@ def run(self):
                 )
                 quantization_config["sym"] = "asym" not in self.args.weight_format
                 quantization_config["group_size"] = 128 if "128" in self.args.weight_format else 64
-            quantization_config["dataset"] = self.args.dataset
             ov_config = OVConfig(quantization_config=quantization_config)
 
         library_name = TasksManager.infer_library_from_model(self.args.model, library_name=self.args.library)
@@ -290,6 +320,19 @@ def run(self):
             if tokenizer_2 is not None:
                 export_tokenizer(tokenizer_2, output / "tokenizer_2")
         else:
+            task = infer_task(self.args.task, self.args.model)
+            quantization_config = ov_config.quantization_config
+            quantize_after_export = (
+                task.startswith("text-generation")
+                and quantization_config is not None
+                and hasattr(quantization_config, "dataset")
+                and quantization_config.dataset is not None
+            )
+            if quantize_after_export:
+                # In order to quantize a text-generation model with a dataset, an instance of OVModelForCausalLM is
+                # required. That's why the quantization is skipped during export and applied explicitly after export.
+                ov_config.quantization_config = None
+
             # TODO : add input shapes
             main_export(
                 model_name_or_path=self.args.model,
@@ -305,3 +348,19 @@ def run(self):
                 library_name=library_name,
                 # **input_shapes,
             )
+
+            if quantize_after_export:
+                from optimum.intel import OVModelForCausalLM, OVQuantizer
+
+                model = OVModelForCausalLM.from_pretrained(self.args.output)
+                quantizer = OVQuantizer(model)
+                quantization_config.tokenizer = quantization_config.tokenizer or str(self.args.output)
+                # TODO: set save_directory=self.args.output once OV is updated to 2024.3
+                quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config))
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    import shutil
+
+                    model.save_pretrained(temp_dir)
+                    ov_config.save_pretrained(self.args.output)
+                    shutil.copy(f"{temp_dir}/openvino_model.xml", f"{self.args.output}/openvino_model.xml")
+                    shutil.copy(f"{temp_dir}/openvino_model.bin", f"{self.args.output}/openvino_model.bin")
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -44,6 +44,22 @@
 logger = logging.getLogger(__name__)
 
 
+def infer_task(task, model_name_or_path):
+    task = TasksManager.map_from_synonym(task)
+    if task == "auto":
+        try:
+            task = TasksManager.infer_task_from_model(model_name_or_path)
+        except KeyError as e:
+            raise KeyError(
+                f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
+            )
+        except RequestsConnectionError as e:
+            raise RequestsConnectionError(
+                f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
+            )
+    return task
+
+
 def main_export(
     model_name_or_path: str,
     output: Union[str, Path],
@@ -174,7 +190,7 @@ def main_export(
             ov_config = OVConfig(quantization_config=q_config)
 
     original_task = task
-    task = TasksManager.map_from_synonym(task)
+    task = infer_task(task, model_name_or_path)
     framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework)
     library_name_is_not_provided = library_name is None
     library_name = TasksManager.infer_library_from_model(
@@ -188,18 +204,6 @@ def main_export(
         )
         library_name = "transformers"
 
-    if task == "auto":
-        try:
-            task = TasksManager.infer_task_from_model(model_name_or_path)
-        except KeyError as e:
-            raise KeyError(
-                f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
-            )
-        except RequestsConnectionError as e:
-            raise RequestsConnectionError(
-                f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
-            )
-
     do_gptq_patching = False
     custom_architecture = False
     loading_kwargs = {}
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -89,6 +89,14 @@ class OVCLIExportTestCase(unittest.TestCase):
         ("text-generation-with-past", "opt125m", "int4_sym_g64", 62, 86),
         ("text-generation-with-past", "opt125m", "int4_asym_g64", 62, 86),
         ("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 16 --all-layers", 0, 32),
+        (
+            "text-generation-with-past",
+            "llama_awq",
+            "int4 --ratio 1.0 --sym --group-size 16 --quant-method awq --dataset wikitext2 --num-samples 100 "
+            "--sensitivity-metric max_activation_variance",
+            4,
+            28,
+        ),
     ]
 
     def _openvino_export(
@@ -197,17 +205,19 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in
     @parameterized.expand(TEST_4BIT_CONFIGURATONS)
     def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expected_int8: int, expected_int4: int):
         with TemporaryDirectory() as tmpdir:
-            subprocess.run(
+            result = subprocess.run(
                 f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}",
                 shell=True,
                 check=True,
+                capture_output=True,
             )
             model_kwargs = {"use_cache": task.endswith("with-past")} if "generation" in task else {}
             model = eval(_HEAD_TO_AUTOMODELS[task.replace("-with-past", "")]).from_pretrained(tmpdir, **model_kwargs)
 
             _, num_int8, num_int4 = get_num_quantized_nodes(model)
             self.assertEqual(expected_int8, num_int8)
             self.assertEqual(expected_int4, num_int4)
+            self.assertTrue("--quant-method awq" not in option or b"Applying AWQ" in result.stdout)
 
     def test_exporters_cli_help(self):
         subprocess.run(