remove quantization from main_export

l-bat · l-bat · commit 4afbcfd26b6a · 2024-04-04T13:01:54.000+01:00
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -153,7 +153,7 @@ def parse_args(parser: "ArgumentParser"):
         return parse_args_openvino(parser)
 
     def run(self):
-        from ...exporters.openvino.__main__ import main_export
+        from ...exporters.openvino.__main__ import main_export, get_relevant_task, export_optimized_diffusion_model
         from ...intel.openvino.configuration import _DEFAULT_4BIT_CONFIGS, OVConfig
 
         if self.args.fp16:
@@ -200,17 +200,32 @@ def run(self):
             quantization_config["dataset"] = self.args.dataset
             ov_config = OVConfig(quantization_config=quantization_config)
 
+        library_name = TasksManager.infer_library_from_model(self.args.model)
+        task = get_relevant_task(self.args.task, self.args.model)
+        saved_dir = self.args.output
+
+        if library_name == "diffusers" and ov_config and ov_config.quantization_config.get("dataset"):
+            import tempfile
+            from copy import deepcopy
+            saved_dir = tempfile.mkdtemp()
+            quantization_config = deepcopy(ov_config.quantization_config)
+            ov_config.quantization_config = {}
+
         # TODO : add input shapes
         main_export(
             model_name_or_path=self.args.model,
-            output=self.args.output,
-            task=self.args.task,
+            output=saved_dir,
+            task=task,
             framework=self.args.framework,
             cache_dir=self.args.cache_dir,
             trust_remote_code=self.args.trust_remote_code,
             pad_token_id=self.args.pad_token_id,
             ov_config=ov_config,
             stateful=not self.args.disable_stateful,
             convert_tokenizer=self.args.convert_tokenizer,
+            library_name=library_name,
             # **input_shapes,
         )
+
+        if saved_dir != self.args.output:
+            export_optimized_diffusion_model(saved_dir, self.args.output, task, quantization_config)
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -77,7 +77,7 @@ def main_export(
         model_name_or_path (`str`):
             Model ID on huggingface.co or path on disk to the model repository to export.
         output (`Union[str, Path]`):
-            Path indicating the directory where to store the generated ONNX model.
+            Path indicating the directory where to store the generated OpenVINO model.
 
         > Optional parameters
 
@@ -161,24 +161,12 @@ def main_export(
             ov_config = OVConfig(quantization_config=q_config)
 
     original_task = task
-    task = TasksManager.map_from_synonym(task)
+    task = get_relevant_task(task, model_name_or_path)
     framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework)
     library_name = TasksManager.infer_library_from_model(
         model_name_or_path, subfolder=subfolder, library_name=library_name
     )
 
-    if task == "auto":
-        try:
-            task = TasksManager.infer_task_from_model(model_name_or_path)
-        except KeyError as e:
-            raise KeyError(
-                f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
-            )
-        except RequestsConnectionError as e:
-            raise RequestsConnectionError(
-                f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
-            )
-
     if convert_tokenizer and not is_openvino_tokenizers_available():
         logger.warning(
             "`convert_tokenizer` requires openvino-tokenizers, please install it with `pip install optimum-intel[openvino-tokenizers]`"
@@ -263,42 +251,6 @@ class StoreAttr(object):
         **loading_kwargs,
     )
 
-    # Apply quantization in hybrid mode to Stable Diffusion before export
-    if (
-        library_name == "diffusers"
-        and ov_config
-        and ov_config.quantization_config
-        and ov_config.quantization_config.get("dataset", None)
-    ):
-        class_name =  model.__class__.__name__
-        if "LatentConsistencyModelPipeline" in class_name:
-            from optimum.intel import OVLatentConsistencyModelPipeline
-
-            model_cls = OVLatentConsistencyModelPipeline
-        elif "StableDiffusionXLPipeline" in class_name:
-            from optimum.intel import OVStableDiffusionXLPipeline
-
-            model_cls = OVStableDiffusionXLPipeline
-        elif "StableDiffusionPipeline" in class_name:
-            from optimum.intel import OVStableDiffusionPipeline
-
-            model_cls = OVStableDiffusionPipeline
-        else:
-            raise NotImplementedError(f"{class_name} doesn't support quantization in hybrid mode.")
-
-        model = model_cls.from_pretrained(
-            model_id=model_name_or_path,
-            export=True,
-            quantization_config=ov_config.quantization_config,
-            cache_dir=cache_dir,
-            trust_remote_code=trust_remote_code,
-            revision=revision,
-            force_download=force_download,
-            use_auth_token=use_auth_token,
-        )
-        model.save_pretrained(output)
-        return
-
     needs_pad_token_id = task == "text-classification" and getattr(model.config, "pad_token_id", None) is None
 
     if needs_pad_token_id:
@@ -391,3 +343,40 @@ class StoreAttr(object):
     if do_gptq_patching:
         torch.cuda.is_available = orig_cuda_check
         GPTQQuantizer.post_init_model = orig_post_init_model
+
+
+def get_relevant_task(task, model_name_or_path):
+    relevant_task = TasksManager.map_from_synonym(task)
+    if relevant_task == "auto":
+        try:
+            relevant_task = TasksManager.infer_task_from_model(model_name_or_path)
+        except KeyError as e:
+            raise KeyError(
+                f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
+            )
+        except RequestsConnectionError as e:
+            raise RequestsConnectionError(
+                f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
+            )
+    return relevant_task
+
+
+def export_optimized_diffusion_model(model_name_or_path, output, task, quantization_config):
+    task = get_relevant_task(task, model_name_or_path)
+    if task == "latent-consistency":
+        from optimum.intel import OVLatentConsistencyModelPipeline
+
+        model_cls = OVLatentConsistencyModelPipeline
+    elif task == "stable-diffusion-xl":
+        from optimum.intel import OVStableDiffusionXLPipeline
+
+        model_cls = OVStableDiffusionXLPipeline
+    elif task == "stable-diffusion":
+        from optimum.intel import OVStableDiffusionPipeline
+
+        model_cls = OVStableDiffusionPipeline
+    else:
+        raise NotImplementedError(f"Quantization in hybrid mode isn't supported for {task}.")
+
+    model = model_cls.from_pretrained(model_id=model_name_or_path, quantization_config=quantization_config)
+    model.save_pretrained(output)