Export hybrid StableDiffusion models via optimum-cli (#618)

l-bat · echarlaix · web-flow · commit 4651ac2c4a05 · 2024-04-18T12:11:08.000+02:00
* Export hybrid StableDiffusion models via optimum-cli

* Add doc and test

* Remove huggingface_hub

* remove quantization from main_export

* remove unused function

* Infer task by loading the diffusers config

* Fix style

* fix tests

---------

Co-authored-by: Ella Charlaix &lt;ella@huggingface.co&gt;
diff --git a/README.md b/README.md
@@ -78,12 +78,18 @@ It is possible to export your model to the [OpenVINO IR](https://docs.openvino.a
 optimum-cli export openvino --model gpt2 ov_model
 ```
 
-You can also apply 8-bit weight-only quantization when exporting your model : the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision.
+You can also apply 8-bit weight-only quantization when exporting your model : the model linear, embedding and convolution weights will be quantized to INT8, the activations will be kept in floating point precision.
 
 ```plain
 optimum-cli export openvino --model gpt2 --weight-format int8 ov_model
 ```
 
+Quantization in hybrid mode can be applied to Stable Diffusion pipeline during model export. This involves applying hybrid post-training quantization to the UNet model and weight-only quantization for the rest of the pipeline components. In the hybrid mode, weights in MatMul and Embedding layers are quantized, as well as activations of other layers.
+
+```plain
+optimum-cli export openvino --model stabilityai/stable-diffusion-2-1 --dataset conceptual_captions --weight-format int8 ov_model
+```
+
 To apply quantization on both weights and activations, you can find more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov).
 
 #### Inference:
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -19,6 +19,7 @@
 from typing import TYPE_CHECKING, Optional
 
 from ...exporters import TasksManager
+from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available
 from ..base import BaseOptimumCLICommand, CommandInfo
 
 
@@ -104,6 +105,16 @@ def parse_args_openvino(parser: "ArgumentParser"):
         default=None,
         help=("The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization."),
     )
+    optional_group.add_argument(
+        "--dataset",
+        type=str,
+        default=None,
+        help=(
+            "The dataset used for data-aware compression or quantization with NNCF. "
+            "You can use the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs "
+            "or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models."
+        ),
+    )
     optional_group.add_argument(
         "--disable-stateful",
         action="store_true",
@@ -195,20 +206,59 @@ def run(self):
                 )
                 quantization_config["sym"] = "asym" not in self.args.weight_format
                 quantization_config["group_size"] = 128 if "128" in self.args.weight_format else 64
+            quantization_config["dataset"] = self.args.dataset
             ov_config = OVConfig(quantization_config=quantization_config)
 
-        # TODO : add input shapes
-        main_export(
-            model_name_or_path=self.args.model,
-            output=self.args.output,
-            task=self.args.task,
-            framework=self.args.framework,
-            cache_dir=self.args.cache_dir,
-            trust_remote_code=self.args.trust_remote_code,
-            pad_token_id=self.args.pad_token_id,
-            ov_config=ov_config,
-            stateful=not self.args.disable_stateful,
-            convert_tokenizer=self.args.convert_tokenizer,
-            library_name=self.args.library
-            # **input_shapes,
-        )
+        library_name = TasksManager.infer_library_from_model(self.args.model)
+
+        if (
+            library_name == "diffusers"
+            and ov_config
+            and ov_config.quantization_config
+            and ov_config.quantization_config.dataset is not None
+        ):
+            if not is_diffusers_available():
+                raise ValueError(DIFFUSERS_IMPORT_ERROR.format("Export of diffusers models"))
+
+            from diffusers import DiffusionPipeline
+
+            diffusers_config = DiffusionPipeline.load_config(self.args.model)
+            class_name = diffusers_config.get("_class_name", None)
+
+            if class_name == "LatentConsistencyModelPipeline":
+                from optimum.intel import OVLatentConsistencyModelPipeline
+
+                model_cls = OVLatentConsistencyModelPipeline
+
+            elif class_name == "StableDiffusionXLPipeline":
+                from optimum.intel import OVStableDiffusionXLPipeline
+
+                model_cls = OVStableDiffusionXLPipeline
+            elif class_name == "StableDiffusionPipeline":
+                from optimum.intel import OVStableDiffusionPipeline
+
+                model_cls = OVStableDiffusionPipeline
+            else:
+                raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.")
+
+            model = model_cls.from_pretrained(
+                self.args.model, export=True, quantization_config=ov_config.quantization_config
+            )
+            model.save_pretrained(self.args.output)
+
+        else:
+            # TODO : add input shapes
+            main_export(
+                model_name_or_path=self.args.model,
+                output=self.args.output,
+                task=self.args.task,
+                framework=self.args.framework,
+                cache_dir=self.args.cache_dir,
+                trust_remote_code=self.args.trust_remote_code,
+                pad_token_id=self.args.pad_token_id,
+                ov_config=ov_config,
+                stateful=not self.args.disable_stateful,
+                convert_tokenizer=self.args.convert_tokenizer,
+                library_name=library_name,
+                # **input_shapes,
+            )
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -77,7 +77,7 @@ def main_export(
         model_name_or_path (`str`):
             Model ID on huggingface.co or path on disk to the model repository to export.
         output (`Union[str, Path]`):
-            Path indicating the directory where to store the generated ONNX model.
+            Path indicating the directory where to store the generated OpenVINO model.
 
         > Optional parameters
 
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
@@ -387,7 +387,7 @@ def transform_fn(data_item):
                 self.__call__(**inputs, height=height, width=width)
             else:
                 self.__call__(*inputs, height=height, width=width)
-            if len(calibration_data) > num_samples:
+            if len(calibration_data) >= num_samples:
                 break
 
         self.unet.request = self.unet.request.request
diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py
@@ -96,6 +96,7 @@
     "stable-diffusion": "OVStableDiffusionPipeline",
     "stable-diffusion-xl": "OVStableDiffusionXLPipeline",
     "pix2struct": "OVModelForPix2Struct",
+    "latent-consistency": "OVLatentConsistencyModelPipeline",
 }
 
 
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -26,6 +26,7 @@
 
 from optimum.exporters.openvino.__main__ import main_export
 from optimum.intel import (  # noqa
+    OVLatentConsistencyModelPipeline,
     OVModelForAudioClassification,
     OVModelForCausalLM,
     OVModelForFeatureExtraction,
@@ -77,6 +78,12 @@ class OVCLIExportTestCase(unittest.TestCase):
         "stable-diffusion-xl": 0,  # not supported
     }
 
+    SUPPORTED_SD_HYBRID_ARCHITECTURES = (
+        ("stable-diffusion", 72, 195),
+        ("stable-diffusion-xl", 84, 331),
+        ("latent-consistency", 50, 135),
+    )
+
     SUPPORTED_4BIT_ARCHITECTURES = (("text-generation-with-past", "opt125m"),)
 
     SUPPORTED_4BIT_OPTIONS = ["int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"]
@@ -176,6 +183,19 @@ def test_exporters_cli_int8(self, task: str, model_type: str):
                 _, num_int8, _ = get_num_quantized_nodes(model)
                 self.assertEqual(expected_int8[i], num_int8)
 
+    @parameterized.expand(SUPPORTED_SD_HYBRID_ARCHITECTURES)
+    def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: int, exp_num_int8: int):
+        with TemporaryDirectory() as tmpdir:
+            subprocess.run(
+                f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --dataset laion/filtered-wit --weight-format int8 {tmpdir}",
+                shell=True,
+                check=True,
+            )
+            model = eval(_HEAD_TO_AUTOMODELS[model_type]).from_pretrained(tmpdir)
+            num_fq, num_int8, _ = get_num_quantized_nodes(model.unet)
+            self.assertEqual(exp_num_int8, num_int8)
+            self.assertEqual(exp_num_fq, num_fq)
+
     @parameterized.expand(TEST_4BIT_CONFIGURATONS)
     def test_exporters_cli_int4(self, task: str, model_type: str, option: str):
         with TemporaryDirectory() as tmpdir:

Original file line number	Diff line number	Diff line change
`@@ -96,6 +96,7 @@`
`96`	`96`	`"stable-diffusion": "OVStableDiffusionPipeline",`
`97`	`97`	`"stable-diffusion-xl": "OVStableDiffusionXLPipeline",`
`98`	`98`	`"pix2struct": "OVModelForPix2Struct",`
	`99`	`+ "latent-consistency": "OVLatentConsistencyModelPipeline",`
`99`	`100`	`}`
`100`	`101`
`101`	`102`