Add doc and test

l-bat · l-bat · commit 1138ff9a3127 · 2024-03-20T18:49:21.000Z
diff --git a/README.md b/README.md
@@ -78,12 +78,18 @@ It is possible to export your model to the [OpenVINO IR](https://docs.openvino.a
 optimum-cli export openvino --model gpt2 ov_model
 ```
 
-You can also apply 8-bit weight-only quantization when exporting your model : the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision.
+You can also apply 8-bit weight-only quantization when exporting your model : the model linear, embedding and convolution weights will be quantized to INT8, the activations will be kept in floating point precision.
 
 ```plain
 optimum-cli export openvino --model gpt2 --weight-format int8 ov_model
 ```
 
+Quantization in hybrid mode can be applied to Stable Diffusion pipeline during model export. This involves applying hybrid post-training quantization to the UNet model and weight-only quantization for the rest of the pipeline components. In the hybrid mode, weights in MatMul and Embedding layers are quantized, as well as activations of other layers.
+
+```plain
+optimum-cli export openvino --model stabilityai/stable-diffusion-2-1 --dataset conceptual_captions --weight-format int8 ov_model
+```
+
 To apply quantization on both weights and activations, you can find more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov).
 
 #### Inference:
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -247,27 +247,12 @@ class StoreAttr(object):
 
         GPTQQuantizer.post_init_model = post_init_model
 
-    model = TasksManager.get_model_from_task(
-        task,
-        model_name_or_path,
-        subfolder=subfolder,
-        revision=revision,
-        cache_dir=cache_dir,
-        use_auth_token=use_auth_token,
-        local_files_only=local_files_only,
-        force_download=force_download,
-        trust_remote_code=trust_remote_code,
-        framework=framework,
-        device=device,
-        library_name=library_name,
-        **loading_kwargs,
-    )
-
+    # Apply quantization in hybrid mode to Stable Diffusion before export
     if (
         library_name == "diffusers"
         and ov_config
         and ov_config.quantization_config
-        and "dataset" in ov_config.quantization_config
+        and ov_config.quantization_config.get("dataset", None)
     ):
         import huggingface_hub
 
@@ -301,6 +286,22 @@ class StoreAttr(object):
         model.save_pretrained(output)
         return
 
+    model = TasksManager.get_model_from_task(
+        task,
+        model_name_or_path,
+        subfolder=subfolder,
+        revision=revision,
+        cache_dir=cache_dir,
+        use_auth_token=use_auth_token,
+        local_files_only=local_files_only,
+        force_download=force_download,
+        trust_remote_code=trust_remote_code,
+        framework=framework,
+        device=device,
+        library_name=library_name,
+        **loading_kwargs,
+    )
+
     needs_pad_token_id = task == "text-classification" and getattr(model.config, "pad_token_id", None) is None
 
     if needs_pad_token_id:
diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py
@@ -96,6 +96,7 @@
     "stable-diffusion": "OVStableDiffusionPipeline",
     "stable-diffusion-xl": "OVStableDiffusionXLPipeline",
     "pix2struct": "OVModelForPix2Struct",
+    "latent-consistency": "OVLatentConsistencyModelPipeline",
 }
 
 
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -37,6 +37,7 @@
     OVModelForTokenClassification,
     OVStableDiffusionPipeline,
     OVStableDiffusionXLPipeline,
+    OVLatentConsistencyModelPipeline,
 )
 from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS
 from optimum.intel.utils.import_utils import is_openvino_tokenizers_available
@@ -77,6 +78,12 @@ class OVCLIExportTestCase(unittest.TestCase):
         "stable-diffusion-xl": 0,  # not supported
     }
 
+    SUPPORTED_SD_HYBRID_ARCHITECTURES = (
+        ("stable-diffusion", 72, 195),
+        ("stable-diffusion-xl", 84, 331),
+        ("latent-consistency", 50, 135),
+    )
+
     SUPPORTED_4BIT_ARCHITECTURES = (("text-generation-with-past", "opt125m"),)
 
     SUPPORTED_4BIT_OPTIONS = ["int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"]
@@ -176,6 +183,20 @@ def test_exporters_cli_int8(self, task: str, model_type: str):
                 _, num_int8, _ = get_num_quantized_nodes(model)
                 self.assertEqual(expected_int8[i], num_int8)
 
+    @parameterized.expand(SUPPORTED_SD_HYBRID_ARCHITECTURES)
+    def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: int, exp_num_int8: int):
+        with TemporaryDirectory() as tmpdir:
+            subprocess.run(
+                f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} "
+                f"--task {model_type} --dataset laion/filtered-wit --weight-format int8 {tmpdir}",
+                shell=True,
+                check=True,
+            )
+            model = eval(_HEAD_TO_AUTOMODELS[model_type]).from_pretrained(tmpdir)
+            num_fq, num_int8, _ = get_num_quantized_nodes(model.unet)
+            self.assertEqual(exp_num_int8, num_int8)
+            self.assertEqual(exp_num_fq, num_fq)
+
     @parameterized.expand(TEST_4BIT_CONFIGURATONS)
     def test_exporters_cli_int4(self, task: str, model_type: str, option: str):
         with TemporaryDirectory() as tmpdir:

Original file line number	Diff line number	Diff line change
`@@ -96,6 +96,7 @@`
`96`	`96`	`"stable-diffusion": "OVStableDiffusionPipeline",`
`97`	`97`	`"stable-diffusion-xl": "OVStableDiffusionXLPipeline",`
`98`	`98`	`"pix2struct": "OVModelForPix2Struct",`
	`99`	`+ "latent-consistency": "OVLatentConsistencyModelPipeline",`
`99`	`100`	`}`
`100`	`101`
`101`	`102`