diff --git a/README.md b/README.md index 78ca130145..41537d8971 100644 --- a/README.md +++ b/README.md @@ -78,12 +78,18 @@ It is possible to export your model to the [OpenVINO IR](https://docs.openvino.a optimum-cli export openvino --model gpt2 ov_model ``` -You can also apply 8-bit weight-only quantization when exporting your model : the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision. +You can also apply 8-bit weight-only quantization when exporting your model : the model linear, embedding and convolution weights will be quantized to INT8, the activations will be kept in floating point precision. ```plain optimum-cli export openvino --model gpt2 --weight-format int8 ov_model ``` +Quantization in hybrid mode can be applied to Stable Diffusion pipeline during model export. This involves applying hybrid post-training quantization to the UNet model and weight-only quantization for the rest of the pipeline components. In the hybrid mode, weights in MatMul and Embedding layers are quantized, as well as activations of other layers. + +```plain +optimum-cli export openvino --model stabilityai/stable-diffusion-2-1 --dataset conceptual_captions --weight-format int8 ov_model +``` + To apply quantization on both weights and activations, you can find more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov). #### Inference: diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 6c17a333ef..40901fbf90 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -19,6 +19,7 @@ from typing import TYPE_CHECKING, Optional from ...exporters import TasksManager +from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available from ..base import BaseOptimumCLICommand, CommandInfo @@ -104,6 +105,16 @@ def parse_args_openvino(parser: "ArgumentParser"): default=None, help=("The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization."), ) + optional_group.add_argument( + "--dataset", + type=str, + default=None, + help=( + "The dataset used for data-aware compression or quantization with NNCF. " + "You can use the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs " + "or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models." + ), + ) optional_group.add_argument( "--disable-stateful", action="store_true", @@ -195,20 +206,59 @@ def run(self): ) quantization_config["sym"] = "asym" not in self.args.weight_format quantization_config["group_size"] = 128 if "128" in self.args.weight_format else 64 + quantization_config["dataset"] = self.args.dataset ov_config = OVConfig(quantization_config=quantization_config) - # TODO : add input shapes - main_export( - model_name_or_path=self.args.model, - output=self.args.output, - task=self.args.task, - framework=self.args.framework, - cache_dir=self.args.cache_dir, - trust_remote_code=self.args.trust_remote_code, - pad_token_id=self.args.pad_token_id, - ov_config=ov_config, - stateful=not self.args.disable_stateful, - convert_tokenizer=self.args.convert_tokenizer, - library_name=self.args.library - # **input_shapes, - ) + library_name = TasksManager.infer_library_from_model(self.args.model) + + if ( + library_name == "diffusers" + and ov_config + and ov_config.quantization_config + and ov_config.quantization_config.dataset is not None + ): + if not is_diffusers_available(): + raise ValueError(DIFFUSERS_IMPORT_ERROR.format("Export of diffusers models")) + + from diffusers import DiffusionPipeline + + diffusers_config = DiffusionPipeline.load_config(self.args.model) + class_name = diffusers_config.get("_class_name", None) + + if class_name == "LatentConsistencyModelPipeline": + from optimum.intel import OVLatentConsistencyModelPipeline + + model_cls = OVLatentConsistencyModelPipeline + + elif class_name == "StableDiffusionXLPipeline": + from optimum.intel import OVStableDiffusionXLPipeline + + model_cls = OVStableDiffusionXLPipeline + elif class_name == "StableDiffusionPipeline": + from optimum.intel import OVStableDiffusionPipeline + + model_cls = OVStableDiffusionPipeline + else: + raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.") + + model = model_cls.from_pretrained( + self.args.model, export=True, quantization_config=ov_config.quantization_config + ) + model.save_pretrained(self.args.output) + + else: + # TODO : add input shapes + main_export( + model_name_or_path=self.args.model, + output=self.args.output, + task=self.args.task, + framework=self.args.framework, + cache_dir=self.args.cache_dir, + trust_remote_code=self.args.trust_remote_code, + pad_token_id=self.args.pad_token_id, + ov_config=ov_config, + stateful=not self.args.disable_stateful, + convert_tokenizer=self.args.convert_tokenizer, + library_name=library_name, + # **input_shapes, + ) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 8b8cc09fc1..5f74c1de8b 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -77,7 +77,7 @@ def main_export( model_name_or_path (`str`): Model ID on huggingface.co or path on disk to the model repository to export. output (`Union[str, Path]`): - Path indicating the directory where to store the generated ONNX model. + Path indicating the directory where to store the generated OpenVINO model. > Optional parameters diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 1e562749b2..7bc7cca04c 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -387,7 +387,7 @@ def transform_fn(data_item): self.__call__(**inputs, height=height, width=width) else: self.__call__(*inputs, height=height, width=width) - if len(calibration_data) > num_samples: + if len(calibration_data) >= num_samples: break self.unet.request = self.unet.request.request diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index a0439d2129..4d1479f733 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -96,6 +96,7 @@ "stable-diffusion": "OVStableDiffusionPipeline", "stable-diffusion-xl": "OVStableDiffusionXLPipeline", "pix2struct": "OVModelForPix2Struct", + "latent-consistency": "OVLatentConsistencyModelPipeline", } diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 46c6e3c69a..7d618c530e 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -26,6 +26,7 @@ from optimum.exporters.openvino.__main__ import main_export from optimum.intel import ( # noqa + OVLatentConsistencyModelPipeline, OVModelForAudioClassification, OVModelForCausalLM, OVModelForFeatureExtraction, @@ -77,6 +78,12 @@ class OVCLIExportTestCase(unittest.TestCase): "stable-diffusion-xl": 0, # not supported } + SUPPORTED_SD_HYBRID_ARCHITECTURES = ( + ("stable-diffusion", 72, 195), + ("stable-diffusion-xl", 84, 331), + ("latent-consistency", 50, 135), + ) + SUPPORTED_4BIT_ARCHITECTURES = (("text-generation-with-past", "opt125m"),) SUPPORTED_4BIT_OPTIONS = ["int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"] @@ -176,6 +183,19 @@ def test_exporters_cli_int8(self, task: str, model_type: str): _, num_int8, _ = get_num_quantized_nodes(model) self.assertEqual(expected_int8[i], num_int8) + @parameterized.expand(SUPPORTED_SD_HYBRID_ARCHITECTURES) + def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: int, exp_num_int8: int): + with TemporaryDirectory() as tmpdir: + subprocess.run( + f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --dataset laion/filtered-wit --weight-format int8 {tmpdir}", + shell=True, + check=True, + ) + model = eval(_HEAD_TO_AUTOMODELS[model_type]).from_pretrained(tmpdir) + num_fq, num_int8, _ = get_num_quantized_nodes(model.unet) + self.assertEqual(exp_num_int8, num_int8) + self.assertEqual(exp_num_fq, num_fq) + @parameterized.expand(TEST_4BIT_CONFIGURATONS) def test_exporters_cli_int4(self, task: str, model_type: str, option: str): with TemporaryDirectory() as tmpdir: