Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Export hybrid StableDiffusion models via optimum-cli #618

Merged
merged 8 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,18 @@ It is possible to export your model to the [OpenVINO IR](https://docs.openvino.a
optimum-cli export openvino --model gpt2 ov_model
```

You can also apply 8-bit weight-only quantization when exporting your model : the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision.
You can also apply 8-bit weight-only quantization when exporting your model : the model linear, embedding and convolution weights will be quantized to INT8, the activations will be kept in floating point precision.

```plain
optimum-cli export openvino --model gpt2 --weight-format int8 ov_model
```

Quantization in hybrid mode can be applied to Stable Diffusion pipeline during model export. This involves applying hybrid post-training quantization to the UNet model and weight-only quantization for the rest of the pipeline components. In the hybrid mode, weights in MatMul and Embedding layers are quantized, as well as activations of other layers.

```plain
optimum-cli export openvino --model stabilityai/stable-diffusion-2-1 --dataset conceptual_captions --weight-format int8 ov_model
```

To apply quantization on both weights and activations, you can find more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov).

#### Inference:
Expand Down
80 changes: 65 additions & 15 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from typing import TYPE_CHECKING, Optional

from ...exporters import TasksManager
from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available
from ..base import BaseOptimumCLICommand, CommandInfo


Expand Down Expand Up @@ -104,6 +105,16 @@ def parse_args_openvino(parser: "ArgumentParser"):
default=None,
help=("The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization."),
)
optional_group.add_argument(
"--dataset",
type=str,
default=None,
help=(
"The dataset used for data-aware compression or quantization with NNCF. "
"You can use the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs "
"or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models."
),
)
optional_group.add_argument(
"--disable-stateful",
action="store_true",
Expand Down Expand Up @@ -195,20 +206,59 @@ def run(self):
)
quantization_config["sym"] = "asym" not in self.args.weight_format
quantization_config["group_size"] = 128 if "128" in self.args.weight_format else 64
quantization_config["dataset"] = self.args.dataset
ov_config = OVConfig(quantization_config=quantization_config)

# TODO : add input shapes
main_export(
model_name_or_path=self.args.model,
output=self.args.output,
task=self.args.task,
framework=self.args.framework,
cache_dir=self.args.cache_dir,
trust_remote_code=self.args.trust_remote_code,
pad_token_id=self.args.pad_token_id,
ov_config=ov_config,
stateful=not self.args.disable_stateful,
convert_tokenizer=self.args.convert_tokenizer,
library_name=self.args.library
# **input_shapes,
)
library_name = TasksManager.infer_library_from_model(self.args.model)

if (
library_name == "diffusers"
and ov_config
and ov_config.quantization_config
and ov_config.quantization_config.dataset is not None
):
if not is_diffusers_available():
raise ValueError(DIFFUSERS_IMPORT_ERROR.format("Export of diffusers models"))

from diffusers import DiffusionPipeline

diffusers_config = DiffusionPipeline.load_config(self.args.model)
class_name = diffusers_config.get("_class_name", None)

if class_name == "LatentConsistencyModelPipeline":
from optimum.intel import OVLatentConsistencyModelPipeline

model_cls = OVLatentConsistencyModelPipeline

elif class_name == "StableDiffusionXLPipeline":
from optimum.intel import OVStableDiffusionXLPipeline

model_cls = OVStableDiffusionXLPipeline
elif class_name == "StableDiffusionPipeline":
from optimum.intel import OVStableDiffusionPipeline

model_cls = OVStableDiffusionPipeline
else:
raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.")

model = model_cls.from_pretrained(
self.args.model, export=True, quantization_config=ov_config.quantization_config
)
model.save_pretrained(self.args.output)

else:
# TODO : add input shapes
main_export(
model_name_or_path=self.args.model,
output=self.args.output,
task=self.args.task,
framework=self.args.framework,
cache_dir=self.args.cache_dir,
trust_remote_code=self.args.trust_remote_code,
pad_token_id=self.args.pad_token_id,
ov_config=ov_config,
stateful=not self.args.disable_stateful,
convert_tokenizer=self.args.convert_tokenizer,
library_name=library_name,
# **input_shapes,
)
2 changes: 1 addition & 1 deletion optimum/exporters/openvino/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def main_export(
model_name_or_path (`str`):
Model ID on huggingface.co or path on disk to the model repository to export.
output (`Union[str, Path]`):
Path indicating the directory where to store the generated ONNX model.
Path indicating the directory where to store the generated OpenVINO model.

> Optional parameters

Expand Down
2 changes: 1 addition & 1 deletion optimum/intel/openvino/modeling_diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ def transform_fn(data_item):
self.__call__(**inputs, height=height, width=width)
else:
self.__call__(*inputs, height=height, width=width)
if len(calibration_data) > num_samples:
if len(calibration_data) >= num_samples:
break

self.unet.request = self.unet.request.request
Expand Down
1 change: 1 addition & 0 deletions optimum/intel/openvino/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@
"stable-diffusion": "OVStableDiffusionPipeline",
"stable-diffusion-xl": "OVStableDiffusionXLPipeline",
"pix2struct": "OVModelForPix2Struct",
"latent-consistency": "OVLatentConsistencyModelPipeline",
}


Expand Down
20 changes: 20 additions & 0 deletions tests/openvino/test_exporters_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

from optimum.exporters.openvino.__main__ import main_export
from optimum.intel import ( # noqa
OVLatentConsistencyModelPipeline,
OVModelForAudioClassification,
OVModelForCausalLM,
OVModelForFeatureExtraction,
Expand Down Expand Up @@ -77,6 +78,12 @@ class OVCLIExportTestCase(unittest.TestCase):
"stable-diffusion-xl": 0, # not supported
}

SUPPORTED_SD_HYBRID_ARCHITECTURES = (
("stable-diffusion", 72, 195),
("stable-diffusion-xl", 84, 331),
("latent-consistency", 50, 135),
)

SUPPORTED_4BIT_ARCHITECTURES = (("text-generation-with-past", "opt125m"),)

SUPPORTED_4BIT_OPTIONS = ["int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"]
Expand Down Expand Up @@ -176,6 +183,19 @@ def test_exporters_cli_int8(self, task: str, model_type: str):
_, num_int8, _ = get_num_quantized_nodes(model)
self.assertEqual(expected_int8[i], num_int8)

@parameterized.expand(SUPPORTED_SD_HYBRID_ARCHITECTURES)
def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: int, exp_num_int8: int):
with TemporaryDirectory() as tmpdir:
subprocess.run(
f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --dataset laion/filtered-wit --weight-format int8 {tmpdir}",
shell=True,
check=True,
)
model = eval(_HEAD_TO_AUTOMODELS[model_type]).from_pretrained(tmpdir)
num_fq, num_int8, _ = get_num_quantized_nodes(model.unet)
self.assertEqual(exp_num_int8, num_int8)
self.assertEqual(exp_num_fq, num_fq)

@parameterized.expand(TEST_4BIT_CONFIGURATONS)
def test_exporters_cli_int4(self, task: str, model_type: str, option: str):
with TemporaryDirectory() as tmpdir:
Expand Down
Loading