diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 93528e0085..70d2e4885c 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -318,6 +318,10 @@ def run(self): from optimum.intel import OVStableDiffusionPipeline model_cls = OVStableDiffusionPipeline + elif class_name == "StableDiffusion3Pipeline": + from optimum.intel import OVStableDiffusion3Pipeline + + model_cls = OVStableDiffusion3Pipeline else: raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.") diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 69cfec1d96..1cf2ecabe4 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -491,7 +491,7 @@ def maybe_convert_tokenizers(library_name: str, output: Path, model=None, prepro f"models won't be generated. Exception: {exception}" ) elif model: - for tokenizer_name in ("tokenizer", "tokenizer_2"): + for tokenizer_name in ("tokenizer", "tokenizer_2", "tokenizer_3"): tokenizer = getattr(model, tokenizer_name, None) if tokenizer: export_tokenizer(tokenizer, output / tokenizer_name, task=task) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 4e6503b5bd..853d0232df 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import functools import gc import logging @@ -31,7 +32,12 @@ from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed from optimum.exporters.onnx.convert import export_pytorch as export_pytorch_to_onnx from optimum.exporters.onnx.convert import export_tensorflow as export_tensorflow_onnx -from optimum.exporters.utils import _get_submodels_and_export_configs as _default_get_submodels_and_export_configs +from optimum.exporters.utils import ( + _get_submodels_and_export_configs as _default_get_submodels_and_export_configs, +) +from optimum.exporters.utils import ( + get_diffusion_models_for_export, +) from optimum.intel.utils.import_utils import ( _nncf_version, _open_clip_version, @@ -618,23 +624,27 @@ def export_from_model( model, library_name, task, preprocessors, custom_export_configs, fn_get_submodels ) - logging.disable(logging.INFO) - export_config, models_and_export_configs, stateful_submodels = _get_submodels_and_export_configs( - model=model, - task=task, - monolith=False, - custom_export_configs=custom_export_configs if custom_export_configs is not None else {}, - custom_architecture=custom_architecture, - fn_get_submodels=fn_get_submodels, - preprocessors=preprocessors, - library_name=library_name, - model_kwargs=model_kwargs, - _variant="default", - legacy=False, - exporter="openvino", - stateful=stateful, - ) - logging.disable(logging.NOTSET) + if library_name == "diffusers": + export_config, models_and_export_configs = get_diffusion_models_for_export_ext(model, exporter="openvino") + stateful_submodels = False + else: + logging.disable(logging.INFO) + export_config, models_and_export_configs, stateful_submodels = _get_submodels_and_export_configs( + model=model, + task=task, + monolith=False, + custom_export_configs=custom_export_configs if custom_export_configs is not None else {}, + custom_architecture=custom_architecture, + fn_get_submodels=fn_get_submodels, + preprocessors=preprocessors, + library_name=library_name, + model_kwargs=model_kwargs, + _variant="default", + legacy=False, + exporter="openvino", + stateful=stateful, + ) + logging.disable(logging.NOTSET) if library_name == "open_clip": if hasattr(model.config, "save_pretrained"): @@ -700,6 +710,10 @@ def export_from_model( if tokenizer_2 is not None: tokenizer_2.save_pretrained(output.joinpath("tokenizer_2")) + tokenizer_3 = getattr(model, "tokenizer_3", None) + if tokenizer_3 is not None: + tokenizer_3.save_pretrained(output.joinpath("tokenizer_3")) + model.save_config(output) export_models( @@ -888,3 +902,218 @@ def _get_submodels_and_export_configs( ) stateful_per_model = [stateful] * len(models_for_export) return export_config, models_for_export, stateful_per_model + + +def get_diffusion_models_for_export_ext( + pipeline: "DiffusionPipeline", int_dtype: str = "int64", float_dtype: str = "fp32", exporter: str = "openvino" +): + try: + from diffusers import ( + StableDiffusion3Img2ImgPipeline, + StableDiffusion3InpaintPipeline, + StableDiffusion3Pipeline, + ) + + is_sd3 = isinstance( + pipeline, (StableDiffusion3Pipeline, StableDiffusion3InpaintPipeline, StableDiffusion3Img2ImgPipeline) + ) + except ImportError: + is_sd3 = False + + try: + from diffusers import FluxPipeline + + is_flux = isinstance(pipeline, FluxPipeline) + except ImportError: + is_flux = False + + if not is_sd3 and not is_flux: + return None, get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter) + if is_sd3: + models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype) + else: + models_for_export = get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype) + + return None, models_for_export + + +def get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype): + models_for_export = {} + + # Text encoder + text_encoder = getattr(pipeline, "text_encoder", None) + if text_encoder is not None: + text_encoder.config.output_hidden_states = True + text_encoder.text_model.config.output_hidden_states = True + text_encoder_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder, + exporter=exporter, + library_name="diffusers", + task="feature-extraction", + model_type="clip-text-with-projection", + ) + text_encoder_export_config = text_encoder_config_constructor( + pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["text_encoder"] = (text_encoder, text_encoder_export_config) + + transformer = pipeline.transformer + transformer.config.text_encoder_projection_dim = transformer.config.joint_attention_dim + transformer.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False) + transformer.config.time_cond_proj_dim = None + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=transformer, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="sd3-transformer", + ) + transformer_export_config = export_config_constructor( + pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["transformer"] = (transformer, transformer_export_config) + + # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 + vae_encoder = copy.deepcopy(pipeline.vae) + vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters} + vae_config_constructor = TasksManager.get_exporter_config_constructor( + model=vae_encoder, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="vae-encoder", + ) + vae_encoder_export_config = vae_config_constructor( + vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config) + + # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 + vae_decoder = copy.deepcopy(pipeline.vae) + vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample) + vae_config_constructor = TasksManager.get_exporter_config_constructor( + model=vae_decoder, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="vae-decoder", + ) + vae_decoder_export_config = vae_config_constructor( + vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["vae_decoder"] = (vae_decoder, vae_decoder_export_config) + + text_encoder_2 = getattr(pipeline, "text_encoder_2", None) + if text_encoder_2 is not None: + text_encoder_2.config.output_hidden_states = True + text_encoder_2.text_model.config.output_hidden_states = True + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder_2, + exporter=exporter, + library_name="diffusers", + task="feature-extraction", + model_type="clip-text-with-projection", + ) + export_config = export_config_constructor(text_encoder_2.config, int_dtype=int_dtype, float_dtype=float_dtype) + models_for_export["text_encoder_2"] = (text_encoder_2, export_config) + + text_encoder_3 = getattr(pipeline, "text_encoder_3", None) + if text_encoder_3 is not None: + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder_3, + exporter=exporter, + library_name="diffusers", + task="feature-extraction", + model_type="t5-encoder-model", + ) + export_config = export_config_constructor( + text_encoder_3.config, + int_dtype=int_dtype, + float_dtype=float_dtype, + ) + models_for_export["text_encoder_3"] = (text_encoder_3, export_config) + + return models_for_export + + +def get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype): + models_for_export = {} + + # Text encoder + text_encoder = getattr(pipeline, "text_encoder", None) + if text_encoder is not None: + text_encoder_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder, + exporter=exporter, + library_name="diffusers", + task="feature-extraction", + model_type="clip-text-model", + ) + text_encoder_export_config = text_encoder_config_constructor( + pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["text_encoder"] = (text_encoder, text_encoder_export_config) + + transformer = pipeline.transformer + transformer.config.text_encoder_projection_dim = transformer.config.joint_attention_dim + transformer.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False) + transformer.config.time_cond_proj_dim = None + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=transformer, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="flux-transformer", + ) + transformer_export_config = export_config_constructor( + pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["transformer"] = (transformer, transformer_export_config) + + # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 + vae_encoder = copy.deepcopy(pipeline.vae) + vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters} + vae_config_constructor = TasksManager.get_exporter_config_constructor( + model=vae_encoder, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="vae-encoder", + ) + vae_encoder_export_config = vae_config_constructor( + vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config) + + # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 + vae_decoder = copy.deepcopy(pipeline.vae) + vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample) + vae_config_constructor = TasksManager.get_exporter_config_constructor( + model=vae_decoder, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="vae-decoder", + ) + vae_decoder_export_config = vae_config_constructor( + vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["vae_decoder"] = (vae_decoder, vae_decoder_export_config) + + text_encoder_2 = getattr(pipeline, "text_encoder_2", None) + if text_encoder_2 is not None: + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder_2, + exporter=exporter, + library_name="diffusers", + task="feature-extraction", + model_type="t5-encoder-model", + ) + export_config = export_config_constructor( + text_encoder_2.config, + int_dtype=int_dtype, + float_dtype=float_dtype, + ) + models_for_export["text_encoder_2"] = (text_encoder_2, export_config) + + return models_for_export diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 33190e6f1c..ace5c150df 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -35,22 +35,26 @@ MistralOnnxConfig, MPTOnnxConfig, PhiOnnxConfig, + UNetOnnxConfig, VisionOnnxConfig, ) from optimum.exporters.onnx.model_patcher import ModelPatcher from optimum.exporters.tasks import TasksManager from optimum.utils import DEFAULT_DUMMY_SHAPES from optimum.utils.input_generators import ( + DTYPE_MAPPER, DummyInputGenerator, DummyPastKeyValuesGenerator, + DummySeq2SeqDecoderTextInputGenerator, DummyTextInputGenerator, + DummyTimestepInputGenerator, DummyVisionInputGenerator, FalconDummyPastKeyValuesGenerator, MistralDummyPastKeyValuesGenerator, ) -from optimum.utils.normalized_config import NormalizedTextConfig, NormalizedVisionConfig +from optimum.utils.normalized_config import NormalizedConfig, NormalizedTextConfig, NormalizedVisionConfig -from ...intel.utils.import_utils import _transformers_version, is_transformers_version +from ...intel.utils.import_utils import _transformers_version, is_diffusers_version, is_transformers_version from .model_patcher import ( AquilaModelPatcher, ArcticModelPatcher, @@ -60,6 +64,7 @@ DBRXModelPatcher, DeciLMModelPatcher, FalconModelPatcher, + FluxTransfromerModelPatcher, Gemma2ModelPatcher, GptNeoxJapaneseModelPatcher, GptNeoxModelPatcher, @@ -1570,3 +1575,166 @@ def patch_model_for_export( if self._behavior != InternVLChatConfigBehavior.VISION_EMBEDDINGS: return super().patch_model_for_export(model, model_kwargs) return InternVLChatImageEmbeddingModelPatcher(self, model, model_kwargs) + + +class PooledProjectionsDummyInputGenerator(DummyInputGenerator): + SUPPORTED_INPUT_NAMES = ["pooled_projections"] + + def __init__( + self, + task: str, + normalized_config: NormalizedConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + random_batch_size_range: Optional[Tuple[int, int]] = None, + **kwargs, + ): + self.task = task + self.batch_size = batch_size + self.pooled_projection_dim = normalized_config.config.pooled_projection_dim + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + shape = [self.batch_size, self.pooled_projection_dim] + return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) + + +class DummyTransformerTimestpsInputGenerator(DummyTimestepInputGenerator): + SUPPORTED_INPUT_NAMES = ("timestep", "text_embeds", "time_ids", "timestep_cond", "guidance") + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name in ["timestep", "guidance"]: + shape = [self.batch_size] + return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype) + return super().generate(input_name, framework, int_dtype, float_dtype) + + +@register_in_tasks_manager("sd3-transformer", *["semantic-segmentation"], library_name="diffusers") +class SD3TransformerOpenVINOConfig(UNetOnnxConfig): + DUMMY_INPUT_GENERATOR_CLASSES = ( + (DummyTransformerTimestpsInputGenerator,) + + UNetOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES + + (PooledProjectionsDummyInputGenerator,) + ) + NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args( + image_size="sample_size", + num_channels="in_channels", + hidden_size="joint_attention_dim", + vocab_size="attention_head_dim", + allow_new=True, + ) + + @property + def inputs(self): + common_inputs = super().inputs + common_inputs["pooled_projections"] = {0: "batch_size"} + return common_inputs + + def rename_ambiguous_inputs(self, inputs): + # The input name in the model signature is `x, hence the export input name is updated. + hidden_states = inputs.pop("sample", None) + if hidden_states is not None: + inputs["hidden_states"] = hidden_states + return inputs + + +@register_in_tasks_manager("t5-encoder-model", *["feature-extraction"], library_name="diffusers") +class T5EncoderOpenVINOConfig(CLIPTextOpenVINOConfig): + pass + + +class DummyFluxTransformerInputGenerator(DummyVisionInputGenerator): + SUPPORTED_INPUT_NAMES = ( + "pixel_values", + "pixel_mask", + "sample", + "latent_sample", + "hidden_states", + "img_ids", + ) + + def __init__( + self, + task: str, + normalized_config: NormalizedVisionConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], + width: int = DEFAULT_DUMMY_SHAPES["width"], + height: int = DEFAULT_DUMMY_SHAPES["height"], + **kwargs, + ): + super().__init__(task, normalized_config, batch_size, num_channels, width, height, **kwargs) + if getattr(normalized_config, "in_channels", None): + self.num_channels = normalized_config.in_channels // 4 + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name in ["hidden_states", "sample"]: + shape = [self.batch_size, (self.height // 2) * (self.width // 2), self.num_channels * 4] + return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) + if input_name == "img_ids": + img_ids_height = self.height // 2 + img_ids_width = self.width // 2 + return self.random_int_tensor( + [self.batch_size, img_ids_height * img_ids_width, 3] + if is_diffusers_version("<", "0.31.0") + else [img_ids_height * img_ids_width, 3], + min_value=0, + max_value=min(img_ids_height, img_ids_width), + framework=framework, + dtype=float_dtype, + ) + + return super().generate(input_name, framework, int_dtype, float_dtype) + + +class DummyFluxTextInputGenerator(DummySeq2SeqDecoderTextInputGenerator): + SUPPORTED_INPUT_NAMES = ( + "decoder_input_ids", + "decoder_attention_mask", + "encoder_outputs", + "encoder_hidden_states", + "txt_ids", + ) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "txt_ids": + import torch + + shape = ( + [self.batch_size, self.sequence_length, 3] + if is_diffusers_version("<", "0.31.0") + else [self.sequence_length, 3] + ) + dtype = DTYPE_MAPPER.pt(float_dtype) + return torch.full(shape, 0, dtype=dtype) + return super().generate(input_name, framework, int_dtype, float_dtype) + + +@register_in_tasks_manager("flux-transformer", *["semantic-segmentation"], library_name="diffusers") +class FluxTransformerOpenVINOConfig(SD3TransformerOpenVINOConfig): + DUMMY_INPUT_GENERATOR_CLASSES = ( + DummyTransformerTimestpsInputGenerator, + DummyFluxTransformerInputGenerator, + DummyFluxTextInputGenerator, + PooledProjectionsDummyInputGenerator, + ) + + @property + def inputs(self): + common_inputs = super().inputs + common_inputs.pop("sample", None) + common_inputs["hidden_states"] = {0: "batch_size", 1: "packed_height_width"} + common_inputs["txt_ids"] = ( + {0: "batch_size", 1: "sequence_length"} if is_diffusers_version("<", "0.31.0") else {0: "sequence_length"} + ) + common_inputs["img_ids"] = ( + {0: "batch_size", 1: "packed_height_width"} + if is_diffusers_version("<", "0.31.0") + else {0: "packed_height_width"} + ) + if getattr(self._normalized_config, "guidance_embeds", False): + common_inputs["guidance"] = {0: "batch_size"} + return common_inputs + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> ModelPatcher: + return FluxTransfromerModelPatcher(self, model, model_kwargs=model_kwargs) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index eadce6d382..3bc9452ff9 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -29,6 +29,7 @@ _openvino_version, _torch_version, _transformers_version, + is_diffusers_version, is_openvino_version, is_torch_version, is_transformers_version, @@ -2705,3 +2706,40 @@ def __init__( def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) self._model.forward = self._model.__orig_forward + + +def _embednb_forward(self, ids: torch.Tensor) -> torch.Tensor: + def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor: + assert dim % 2 == 0, "The dimension must be even." + + scale = torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim + omega = 1.0 / (theta**scale) + + batch_size, seq_length = pos.shape + out = pos.unsqueeze(-1) * omega.unsqueeze(0).unsqueeze(0) + cos_out = torch.cos(out) + sin_out = torch.sin(out) + + stacked_out = torch.stack([cos_out, -sin_out, sin_out, cos_out], dim=-1) + out = stacked_out.view(batch_size, -1, dim // 2, 2, 2) + return out.float() + + n_axes = ids.shape[-1] + emb = torch.cat( + [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)], + dim=-3, + ) + return emb.unsqueeze(1) + + +class FluxTransfromerModelPatcher(ModelPatcher): + def __enter__(self): + super().__enter__() + if is_diffusers_version("<", "0.31.0"): + self._model.pos_embed._orig_forward = self._model.pos_embed.forward + self._model.pos_embed.forward = types.MethodType(_embednb_forward, self._model.pos_embed) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + if hasattr(self._model.pos_embed, "_orig_forward"): + self._model.pos_embed.forward = self._model.pos_embed._orig_forward diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 5926f1869c..67a01011a2 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -100,8 +100,12 @@ "OVStableDiffusionXLPipeline", "OVStableDiffusionXLImg2ImgPipeline", "OVStableDiffusionXLInpaintPipeline", + "OVStableDiffusion3Pipeline", + "OVStableDiffusion3Image2ImagePipeline", + "OVStableDiffusion3InpaintPipeline", "OVLatentConsistencyModelPipeline", "OVLatentConsistencyModelImg2ImgPipeline", + "OVFluxPipeline", "OVPipelineForImage2Image", "OVPipelineForText2Image", "OVPipelineForInpainting", @@ -116,8 +120,12 @@ "OVStableDiffusionXLPipeline", "OVStableDiffusionXLImg2ImgPipeline", "OVStableDiffusionXLInpaintPipeline", + "OVStableDiffusion3Pipeline", + "OVStableDiffusion3Image2ImagePipeline", + "OVStableDiffusion3InpaintPipeline", "OVLatentConsistencyModelPipeline", "OVLatentConsistencyModelImg2ImgPipeline", + "OVFluxPipeline", "OVPipelineForImage2Image", "OVPipelineForText2Image", "OVPipelineForInpainting", @@ -263,10 +271,14 @@ except OptionalDependencyNotAvailable: from .utils.dummy_openvino_and_diffusers_objects import ( OVDiffusionPipeline, + OVFluxPipeline, OVLatentConsistencyModelPipeline, OVPipelineForImage2Image, OVPipelineForInpainting, OVPipelineForText2Image, + OVStableDiffusion3Img2ImgPipeline, + OVStableDiffusion3InpaintPipeline, + OVStableDiffusion3Pipeline, OVStableDiffusionImg2ImgPipeline, OVStableDiffusionInpaintPipeline, OVStableDiffusionPipeline, @@ -276,11 +288,15 @@ else: from .openvino import ( OVDiffusionPipeline, + OVFluxPipeline, OVLatentConsistencyModelImg2ImgPipeline, OVLatentConsistencyModelPipeline, OVPipelineForImage2Image, OVPipelineForInpainting, OVPipelineForText2Image, + OVStableDiffusion3Img2ImgPipeline, + OVStableDiffusion3InpaintPipeline, + OVStableDiffusion3Pipeline, OVStableDiffusionImg2ImgPipeline, OVStableDiffusionInpaintPipeline, OVStableDiffusionPipeline, diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 549bf8170d..589a0938e3 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -82,11 +82,15 @@ if is_diffusers_available(): from .modeling_diffusion import ( OVDiffusionPipeline, + OVFluxPipeline, OVLatentConsistencyModelImg2ImgPipeline, OVLatentConsistencyModelPipeline, OVPipelineForImage2Image, OVPipelineForInpainting, OVPipelineForText2Image, + OVStableDiffusion3Img2ImgPipeline, + OVStableDiffusion3InpaintPipeline, + OVStableDiffusion3Pipeline, OVStableDiffusionImg2ImgPipeline, OVStableDiffusionInpaintPipeline, OVStableDiffusionPipeline, diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 68dc31bc90..1bf452efab 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -22,7 +22,7 @@ from copy import deepcopy from pathlib import Path from tempfile import gettempdir -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, List, Optional, Union import numpy as np import openvino @@ -82,6 +82,20 @@ else: from diffusers.models.vae import DiagonalGaussianDistribution +if is_diffusers_version(">=", "0.29.0"): + from diffusers import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline +else: + StableDiffusion3Pipeline, StableDiffusion3Img2ImgPipeline = StableDiffusionPipeline, StableDiffusionImg2ImgPipeline + +if is_diffusers_version(">=", "0.30.0"): + from diffusers import FluxPipeline, StableDiffusion3InpaintPipeline +else: + StableDiffusion3InpaintPipeline = StableDiffusionInpaintPipeline + FluxPipeline = StableDiffusionPipeline + + +DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer" +DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER = "text_encoder_3" core = Core() @@ -99,15 +113,18 @@ class OVDiffusionPipeline(OVBaseModel, DiffusionPipeline): def __init__( self, scheduler: SchedulerMixin, - unet: openvino.runtime.Model, - vae_decoder: openvino.runtime.Model, + unet: Optional[openvino.runtime.Model] = None, + vae_decoder: Optional[openvino.runtime.Model] = None, # optional pipeline models vae_encoder: Optional[openvino.runtime.Model] = None, text_encoder: Optional[openvino.runtime.Model] = None, text_encoder_2: Optional[openvino.runtime.Model] = None, + text_encoder_3: Optional[openvino.runtime.Model] = None, + transformer: Optional[openvino.runtime.Model] = None, # optional pipeline submodels tokenizer: Optional[CLIPTokenizer] = None, tokenizer_2: Optional[CLIPTokenizer] = None, + tokenizer_3: Optional[CLIPTokenizer] = None, feature_extractor: Optional[CLIPFeatureExtractor] = None, # stable diffusion xl specific arguments force_zeros_for_empty_prompt: bool = True, @@ -149,7 +166,15 @@ def __init__( f"Please set `compile_only=False` or `dynamic_shapes={model_is_dynamic}`" ) - self.unet = OVModelUnet(unet, self, DIFFUSION_MODEL_UNET_SUBFOLDER) + self.unet = OVModelUnet(unet, self, DIFFUSION_MODEL_UNET_SUBFOLDER) if unet is not None else None + self.transformer = ( + OVModelTransformer(transformer, self, DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER) + if transformer is not None + else None + ) + + if unet is None and transformer is None: + raise ValueError("`unet` or `transformer` model should be provided for pipeline work") self.vae_decoder = OVModelVaeDecoder(vae_decoder, self, DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER) self.vae_encoder = ( OVModelVaeEncoder(vae_encoder, self, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER) @@ -166,12 +191,18 @@ def __init__( if text_encoder_2 is not None else None ) + self.text_encoder_3 = ( + OVModelTextEncoder(text_encoder_3, self, DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER) + if text_encoder_3 is not None + else None + ) # We wrap the VAE Decoder & Encoder in a single object to simulate diffusers API self.vae = OVModelVae(decoder=self.vae_decoder, encoder=self.vae_encoder) self.scheduler = scheduler self.tokenizer = tokenizer self.tokenizer_2 = tokenizer_2 + self.tokenizer_3 = tokenizer_3 self.feature_extractor = feature_extractor # we allow passing these as torch models for now @@ -181,13 +212,16 @@ def __init__( all_pipeline_init_args = { "vae": self.vae, "unet": self.unet, + "transformer": self.transformer, "text_encoder": self.text_encoder, "text_encoder_2": self.text_encoder_2, + "text_encoder_3": self.text_encoder_3, "safety_checker": self.safety_checker, "image_encoder": self.image_encoder, "scheduler": self.scheduler, "tokenizer": self.tokenizer, "tokenizer_2": self.tokenizer_2, + "tokenizer_3": self.tokenizer_3, "feature_extractor": self.feature_extractor, "requires_aesthetics_score": requires_aesthetics_score, "force_zeros_for_empty_prompt": force_zeros_for_empty_prompt, @@ -236,6 +270,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]): (self.vae_encoder, save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER), (self.text_encoder, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER), (self.text_encoder_2, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER), + (self.text_encoder_3, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER), + (self.transformer, save_directory / DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER), } for model, save_path in models_to_save_paths: if model is not None: @@ -254,6 +290,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]): self.tokenizer.save_pretrained(save_directory / "tokenizer") if self.tokenizer_2 is not None: self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2") + if self.tokenizer_3 is not None: + self.tokenizer_3.save_pretrained(save_directory / "tokenizer_3") if self.feature_extractor is not None: self.feature_extractor.save_pretrained(save_directory / "feature_extractor") @@ -274,6 +312,8 @@ def _from_pretrained( vae_encoder_file_name: Optional[str] = None, text_encoder_file_name: Optional[str] = None, text_encoder_2_file_name: Optional[str] = None, + text_encoder_3_file_name: Optional[str] = None, + transformer_file_name: Optional[str] = None, from_onnx: bool = False, load_in_8bit: bool = False, quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, @@ -294,6 +334,8 @@ def _from_pretrained( vae_decoder_file_name = vae_decoder_file_name or default_file_name text_encoder_file_name = text_encoder_file_name or default_file_name text_encoder_2_file_name = text_encoder_2_file_name or default_file_name + text_encoder_3_file_name = text_encoder_3_file_name or default_file_name + transformer_file_name = transformer_file_name or default_file_name if not os.path.isdir(str(model_id)): all_components = {key for key in config.keys() if not key.startswith("_")} | {"vae_encoder", "vae_decoder"} @@ -301,15 +343,19 @@ def _from_pretrained( allow_patterns.update( { unet_file_name, + transformer_file_name, vae_encoder_file_name, vae_decoder_file_name, text_encoder_file_name, text_encoder_2_file_name, + text_encoder_3_file_name, unet_file_name.replace(".xml", ".bin"), + transformer_file_name.replace(".xml", ".bin"), vae_encoder_file_name.replace(".xml", ".bin"), vae_decoder_file_name.replace(".xml", ".bin"), text_encoder_file_name.replace(".xml", ".bin"), text_encoder_2_file_name.replace(".xml", ".bin"), + text_encoder_3_file_name.replace(".xml", ".bin"), SCHEDULER_CONFIG_NAME, cls.config_name, CONFIG_NAME, @@ -337,9 +383,15 @@ def _from_pretrained( if model_save_dir is None: model_save_dir = model_save_path - submodels = {"scheduler": None, "tokenizer": None, "tokenizer_2": None, "feature_extractor": None} + submodels = { + "scheduler": None, + "tokenizer": None, + "tokenizer_2": None, + "tokenizer_3": None, + "feature_extractor": None, + } for name in submodels.keys(): - if kwargs.get(name, None) is not None: + if kwargs.get(name) is not None: submodels[name] = kwargs.pop(name) elif config.get(name, (None, None))[0] is not None: library_name, library_classes = config.get(name) @@ -354,17 +406,19 @@ def _from_pretrained( models = { "unet": model_save_path / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, + "transformer": model_save_path / DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER / transformer_file_name, "vae_decoder": model_save_path / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, "vae_encoder": model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, "text_encoder": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, "text_encoder_2": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name, + "text_encoder_3": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER / text_encoder_3_file_name, } compile_only = kwargs.get("compile_only", False) quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) if (quantization_config is None or quantization_config.dataset is None) and not compile_only: for name, path in models.items(): - if kwargs.get(name, None) is not None: + if name in kwargs: models[name] = kwargs.pop(name) else: models[name] = cls.load_model(path, quantization_config) if path.is_file() else None @@ -375,7 +429,7 @@ def _from_pretrained( if "GPU" in device.upper() and "INFERENCE_PRECISION_HINT" not in vae_ov_conifg: vae_ov_conifg["INFERENCE_PRECISION_HINT"] = "f32" for name, path in models.items(): - if kwargs.get(name, None) is not None: + if name in kwargs: models[name] = kwargs.pop(name) else: models[name] = ( @@ -396,7 +450,7 @@ def _from_pretrained( from optimum.intel import OVQuantizer for name, path in models.items(): - if kwargs.get(name, None) is not None: + if name in kwargs: models[name] = kwargs.pop(name) else: models[name] = cls.load_model(path) if path.is_file() else None @@ -411,7 +465,6 @@ def _from_pretrained( quantizer.quantize(ov_config=OVConfig(quantization_config=hybrid_quantization_config)) return ov_pipeline - ov_pipeline = ov_pipeline_class( **models, **submodels, @@ -463,6 +516,7 @@ def _from_transformers( no_post_process=True, revision=revision, cache_dir=cache_dir, + task=cls.export_feature, token=token, local_files_only=local_files_only, force_download=force_download, @@ -495,7 +549,7 @@ def to(self, *args, device: Optional[str] = None, dtype: Optional[torch.dtype] = if isinstance(device, str): self._device = device.upper() - self.request = None + self.clear_requests() elif device is not None: raise ValueError( "The `device` argument should be a string representing the device on which the model should be loaded." @@ -511,21 +565,24 @@ def to(self, *args, device: Optional[str] = None, dtype: Optional[torch.dtype] = @property def height(self) -> int: - height = self.unet.model.inputs[0].get_partial_shape()[2] + model = self.vae.decoder.model + height = model.inputs[0].get_partial_shape()[2] if height.is_dynamic: return -1 return height.get_length() * self.vae_scale_factor @property def width(self) -> int: - width = self.unet.model.inputs[0].get_partial_shape()[3] + model = self.vae.decoder.model + width = model.inputs[0].get_partial_shape()[3] if width.is_dynamic: return -1 return width.get_length() * self.vae_scale_factor @property def batch_size(self) -> int: - batch_size = self.unet.model.inputs[0].get_partial_shape()[0] + model = self.unet.model if self.unet is not None else self.transformer.model + batch_size = model.inputs[0].get_partial_shape()[0] if batch_size.is_dynamic: return -1 return batch_size.get_length() @@ -577,6 +634,65 @@ def _reshape_unet( model.reshape(shapes) return model + def _reshape_transformer( + self, + model: openvino.runtime.Model, + batch_size: int = -1, + height: int = -1, + width: int = -1, + num_images_per_prompt: int = -1, + tokenizer_max_length: int = -1, + ): + if batch_size == -1 or num_images_per_prompt == -1: + batch_size = -1 + else: + # The factor of 2 comes from the guidance scale > 1 + batch_size *= num_images_per_prompt + if "img_ids" not in {inputs.get_any_name() for inputs in model.inputs}: + batch_size *= 2 + + height = height // self.vae_scale_factor if height > 0 else height + width = width // self.vae_scale_factor if width > 0 else width + packed_height = height // 2 if height > 0 else height + packed_width = width // 2 if width > 0 else width + packed_height_width = packed_width * packed_height if height > 0 and width > 0 else -1 + shapes = {} + for inputs in model.inputs: + shapes[inputs] = inputs.get_partial_shape() + if inputs.get_any_name() in ["timestep", "guidance"]: + shapes[inputs][0] = batch_size + elif inputs.get_any_name() == "hidden_states": + in_channels = self.transformer.config.get("in_channels", None) + if in_channels is None: + in_channels = ( + shapes[inputs][1] if inputs.get_partial_shape().rank.get_length() == 4 else shapes[inputs][2] + ) + if in_channels.is_dynamic: + logger.warning( + "Could not identify `in_channels` from the unet configuration, to statically reshape the unet please provide a configuration." + ) + self.is_dynamic = True + if inputs.get_partial_shape().rank.get_length() == 4: + shapes[inputs] = [batch_size, in_channels, height, width] + else: + shapes[inputs] = [batch_size, packed_height_width, in_channels] + + elif inputs.get_any_name() == "pooled_projections": + shapes[inputs] = [batch_size, self.transformer.config["pooled_projection_dim"]] + elif inputs.get_any_name() == "img_ids": + shapes[inputs] = ( + [batch_size, packed_height_width, 3] + if is_diffusers_version("<", "0.31.0") + else [packed_height_width, 3] + ) + elif inputs.get_any_name() == "txt_ids": + shapes[inputs] = [batch_size, -1, 3] if is_diffusers_version("<", "0.31.0") else [-1, 3] + else: + shapes[inputs][0] = batch_size + shapes[inputs][1] = -1 # text_encoder_3 may have vary input length + model.reshape(shapes) + return model + def _reshape_text_encoder( self, model: openvino.runtime.Model, batch_size: int = -1, tokenizer_max_length: int = -1 ): @@ -638,9 +754,14 @@ def reshape( self.tokenizer.model_max_length if self.tokenizer is not None else self.tokenizer_2.model_max_length ) - self.unet.model = self._reshape_unet( - self.unet.model, batch_size, height, width, num_images_per_prompt, tokenizer_max_len - ) + if self.unet is not None: + self.unet.model = self._reshape_unet( + self.unet.model, batch_size, height, width, num_images_per_prompt, tokenizer_max_len + ) + if self.transformer is not None: + self.transformer.model = self._reshape_transformer( + self.transformer.model, batch_size, height, width, num_images_per_prompt, tokenizer_max_len + ) self.vae_decoder.model = self._reshape_vae_decoder( self.vae_decoder.model, height, width, num_images_per_prompt ) @@ -658,6 +779,11 @@ def reshape( self.text_encoder_2.model, batch_size, self.tokenizer_2.model_max_length ) + if self.text_encoder_3 is not None: + self.text_encoder_3.model = self._reshape_text_encoder( + self.text_encoder_3.model, batch_size, self.tokenizer_3.model_max_length + ) + self.clear_requests() return self @@ -670,7 +796,15 @@ def half(self): "`half()` is not supported with `compile_only` mode, please intialize model without this option" ) - for component in {self.unet, self.vae_encoder, self.vae_decoder, self.text_encoder, self.text_encoder_2}: + for component in { + self.unet, + self.transformer, + self.vae_encoder, + self.vae_decoder, + self.text_encoder, + self.text_encoder_2, + self.text_encoder_3, + }: if component is not None: compress_model_transformation(component.model) @@ -684,12 +818,28 @@ def clear_requests(self): "`clear_requests()` is not supported with `compile_only` mode, please intialize model without this option" ) - for component in {self.unet, self.vae_encoder, self.vae_decoder, self.text_encoder, self.text_encoder_2}: + for component in [ + self.unet, + self.transformer, + self.vae_encoder, + self.vae_decoder, + self.text_encoder, + self.text_encoder_2, + self.text_encoder_3, + ]: if component is not None: component.request = None def compile(self): - for component in {self.unet, self.vae_encoder, self.vae_decoder, self.text_encoder, self.text_encoder_2}: + for component in [ + self.unet, + self.transformer, + self.vae_encoder, + self.vae_decoder, + self.text_encoder, + self.text_encoder_2, + self.text_encoder_3, + ]: if component is not None: component._compile() @@ -705,8 +855,10 @@ def components(self) -> Dict[str, Any]: components = { "vae": self.vae, "unet": self.unet, + "transformer": self.transformer, "text_encoder": self.text_encoder, "text_encoder_2": self.text_encoder_2, + "text_encoder_3": self.text_encoder_2, "safety_checker": self.safety_checker, "image_encoder": self.image_encoder, } @@ -821,6 +973,12 @@ def modules(self): class OVModelTextEncoder(OVPipelinePart): + def __init__(self, model: openvino.runtime.Model, parent_pipeline: OVDiffusionPipeline, model_name: str = ""): + super().__init__(model, parent_pipeline, model_name) + self.hidden_states_output_names = sorted( + {name for out in self.model.outputs for name in out.names if name.startswith("hidden_states")} + ) + def forward( self, input_ids: Union[np.ndarray, torch.Tensor], @@ -829,24 +987,26 @@ def forward( return_dict: bool = False, ): self._compile() - model_inputs = {"input_ids": input_ids} - ov_outputs = self.request(model_inputs, share_inputs=True).to_dict() - + ov_outputs = self.request(model_inputs, share_inputs=True) + main_out = ov_outputs[0] model_outputs = {} - for key, value in ov_outputs.items(): - model_outputs[next(iter(key.names))] = torch.from_numpy(value) - - if output_hidden_states: - model_outputs["hidden_states"] = [] - for i in range(self.config.num_hidden_layers): - model_outputs["hidden_states"].append(model_outputs.pop(f"hidden_states.{i}")) - model_outputs["hidden_states"].append(model_outputs.get("last_hidden_state")) + model_outputs[self.model.outputs[0].get_any_name()] = torch.from_numpy(main_out) + if len(self.model.outputs) > 1 and "pooler_output" in self.model.outputs[1].get_any_name(): + model_outputs["pooler_output"] = torch.from_numpy(ov_outputs[1]) + if self.hidden_states_output_names and "last_hidden_state" not in model_outputs: + model_outputs["last_hidden_state"] = torch.from_numpy(ov_outputs[self.hidden_states_output_names[-1]]) + if ( + self.hidden_states_output_names + and output_hidden_states + or getattr(self.config, "output_hidden_states", False) + ): + hidden_states = [torch.from_numpy(ov_outputs[out_name]) for out_name in self.hidden_states_output_names] + model_outputs["hidden_states"] = hidden_states if return_dict: return model_outputs - return ModelOutput(**model_outputs) @@ -904,6 +1064,48 @@ def forward( return ModelOutput(**model_outputs) +class OVModelTransformer(OVPipelinePart): + def forward( + self, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor = None, + pooled_projections: torch.FloatTensor = None, + timestep: torch.LongTensor = None, + img_ids: torch.Tensor = None, + txt_ids: torch.Tensor = None, + guidance: torch.Tensor = None, + block_controlnet_hidden_states: List = None, + joint_attention_kwargs: Optional[Dict[str, Any]] = None, + return_dict: bool = True, + ): + self._compile() + + model_inputs = { + "hidden_states": hidden_states, + "timestep": timestep, + "encoder_hidden_states": encoder_hidden_states, + "pooled_projections": pooled_projections, + } + + if img_ids is not None: + model_inputs["img_ids"] = img_ids + if txt_ids is not None: + model_inputs["txt_ids"] = txt_ids + if guidance is not None: + model_inputs["guidance"] = guidance + + ov_outputs = self.request(model_inputs, share_inputs=True).to_dict() + + model_outputs = {} + for key, value in ov_outputs.items(): + model_outputs[next(iter(key.names))] = torch.from_numpy(value) + + if return_dict: + return model_outputs + + return ModelOutput(**model_outputs) + + class OVModelVaeEncoder(OVPipelinePart): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1177,6 +1379,34 @@ class OVLatentConsistencyModelImg2ImgPipeline( auto_model_class = LatentConsistencyModelImg2ImgPipeline +class OVStableDiffusion3Pipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3Pipeline): + main_input_name = "prompt" + export_feature = "text-to-image" + auto_model_class = StableDiffusion3Pipeline + + +class OVStableDiffusion3Img2ImgPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3Img2ImgPipeline +): + main_input_name = "image" + export_feature = "image-to-image" + auto_model_class = StableDiffusion3Img2ImgPipeline + + +class OVStableDiffusion3InpaintPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3InpaintPipeline +): + main_input_name = "image" + export_feature = "inpainting" + auto_model_class = StableDiffusion3InpaintPipeline + + +class OVFluxPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, FluxPipeline): + main_input_name = "prompt" + export_feature = "text-to-image" + auto_model_class = FluxPipeline + + SUPPORTED_OV_PIPELINES = [ OVStableDiffusionPipeline, OVStableDiffusionImg2ImgPipeline, @@ -1224,6 +1454,23 @@ def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tru ] ) +if is_diffusers_version(">=", "0.29.0"): + SUPPORTED_OV_PIPELINES.extend( + [ + OVStableDiffusion3Pipeline, + OVStableDiffusion3Img2ImgPipeline, + ] + ) + + OV_TEXT2IMAGE_PIPELINES_MAPPING["stable-diffusion-3"] = OVStableDiffusion3Pipeline + OV_IMAGE2IMAGE_PIPELINES_MAPPING["stable-diffusion-3"] = OVStableDiffusion3Img2ImgPipeline + +if is_diffusers_version(">=", "0.30.0"): + SUPPORTED_OV_PIPELINES.extend([OVStableDiffusion3InpaintPipeline, OVFluxPipeline]) + OV_INPAINT_PIPELINES_MAPPING["stable-diffusion-3"] = OVStableDiffusion3InpaintPipeline + OV_TEXT2IMAGE_PIPELINES_MAPPING["flux"] = OVFluxPipeline + + SUPPORTED_OV_PIPELINES_MAPPINGS = [ OV_TEXT2IMAGE_PIPELINES_MAPPING, OV_IMAGE2IMAGE_PIPELINES_MAPPING, @@ -1279,13 +1526,16 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): class OVPipelineForText2Image(OVPipelineForTask): auto_model_class = AutoPipelineForText2Image ov_pipelines_mapping = OV_TEXT2IMAGE_PIPELINES_MAPPING + export_feature = "text-to-image" class OVPipelineForImage2Image(OVPipelineForTask): auto_model_class = AutoPipelineForImage2Image ov_pipelines_mapping = OV_IMAGE2IMAGE_PIPELINES_MAPPING + export_feature = "image-to-image" class OVPipelineForInpainting(OVPipelineForTask): auto_model_class = AutoPipelineForInpainting ov_pipelines_mapping = OV_INPAINT_PIPELINES_MAPPING + export_feature = "inpainting" diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 1ad75477cc..c2e880e62a 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -380,15 +380,27 @@ def _quantize_ovbasemodel( quantization_config_copy = copy.deepcopy(quantization_config) quantization_config_copy.dataset = None quantization_config_copy.quant_method = OVQuantizationMethod.DEFAULT - sub_model_names = ["vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2"] + sub_model_names = [ + "vae_encoder", + "vae_decoder", + "text_encoder", + "text_encoder_2", + "text_encoder_3", + ] sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names)) for sub_model in sub_models: _weight_only_quantization(sub_model.model, quantization_config_copy) - # Apply hybrid quantization to UNet - self.model.unet.model = _hybrid_quantization( - self.model.unet.model, quantization_config, calibration_dataset - ) + if self.model.unet is not None: + # Apply hybrid quantization to UNet + self.model.unet.model = _hybrid_quantization( + self.model.unet.model, quantization_config, calibration_dataset + ) + else: + self.model.transformer.model = _hybrid_quantization( + self.model.transformer.model, quantization_config, calibration_dataset + ) + self.model.clear_requests() else: # The model may be for example OVModelForImageClassification, OVModelForAudioClassification, etc. @@ -396,7 +408,15 @@ def _quantize_ovbasemodel( self.model.request = None else: if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline): - sub_model_names = ["vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2", "unet"] + sub_model_names = [ + "vae_encoder", + "vae_decoder", + "text_encoder", + "text_encoder_2", + "unet", + "transformer", + "text_encoder_3", + ] sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names)) for sub_model in sub_models: _weight_only_quantization(sub_model.model, quantization_config) @@ -743,7 +763,9 @@ def _prepare_unet_dataset( ) -> nncf.Dataset: self.model.compile() - size = self.model.unet.config.get("sample_size", 64) * self.model.vae_scale_factor + diffuser = self.model.unet if self.model.unet is not None else self.model.transformer + + size = diffuser.config.get("sample_size", 64) * self.model.vae_scale_factor height, width = 2 * (min(size, 512),) num_samples = num_samples or 200 @@ -784,7 +806,7 @@ def transform_fn(data_item): calibration_data = [] try: - self.model.unet.request = InferRequestWrapper(self.model.unet.request, calibration_data) + diffuser.request = InferRequestWrapper(diffuser.request, calibration_data) for inputs in dataset: inputs = transform_fn(inputs) @@ -795,7 +817,7 @@ def transform_fn(data_item): if len(calibration_data) >= num_samples: break finally: - self.model.unet.request = self.model.unet.request.request + diffuser.request = diffuser.request.request calibration_dataset = nncf.Dataset(calibration_data[:num_samples]) return calibration_dataset diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index fcc6944e9f..ca7d177201 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -119,6 +119,8 @@ "audio-classification": "OVModelForAudioClassification", "stable-diffusion": "OVStableDiffusionPipeline", "stable-diffusion-xl": "OVStableDiffusionXLPipeline", + "stable-diffusion-3": "OVStableDiffusion3Pipeline", + "flux": "OVFluxPipeline", "pix2struct": "OVModelForPix2Struct", "latent-consistency": "OVLatentConsistencyModelPipeline", "open_clip_text": "OVModelOpenCLIPText", diff --git a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py index 6ded4fd5df..38aea6c1f1 100644 --- a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py +++ b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py @@ -145,3 +145,47 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino", "diffusers"]) + + +class OVStableDiffusion3Img2ImgPipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) + + +class OVStableDiffusion3Pipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) + + +class OVStableDiffusion3InpaintPipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) + + +class OVFluxPipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py index a05efc46c7..a39957bbf7 100644 --- a/optimum/intel/utils/modeling_utils.py +++ b/optimum/intel/utils/modeling_utils.py @@ -123,17 +123,20 @@ def _find_files_matching_pattern( str(model_name_or_path), subfolder=subfolder, revision=revision, token=token ) if library_name == "diffusers": - subfolder = os.path.join(subfolder, "unet") + subfolders = [os.path.join(subfolder, "unet"), os.path.join(subfolder, "transformer")] else: - subfolder = subfolder or "." + subfolders = [subfolder or "."] if model_path.is_dir(): - glob_pattern = subfolder + "/*" - files = model_path.glob(glob_pattern) - files = [p for p in files if re.search(pattern, str(p))] + files = [] + for subfolder in subfolders: + glob_pattern = subfolder + "/*" + files_ = model_path.glob(glob_pattern) + files_ = [p for p in files_ if re.search(pattern, str(p))] + files.extend(files_) else: repo_files = map(Path, HfApi().list_repo_files(model_name_or_path, revision=revision, token=token)) - files = [Path(p) for p in repo_files if re.match(pattern, str(p)) and str(p.parent) == subfolder] + files = [Path(p) for p in repo_files if re.match(pattern, str(p)) and str(p.parent) in subfolders] return files diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index 687c1f5c02..1467e5ed1f 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -35,6 +35,7 @@ OVPipelineForInpainting, OVPipelineForText2Image, ) +from optimum.intel.utils.import_utils import is_transformers_version from optimum.utils.testing_utils import require_diffusers @@ -73,6 +74,11 @@ def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type= class OVPipelineForText2ImageTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] + NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] + if is_transformers_version(">=", "4.40.0"): + SUPPORTED_ARCHITECTURES.extend(["stable-diffusion-3", "flux"]) + NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES.append("stable-diffusion-3") + CALLBACK_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] OVMODEL_CLASS = OVPipelineForText2Image AUTOMODEL_CLASS = AutoPipelineForText2Image @@ -126,8 +132,8 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): height, width, batch_size = 128, 128, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], text_encoder_3=None) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], text_encoder_3=None) for output_type in ["latent", "np", "pt"]: inputs["output_type"] = output_type @@ -135,9 +141,9 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - np.testing.assert_allclose(ov_output, diffusers_output, atol=1e-4, rtol=1e-2) + np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(CALLBACK_SUPPORT_ARCHITECTURES) @require_diffusers def test_callback(self, model_arch: str): height, width, batch_size = 64, 128, 1 @@ -184,10 +190,26 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: - self.assertEqual( - outputs.shape, - (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), - ) + if model_arch != "flux": + out_channels = ( + pipeline.unet.config.out_channels + if pipeline.unet is not None + else pipeline.transformer.config.out_channels + ) + self.assertEqual( + outputs.shape, + ( + batch_size, + out_channels, + height // pipeline.vae_scale_factor, + width // pipeline.vae_scale_factor, + ), + ) + else: + packed_height = height // pipeline.vae_scale_factor + packed_width = width // pipeline.vae_scale_factor + channels = pipeline.transformer.config.in_channels + self.assertEqual(outputs.shape, (batch_size, packed_height * packed_width, channels)) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -205,7 +227,7 @@ def test_image_reproducibility(self, model_arch: str): self.assertFalse(np.array_equal(ov_outputs_1.images[0], ov_outputs_3.images[0])) np.testing.assert_allclose(ov_outputs_1.images[0], ov_outputs_2.images[0], atol=1e-4, rtol=1e-2) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES) def test_negative_prompt(self, model_arch: str): height, width, batch_size = 64, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) @@ -229,6 +251,22 @@ def test_negative_prompt(self, model_arch: str): do_classifier_free_guidance=True, negative_prompt=negative_prompt, ) + elif model_arch == "stable-diffusion-3": + ( + inputs["prompt_embeds"], + inputs["negative_prompt_embeds"], + inputs["pooled_prompt_embeds"], + inputs["negative_pooled_prompt_embeds"], + ) = pipeline.encode_prompt( + prompt=prompt, + prompt_2=None, + prompt_3=None, + num_images_per_prompt=1, + device=torch.device("cpu"), + do_classifier_free_guidance=True, + negative_prompt=negative_prompt, + ) + else: inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = pipeline.encode_prompt( prompt=prompt, @@ -288,11 +326,18 @@ def test_height_width_properties(self, model_arch: str): ) self.assertFalse(ov_pipeline.is_dynamic) + expected_batch = batch_size * num_images_per_prompt + if ( + ov_pipeline.unet is not None + and "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs} + ) or ( + ov_pipeline.transformer is not None + and "txt_ids" not in {inputs.get_any_name() for inputs in ov_pipeline.transformer.model.inputs} + ): + expected_batch *= 2 self.assertEqual( ov_pipeline.batch_size, - batch_size - * num_images_per_prompt - * (2 if "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs} else 1), + expected_batch, ) self.assertEqual(ov_pipeline.height, height) self.assertEqual(ov_pipeline.width, width) @@ -324,6 +369,8 @@ def test_textual_inversion(self): class OVPipelineForImage2ImageTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] + if is_transformers_version(">=", "4.40.0"): + SUPPORTED_ARCHITECTURES.append("stable-diffusion-3") AUTOMODEL_CLASS = AutoPipelineForImage2Image OVMODEL_CLASS = OVPipelineForImage2Image @@ -369,7 +416,7 @@ def test_num_images_per_prompt(self, model_arch: str): outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]) @require_diffusers def test_callback(self, model_arch: str): height, width, batch_size = 32, 64, 1 @@ -416,9 +463,19 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: + out_channels = ( + pipeline.unet.config.out_channels + if pipeline.unet is not None + else pipeline.transformer.config.out_channels + ) self.assertEqual( outputs.shape, - (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ( + batch_size, + out_channels, + height // pipeline.vae_scale_factor, + width // pipeline.vae_scale_factor, + ), ) @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -427,16 +484,17 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): height, width, batch_size = 128, 128, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], text_encoder_3=None) + ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], text_encoder_3=None) for output_type in ["latent", "np", "pt"]: + print(output_type) inputs["output_type"] = output_type ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - np.testing.assert_allclose(ov_output, diffusers_output, atol=1e-4, rtol=1e-2) + np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -500,12 +558,12 @@ def test_height_width_properties(self, model_arch: str): ) self.assertFalse(ov_pipeline.is_dynamic) - self.assertEqual( - ov_pipeline.batch_size, - batch_size - * num_images_per_prompt - * (2 if "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs} else 1), - ) + expected_batch = batch_size * num_images_per_prompt + if ov_pipeline.unet is None or "timestep_cond" not in { + inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs + }: + expected_batch *= 2 + self.assertEqual(ov_pipeline.batch_size, expected_batch) self.assertEqual(ov_pipeline.height, height) self.assertEqual(ov_pipeline.width, width) @@ -537,6 +595,9 @@ def test_textual_inversion(self): class OVPipelineForInpaintingTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"] + if is_transformers_version(">=", "4.40.0"): + SUPPORTED_ARCHITECTURES.append("stable-diffusion-3") + AUTOMODEL_CLASS = AutoPipelineForInpainting OVMODEL_CLASS = OVPipelineForInpainting @@ -586,7 +647,7 @@ def test_num_images_per_prompt(self, model_arch: str): outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(["stable-diffusion", "stable-diffusion-xl"]) @require_diffusers def test_callback(self, model_arch: str): height, width, batch_size = 32, 64, 1 @@ -633,9 +694,19 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: + out_channels = ( + pipeline.unet.config.out_channels + if pipeline.unet is not None + else pipeline.transformer.config.out_channels + ) self.assertEqual( outputs.shape, - (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ( + batch_size, + out_channels, + height // pipeline.vae_scale_factor, + width // pipeline.vae_scale_factor, + ), ) @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -653,7 +724,7 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - np.testing.assert_allclose(ov_output, diffusers_output, atol=1e-4, rtol=1e-2) + np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -717,11 +788,14 @@ def test_height_width_properties(self, model_arch: str): ) self.assertFalse(ov_pipeline.is_dynamic) + expected_batch = batch_size * num_images_per_prompt + if ov_pipeline.unet is None or "timestep_cond" not in { + inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs + }: + expected_batch *= 2 self.assertEqual( ov_pipeline.batch_size, - batch_size - * num_images_per_prompt - * (2 if "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs} else 1), + expected_batch, ) self.assertEqual(ov_pipeline.height, height) self.assertEqual(ov_pipeline.width, width) diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 43c535e673..6a42c4a09f 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -27,6 +27,7 @@ from optimum.exporters.openvino import export_from_model, main_export from optimum.exporters.tasks import TasksManager from optimum.intel import ( + OVFluxPipeline, OVLatentConsistencyModelPipeline, OVModelForAudioClassification, OVModelForCausalLM, @@ -40,13 +41,14 @@ OVModelForSequenceClassification, OVModelForSpeechSeq2Seq, OVModelForTokenClassification, + OVStableDiffusion3Pipeline, OVStableDiffusionPipeline, OVStableDiffusionXLImg2ImgPipeline, OVStableDiffusionXLPipeline, ) from optimum.intel.openvino.modeling_base import OVBaseModel from optimum.intel.openvino.utils import TemporaryDirectory -from optimum.intel.utils.import_utils import _transformers_version +from optimum.intel.utils.import_utils import _transformers_version, is_transformers_version from optimum.utils.save_utils import maybe_load_preprocessors @@ -70,6 +72,9 @@ class ExportModelTest(unittest.TestCase): "latent-consistency": OVLatentConsistencyModelPipeline, } + if is_transformers_version(">=", "4.45"): + SUPPORTED_ARCHITECTURES.update({"stable-diffusion-3": OVStableDiffusion3Pipeline, "flux": OVFluxPipeline}) + GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper") def _openvino_export( diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 8443f95b31..fd5a25b4f3 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -25,6 +25,7 @@ from optimum.exporters.openvino.__main__ import main_export from optimum.intel import ( # noqa + OVFluxPipeline, OVLatentConsistencyModelPipeline, OVModelForAudioClassification, OVModelForCausalLM, @@ -39,6 +40,7 @@ OVModelOpenCLIPText, OVModelOpenCLIPVisual, OVSentenceTransformer, + OVStableDiffusion3Pipeline, OVStableDiffusionPipeline, OVStableDiffusionXLPipeline, ) @@ -48,6 +50,7 @@ compare_versions, is_openvino_tokenizers_available, is_tokenizers_version, + is_transformers_version, ) @@ -56,7 +59,7 @@ class OVCLIExportTestCase(unittest.TestCase): Integration tests ensuring supported models are correctly exported. """ - SUPPORTED_ARCHITECTURES = ( + SUPPORTED_ARCHITECTURES = [ ("text-generation", "gpt2"), ("text-generation-with-past", "gpt2"), ("text2text-generation", "t5"), @@ -71,7 +74,10 @@ class OVCLIExportTestCase(unittest.TestCase): ("text-to-image", "stable-diffusion"), ("text-to-image", "stable-diffusion-xl"), ("image-to-image", "stable-diffusion-xl-refiner"), - ) + ] + + if is_transformers_version(">=", "4.45"): + SUPPORTED_ARCHITECTURES.extend([("text-to-image", "stable-diffusion-3"), ("text-to-image", "flux")]) EXPECTED_NUMBER_OF_TOKENIZER_MODELS = { "gpt2": 2 if is_tokenizers_version("<", "0.20") else 0, "t5": 0, # no .model file in the repository @@ -84,13 +90,18 @@ class OVCLIExportTestCase(unittest.TestCase): "blenderbot": 2 if is_tokenizers_version("<", "0.20") else 0, "stable-diffusion": 2 if is_tokenizers_version("<", "0.20") else 0, "stable-diffusion-xl": 4 if is_tokenizers_version("<", "0.20") else 0, + "stable-diffusion-3": 6 if is_tokenizers_version("<", "0.20") else 2, + "flux": 4 if is_tokenizers_version("<", "0.20") else 0, } - SUPPORTED_SD_HYBRID_ARCHITECTURES = ( + SUPPORTED_SD_HYBRID_ARCHITECTURES = [ ("stable-diffusion", 72, 195), ("stable-diffusion-xl", 84, 331), ("latent-consistency", 50, 135), - ) + ] + + if is_transformers_version(">=", "4.45"): + SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("stable-diffusion-3", 9, 65)) TEST_4BIT_CONFIGURATONS = [ ("text-generation-with-past", "opt125m", "int4 --sym --group-size 128", {"int8": 4, "int4": 72}), @@ -208,8 +219,8 @@ def test_exporters_cli_int8(self, task: str, model_type: str): models = [model.encoder, model.decoder] if task.endswith("with-past"): models.append(model.decoder_with_past) - elif model_type.startswith("stable-diffusion"): - models = [model.unet, model.vae_encoder, model.vae_decoder] + elif model_type.startswith("stable-diffusion") or model_type.startswith("flux"): + models = [model.unet or model.transformer, model.vae_encoder, model.vae_decoder] models.append(model.text_encoder if model_type == "stable-diffusion" else model.text_encoder_2) else: models = [model] @@ -228,7 +239,9 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in check=True, ) model = eval(_HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")]).from_pretrained(tmpdir) - num_fq, num_weight_nodes = get_num_quantized_nodes(model.unet) + num_fq, num_weight_nodes = get_num_quantized_nodes( + model.unet if model.unet is not None else model.transformer + ) self.assertEqual(exp_num_int8, num_weight_nodes["int8"]) self.assertEqual(exp_num_fq, num_fq) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index b294e3e221..f2a4dc723f 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -56,6 +56,8 @@ OVModelForSpeechSeq2Seq, OVStableDiffusionPipeline, OVStableDiffusionXLPipeline, + OVStableDiffusion3Pipeline, + OVFluxPipeline, OVQuantizer, OVTrainer, OVQuantizationConfig, @@ -300,11 +302,18 @@ class OVWeightCompressionTest(unittest.TestCase): (OVModelOpenCLIPForZeroShotImageClassification, "open-clip"), ) - SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = ( + SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [ (OVStableDiffusionPipeline, "stable-diffusion", 72, 195), (OVStableDiffusionXLPipeline, "stable-diffusion-xl", 84, 331), (OVLatentConsistencyModelPipeline, "latent-consistency", 50, 135), - ) + ] + + if is_transformers_version(">=", "4.45.0"): + SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION.extend( + [ + (OVStableDiffusion3Pipeline, "stable-diffusion-3", 9, 65), + ] + ) IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3") @@ -454,7 +463,9 @@ def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_f with TemporaryDirectory() as tmp_dir: model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) - num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(model.unet) + num_fake_quantize, num_weight_nodes = get_num_quantized_nodes( + model.unet if model.unet is not None else model.transformer + ) self.assertEqual(expected_num_fake_quantize, num_fake_quantize) self.assertEqual(expected_ov_int8, num_weight_nodes["int8"]) self.assertEqual(0, num_weight_nodes["int4"]) @@ -468,7 +479,9 @@ def test_stable_diffusion_with_weight_compression(self): quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config)) - num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(int8_pipe.unet) + num_fake_quantize, num_weight_nodes = get_num_quantized_nodes( + int8_pipe.unet if int8_pipe.unet is not None else int8_pipe.transformer + ) self.assertEqual(0, num_fake_quantize) self.assertEqual(242, num_weight_nodes["int8"]) self.assertEqual(0, num_weight_nodes["int4"]) @@ -487,7 +500,9 @@ def test_ovmodel_hybrid_quantization_with_custom_dataset( self.assertEqual(quantization_config.quant_method, OVQuantizationMethod.HYBRID) quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config), calibration_dataset=dataset) - num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(model.unet) + num_fake_quantize, num_weight_nodes = get_num_quantized_nodes( + model.unet if model.unet is not None else model.transformer + ) self.assertEqual(expected_num_fake_quantize, num_fake_quantize) self.assertEqual(expected_ov_int8, num_weight_nodes["int8"]) self.assertEqual(0, num_weight_nodes["int4"]) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index d7eea01dba..e5a9f73a64 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -59,6 +59,7 @@ "falcon": "fxmarty/really-tiny-falcon-testing", "falcon-40b": "katuni4ka/tiny-random-falcon-40b", "flaubert": "hf-internal-testing/tiny-random-flaubert", + "flux": "katuni4ka/tiny-random-flux", "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", "gpt2": "hf-internal-testing/tiny-random-gpt2", "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", @@ -118,6 +119,7 @@ "stable-diffusion-openvino": "hf-internal-testing/tiny-stable-diffusion-openvino", "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", "stable-diffusion-xl-refiner": "echarlaix/tiny-random-stable-diffusion-xl-refiner", + "stable-diffusion-3": "yujiepan/stable-diffusion-3-tiny-random", "stablelm": "hf-internal-testing/tiny-random-StableLmForCausalLM", "starcoder2": "hf-internal-testing/tiny-random-Starcoder2ForCausalLM", "latent-consistency": "echarlaix/tiny-random-latent-consistency", @@ -170,6 +172,8 @@ "stable-diffusion-xl": (366, 34, 42, 66), "stable-diffusion-xl-refiner": (366, 34, 42, 66), "open-clip": (20, 28), + "stable-diffusion-3": (66, 42, 58, 30), + "flux": (56, 24, 28, 64), }