From c5df4c355b0d0d7977fca7dc3b6d24bfa2b4baac Mon Sep 17 00:00:00 2001 From: eaidova Date: Wed, 25 Sep 2024 17:30:38 +0400 Subject: [PATCH 01/24] WIP: conversion and pipeline base --- notebooks/ipex/text_generation.ipynb | 8 +- .../openvino/optimum_openvino_inference.ipynb | 4 +- .../openvino/quantized_generation_demo.ipynb | 86 +++++----- ...stable_diffusion_hybrid_quantization.ipynb | 25 +-- optimum/exporters/openvino/convert.py | 153 +++++++++++++++--- optimum/exporters/openvino/model_configs.py | 58 ++++++- optimum/intel/openvino/modeling_diffusion.py | 9 +- 7 files changed, 261 insertions(+), 82 deletions(-) diff --git a/notebooks/ipex/text_generation.ipynb b/notebooks/ipex/text_generation.ipynb index d1a62d9201..df46355531 100644 --- a/notebooks/ipex/text_generation.ipynb +++ b/notebooks/ipex/text_generation.ipynb @@ -62,9 +62,13 @@ "source": [ "model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16, export=True)\n", "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n", - "input_sentence = [\"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"]\n", + "input_sentence = [\n", + " \"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"\n", + "]\n", "model_inputs = tokenizer(input_sentence, return_tensors=\"pt\")\n", - "generation_kwargs = dict(max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True)\n", + "generation_kwargs = dict(\n", + " max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True\n", + ")\n", "\n", "generated_ids = model.generate(**model_inputs, **generation_kwargs)\n", "output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\n", diff --git a/notebooks/openvino/optimum_openvino_inference.ipynb b/notebooks/openvino/optimum_openvino_inference.ipynb index 76c77aec55..7ef14e0635 100644 --- a/notebooks/openvino/optimum_openvino_inference.ipynb +++ b/notebooks/openvino/optimum_openvino_inference.ipynb @@ -466,7 +466,9 @@ "source": [ "# Set the device directly with `.from_pretrained()`\n", "if \"GPU\" in Core().available_devices:\n", - " model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp16\", device=\"GPU\")" + " model = OVModelForQuestionAnswering.from_pretrained(\n", + " \"distilbert-base-uncased-distilled-squad-ov-fp16\", device=\"GPU\"\n", + " )" ] }, { diff --git a/notebooks/openvino/quantized_generation_demo.ipynb b/notebooks/openvino/quantized_generation_demo.ipynb index 5673243cb2..cc5c1ec2b3 100644 --- a/notebooks/openvino/quantized_generation_demo.ipynb +++ b/notebooks/openvino/quantized_generation_demo.ipynb @@ -121,7 +121,7 @@ " \"CACHE_DIR\": os.path.join(save_name, \"model_cache\"), # OpenVINO will use this directory as cache\n", " },\n", " \"compile\": False,\n", - " \"quantization_config\": quantization_config\n", + " \"quantization_config\": quantization_config,\n", "}\n", "\n", "# Check whether the model was already exported\n", @@ -143,8 +143,8 @@ "\n", "# TODO Optional: export to huggingface/hub\n", "\n", - "model_size = os.stat(os.path.join(save_name, \"openvino_model.bin\")).st_size / 1024 ** 3\n", - "print(f'Model size in FP32: ~5.4GB, current model size in 4bit: {model_size:.2f}GB')" + "model_size = os.stat(os.path.join(save_name, \"openvino_model.bin\")).st_size / 1024**3\n", + "print(f\"Model size in FP32: ~5.4GB, current model size in 4bit: {model_size:.2f}GB\")" ] }, { @@ -212,7 +212,7 @@ "from transformers import TextStreamer\n", "\n", "# Tokenize the sample\n", - "inputs = tokenizer([sample], return_tensors='pt')\n", + "inputs = tokenizer([sample], return_tensors=\"pt\")\n", "\n", "# Call generate on the inputs\n", "out = model.generate(\n", @@ -294,7 +294,7 @@ "\n", "\n", "# Tokenize the sample\n", - "inputs = tokenizer([sample], return_tensors='pt') \n", + "inputs = tokenizer([sample], return_tensors=\"pt\")\n", "\n", "out = stateless_model.generate(\n", " **inputs,\n", @@ -302,7 +302,7 @@ " streamer=TextStreamer(tokenizer=tokenizer, skip_special_tokens=True),\n", " pad_token_id=tokenizer.eos_token_id,\n", " prompt_lookup_num_tokens=3,\n", - ") " + ")" ] }, { @@ -358,7 +358,7 @@ " \"CACHE_DIR\": os.path.join(save_name, \"model_cache\"), # OpenVINO will use this directory as cache\n", " },\n", " \"compile\": False,\n", - " \"quantization_config\": quantization_config\n", + " \"quantization_config\": quantization_config,\n", "}\n", "\n", "# Check whether the model was already exported\n", @@ -458,15 +458,15 @@ " if len(self.seq_lens) > 0 or len(self.win_sizes) > 0:\n", " raise RuntimeError(\"Always use a new instance, don't reuse!\")\n", " self.model_forward = self.model.forward\n", - " \n", + "\n", " @wraps(self.model_forward)\n", " def forward_wrapper(**kwargs):\n", " self.seq_lens[-1].append(kwargs.get(\"attention_mask\").shape[-1])\n", " self.win_sizes[-1].append(kwargs.get(\"input_ids\").shape[-1] - 1)\n", " return self.model_forward(**kwargs)\n", - " \n", + "\n", " self.model.forward = forward_wrapper\n", - " \n", + "\n", " # wrap generate method\n", " self.model_generate = self.model.generate\n", "\n", @@ -479,10 +479,11 @@ " out = self.model_generate(*args, **kwargs)\n", " self.seq_lens[-1].append(out.shape[-1])\n", " return out\n", + "\n", " self.model.generate = generate_wrapper\n", " return self\n", "\n", - " def __exit__(self, type, value, traceback):\n", + " def __exit__(self, type, value, traceback):\n", " self.model.forward = self.model_forward\n", " self.model.generate = self.model_generate\n", " self.model_forward = None\n", @@ -494,7 +495,7 @@ " self.seq_lens = [sl[1:] for sl in self.seq_lens]\n", " # Add window size for output to ease calculation later\n", " for ws, sl in zip(self.win_sizes, self.seq_lens):\n", - " ws.append(0) \n", + " ws.append(0)\n", "\n", " def acceptance_rate(self, return_mean=True, normalize=False):\n", " # ar_per_win = ((cur_seq_len - cur_win_size) - (prev_seq_len - prev_win_size) - 1) / prev_win_size\n", @@ -503,9 +504,8 @@ " sl = np.array(sl, dtype=np.float64)\n", " ws = np.array(ws, dtype=np.float64)\n", " out_lens = sl - ws\n", - " accepted = (out_lens[1:] - out_lens[:-1] - 1)\n", - " ar_per_win.append(np.divide(accepted, ws[:-1],\n", - " out=np.zeros_like(accepted),where=ws[:-1] != 0))\n", + " accepted = out_lens[1:] - out_lens[:-1] - 1\n", + " ar_per_win.append(np.divide(accepted, ws[:-1], out=np.zeros_like(accepted), where=ws[:-1] != 0))\n", " ar_per_win = np.hstack(ar_per_win)\n", " # Normalized AR doesn't take into account windows with size 0\n", " if normalize:\n", @@ -544,7 +544,7 @@ "samples_number = 30\n", "with AcceptanceRateRecorder(stateless_model) as ar_recorder:\n", " for text in tqdm(dataset[:samples_number]):\n", - " tokenized_prompt = tokenizer([prompt_template.format(text=text)], return_tensors='pt')\n", + " tokenized_prompt = tokenizer([prompt_template.format(text=text)], return_tensors=\"pt\")\n", " stateless_model.generate(\n", " **tokenized_prompt,\n", " max_new_tokens=128,\n", @@ -623,7 +623,6 @@ " return False\n", "\n", "\n", - "\n", "# Set the chat template to the tokenizer. The chat template implements the simple template of\n", "# User: content\n", "# Assistant: content\n", @@ -651,11 +650,7 @@ " if model_msg:\n", " messages.append({\"role\": \"Assistant\", \"content\": model_msg})\n", " input_token = tokenizer.apply_chat_template(\n", - " messages,\n", - " add_generation_prompt=True,\n", - " tokenize=True,\n", - " return_tensors=\"pt\",\n", - " return_dict=True\n", + " messages, add_generation_prompt=True, tokenize=True, return_tensors=\"pt\", return_dict=True\n", " )\n", " return input_token\n", "\n", @@ -679,18 +674,18 @@ " # Construct the input message string for the model by concatenating the current system message and conversation history\n", " # Tokenize the messages string\n", " inputs = prepare_history_for_model(history)\n", - " input_length = inputs['input_ids'].shape[1]\n", + " input_length = inputs[\"input_ids\"].shape[1]\n", " # truncate input in case it is too long.\n", " # TODO improve this\n", " if input_length > 2000:\n", " history = [history[-1]]\n", " inputs = prepare_history_for_model(history)\n", - " input_length = inputs['input_ids'].shape[1]\n", + " input_length = inputs[\"input_ids\"].shape[1]\n", "\n", " prompt_char = \"▌\"\n", " history[-1][1] = prompt_char\n", " yield history, \"Status: Generating...\", *([gr.update(interactive=False)] * 4)\n", - " \n", + "\n", " streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n", "\n", " # Create a stopping criteria to prevent the model from playing the role of the user aswell.\n", @@ -706,11 +701,14 @@ " eos_token_id=[tokenizer.eos_token_id],\n", " pad_token_id=tokenizer.eos_token_id,\n", " )\n", - " generate_kwargs = dict(\n", - " streamer=streamer,\n", - " generation_config=generation_config,\n", - " stopping_criteria=stopping_criteria,\n", - " ) | inputs\n", + " generate_kwargs = (\n", + " dict(\n", + " streamer=streamer,\n", + " generation_config=generation_config,\n", + " stopping_criteria=stopping_criteria,\n", + " )\n", + " | inputs\n", + " )\n", "\n", " if assisted:\n", " target_generate = stateless_model.generate\n", @@ -737,7 +735,7 @@ " yield history, \"Status: Generating...\", *([gr.update(interactive=False)] * 4)\n", " history[-1][1] = partial_text\n", " generation_time = time.perf_counter() - start\n", - " yield history, f'Generation time: {generation_time:.2f} sec', *([gr.update(interactive=True)] * 4)" + " yield history, f\"Generation time: {generation_time:.2f} sec\", *([gr.update(interactive=True)] * 4)" ] }, { @@ -781,7 +779,9 @@ " [\"Can you explain to me briefly what is Python programming language?\"],\n", " [\"Explain the plot of Cinderella in a sentence.\"],\n", " [\"Write a Python function to perform binary search over a sorted list. Use markdown to write code\"],\n", - " [\"Lily has a rubber ball that she drops from the top of a wall. The wall is 2 meters tall. How long will it take for the ball to reach the ground?\"],\n", + " [\n", + " \"Lily has a rubber ball that she drops from the top of a wall. The wall is 2 meters tall. How long will it take for the ball to reach the ground?\"\n", + " ],\n", "]\n", "\n", "\n", @@ -797,7 +797,7 @@ " \"\"\"\n", " # Append current user message to history with a blank assistant message which will be generated by the model\n", " history.append([message, None])\n", - " return ('', history)\n", + " return (\"\", history)\n", "\n", "\n", "def prepare_for_regenerate(history):\n", @@ -808,7 +808,7 @@ " history: conversation history\n", " Returns:\n", " updated history\n", - " \"\"\" \n", + " \"\"\"\n", " history[-1][1] = None\n", " return history\n", "\n", @@ -821,7 +821,7 @@ " msg = gr.Textbox(placeholder=\"Enter message here...\", show_label=False, autofocus=True, scale=75)\n", " status = gr.Textbox(\"Status: Idle\", show_label=False, max_lines=1, scale=15)\n", " with gr.Row():\n", - " submit = gr.Button(\"Submit\", variant='primary')\n", + " submit = gr.Button(\"Submit\", variant=\"primary\")\n", " regenerate = gr.Button(\"Regenerate\")\n", " clear = gr.Button(\"Clear\")\n", " with gr.Accordion(\"Advanced Options:\", open=False):\n", @@ -860,9 +860,7 @@ " step=0.1,\n", " interactive=True,\n", " )\n", - " gr.Examples(\n", - " EXAMPLES, inputs=msg, label=\"Click on any example and press the 'Submit' button\"\n", - " )\n", + " gr.Examples(EXAMPLES, inputs=msg, label=\"Click on any example and press the 'Submit' button\")\n", "\n", " # Sets generate function to be triggered when the user submit a new message\n", " gr.on(\n", @@ -876,20 +874,14 @@ " inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, assisted],\n", " outputs=[chatbot, status, msg, submit, regenerate, clear],\n", " concurrency_limit=1,\n", - " queue=True\n", - " )\n", - " regenerate.click(\n", - " fn=prepare_for_regenerate,\n", - " inputs=chatbot,\n", - " outputs=chatbot,\n", " queue=True,\n", - " concurrency_limit=1\n", - " ).then(\n", + " )\n", + " regenerate.click(fn=prepare_for_regenerate, inputs=chatbot, outputs=chatbot, queue=True, concurrency_limit=1).then(\n", " fn=generate,\n", " inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, assisted],\n", " outputs=[chatbot, status, msg, submit, regenerate, clear],\n", " concurrency_limit=1,\n", - " queue=True\n", + " queue=True,\n", " )\n", " clear.click(fn=lambda: (None, \"Status: Idle\"), inputs=None, outputs=[chatbot, status], queue=False)" ] diff --git a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb index 8ef2e8ad6c..d89457bd78 100644 --- a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb +++ b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb @@ -167,6 +167,7 @@ "def preprocess_fn(example):\n", " return {\"prompt\": example[\"caption\"]}\n", "\n", + "\n", "NUM_SAMPLES = 200\n", "dataset = dataset.take(NUM_SAMPLES)\n", "calibration_dataset = dataset.map(lambda x: preprocess_fn(x), remove_columns=dataset.column_names)" @@ -1066,12 +1067,14 @@ ], "source": [ "int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_ID, export=True)\n", - "quantization_config = OVWeightQuantizationConfig(bits=8, num_samples=NUM_SAMPLES, quant_method=OVQuantizationMethod.HYBRID)\n", + "quantization_config = OVWeightQuantizationConfig(\n", + " bits=8, num_samples=NUM_SAMPLES, quant_method=OVQuantizationMethod.HYBRID\n", + ")\n", "quantizer = OVQuantizer(int8_pipe)\n", "quantizer.quantize(\n", " ov_config=OVConfig(quantization_config=quantization_config),\n", " calibration_dataset=calibration_dataset,\n", - " save_directory=int8_model_path\n", + " save_directory=int8_model_path,\n", ")" ] }, @@ -1202,8 +1205,10 @@ " im_w, im_h = fp32_img.size\n", " is_horizontal = im_h <= im_w\n", " figsize = (20, 30) if is_horizontal else (30, 20)\n", - " fig, axs = plt.subplots(1 if is_horizontal else 2, 2 if is_horizontal else 1, figsize=figsize, sharex='all', sharey='all')\n", - " fig.patch.set_facecolor('white')\n", + " fig, axs = plt.subplots(\n", + " 1 if is_horizontal else 2, 2 if is_horizontal else 1, figsize=figsize, sharex=\"all\", sharey=\"all\"\n", + " )\n", + " fig.patch.set_facecolor(\"white\")\n", " list_axes = list(axs.flat)\n", " for a in list_axes:\n", " a.set_xticklabels([])\n", @@ -1217,7 +1222,7 @@ " img2_title = \"INT8 result\"\n", " list_axes[0].set_title(img1_title, fontsize=20)\n", " list_axes[1].set_title(img2_title, fontsize=20)\n", - " fig.subplots_adjust(wspace=0.0 if is_horizontal else 0.01 , hspace=0.01 if is_horizontal else 0.0)\n", + " fig.subplots_adjust(wspace=0.0 if is_horizontal else 0.01, hspace=0.01 if is_horizontal else 0.0)\n", " fig.tight_layout()" ] }, @@ -1230,13 +1235,10 @@ "source": [ "prompt = \"Self-portrait oil painting, a beautiful cyborg with golden hair, 8k\"\n", "\n", + "\n", "def generate_image(pipeline, prompt):\n", " transformers.set_seed(1)\n", - " return pipeline(\n", - " prompt=prompt,\n", - " guidance_scale=8.0,\n", - " output_type=\"pil\"\n", - " ).images[0]" + " return pipeline(prompt=prompt, guidance_scale=8.0, output_type=\"pil\").images[0]" ] }, { @@ -1329,7 +1331,7 @@ "def get_model_size(model_folder, framework):\n", " \"\"\"\n", " Return OpenVINO or PyTorch model size in Mb.\n", - " \n", + "\n", " Arguments:\n", " model_folder:\n", " Directory containing a model.\n", @@ -1531,6 +1533,7 @@ "def get_val_dataset(num_items=3):\n", " return [item[\"caption\"] for item in dataset.take(num_items)]\n", "\n", + "\n", "def benchmark(pipeline, dataset):\n", " \"\"\"\n", " Benchmark PyTorch or OpenVINO model. This function does inference on `num_items`\n", diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 4e6503b5bd..a62c9687cf 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -18,6 +18,7 @@ import os from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union +import copy import onnx from transformers.generation import GenerationMixin @@ -31,7 +32,7 @@ from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed from optimum.exporters.onnx.convert import export_pytorch as export_pytorch_to_onnx from optimum.exporters.onnx.convert import export_tensorflow as export_tensorflow_onnx -from optimum.exporters.utils import _get_submodels_and_export_configs as _default_get_submodels_and_export_configs +from optimum.exporters.utils import _get_submodels_and_export_configs , get_diffusion_models_for_export, _get_submodels_for_export_diffusion from optimum.intel.utils.import_utils import ( _nncf_version, _open_clip_version, @@ -347,6 +348,7 @@ def export_pytorch( # Check that inputs match, and order them properly dummy_inputs = config.generate_dummy_inputs(framework="pt", **input_shapes) + logger.info(dummy_inputs) device = torch.device(device) if device.type == "cuda" and torch.cuda.is_available(): model.to(device) @@ -618,23 +620,26 @@ def export_from_model( model, library_name, task, preprocessors, custom_export_configs, fn_get_submodels ) - logging.disable(logging.INFO) - export_config, models_and_export_configs, stateful_submodels = _get_submodels_and_export_configs( - model=model, - task=task, - monolith=False, - custom_export_configs=custom_export_configs if custom_export_configs is not None else {}, - custom_architecture=custom_architecture, - fn_get_submodels=fn_get_submodels, - preprocessors=preprocessors, - library_name=library_name, - model_kwargs=model_kwargs, - _variant="default", - legacy=False, - exporter="openvino", - stateful=stateful, - ) - logging.disable(logging.NOTSET) + if library_name == "diffusers": + export_config, models_and_export_configs = get_diffusion_models_for_export_ext(model, exporter="openvino") + else: + logging.disable(logging.INFO) + export_config, models_and_export_configs, stateful_submodels = _get_submodels_and_export_configs( + model=model, + task=task, + monolith=False, + custom_export_configs=custom_export_configs if custom_export_configs is not None else {}, + custom_architecture=custom_architecture, + fn_get_submodels=fn_get_submodels, + preprocessors=preprocessors, + library_name=library_name, + model_kwargs=model_kwargs, + _variant="default", + legacy=False, + exporter="openvino", + stateful=stateful + ) + logging.disable(logging.NOTSET) if library_name == "open_clip": if hasattr(model.config, "save_pretrained"): @@ -699,6 +704,10 @@ def export_from_model( tokenizer_2 = getattr(model, "tokenizer_2", None) if tokenizer_2 is not None: tokenizer_2.save_pretrained(output.joinpath("tokenizer_2")) + + tokenizer_3 = getattr(model, "tokenizer_3", None) + if tokenizer_3 is not None: + tokenizer_3.save_pretrained(output.joinpath("tokenizer_3")) model.save_config(output) @@ -888,3 +897,111 @@ def _get_submodels_and_export_configs( ) stateful_per_model = [stateful] * len(models_for_export) return export_config, models_for_export, stateful_per_model + + +def get_diffusion_models_for_export_ext( + pipeline: "DiffusionPipeline", + int_dtype: str = "int64", + float_dtype: str = "fp32", + exporter: str = "openvino"): + + try: + from diffusers import StableDiffusion3Pipeline, StableDiffusion3Img2ImgPipeline, StableDiffusion3InpaintPipeline + is_sd3 = isinstance(pipeline, (StableDiffusion3Pipeline, StableDiffusion3InpaintPipeline, StableDiffusion3Img2ImgPipeline)) + except ImportError: + is_sd3 = False + + if not is_sd3: + return get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter) + + models_for_export = {} + + # Text encoder + text_encoder = getattr(pipeline, "text_encoder", None) + if text_encoder is not None: + text_encoder.config.output_hidden_states = True + text_encoder_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder, + exporter=exporter, + library_name="diffusers", + task="feature-extraction", + ) + text_encoder_export_config = text_encoder_config_constructor( + pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["text_encoder"] = (text_encoder, text_encoder_export_config) + + + transformer = pipeline.transformer + transformer.config.text_encoder_projection_dim = transformer.config.joint_attention_dim + transformer.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False) + transformer.config.time_cond_proj_dim = None + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=transformer, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="transformer", + ) + transformer_export_config = export_config_constructor(pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype) + models_for_export["trasnformer"] = (transformer, transformer_export_config) + + + # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 + vae_encoder = copy.deepcopy(pipeline.vae) + vae_encoder.forward = lambda sample: {"latent_sample": vae_encoder.encode(x=sample)["latent_dist"].sample()} + vae_config_constructor = TasksManager.get_exporter_config_constructor( + model=vae_encoder, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="vae-encoder", + ) + vae_encoder_export_config = vae_config_constructor(vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype) + models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config) + + + # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 + vae_decoder = copy.deepcopy(pipeline.vae) + vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample) + vae_config_constructor = TasksManager.get_exporter_config_constructor( + model=vae_decoder, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="vae-decoder", + ) + vae_decoder_export_config = vae_config_constructor(vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype) + models_for_export["vae_decoder"] = (vae_decoder, vae_decoder_export_config) + + text_encoder_2 = getattr(pipeline, "text_encoder_2", None) + if text_encoder_2 is not None: + text_encoder_2.config.output_hidden_states = True + text_encoder_2.text_model.config.output_hidden_states = True + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder_2, + exporter=exporter, + library_name="diffusers", + task="feature-extraction", + model_type="clip-text-with-projection", + ) + export_config = export_config_constructor( + text_encoder_2.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["text_encoder_2"] = (text_encoder_2, export_config) + + text_encoder_3 = getattr(pipeline, "text_encoder_3", None) + if text_encoder_3 is not None: + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder_3, + exporter=exporter, + library_name="diffusers", + task="feature-extraction", + model_type="clip-text-with-projection", + ) + export_config = export_config_constructor( + text_encoder_3.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["text_encoder_3"] = (text_encoder_3, export_config) + + return None, models_for_export, False diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 33190e6f1c..f16a31f8a9 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -15,6 +15,7 @@ import enum from copy import deepcopy from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +import random from packaging import version from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel @@ -36,6 +37,7 @@ MPTOnnxConfig, PhiOnnxConfig, VisionOnnxConfig, + UNetOnnxConfig ) from optimum.exporters.onnx.model_patcher import ModelPatcher from optimum.exporters.tasks import TasksManager @@ -48,7 +50,7 @@ FalconDummyPastKeyValuesGenerator, MistralDummyPastKeyValuesGenerator, ) -from optimum.utils.normalized_config import NormalizedTextConfig, NormalizedVisionConfig +from optimum.utils.normalized_config import NormalizedTextConfig, NormalizedVisionConfig, NormalizedConfig from ...intel.utils.import_utils import _transformers_version, is_transformers_version from .model_patcher import ( @@ -1570,3 +1572,57 @@ def patch_model_for_export( if self._behavior != InternVLChatConfigBehavior.VISION_EMBEDDINGS: return super().patch_model_for_export(model, model_kwargs) return InternVLChatImageEmbeddingModelPatcher(self, model, model_kwargs) + + + +class PooledProjectionsDummyInputGenerator(DummyInputGenerator): + SUPPORTED_INPUT_NAMES = ( + "pooled_projection" + ) + + def __init__( + self, + task: str, + normalized_config: NormalizedConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + random_batch_size_range: Optional[Tuple[int, int]] = None, + **kwargs, + ): + self.task = task + if random_batch_size_range: + low, high = random_batch_size_range + self.batch_size = random.randint(low, high) + else: + self.batch_size = batch_size + self.pooled_projection_dim = normalized_config.config.pooled_projection_dim + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + + shape = [self.batch_size, self.pooled_projection_dim] + return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) + + +@register_in_tasks_manager("transformer", *["semantic-segmentation"], library_name="diffusers") +class TransformerOpenVINOConfig(UNetOnnxConfig): + DUMMY_INPUT_GENERATOR_CLASSES = UNetOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES + (PooledProjectionsDummyInputGenerator,) + NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args( + image_size="sample_size", + num_channels="in_channels", + hidden_size="joint_attention_dim", + vocab_size="attention_head_dim", + allow_new=True, + ) + + @property + def inputs(self): + common_inputs = super().inputs + common_inputs["pooled_projections"] = {0: "batch_size"} + return common_inputs + + def rename_ambiguous_inputs(self, inputs): + # The input name in the model signature is `x, hence the export input name is updated. + hidden_states = inputs.pop("sample", None) + if hidden_states is not None: + inputs["hidden_states"] = hidden_states + print(inputs) + return inputs diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 68dc31bc90..5f251e8595 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -61,6 +61,9 @@ DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, ) +DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer" +DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER = "text_encoder_3" + from ...exporters.openvino import main_export from ..utils.import_utils import is_diffusers_version from .configuration import OVConfig, OVQuantizationMethod, OVWeightQuantizationConfig @@ -511,14 +514,16 @@ def to(self, *args, device: Optional[str] = None, dtype: Optional[torch.dtype] = @property def height(self) -> int: - height = self.unet.model.inputs[0].get_partial_shape()[2] + model = self.unet.model if self.unet is not None else self.transformer.model + height = model.inputs[0].get_partial_shape()[2] if height.is_dynamic: return -1 return height.get_length() * self.vae_scale_factor @property def width(self) -> int: - width = self.unet.model.inputs[0].get_partial_shape()[3] + model = self.unet.model if self.unet is not None else self.transformer.model + width = model.inputs[0].get_partial_shape()[3] if width.is_dynamic: return -1 return width.get_length() * self.vae_scale_factor From b0fc176b297f2d2360e32ddf31f74524efb6a283 Mon Sep 17 00:00:00 2001 From: eaidova Date: Thu, 26 Sep 2024 14:19:00 +0400 Subject: [PATCH 02/24] Support SD3 --- optimum/exporters/openvino/convert.py | 61 +++++++++++--------- optimum/exporters/openvino/model_configs.py | 15 ++--- optimum/intel/openvino/modeling_diffusion.py | 48 ++++++++++++++- optimum/intel/utils/modeling_utils.py | 15 +++-- 4 files changed, 96 insertions(+), 43 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index a62c9687cf..60fb6c223a 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import functools import gc import logging import os from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union -import copy import onnx from transformers.generation import GenerationMixin @@ -32,7 +32,10 @@ from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed from optimum.exporters.onnx.convert import export_pytorch as export_pytorch_to_onnx from optimum.exporters.onnx.convert import export_tensorflow as export_tensorflow_onnx -from optimum.exporters.utils import _get_submodels_and_export_configs , get_diffusion_models_for_export, _get_submodels_for_export_diffusion +from optimum.exporters.utils import ( + _get_submodels_and_export_configs as _default_get_submodels_and_export_configs, + get_diffusion_models_for_export, +) from optimum.intel.utils.import_utils import ( _nncf_version, _open_clip_version, @@ -348,7 +351,6 @@ def export_pytorch( # Check that inputs match, and order them properly dummy_inputs = config.generate_dummy_inputs(framework="pt", **input_shapes) - logger.info(dummy_inputs) device = torch.device(device) if device.type == "cuda" and torch.cuda.is_available(): model.to(device) @@ -704,7 +706,7 @@ def export_from_model( tokenizer_2 = getattr(model, "tokenizer_2", None) if tokenizer_2 is not None: tokenizer_2.save_pretrained(output.joinpath("tokenizer_2")) - + tokenizer_3 = getattr(model, "tokenizer_3", None) if tokenizer_3 is not None: tokenizer_3.save_pretrained(output.joinpath("tokenizer_3")) @@ -900,38 +902,43 @@ def _get_submodels_and_export_configs( def get_diffusion_models_for_export_ext( - pipeline: "DiffusionPipeline", - int_dtype: str = "int64", - float_dtype: str = "fp32", - exporter: str = "openvino"): - + pipeline: "DiffusionPipeline", int_dtype: str = "int64", float_dtype: str = "fp32", exporter: str = "openvino" +): try: - from diffusers import StableDiffusion3Pipeline, StableDiffusion3Img2ImgPipeline, StableDiffusion3InpaintPipeline - is_sd3 = isinstance(pipeline, (StableDiffusion3Pipeline, StableDiffusion3InpaintPipeline, StableDiffusion3Img2ImgPipeline)) + from diffusers import ( + StableDiffusion3Img2ImgPipeline, + StableDiffusion3InpaintPipeline, + StableDiffusion3Pipeline, + ) + + is_sd3 = isinstance( + pipeline, (StableDiffusion3Pipeline, StableDiffusion3InpaintPipeline, StableDiffusion3Img2ImgPipeline) + ) except ImportError: is_sd3 = False - + if not is_sd3: return get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter) - + models_for_export = {} # Text encoder text_encoder = getattr(pipeline, "text_encoder", None) if text_encoder is not None: text_encoder.config.output_hidden_states = True + text_encoder.text_model.config.output_hidden_states = True text_encoder_config_constructor = TasksManager.get_exporter_config_constructor( model=text_encoder, exporter=exporter, library_name="diffusers", task="feature-extraction", + model_type="clip-text-with-projection", ) text_encoder_export_config = text_encoder_config_constructor( pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype ) models_for_export["text_encoder"] = (text_encoder, text_encoder_export_config) - transformer = pipeline.transformer transformer.config.text_encoder_projection_dim = transformer.config.joint_attention_dim transformer.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False) @@ -943,9 +950,10 @@ def get_diffusion_models_for_export_ext( task="semantic-segmentation", model_type="transformer", ) - transformer_export_config = export_config_constructor(pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype) - models_for_export["trasnformer"] = (transformer, transformer_export_config) - + transformer_export_config = export_config_constructor( + pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["transformer"] = (transformer, transformer_export_config) # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 vae_encoder = copy.deepcopy(pipeline.vae) @@ -957,10 +965,11 @@ def get_diffusion_models_for_export_ext( task="semantic-segmentation", model_type="vae-encoder", ) - vae_encoder_export_config = vae_config_constructor(vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype) + vae_encoder_export_config = vae_config_constructor( + vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config) - # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 vae_decoder = copy.deepcopy(pipeline.vae) vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample) @@ -971,7 +980,9 @@ def get_diffusion_models_for_export_ext( task="semantic-segmentation", model_type="vae-decoder", ) - vae_decoder_export_config = vae_config_constructor(vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype) + vae_decoder_export_config = vae_config_constructor( + vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) models_for_export["vae_decoder"] = (vae_decoder, vae_decoder_export_config) text_encoder_2 = getattr(pipeline, "text_encoder_2", None) @@ -985,11 +996,9 @@ def get_diffusion_models_for_export_ext( task="feature-extraction", model_type="clip-text-with-projection", ) - export_config = export_config_constructor( - text_encoder_2.config, int_dtype=int_dtype, float_dtype=float_dtype - ) + export_config = export_config_constructor(text_encoder_2.config, int_dtype=int_dtype, float_dtype=float_dtype) models_for_export["text_encoder_2"] = (text_encoder_2, export_config) - + text_encoder_3 = getattr(pipeline, "text_encoder_3", None) if text_encoder_3 is not None: export_config_constructor = TasksManager.get_exporter_config_constructor( @@ -999,9 +1008,7 @@ def get_diffusion_models_for_export_ext( task="feature-extraction", model_type="clip-text-with-projection", ) - export_config = export_config_constructor( - text_encoder_3.config, int_dtype=int_dtype, float_dtype=float_dtype - ) + export_config = export_config_constructor(text_encoder_3.config, int_dtype=int_dtype, float_dtype=float_dtype) models_for_export["text_encoder_3"] = (text_encoder_3, export_config) return None, models_for_export, False diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index f16a31f8a9..b5e19f4c60 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -13,9 +13,9 @@ # limitations under the License. import enum +import random from copy import deepcopy from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union -import random from packaging import version from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel @@ -37,7 +37,6 @@ MPTOnnxConfig, PhiOnnxConfig, VisionOnnxConfig, - UNetOnnxConfig ) from optimum.exporters.onnx.model_patcher import ModelPatcher from optimum.exporters.tasks import TasksManager @@ -50,7 +49,7 @@ FalconDummyPastKeyValuesGenerator, MistralDummyPastKeyValuesGenerator, ) -from optimum.utils.normalized_config import NormalizedTextConfig, NormalizedVisionConfig, NormalizedConfig +from optimum.utils.normalized_config import NormalizedConfig, NormalizedTextConfig, NormalizedVisionConfig from ...intel.utils.import_utils import _transformers_version, is_transformers_version from .model_patcher import ( @@ -1576,9 +1575,7 @@ def patch_model_for_export( class PooledProjectionsDummyInputGenerator(DummyInputGenerator): - SUPPORTED_INPUT_NAMES = ( - "pooled_projection" - ) + SUPPORTED_INPUT_NAMES = "pooled_projection" def __init__( self, @@ -1597,14 +1594,15 @@ def __init__( self.pooled_projection_dim = normalized_config.config.pooled_projection_dim def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): - shape = [self.batch_size, self.pooled_projection_dim] return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) @register_in_tasks_manager("transformer", *["semantic-segmentation"], library_name="diffusers") class TransformerOpenVINOConfig(UNetOnnxConfig): - DUMMY_INPUT_GENERATOR_CLASSES = UNetOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES + (PooledProjectionsDummyInputGenerator,) + DUMMY_INPUT_GENERATOR_CLASSES = UNetOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES + ( + PooledProjectionsDummyInputGenerator, + ) NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args( image_size="sample_size", num_channels="in_channels", @@ -1624,5 +1622,4 @@ def rename_ambiguous_inputs(self, inputs): hidden_states = inputs.pop("sample", None) if hidden_states is not None: inputs["hidden_states"] = hidden_states - print(inputs) return inputs diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 5f251e8595..85a581d6e4 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -61,6 +61,7 @@ DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, ) + DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer" DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER = "text_encoder_3" @@ -85,6 +86,11 @@ else: from diffusers.models.vae import DiagonalGaussianDistribution +if is_diffusers_version(">=", "0.29.0"): + from diffusers import StableDiffusion3Img2ImgPipeline, StableDiffusion3InpaintPipeline, StableDiffusion3Pipeline +else: + StableDiffusion3Pipeline, StableDiffusion3InpaintPipeline, StableDiffusion3Img2ImgPipeline = None, None, None + core = Core() @@ -515,7 +521,7 @@ def to(self, *args, device: Optional[str] = None, dtype: Optional[torch.dtype] = @property def height(self) -> int: model = self.unet.model if self.unet is not None else self.transformer.model - height = model.inputs[0].get_partial_shape()[2] + height = model.inputs[0].get_partial_shape()[2] if height.is_dynamic: return -1 return height.get_length() * self.vae_scale_factor @@ -582,6 +588,46 @@ def _reshape_unet( model.reshape(shapes) return model + def _reshape_transformer( + self, + model: openvino.runtime.Model, + batch_size: int = -1, + height: int = -1, + width: int = -1, + num_images_per_prompt: int = -1, + tokenizer_max_length: int = -1, + ): + if batch_size == -1 or num_images_per_prompt == -1: + batch_size = -1 + else: + batch_size *= num_images_per_prompt + + height = height // self.vae_scale_factor if height > 0 else height + width = width // self.vae_scale_factor if width > 0 else width + shapes = {} + for inputs in model.inputs: + shapes[inputs] = inputs.get_partial_shape() + if inputs.get_any_name() == "timestep": + shapes[inputs][0] = 1 + elif inputs.get_any_name() == "hidden_states": + in_channels = self.transformer.config.get("in_channels", None) + if in_channels is None: + in_channels = shapes[inputs][1] + if in_channels.is_dynamic: + logger.warning( + "Could not identify `in_channels` from the unet configuration, to statically reshape the unet please provide a configuration." + ) + self.is_dynamic = True + + shapes[inputs] = [batch_size, in_channels, height, width] + elif inputs.get_any_name() == "pooled_projections": + shapes[inputs] = [batch_size, self.transformer.config["pooled_projection_dim"]] + else: + shapes[inputs][0] = batch_size + shapes[inputs][1] = tokenizer_max_length * 2 + model.reshape(shapes) + return model + def _reshape_text_encoder( self, model: openvino.runtime.Model, batch_size: int = -1, tokenizer_max_length: int = -1 ): diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py index a05efc46c7..a39957bbf7 100644 --- a/optimum/intel/utils/modeling_utils.py +++ b/optimum/intel/utils/modeling_utils.py @@ -123,17 +123,20 @@ def _find_files_matching_pattern( str(model_name_or_path), subfolder=subfolder, revision=revision, token=token ) if library_name == "diffusers": - subfolder = os.path.join(subfolder, "unet") + subfolders = [os.path.join(subfolder, "unet"), os.path.join(subfolder, "transformer")] else: - subfolder = subfolder or "." + subfolders = [subfolder or "."] if model_path.is_dir(): - glob_pattern = subfolder + "/*" - files = model_path.glob(glob_pattern) - files = [p for p in files if re.search(pattern, str(p))] + files = [] + for subfolder in subfolders: + glob_pattern = subfolder + "/*" + files_ = model_path.glob(glob_pattern) + files_ = [p for p in files_ if re.search(pattern, str(p))] + files.extend(files_) else: repo_files = map(Path, HfApi().list_repo_files(model_name_or_path, revision=revision, token=token)) - files = [Path(p) for p in repo_files if re.match(pattern, str(p)) and str(p.parent) == subfolder] + files = [Path(p) for p in repo_files if re.match(pattern, str(p)) and str(p.parent) in subfolders] return files From d2e7d04ec2e95a8b2f9d992bc113caa2a0e857b8 Mon Sep 17 00:00:00 2001 From: eaidova Date: Thu, 3 Oct 2024 10:53:21 +0400 Subject: [PATCH 03/24] img2img pipeline --- optimum/intel/openvino/modeling_diffusion.py | 17 +++++++++++++++-- .../dummy_openvino_and_diffusers_objects.py | 11 +++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 85a581d6e4..18b574a24b 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -87,10 +87,23 @@ from diffusers.models.vae import DiagonalGaussianDistribution if is_diffusers_version(">=", "0.29.0"): - from diffusers import StableDiffusion3Img2ImgPipeline, StableDiffusion3InpaintPipeline, StableDiffusion3Pipeline + from diffusers import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline else: - StableDiffusion3Pipeline, StableDiffusion3InpaintPipeline, StableDiffusion3Img2ImgPipeline = None, None, None + StableDiffusion3Pipeline, StableDiffusion3Img2ImgPipeline = None, None +if is_diffusers_version(">=", "0.30.0"): + from diffusers import StableDiffusion3InpaintPipeline +else: + StableDiffusion3InpaintPipeline = None + +PipelineImageInput = Union[ + PIL.Image.Image, + np.ndarray, + torch.Tensor, + List[PIL.Image.Image], + List[np.ndarray], + List[torch.Tensor], +] core = Core() diff --git a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py index 6ded4fd5df..2f7fbb8b06 100644 --- a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py +++ b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py @@ -145,3 +145,14 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino", "diffusers"]) + + +class OVStableDiffusion3Img2ImgPipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) From ba4eca4fe4b67ea5b474dbaa5af512dc516a751c Mon Sep 17 00:00:00 2001 From: eaidova Date: Mon, 7 Oct 2024 15:48:21 +0400 Subject: [PATCH 04/24] fix model export --- notebooks/ipex/text_generation.ipynb | 8 +- .../openvino/optimum_openvino_inference.ipynb | 4 +- .../openvino/quantized_generation_demo.ipynb | 86 ++++++++++--------- ...stable_diffusion_hybrid_quantization.ipynb | 25 +++--- optimum/exporters/openvino/convert.py | 2 +- optimum/intel/openvino/modeling_diffusion.py | 9 +- 6 files changed, 67 insertions(+), 67 deletions(-) diff --git a/notebooks/ipex/text_generation.ipynb b/notebooks/ipex/text_generation.ipynb index df46355531..d1a62d9201 100644 --- a/notebooks/ipex/text_generation.ipynb +++ b/notebooks/ipex/text_generation.ipynb @@ -62,13 +62,9 @@ "source": [ "model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16, export=True)\n", "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n", - "input_sentence = [\n", - " \"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"\n", - "]\n", + "input_sentence = [\"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"]\n", "model_inputs = tokenizer(input_sentence, return_tensors=\"pt\")\n", - "generation_kwargs = dict(\n", - " max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True\n", - ")\n", + "generation_kwargs = dict(max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True)\n", "\n", "generated_ids = model.generate(**model_inputs, **generation_kwargs)\n", "output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\n", diff --git a/notebooks/openvino/optimum_openvino_inference.ipynb b/notebooks/openvino/optimum_openvino_inference.ipynb index 7ef14e0635..76c77aec55 100644 --- a/notebooks/openvino/optimum_openvino_inference.ipynb +++ b/notebooks/openvino/optimum_openvino_inference.ipynb @@ -466,9 +466,7 @@ "source": [ "# Set the device directly with `.from_pretrained()`\n", "if \"GPU\" in Core().available_devices:\n", - " model = OVModelForQuestionAnswering.from_pretrained(\n", - " \"distilbert-base-uncased-distilled-squad-ov-fp16\", device=\"GPU\"\n", - " )" + " model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp16\", device=\"GPU\")" ] }, { diff --git a/notebooks/openvino/quantized_generation_demo.ipynb b/notebooks/openvino/quantized_generation_demo.ipynb index cc5c1ec2b3..5673243cb2 100644 --- a/notebooks/openvino/quantized_generation_demo.ipynb +++ b/notebooks/openvino/quantized_generation_demo.ipynb @@ -121,7 +121,7 @@ " \"CACHE_DIR\": os.path.join(save_name, \"model_cache\"), # OpenVINO will use this directory as cache\n", " },\n", " \"compile\": False,\n", - " \"quantization_config\": quantization_config,\n", + " \"quantization_config\": quantization_config\n", "}\n", "\n", "# Check whether the model was already exported\n", @@ -143,8 +143,8 @@ "\n", "# TODO Optional: export to huggingface/hub\n", "\n", - "model_size = os.stat(os.path.join(save_name, \"openvino_model.bin\")).st_size / 1024**3\n", - "print(f\"Model size in FP32: ~5.4GB, current model size in 4bit: {model_size:.2f}GB\")" + "model_size = os.stat(os.path.join(save_name, \"openvino_model.bin\")).st_size / 1024 ** 3\n", + "print(f'Model size in FP32: ~5.4GB, current model size in 4bit: {model_size:.2f}GB')" ] }, { @@ -212,7 +212,7 @@ "from transformers import TextStreamer\n", "\n", "# Tokenize the sample\n", - "inputs = tokenizer([sample], return_tensors=\"pt\")\n", + "inputs = tokenizer([sample], return_tensors='pt')\n", "\n", "# Call generate on the inputs\n", "out = model.generate(\n", @@ -294,7 +294,7 @@ "\n", "\n", "# Tokenize the sample\n", - "inputs = tokenizer([sample], return_tensors=\"pt\")\n", + "inputs = tokenizer([sample], return_tensors='pt') \n", "\n", "out = stateless_model.generate(\n", " **inputs,\n", @@ -302,7 +302,7 @@ " streamer=TextStreamer(tokenizer=tokenizer, skip_special_tokens=True),\n", " pad_token_id=tokenizer.eos_token_id,\n", " prompt_lookup_num_tokens=3,\n", - ")" + ") " ] }, { @@ -358,7 +358,7 @@ " \"CACHE_DIR\": os.path.join(save_name, \"model_cache\"), # OpenVINO will use this directory as cache\n", " },\n", " \"compile\": False,\n", - " \"quantization_config\": quantization_config,\n", + " \"quantization_config\": quantization_config\n", "}\n", "\n", "# Check whether the model was already exported\n", @@ -458,15 +458,15 @@ " if len(self.seq_lens) > 0 or len(self.win_sizes) > 0:\n", " raise RuntimeError(\"Always use a new instance, don't reuse!\")\n", " self.model_forward = self.model.forward\n", - "\n", + " \n", " @wraps(self.model_forward)\n", " def forward_wrapper(**kwargs):\n", " self.seq_lens[-1].append(kwargs.get(\"attention_mask\").shape[-1])\n", " self.win_sizes[-1].append(kwargs.get(\"input_ids\").shape[-1] - 1)\n", " return self.model_forward(**kwargs)\n", - "\n", + " \n", " self.model.forward = forward_wrapper\n", - "\n", + " \n", " # wrap generate method\n", " self.model_generate = self.model.generate\n", "\n", @@ -479,11 +479,10 @@ " out = self.model_generate(*args, **kwargs)\n", " self.seq_lens[-1].append(out.shape[-1])\n", " return out\n", - "\n", " self.model.generate = generate_wrapper\n", " return self\n", "\n", - " def __exit__(self, type, value, traceback):\n", + " def __exit__(self, type, value, traceback):\n", " self.model.forward = self.model_forward\n", " self.model.generate = self.model_generate\n", " self.model_forward = None\n", @@ -495,7 +494,7 @@ " self.seq_lens = [sl[1:] for sl in self.seq_lens]\n", " # Add window size for output to ease calculation later\n", " for ws, sl in zip(self.win_sizes, self.seq_lens):\n", - " ws.append(0)\n", + " ws.append(0) \n", "\n", " def acceptance_rate(self, return_mean=True, normalize=False):\n", " # ar_per_win = ((cur_seq_len - cur_win_size) - (prev_seq_len - prev_win_size) - 1) / prev_win_size\n", @@ -504,8 +503,9 @@ " sl = np.array(sl, dtype=np.float64)\n", " ws = np.array(ws, dtype=np.float64)\n", " out_lens = sl - ws\n", - " accepted = out_lens[1:] - out_lens[:-1] - 1\n", - " ar_per_win.append(np.divide(accepted, ws[:-1], out=np.zeros_like(accepted), where=ws[:-1] != 0))\n", + " accepted = (out_lens[1:] - out_lens[:-1] - 1)\n", + " ar_per_win.append(np.divide(accepted, ws[:-1],\n", + " out=np.zeros_like(accepted),where=ws[:-1] != 0))\n", " ar_per_win = np.hstack(ar_per_win)\n", " # Normalized AR doesn't take into account windows with size 0\n", " if normalize:\n", @@ -544,7 +544,7 @@ "samples_number = 30\n", "with AcceptanceRateRecorder(stateless_model) as ar_recorder:\n", " for text in tqdm(dataset[:samples_number]):\n", - " tokenized_prompt = tokenizer([prompt_template.format(text=text)], return_tensors=\"pt\")\n", + " tokenized_prompt = tokenizer([prompt_template.format(text=text)], return_tensors='pt')\n", " stateless_model.generate(\n", " **tokenized_prompt,\n", " max_new_tokens=128,\n", @@ -623,6 +623,7 @@ " return False\n", "\n", "\n", + "\n", "# Set the chat template to the tokenizer. The chat template implements the simple template of\n", "# User: content\n", "# Assistant: content\n", @@ -650,7 +651,11 @@ " if model_msg:\n", " messages.append({\"role\": \"Assistant\", \"content\": model_msg})\n", " input_token = tokenizer.apply_chat_template(\n", - " messages, add_generation_prompt=True, tokenize=True, return_tensors=\"pt\", return_dict=True\n", + " messages,\n", + " add_generation_prompt=True,\n", + " tokenize=True,\n", + " return_tensors=\"pt\",\n", + " return_dict=True\n", " )\n", " return input_token\n", "\n", @@ -674,18 +679,18 @@ " # Construct the input message string for the model by concatenating the current system message and conversation history\n", " # Tokenize the messages string\n", " inputs = prepare_history_for_model(history)\n", - " input_length = inputs[\"input_ids\"].shape[1]\n", + " input_length = inputs['input_ids'].shape[1]\n", " # truncate input in case it is too long.\n", " # TODO improve this\n", " if input_length > 2000:\n", " history = [history[-1]]\n", " inputs = prepare_history_for_model(history)\n", - " input_length = inputs[\"input_ids\"].shape[1]\n", + " input_length = inputs['input_ids'].shape[1]\n", "\n", " prompt_char = \"▌\"\n", " history[-1][1] = prompt_char\n", " yield history, \"Status: Generating...\", *([gr.update(interactive=False)] * 4)\n", - "\n", + " \n", " streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n", "\n", " # Create a stopping criteria to prevent the model from playing the role of the user aswell.\n", @@ -701,14 +706,11 @@ " eos_token_id=[tokenizer.eos_token_id],\n", " pad_token_id=tokenizer.eos_token_id,\n", " )\n", - " generate_kwargs = (\n", - " dict(\n", - " streamer=streamer,\n", - " generation_config=generation_config,\n", - " stopping_criteria=stopping_criteria,\n", - " )\n", - " | inputs\n", - " )\n", + " generate_kwargs = dict(\n", + " streamer=streamer,\n", + " generation_config=generation_config,\n", + " stopping_criteria=stopping_criteria,\n", + " ) | inputs\n", "\n", " if assisted:\n", " target_generate = stateless_model.generate\n", @@ -735,7 +737,7 @@ " yield history, \"Status: Generating...\", *([gr.update(interactive=False)] * 4)\n", " history[-1][1] = partial_text\n", " generation_time = time.perf_counter() - start\n", - " yield history, f\"Generation time: {generation_time:.2f} sec\", *([gr.update(interactive=True)] * 4)" + " yield history, f'Generation time: {generation_time:.2f} sec', *([gr.update(interactive=True)] * 4)" ] }, { @@ -779,9 +781,7 @@ " [\"Can you explain to me briefly what is Python programming language?\"],\n", " [\"Explain the plot of Cinderella in a sentence.\"],\n", " [\"Write a Python function to perform binary search over a sorted list. Use markdown to write code\"],\n", - " [\n", - " \"Lily has a rubber ball that she drops from the top of a wall. The wall is 2 meters tall. How long will it take for the ball to reach the ground?\"\n", - " ],\n", + " [\"Lily has a rubber ball that she drops from the top of a wall. The wall is 2 meters tall. How long will it take for the ball to reach the ground?\"],\n", "]\n", "\n", "\n", @@ -797,7 +797,7 @@ " \"\"\"\n", " # Append current user message to history with a blank assistant message which will be generated by the model\n", " history.append([message, None])\n", - " return (\"\", history)\n", + " return ('', history)\n", "\n", "\n", "def prepare_for_regenerate(history):\n", @@ -808,7 +808,7 @@ " history: conversation history\n", " Returns:\n", " updated history\n", - " \"\"\"\n", + " \"\"\" \n", " history[-1][1] = None\n", " return history\n", "\n", @@ -821,7 +821,7 @@ " msg = gr.Textbox(placeholder=\"Enter message here...\", show_label=False, autofocus=True, scale=75)\n", " status = gr.Textbox(\"Status: Idle\", show_label=False, max_lines=1, scale=15)\n", " with gr.Row():\n", - " submit = gr.Button(\"Submit\", variant=\"primary\")\n", + " submit = gr.Button(\"Submit\", variant='primary')\n", " regenerate = gr.Button(\"Regenerate\")\n", " clear = gr.Button(\"Clear\")\n", " with gr.Accordion(\"Advanced Options:\", open=False):\n", @@ -860,7 +860,9 @@ " step=0.1,\n", " interactive=True,\n", " )\n", - " gr.Examples(EXAMPLES, inputs=msg, label=\"Click on any example and press the 'Submit' button\")\n", + " gr.Examples(\n", + " EXAMPLES, inputs=msg, label=\"Click on any example and press the 'Submit' button\"\n", + " )\n", "\n", " # Sets generate function to be triggered when the user submit a new message\n", " gr.on(\n", @@ -874,14 +876,20 @@ " inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, assisted],\n", " outputs=[chatbot, status, msg, submit, regenerate, clear],\n", " concurrency_limit=1,\n", - " queue=True,\n", + " queue=True\n", " )\n", - " regenerate.click(fn=prepare_for_regenerate, inputs=chatbot, outputs=chatbot, queue=True, concurrency_limit=1).then(\n", + " regenerate.click(\n", + " fn=prepare_for_regenerate,\n", + " inputs=chatbot,\n", + " outputs=chatbot,\n", + " queue=True,\n", + " concurrency_limit=1\n", + " ).then(\n", " fn=generate,\n", " inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, assisted],\n", " outputs=[chatbot, status, msg, submit, regenerate, clear],\n", " concurrency_limit=1,\n", - " queue=True,\n", + " queue=True\n", " )\n", " clear.click(fn=lambda: (None, \"Status: Idle\"), inputs=None, outputs=[chatbot, status], queue=False)" ] diff --git a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb index d89457bd78..8ef2e8ad6c 100644 --- a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb +++ b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb @@ -167,7 +167,6 @@ "def preprocess_fn(example):\n", " return {\"prompt\": example[\"caption\"]}\n", "\n", - "\n", "NUM_SAMPLES = 200\n", "dataset = dataset.take(NUM_SAMPLES)\n", "calibration_dataset = dataset.map(lambda x: preprocess_fn(x), remove_columns=dataset.column_names)" @@ -1067,14 +1066,12 @@ ], "source": [ "int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_ID, export=True)\n", - "quantization_config = OVWeightQuantizationConfig(\n", - " bits=8, num_samples=NUM_SAMPLES, quant_method=OVQuantizationMethod.HYBRID\n", - ")\n", + "quantization_config = OVWeightQuantizationConfig(bits=8, num_samples=NUM_SAMPLES, quant_method=OVQuantizationMethod.HYBRID)\n", "quantizer = OVQuantizer(int8_pipe)\n", "quantizer.quantize(\n", " ov_config=OVConfig(quantization_config=quantization_config),\n", " calibration_dataset=calibration_dataset,\n", - " save_directory=int8_model_path,\n", + " save_directory=int8_model_path\n", ")" ] }, @@ -1205,10 +1202,8 @@ " im_w, im_h = fp32_img.size\n", " is_horizontal = im_h <= im_w\n", " figsize = (20, 30) if is_horizontal else (30, 20)\n", - " fig, axs = plt.subplots(\n", - " 1 if is_horizontal else 2, 2 if is_horizontal else 1, figsize=figsize, sharex=\"all\", sharey=\"all\"\n", - " )\n", - " fig.patch.set_facecolor(\"white\")\n", + " fig, axs = plt.subplots(1 if is_horizontal else 2, 2 if is_horizontal else 1, figsize=figsize, sharex='all', sharey='all')\n", + " fig.patch.set_facecolor('white')\n", " list_axes = list(axs.flat)\n", " for a in list_axes:\n", " a.set_xticklabels([])\n", @@ -1222,7 +1217,7 @@ " img2_title = \"INT8 result\"\n", " list_axes[0].set_title(img1_title, fontsize=20)\n", " list_axes[1].set_title(img2_title, fontsize=20)\n", - " fig.subplots_adjust(wspace=0.0 if is_horizontal else 0.01, hspace=0.01 if is_horizontal else 0.0)\n", + " fig.subplots_adjust(wspace=0.0 if is_horizontal else 0.01 , hspace=0.01 if is_horizontal else 0.0)\n", " fig.tight_layout()" ] }, @@ -1235,10 +1230,13 @@ "source": [ "prompt = \"Self-portrait oil painting, a beautiful cyborg with golden hair, 8k\"\n", "\n", - "\n", "def generate_image(pipeline, prompt):\n", " transformers.set_seed(1)\n", - " return pipeline(prompt=prompt, guidance_scale=8.0, output_type=\"pil\").images[0]" + " return pipeline(\n", + " prompt=prompt,\n", + " guidance_scale=8.0,\n", + " output_type=\"pil\"\n", + " ).images[0]" ] }, { @@ -1331,7 +1329,7 @@ "def get_model_size(model_folder, framework):\n", " \"\"\"\n", " Return OpenVINO or PyTorch model size in Mb.\n", - "\n", + " \n", " Arguments:\n", " model_folder:\n", " Directory containing a model.\n", @@ -1533,7 +1531,6 @@ "def get_val_dataset(num_items=3):\n", " return [item[\"caption\"] for item in dataset.take(num_items)]\n", "\n", - "\n", "def benchmark(pipeline, dataset):\n", " \"\"\"\n", " Benchmark PyTorch or OpenVINO model. This function does inference on `num_items`\n", diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 60fb6c223a..c684cad256 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -918,7 +918,7 @@ def get_diffusion_models_for_export_ext( is_sd3 = False if not is_sd3: - return get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter) + return None, get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter) models_for_export = {} diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 18b574a24b..4f69644b1c 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -61,10 +61,6 @@ DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, ) - -DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer" -DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER = "text_encoder_3" - from ...exporters.openvino import main_export from ..utils.import_utils import is_diffusers_version from .configuration import OVConfig, OVQuantizationMethod, OVWeightQuantizationConfig @@ -96,6 +92,11 @@ else: StableDiffusion3InpaintPipeline = None + +DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer" +DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER = "text_encoder_3" + + PipelineImageInput = Union[ PIL.Image.Image, np.ndarray, From b219211f246c26d1e2d2ad53fb3de80db94911db Mon Sep 17 00:00:00 2001 From: eaidova Date: Fri, 11 Oct 2024 18:27:17 +0400 Subject: [PATCH 05/24] update after migration on new pipeline style --- optimum/exporters/openvino/convert.py | 9 +- optimum/exporters/openvino/model_configs.py | 2 +- optimum/intel/openvino/modeling_diffusion.py | 171 ++++++++++++++++--- 3 files changed, 155 insertions(+), 27 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index c684cad256..74b5667647 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -34,6 +34,8 @@ from optimum.exporters.onnx.convert import export_tensorflow as export_tensorflow_onnx from optimum.exporters.utils import ( _get_submodels_and_export_configs as _default_get_submodels_and_export_configs, +) +from optimum.exporters.utils import ( get_diffusion_models_for_export, ) from optimum.intel.utils.import_utils import ( @@ -624,6 +626,7 @@ def export_from_model( if library_name == "diffusers": export_config, models_and_export_configs = get_diffusion_models_for_export_ext(model, exporter="openvino") + stateful_submodels = False else: logging.disable(logging.INFO) export_config, models_and_export_configs, stateful_submodels = _get_submodels_and_export_configs( @@ -639,7 +642,7 @@ def export_from_model( _variant="default", legacy=False, exporter="openvino", - stateful=stateful + stateful=stateful, ) logging.disable(logging.NOTSET) @@ -957,7 +960,7 @@ def get_diffusion_models_for_export_ext( # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 vae_encoder = copy.deepcopy(pipeline.vae) - vae_encoder.forward = lambda sample: {"latent_sample": vae_encoder.encode(x=sample)["latent_dist"].sample()} + vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters} vae_config_constructor = TasksManager.get_exporter_config_constructor( model=vae_encoder, exporter=exporter, @@ -1011,4 +1014,4 @@ def get_diffusion_models_for_export_ext( export_config = export_config_constructor(text_encoder_3.config, int_dtype=int_dtype, float_dtype=float_dtype) models_for_export["text_encoder_3"] = (text_encoder_3, export_config) - return None, models_for_export, False + return None, models_for_export diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index b5e19f4c60..3fc3c07c7e 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -36,6 +36,7 @@ MistralOnnxConfig, MPTOnnxConfig, PhiOnnxConfig, + UNetOnnxConfig, VisionOnnxConfig, ) from optimum.exporters.onnx.model_patcher import ModelPatcher @@ -1573,7 +1574,6 @@ def patch_model_for_export( return InternVLChatImageEmbeddingModelPatcher(self, model, model_kwargs) - class PooledProjectionsDummyInputGenerator(DummyInputGenerator): SUPPORTED_INPUT_NAMES = "pooled_projection" diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 4f69644b1c..85d4c0b3b0 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -22,7 +22,7 @@ from copy import deepcopy from pathlib import Path from tempfile import gettempdir -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, List, Optional, Union import numpy as np import openvino @@ -85,27 +85,17 @@ if is_diffusers_version(">=", "0.29.0"): from diffusers import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline else: - StableDiffusion3Pipeline, StableDiffusion3Img2ImgPipeline = None, None + StableDiffusion3Pipeline, StableDiffusion3Img2ImgPipeline = StableDiffusionPipeline, StableDiffusionImg2ImgPipeline if is_diffusers_version(">=", "0.30.0"): from diffusers import StableDiffusion3InpaintPipeline else: - StableDiffusion3InpaintPipeline = None + StableDiffusion3InpaintPipeline = StableDiffusion3Pipeline DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer" DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER = "text_encoder_3" - -PipelineImageInput = Union[ - PIL.Image.Image, - np.ndarray, - torch.Tensor, - List[PIL.Image.Image], - List[np.ndarray], - List[torch.Tensor], -] - core = Core() logger = logging.getLogger(__name__) @@ -122,15 +112,18 @@ class OVDiffusionPipeline(OVBaseModel, DiffusionPipeline): def __init__( self, scheduler: SchedulerMixin, - unet: openvino.runtime.Model, - vae_decoder: openvino.runtime.Model, + unet: Optional[openvino.runtime.Model] = None, + vae_decoder: Optional[openvino.runtime.Model] = None, # optional pipeline models vae_encoder: Optional[openvino.runtime.Model] = None, text_encoder: Optional[openvino.runtime.Model] = None, text_encoder_2: Optional[openvino.runtime.Model] = None, + text_encoder_3: Optional[openvino.runtime.Model] = None, + transofrmer: Optional[openvino.runtime.Model] = None, # optional pipeline submodels tokenizer: Optional[CLIPTokenizer] = None, tokenizer_2: Optional[CLIPTokenizer] = None, + tokenizer_3: Optional[CLIPTokenizer] = None, feature_extractor: Optional[CLIPFeatureExtractor] = None, # stable diffusion xl specific arguments force_zeros_for_empty_prompt: bool = True, @@ -172,7 +165,15 @@ def __init__( f"Please set `compile_only=False` or `dynamic_shapes={model_is_dynamic}`" ) - self.unet = OVModelUnet(unet, self, DIFFUSION_MODEL_UNET_SUBFOLDER) + self.unet = OVModelUnet(unet, self, DIFFUSION_MODEL_UNET_SUBFOLDER) if unet is not None else None + self.transformer = ( + OVModelTransformer(transofrmer, self, DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER) + if transofrmer is not None + else None + ) + + if unet is None and transofrmer is None: + raise ValueError("`unet` or `transformer` model should be provided for pipeline work") self.vae_decoder = OVModelVaeDecoder(vae_decoder, self, DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER) self.vae_encoder = ( OVModelVaeEncoder(vae_encoder, self, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER) @@ -189,12 +190,18 @@ def __init__( if text_encoder_2 is not None else None ) + self.text_encoder_3 = ( + OVModelTextEncoder(text_encoder_2, self, DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER) + if text_encoder_3 is not None + else None + ) # We wrap the VAE Decoder & Encoder in a single object to simulate diffusers API self.vae = OVModelVae(decoder=self.vae_decoder, encoder=self.vae_encoder) self.scheduler = scheduler self.tokenizer = tokenizer self.tokenizer_2 = tokenizer_2 + self.tokenizer_3 = tokenizer_3 self.feature_extractor = feature_extractor # we allow passing these as torch models for now @@ -204,8 +211,10 @@ def __init__( all_pipeline_init_args = { "vae": self.vae, "unet": self.unet, + "transformer": self.transformer, "text_encoder": self.text_encoder, "text_encoder_2": self.text_encoder_2, + "text_encoder_3": self.text_encoder_3, "safety_checker": self.safety_checker, "image_encoder": self.image_encoder, "scheduler": self.scheduler, @@ -259,6 +268,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]): (self.vae_encoder, save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER), (self.text_encoder, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER), (self.text_encoder_2, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER), + (self.text_encoder_3, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER), + (self.transformer, save_directory / DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER), } for model, save_path in models_to_save_paths: if model is not None: @@ -277,6 +288,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]): self.tokenizer.save_pretrained(save_directory / "tokenizer") if self.tokenizer_2 is not None: self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2") + if self.tokenizer_3 is not None: + self.tokenizer_3.save_pretrained(save_directory / "tokenizer_3") if self.feature_extractor is not None: self.feature_extractor.save_pretrained(save_directory / "feature_extractor") @@ -297,6 +310,8 @@ def _from_pretrained( vae_encoder_file_name: Optional[str] = None, text_encoder_file_name: Optional[str] = None, text_encoder_2_file_name: Optional[str] = None, + text_encoder_3_file_name: Optional[str] = None, + transformer_file_name: Optional[str] = None, from_onnx: bool = False, load_in_8bit: bool = False, quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, @@ -317,6 +332,8 @@ def _from_pretrained( vae_decoder_file_name = vae_decoder_file_name or default_file_name text_encoder_file_name = text_encoder_file_name or default_file_name text_encoder_2_file_name = text_encoder_2_file_name or default_file_name + text_encoder_3_file_name = text_encoder_3_file_name or default_file_name + transformer_file_name = transformer_file_name or default_file_name if not os.path.isdir(str(model_id)): all_components = {key for key in config.keys() if not key.startswith("_")} | {"vae_encoder", "vae_decoder"} @@ -324,15 +341,19 @@ def _from_pretrained( allow_patterns.update( { unet_file_name, + transformer_file_name, vae_encoder_file_name, vae_decoder_file_name, text_encoder_file_name, text_encoder_2_file_name, + text_encoder_3_file_name, unet_file_name.replace(".xml", ".bin"), + transformer_file_name.replace(".xml", ".bin"), vae_encoder_file_name.replace(".xml", ".bin"), vae_decoder_file_name.replace(".xml", ".bin"), text_encoder_file_name.replace(".xml", ".bin"), text_encoder_2_file_name.replace(".xml", ".bin"), + text_encoder_3_file_name.replace(".xml", ".bin"), SCHEDULER_CONFIG_NAME, cls.config_name, CONFIG_NAME, @@ -360,7 +381,13 @@ def _from_pretrained( if model_save_dir is None: model_save_dir = model_save_path - submodels = {"scheduler": None, "tokenizer": None, "tokenizer_2": None, "feature_extractor": None} + submodels = { + "scheduler": None, + "tokenizer": None, + "tokenizer_2": None, + "tokenizer_3": None, + "feature_extractor": None, + } for name in submodels.keys(): if kwargs.get(name, None) is not None: submodels[name] = kwargs.pop(name) @@ -377,10 +404,12 @@ def _from_pretrained( models = { "unet": model_save_path / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, + "transformer": model_save_path / DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER / unet_file_name, "vae_decoder": model_save_path / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, "vae_encoder": model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, "text_encoder": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, "text_encoder_2": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name, + "text_encoder_3": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER / text_encoder_3_file_name, } compile_only = kwargs.get("compile_only", False) @@ -518,7 +547,7 @@ def to(self, *args, device: Optional[str] = None, dtype: Optional[torch.dtype] = if isinstance(device, str): self._device = device.upper() - self.request = None + self.clear_requests() elif device is not None: raise ValueError( "The `device` argument should be a string representing the device on which the model should be loaded." @@ -703,9 +732,14 @@ def reshape( self.tokenizer.model_max_length if self.tokenizer is not None else self.tokenizer_2.model_max_length ) - self.unet.model = self._reshape_unet( - self.unet.model, batch_size, height, width, num_images_per_prompt, tokenizer_max_len - ) + if self.unet is not None: + self.unet.model = self._reshape_unet( + self.unet.model, batch_size, height, width, num_images_per_prompt, tokenizer_max_len + ) + if self.transformer is not None: + self.transformer.model = self._reshape_transformer( + self.unet.model, batch_size, height, width, num_images_per_prompt, tokenizer_max_len + ) self.vae_decoder.model = self._reshape_vae_decoder( self.vae_decoder.model, height, width, num_images_per_prompt ) @@ -723,6 +757,11 @@ def reshape( self.text_encoder_2.model, batch_size, self.tokenizer_2.model_max_length ) + if self.text_encoder_3 is not None: + self.text_encoder_3.model = self._reshape_text_encoder( + self.text_encoder_3.model, batch_size, self.tokenizer_3.model_max_length + ) + self.clear_requests() return self @@ -749,12 +788,26 @@ def clear_requests(self): "`clear_requests()` is not supported with `compile_only` mode, please intialize model without this option" ) - for component in {self.unet, self.vae_encoder, self.vae_decoder, self.text_encoder, self.text_encoder_2}: + for component in { + self.unet, + self.transformer, + self.vae_encoder, + self.vae_decoder, + self.text_encoder, + self.text_encoder_2, + }: if component is not None: component.request = None def compile(self): - for component in {self.unet, self.vae_encoder, self.vae_decoder, self.text_encoder, self.text_encoder_2}: + for component in { + self.unet, + self.transformer, + self.vae_encoder, + self.vae_decoder, + self.text_encoder, + self.text_encoder_2, + }: if component is not None: component._compile() @@ -770,8 +823,10 @@ def components(self) -> Dict[str, Any]: components = { "vae": self.vae, "unet": self.unet, + "transfomer": self.transformer, "text_encoder": self.text_encoder, "text_encoder_2": self.text_encoder_2, + "text_encoder_3": self.text_encoder_2, "safety_checker": self.safety_checker, "image_encoder": self.image_encoder, } @@ -969,6 +1024,38 @@ def forward( return ModelOutput(**model_outputs) +class OVModelTransformer(OVPipelinePart): + def forward( + self, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor = None, + pooled_projections: torch.FloatTensor = None, + timestep: torch.LongTensor = None, + block_controlnet_hidden_states: List = None, + joint_attention_kwargs: Optional[Dict[str, Any]] = None, + return_dict: bool = True, + ): + self._compile() + + model_inputs = { + "hidden_states": hidden_states, + "timestep": timestep, + "encoder_hidden_states": encoder_hidden_states, + "pooled_projections": pooled_projections, + } + + ov_outputs = self.request(model_inputs, share_inputs=True).to_dict() + + model_outputs = {} + for key, value in ov_outputs.items(): + model_outputs[next(iter(key.names))] = torch.from_numpy(value) + + if return_dict: + return model_outputs + + return ModelOutput(**model_outputs) + + class OVModelVaeEncoder(OVPipelinePart): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1242,6 +1329,28 @@ class OVLatentConsistencyModelImg2ImgPipeline( auto_model_class = LatentConsistencyModelImg2ImgPipeline +class OVStableDiffusion3Pipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3Pipeline): + main_input_name = "prompt" + export_feature = "text-to-image" + auto_model_class = StableDiffusion3Pipeline + + +class OVStableDiffusion3Img2ImgPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3Img2ImgPipeline +): + main_input_name = "prompt" + export_feature = "text-to-image" + auto_model_class = StableDiffusion3Img2ImgPipeline + + +class OVStableDiffusion3InpaintPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3InpaintPipeline +): + main_input_name = "prompt" + export_feature = "text-to-image" + auto_model_class = StableDiffusion3InpaintPipeline + + SUPPORTED_OV_PIPELINES = [ OVStableDiffusionPipeline, OVStableDiffusionImg2ImgPipeline, @@ -1289,6 +1398,22 @@ def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tru ] ) +if is_diffusers_version(">=", "0.29.0"): + SUPPORTED_OV_PIPELINES.extend( + [ + OVStableDiffusion3Pipeline, + OVStableDiffusion3Img2ImgPipeline, + ] + ) + + OV_TEXT2IMAGE_PIPELINES_MAPPING["stable-diffusion-3"] = OVStableDiffusion3Pipeline + OV_IMAGE2IMAGE_PIPELINES_MAPPING["stable-diffusion-3"] = OVStableDiffusion3Img2ImgPipeline + +if is_diffusers_version(">=", "0.30.0"): + SUPPORTED_OV_PIPELINES.append(OVStableDiffusion3InpaintPipeline) + OV_INPAINT_PIPELINES_MAPPING["stable-diffusion-3"] = OVStableDiffusion3InpaintPipeline + + SUPPORTED_OV_PIPELINES_MAPPINGS = [ OV_TEXT2IMAGE_PIPELINES_MAPPING, OV_IMAGE2IMAGE_PIPELINES_MAPPING, From b84da62b868620da2d91e9bb6d1de0b5b51011af Mon Sep 17 00:00:00 2001 From: eaidova Date: Fri, 11 Oct 2024 19:17:57 +0400 Subject: [PATCH 06/24] fix inference issues --- optimum/intel/openvino/modeling_diffusion.py | 26 +++++++++++--------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 85d4c0b3b0..14d44a6992 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -119,7 +119,7 @@ def __init__( text_encoder: Optional[openvino.runtime.Model] = None, text_encoder_2: Optional[openvino.runtime.Model] = None, text_encoder_3: Optional[openvino.runtime.Model] = None, - transofrmer: Optional[openvino.runtime.Model] = None, + transformer: Optional[openvino.runtime.Model] = None, # optional pipeline submodels tokenizer: Optional[CLIPTokenizer] = None, tokenizer_2: Optional[CLIPTokenizer] = None, @@ -167,12 +167,12 @@ def __init__( self.unet = OVModelUnet(unet, self, DIFFUSION_MODEL_UNET_SUBFOLDER) if unet is not None else None self.transformer = ( - OVModelTransformer(transofrmer, self, DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER) - if transofrmer is not None + OVModelTransformer(transformer, self, DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER) + if transformer is not None else None ) - if unet is None and transofrmer is None: + if unet is None and transformer is None: raise ValueError("`unet` or `transformer` model should be provided for pipeline work") self.vae_decoder = OVModelVaeDecoder(vae_decoder, self, DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER) self.vae_encoder = ( @@ -191,7 +191,7 @@ def __init__( else None ) self.text_encoder_3 = ( - OVModelTextEncoder(text_encoder_2, self, DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER) + OVModelTextEncoder(text_encoder_3, self, DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER) if text_encoder_3 is not None else None ) @@ -220,6 +220,7 @@ def __init__( "scheduler": self.scheduler, "tokenizer": self.tokenizer, "tokenizer_2": self.tokenizer_2, + "tokenizer_3": self.tokenizer_3, "feature_extractor": self.feature_extractor, "requires_aesthetics_score": requires_aesthetics_score, "force_zeros_for_empty_prompt": force_zeros_for_empty_prompt, @@ -404,7 +405,7 @@ def _from_pretrained( models = { "unet": model_save_path / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, - "transformer": model_save_path / DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER / unet_file_name, + "transformer": model_save_path / DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER / transformer_file_name, "vae_decoder": model_save_path / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, "vae_encoder": model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, "text_encoder": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, @@ -579,7 +580,8 @@ def width(self) -> int: @property def batch_size(self) -> int: - batch_size = self.unet.model.inputs[0].get_partial_shape()[0] + model = self.unet.model if self.unet is not None else self.transformer + batch_size = model.inputs[0].get_partial_shape()[0] if batch_size.is_dynamic: return -1 return batch_size.get_length() @@ -651,7 +653,7 @@ def _reshape_transformer( for inputs in model.inputs: shapes[inputs] = inputs.get_partial_shape() if inputs.get_any_name() == "timestep": - shapes[inputs][0] = 1 + shapes[inputs][0] = batch_size elif inputs.get_any_name() == "hidden_states": in_channels = self.transformer.config.get("in_channels", None) if in_channels is None: @@ -667,7 +669,7 @@ def _reshape_transformer( shapes[inputs] = [batch_size, self.transformer.config["pooled_projection_dim"]] else: shapes[inputs][0] = batch_size - shapes[inputs][1] = tokenizer_max_length * 2 + shapes[inputs][1] = -1 # text_encoder_3 may have vary input length model.reshape(shapes) return model @@ -738,7 +740,7 @@ def reshape( ) if self.transformer is not None: self.transformer.model = self._reshape_transformer( - self.unet.model, batch_size, height, width, num_images_per_prompt, tokenizer_max_len + self.transformer.model, batch_size, height, width, num_images_per_prompt, tokenizer_max_len ) self.vae_decoder.model = self._reshape_vae_decoder( self.vae_decoder.model, height, width, num_images_per_prompt @@ -774,7 +776,7 @@ def half(self): "`half()` is not supported with `compile_only` mode, please intialize model without this option" ) - for component in {self.unet, self.vae_encoder, self.vae_decoder, self.text_encoder, self.text_encoder_2}: + for component in {self.unet, self.transformer, self.vae_encoder, self.vae_decoder, self.text_encoder, self.text_encoder_2, self.text_encoder_3}: if component is not None: compress_model_transformation(component.model) @@ -795,6 +797,7 @@ def clear_requests(self): self.vae_decoder, self.text_encoder, self.text_encoder_2, + self.text_encoder_3 }: if component is not None: component.request = None @@ -807,6 +810,7 @@ def compile(self): self.vae_decoder, self.text_encoder, self.text_encoder_2, + self.text_encoder_3 }: if component is not None: component._compile() From 7759353034573453f58eaa740a8f52a8f9a75707 Mon Sep 17 00:00:00 2001 From: eaidova Date: Fri, 11 Oct 2024 19:41:37 +0400 Subject: [PATCH 07/24] fix missed tokenizer export --- optimum/exporters/openvino/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 69cfec1d96..1cf2ecabe4 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -491,7 +491,7 @@ def maybe_convert_tokenizers(library_name: str, output: Path, model=None, prepro f"models won't be generated. Exception: {exception}" ) elif model: - for tokenizer_name in ("tokenizer", "tokenizer_2"): + for tokenizer_name in ("tokenizer", "tokenizer_2", "tokenizer_3"): tokenizer = getattr(model, tokenizer_name, None) if tokenizer: export_tokenizer(tokenizer, output / tokenizer_name, task=task) From dfec4b41b0af4611b536131fdd1d419a8c77039d Mon Sep 17 00:00:00 2001 From: eaidova Date: Fri, 11 Oct 2024 20:46:40 +0400 Subject: [PATCH 08/24] add support in quantization --- optimum/intel/__init__.py | 3 ++ optimum/intel/openvino/__init__.py | 3 ++ optimum/intel/openvino/modeling_diffusion.py | 18 ++++++--- optimum/intel/openvino/quantization.py | 40 ++++++++++++++----- .../dummy_openvino_and_diffusers_objects.py | 22 ++++++++++ 5 files changed, 72 insertions(+), 14 deletions(-) diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 5926f1869c..548b373afc 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -267,6 +267,9 @@ OVPipelineForImage2Image, OVPipelineForInpainting, OVPipelineForText2Image, + OVStableDiffusion3Img2ImgPipeline, + OVStableDiffusion3InpaintPipeline, + OVStableDiffusion3Pipeline, OVStableDiffusionImg2ImgPipeline, OVStableDiffusionInpaintPipeline, OVStableDiffusionPipeline, diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 549bf8170d..da8b95499b 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -87,6 +87,9 @@ OVPipelineForImage2Image, OVPipelineForInpainting, OVPipelineForText2Image, + OVStableDiffusion3Img2ImgPipeline, + OVStableDiffusion3InpaintPipeline, + OVStableDiffusion3Pipeline, OVStableDiffusionImg2ImgPipeline, OVStableDiffusionInpaintPipeline, OVStableDiffusionPipeline, diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 14d44a6992..46f12abca0 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -581,7 +581,7 @@ def width(self) -> int: @property def batch_size(self) -> int: model = self.unet.model if self.unet is not None else self.transformer - batch_size = model.inputs[0].get_partial_shape()[0] + batch_size = model.inputs[0].get_partial_shape()[0] if batch_size.is_dynamic: return -1 return batch_size.get_length() @@ -669,7 +669,7 @@ def _reshape_transformer( shapes[inputs] = [batch_size, self.transformer.config["pooled_projection_dim"]] else: shapes[inputs][0] = batch_size - shapes[inputs][1] = -1 # text_encoder_3 may have vary input length + shapes[inputs][1] = -1 # text_encoder_3 may have vary input length model.reshape(shapes) return model @@ -776,7 +776,15 @@ def half(self): "`half()` is not supported with `compile_only` mode, please intialize model without this option" ) - for component in {self.unet, self.transformer, self.vae_encoder, self.vae_decoder, self.text_encoder, self.text_encoder_2, self.text_encoder_3}: + for component in { + self.unet, + self.transformer, + self.vae_encoder, + self.vae_decoder, + self.text_encoder, + self.text_encoder_2, + self.text_encoder_3, + }: if component is not None: compress_model_transformation(component.model) @@ -797,7 +805,7 @@ def clear_requests(self): self.vae_decoder, self.text_encoder, self.text_encoder_2, - self.text_encoder_3 + self.text_encoder_3, }: if component is not None: component.request = None @@ -810,7 +818,7 @@ def compile(self): self.vae_decoder, self.text_encoder, self.text_encoder_2, - self.text_encoder_3 + self.text_encoder_3, }: if component is not None: component._compile() diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 1ad75477cc..c2e880e62a 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -380,15 +380,27 @@ def _quantize_ovbasemodel( quantization_config_copy = copy.deepcopy(quantization_config) quantization_config_copy.dataset = None quantization_config_copy.quant_method = OVQuantizationMethod.DEFAULT - sub_model_names = ["vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2"] + sub_model_names = [ + "vae_encoder", + "vae_decoder", + "text_encoder", + "text_encoder_2", + "text_encoder_3", + ] sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names)) for sub_model in sub_models: _weight_only_quantization(sub_model.model, quantization_config_copy) - # Apply hybrid quantization to UNet - self.model.unet.model = _hybrid_quantization( - self.model.unet.model, quantization_config, calibration_dataset - ) + if self.model.unet is not None: + # Apply hybrid quantization to UNet + self.model.unet.model = _hybrid_quantization( + self.model.unet.model, quantization_config, calibration_dataset + ) + else: + self.model.transformer.model = _hybrid_quantization( + self.model.transformer.model, quantization_config, calibration_dataset + ) + self.model.clear_requests() else: # The model may be for example OVModelForImageClassification, OVModelForAudioClassification, etc. @@ -396,7 +408,15 @@ def _quantize_ovbasemodel( self.model.request = None else: if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline): - sub_model_names = ["vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2", "unet"] + sub_model_names = [ + "vae_encoder", + "vae_decoder", + "text_encoder", + "text_encoder_2", + "unet", + "transformer", + "text_encoder_3", + ] sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names)) for sub_model in sub_models: _weight_only_quantization(sub_model.model, quantization_config) @@ -743,7 +763,9 @@ def _prepare_unet_dataset( ) -> nncf.Dataset: self.model.compile() - size = self.model.unet.config.get("sample_size", 64) * self.model.vae_scale_factor + diffuser = self.model.unet if self.model.unet is not None else self.model.transformer + + size = diffuser.config.get("sample_size", 64) * self.model.vae_scale_factor height, width = 2 * (min(size, 512),) num_samples = num_samples or 200 @@ -784,7 +806,7 @@ def transform_fn(data_item): calibration_data = [] try: - self.model.unet.request = InferRequestWrapper(self.model.unet.request, calibration_data) + diffuser.request = InferRequestWrapper(diffuser.request, calibration_data) for inputs in dataset: inputs = transform_fn(inputs) @@ -795,7 +817,7 @@ def transform_fn(data_item): if len(calibration_data) >= num_samples: break finally: - self.model.unet.request = self.model.unet.request.request + diffuser.request = diffuser.request.request calibration_dataset = nncf.Dataset(calibration_data[:num_samples]) return calibration_dataset diff --git a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py index 2f7fbb8b06..6d6f6262e7 100644 --- a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py +++ b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py @@ -156,3 +156,25 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino", "diffusers"]) + + +class OVStableDiffusion3Pipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) + + +class OVStableDiffusion3InpaintPipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) From 9b055ad71f110b762e1a5b092fc2a74e2f3babc8 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Fri, 11 Oct 2024 20:32:30 +0400 Subject: [PATCH 09/24] Update optimum/intel/openvino/modeling_diffusion.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --- optimum/intel/openvino/modeling_diffusion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 46f12abca0..3cbd8d3414 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -580,8 +580,8 @@ def width(self) -> int: @property def batch_size(self) -> int: - model = self.unet.model if self.unet is not None else self.transformer - batch_size = model.inputs[0].get_partial_shape()[0] + model = self.unet.model if self.unet is not None else self.transformer.model + batch_size = model.inputs[0].get_partial_shape()[0] if batch_size.is_dynamic: return -1 return batch_size.get_length() From 1af01851f28f3314ad3fc2c8fed6c1deae0b2d5a Mon Sep 17 00:00:00 2001 From: eaidova Date: Fri, 11 Oct 2024 21:27:10 +0400 Subject: [PATCH 10/24] add tests --- optimum/intel/openvino/modeling_diffusion.py | 4 ++++ tests/openvino/test_diffusion.py | 6 +++--- tests/openvino/utils_tests.py | 1 + 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 3cbd8d3414..23968f50b0 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -516,6 +516,7 @@ def _from_transformers( no_post_process=True, revision=revision, cache_dir=cache_dir, + task=cls.export_feature, token=token, local_files_only=local_files_only, force_download=force_download, @@ -1481,13 +1482,16 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): class OVPipelineForText2Image(OVPipelineForTask): auto_model_class = AutoPipelineForText2Image ov_pipelines_mapping = OV_TEXT2IMAGE_PIPELINES_MAPPING + export_feature = "text-to-image" class OVPipelineForImage2Image(OVPipelineForTask): auto_model_class = AutoPipelineForImage2Image ov_pipelines_mapping = OV_IMAGE2IMAGE_PIPELINES_MAPPING + export_feature = "image-to-image" class OVPipelineForInpainting(OVPipelineForTask): auto_model_class = AutoPipelineForInpainting ov_pipelines_mapping = OV_INPAINT_PIPELINES_MAPPING + export_feature = "inpainting" diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index 687c1f5c02..88de26fa85 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -72,7 +72,7 @@ def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type= class OVPipelineForText2ImageTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency", "stable-diffusion-3"] OVMODEL_CLASS = OVPipelineForText2Image AUTOMODEL_CLASS = AutoPipelineForText2Image @@ -323,7 +323,7 @@ def test_textual_inversion(self): class OVPipelineForImage2ImageTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency", "stable-diffusion-3"] AUTOMODEL_CLASS = AutoPipelineForImage2Image OVMODEL_CLASS = OVPipelineForImage2Image @@ -535,7 +535,7 @@ def test_textual_inversion(self): class OVPipelineForInpaintingTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"] + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "stable-diffusion-3"] AUTOMODEL_CLASS = AutoPipelineForInpainting OVMODEL_CLASS = OVPipelineForInpainting diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index d7eea01dba..bd17134d8b 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -118,6 +118,7 @@ "stable-diffusion-openvino": "hf-internal-testing/tiny-stable-diffusion-openvino", "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", "stable-diffusion-xl-refiner": "echarlaix/tiny-random-stable-diffusion-xl-refiner", + "stable-diffusion-3": "yujiepan/stable-diffusion-3-tiny-random", "stablelm": "hf-internal-testing/tiny-random-StableLmForCausalLM", "starcoder2": "hf-internal-testing/tiny-random-Starcoder2ForCausalLM", "latent-consistency": "echarlaix/tiny-random-latent-consistency", From ddab249f4e3e1429b3a3d1833d871bd2c69be943 Mon Sep 17 00:00:00 2001 From: eaidova Date: Mon, 14 Oct 2024 08:50:45 +0400 Subject: [PATCH 11/24] fix tests --- optimum/intel/openvino/modeling_diffusion.py | 2 + tests/openvino/test_diffusion.py | 54 ++++++++++++++------ 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 23968f50b0..4f9b626022 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -647,6 +647,8 @@ def _reshape_transformer( batch_size = -1 else: batch_size *= num_images_per_prompt + # The factor of 2 comes from the guidance scale > 1 + batch_size *= 2 height = height // self.vae_scale_factor if height > 0 else height width = width // self.vae_scale_factor if width > 0 else width diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index 88de26fa85..dcbe03bd11 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -135,9 +135,9 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - np.testing.assert_allclose(ov_output, diffusers_output, atol=1e-4, rtol=1e-2) + np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]) @require_diffusers def test_callback(self, model_arch: str): height, width, batch_size = 64, 128, 1 @@ -184,9 +184,10 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: + out_channels = pipeline.unet.config.out_channels if pipeline.unet is not None else pipeline.transformer.config.out_channels self.assertEqual( outputs.shape, - (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + (batch_size, out_channels, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), ) @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -229,6 +230,22 @@ def test_negative_prompt(self, model_arch: str): do_classifier_free_guidance=True, negative_prompt=negative_prompt, ) + elif model_arch == "stable-diffusion-3": + ( + inputs["prompt_embeds"], + inputs["negative_prompt_embeds"], + inputs["pooled_prompt_embeds"], + inputs["negative_pooled_prompt_embeds"], + ) = pipeline.encode_prompt( + prompt=prompt, + prompt_2=None, + prompt_3=None, + num_images_per_prompt=1, + device=torch.device("cpu"), + do_classifier_free_guidance=True, + negative_prompt=negative_prompt, + ) + else: inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = pipeline.encode_prompt( prompt=prompt, @@ -288,11 +305,12 @@ def test_height_width_properties(self, model_arch: str): ) self.assertFalse(ov_pipeline.is_dynamic) + expected_batch = batch_size * num_images_per_prompt + if ov_pipeline.unet is not None and "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs}: + expected_batch *= 2 self.assertEqual( ov_pipeline.batch_size, - batch_size - * num_images_per_prompt - * (2 if "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs} else 1), + expected_batch, ) self.assertEqual(ov_pipeline.height, height) self.assertEqual(ov_pipeline.width, width) @@ -369,7 +387,7 @@ def test_num_images_per_prompt(self, model_arch: str): outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]) @require_diffusers def test_callback(self, model_arch: str): height, width, batch_size = 32, 64, 1 @@ -416,9 +434,10 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: + out_channels = pipeline.unet.config.out_channels if pipeline.unet is not None else pipeline.transformer.config.out_channels self.assertEqual( outputs.shape, - (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + (batch_size, out_channels, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), ) @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -500,11 +519,12 @@ def test_height_width_properties(self, model_arch: str): ) self.assertFalse(ov_pipeline.is_dynamic) + expected_batch = batch_size * num_images_per_prompt + if ov_pipeline.unet is not None and "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs}: + expected_batch *= 2 self.assertEqual( ov_pipeline.batch_size, - batch_size - * num_images_per_prompt - * (2 if "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs} else 1), + expected_batch ) self.assertEqual(ov_pipeline.height, height) self.assertEqual(ov_pipeline.width, width) @@ -586,7 +606,7 @@ def test_num_images_per_prompt(self, model_arch: str): outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(["stable-diffusion", "stable-diffusion-xl"]) @require_diffusers def test_callback(self, model_arch: str): height, width, batch_size = 32, 64, 1 @@ -633,9 +653,10 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: + out_channels = pipeline.unet.config.out_channels if pipeline.unet is not None else pipeline.transformer.config.out_channels self.assertEqual( outputs.shape, - (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + (batch_size, out_channels, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), ) @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -717,11 +738,12 @@ def test_height_width_properties(self, model_arch: str): ) self.assertFalse(ov_pipeline.is_dynamic) + expected_batch = batch_size * num_images_per_prompt + if ov_pipeline.unet is not None and "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs}: + expected_batch *= 2 self.assertEqual( ov_pipeline.batch_size, - batch_size - * num_images_per_prompt - * (2 if "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs} else 1), + expected_batch, ) self.assertEqual(ov_pipeline.height, height) self.assertEqual(ov_pipeline.width, width) From 96ba9854eef46a38cfc9a89e913c3e64887a3d03 Mon Sep 17 00:00:00 2001 From: eaidova Date: Mon, 14 Oct 2024 13:32:25 +0400 Subject: [PATCH 12/24] update tests --- optimum/exporters/openvino/convert.py | 4 +- optimum/exporters/openvino/model_configs.py | 10 +++++ optimum/intel/openvino/modeling_diffusion.py | 46 ++++++++++---------- tests/openvino/test_diffusion.py | 17 +++++--- tests/openvino/utils_tests.py | 2 +- 5 files changed, 46 insertions(+), 33 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 74b5667647..c4345281dc 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -1009,9 +1009,9 @@ def get_diffusion_models_for_export_ext( exporter=exporter, library_name="diffusers", task="feature-extraction", - model_type="clip-text-with-projection", + model_type="t5-encoder-model", ) - export_config = export_config_constructor(text_encoder_3.config, int_dtype=int_dtype, float_dtype=float_dtype) + export_config = export_config_constructor(text_encoder_3.config, int_dtype=int_dtype, float_dtype=float_dtype, ) models_for_export["text_encoder_3"] = (text_encoder_3, export_config) return None, models_for_export diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 3fc3c07c7e..8e3b87cc57 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -20,6 +20,7 @@ from packaging import version from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel from transformers.utils import is_tf_available +from optimum.exporters.onnx.model_patcher import ModelPatcher from optimum.exporters.onnx.config import OnnxConfig, TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig from optimum.exporters.onnx.model_configs import ( @@ -1623,3 +1624,12 @@ def rename_ambiguous_inputs(self, inputs): if hidden_states is not None: inputs["hidden_states"] = hidden_states return inputs + + +@register_in_tasks_manager("t5-encoder-model", *["feature-extraction"], library_name="diffusers") +class T5EncoderOpenVINOConfig(CLIPTextOnnxConfig): + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> ModelPatcher: + return ModelPatcher(self, model, model_kwargs=model_kwargs) + diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 4f9b626022..eaf8c86eb4 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -390,7 +390,7 @@ def _from_pretrained( "feature_extractor": None, } for name in submodels.keys(): - if kwargs.get(name, None) is not None: + if kwargs.get(name) is not None: submodels[name] = kwargs.pop(name) elif config.get(name, (None, None))[0] is not None: library_name, library_classes = config.get(name) @@ -417,7 +417,7 @@ def _from_pretrained( quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) if (quantization_config is None or quantization_config.dataset is None) and not compile_only: for name, path in models.items(): - if kwargs.get(name, None) is not None: + if name in kwargs: models[name] = kwargs.pop(name) else: models[name] = cls.load_model(path, quantization_config) if path.is_file() else None @@ -428,7 +428,7 @@ def _from_pretrained( if "GPU" in device.upper() and "INFERENCE_PRECISION_HINT" not in vae_ov_conifg: vae_ov_conifg["INFERENCE_PRECISION_HINT"] = "f32" for name, path in models.items(): - if kwargs.get(name, None) is not None: + if name in kwargs: models[name] = kwargs.pop(name) else: models[name] = ( @@ -449,7 +449,7 @@ def _from_pretrained( from optimum.intel import OVQuantizer for name, path in models.items(): - if kwargs.get(name, None) is not None: + if name in kwargs: models[name] = kwargs.pop(name) else: models[name] = cls.load_model(path) if path.is_file() else None @@ -464,7 +464,6 @@ def _from_pretrained( quantizer.quantize(ov_config=OVConfig(quantization_config=hybrid_quantization_config)) return ov_pipeline - ov_pipeline = ov_pipeline_class( **models, **submodels, @@ -646,10 +645,9 @@ def _reshape_transformer( if batch_size == -1 or num_images_per_prompt == -1: batch_size = -1 else: - batch_size *= num_images_per_prompt - # The factor of 2 comes from the guidance scale > 1 - batch_size *= 2 - + # The factor of 2 comes from the guidance scale > 1 + batch_size *= 2 * num_images_per_prompt + height = height // self.vae_scale_factor if height > 0 else height width = width // self.vae_scale_factor if width > 0 else width shapes = {} @@ -801,7 +799,7 @@ def clear_requests(self): "`clear_requests()` is not supported with `compile_only` mode, please intialize model without this option" ) - for component in { + for component in [ self.unet, self.transformer, self.vae_encoder, @@ -809,12 +807,12 @@ def clear_requests(self): self.text_encoder, self.text_encoder_2, self.text_encoder_3, - }: + ]: if component is not None: component.request = None def compile(self): - for component in { + for component in [ self.unet, self.transformer, self.vae_encoder, @@ -822,7 +820,7 @@ def compile(self): self.text_encoder, self.text_encoder_2, self.text_encoder_3, - }: + ]: if component is not None: component._compile() @@ -956,6 +954,10 @@ def modules(self): class OVModelTextEncoder(OVPipelinePart): + def __init__(self, model: openvino.runtime.Model, parent_pipeline: OVDiffusionPipeline, model_name: str = ""): + super().__init__(model, parent_pipeline, model_name) + self.hidden_states_output_names = sorted({name for out in self.model.outputs for name in out.names if name.startswith("hidden_states")}) + def forward( self, input_ids: Union[np.ndarray, torch.Tensor], @@ -967,17 +969,15 @@ def forward( model_inputs = {"input_ids": input_ids} - ov_outputs = self.request(model_inputs, share_inputs=True).to_dict() - + ov_outputs = self.request(model_inputs, share_inputs=True) + main_out = ov_outputs[0] model_outputs = {} - for key, value in ov_outputs.items(): - model_outputs[next(iter(key.names))] = torch.from_numpy(value) - - if output_hidden_states: - model_outputs["hidden_states"] = [] - for i in range(self.config.num_hidden_layers): - model_outputs["hidden_states"].append(model_outputs.pop(f"hidden_states.{i}")) - model_outputs["hidden_states"].append(model_outputs.get("last_hidden_state")) + model_outputs[self.model.outputs[0].get_any_name()] = torch.from_numpy(main_out) + if self.hidden_states_output_names and not "last_hidden_state" in model_outputs: + model_outputs["last_hidden_state"] = torch.from_numpy(ov_outputs[self.hidden_states_output_names[-1]]) + if self.hidden_states_output_names and output_hidden_states or self.config.output_hidden_states: + hidden_states = [torch.from_numpy(ov_outputs[out_name]) for out_name in self.hidden_states_output_names] + model_outputs["hidden_states"] = hidden_states if return_dict: return model_outputs diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index dcbe03bd11..74d48064c6 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -126,10 +126,11 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): height, width, batch_size = 128, 128, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], text_encoder_3=None) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], text_encoder_3=None) for output_type in ["latent", "np", "pt"]: + print(output_type) inputs["output_type"] = output_type ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images @@ -446,16 +447,17 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): height, width, batch_size = 128, 128, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], text_encoder_3=None) + ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], text_encoder_3=None) for output_type in ["latent", "np", "pt"]: + print(output_type) inputs["output_type"] = output_type ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - np.testing.assert_allclose(ov_output, diffusers_output, atol=1e-4, rtol=1e-2) + np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -669,12 +671,13 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) for output_type in ["latent", "np", "pt"]: + print(output_type) inputs["output_type"] = output_type ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - - np.testing.assert_allclose(ov_output, diffusers_output, atol=1e-4, rtol=1e-2) + + np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index bd17134d8b..efccb698e7 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -118,7 +118,7 @@ "stable-diffusion-openvino": "hf-internal-testing/tiny-stable-diffusion-openvino", "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", "stable-diffusion-xl-refiner": "echarlaix/tiny-random-stable-diffusion-xl-refiner", - "stable-diffusion-3": "yujiepan/stable-diffusion-3-tiny-random", + "stable-diffusion-3": "katuni4ka/tiny-random-stabdle-diffusion-3", "stablelm": "hf-internal-testing/tiny-random-StableLmForCausalLM", "starcoder2": "hf-internal-testing/tiny-random-Starcoder2ForCausalLM", "latent-consistency": "echarlaix/tiny-random-latent-consistency", From d07ba0b27f33b67f455914e1f4f7346a84378cc4 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Mon, 14 Oct 2024 15:38:21 +0400 Subject: [PATCH 13/24] Update tests/openvino/utils_tests.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --- tests/openvino/utils_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index efccb698e7..eaa4a09836 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -118,7 +118,7 @@ "stable-diffusion-openvino": "hf-internal-testing/tiny-stable-diffusion-openvino", "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", "stable-diffusion-xl-refiner": "echarlaix/tiny-random-stable-diffusion-xl-refiner", - "stable-diffusion-3": "katuni4ka/tiny-random-stabdle-diffusion-3", + "stable-diffusion-3": "katuni4ka/tiny-random-stable-diffusion-3", "stablelm": "hf-internal-testing/tiny-random-StableLmForCausalLM", "starcoder2": "hf-internal-testing/tiny-random-Starcoder2ForCausalLM", "latent-consistency": "echarlaix/tiny-random-latent-consistency", From 41afd031accb9a2b0d71f15cc4dc079ee03128bc Mon Sep 17 00:00:00 2001 From: eaidova Date: Mon, 14 Oct 2024 20:38:50 +0400 Subject: [PATCH 14/24] fix tests --- optimum/exporters/openvino/convert.py | 6 +- optimum/exporters/openvino/model_configs.py | 2 - optimum/intel/openvino/modeling_diffusion.py | 16 +++-- tests/openvino/test_diffusion.py | 66 ++++++++++++++------ 4 files changed, 64 insertions(+), 26 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index c4345281dc..3aaa36aa0a 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -1011,7 +1011,11 @@ def get_diffusion_models_for_export_ext( task="feature-extraction", model_type="t5-encoder-model", ) - export_config = export_config_constructor(text_encoder_3.config, int_dtype=int_dtype, float_dtype=float_dtype, ) + export_config = export_config_constructor( + text_encoder_3.config, + int_dtype=int_dtype, + float_dtype=float_dtype, + ) models_for_export["text_encoder_3"] = (text_encoder_3, export_config) return None, models_for_export diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 8e3b87cc57..6933f0042c 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -20,7 +20,6 @@ from packaging import version from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel from transformers.utils import is_tf_available -from optimum.exporters.onnx.model_patcher import ModelPatcher from optimum.exporters.onnx.config import OnnxConfig, TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig from optimum.exporters.onnx.model_configs import ( @@ -1632,4 +1631,3 @@ def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> ModelPatcher: return ModelPatcher(self, model, model_kwargs=model_kwargs) - diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index eaf8c86eb4..ca5f37ca57 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -581,7 +581,7 @@ def width(self) -> int: @property def batch_size(self) -> int: model = self.unet.model if self.unet is not None else self.transformer.model - batch_size = model.inputs[0].get_partial_shape()[0] + batch_size = model.inputs[0].get_partial_shape()[0] if batch_size.is_dynamic: return -1 return batch_size.get_length() @@ -647,7 +647,7 @@ def _reshape_transformer( else: # The factor of 2 comes from the guidance scale > 1 batch_size *= 2 * num_images_per_prompt - + height = height // self.vae_scale_factor if height > 0 else height width = width // self.vae_scale_factor if width > 0 else width shapes = {} @@ -956,7 +956,9 @@ def modules(self): class OVModelTextEncoder(OVPipelinePart): def __init__(self, model: openvino.runtime.Model, parent_pipeline: OVDiffusionPipeline, model_name: str = ""): super().__init__(model, parent_pipeline, model_name) - self.hidden_states_output_names = sorted({name for out in self.model.outputs for name in out.names if name.startswith("hidden_states")}) + self.hidden_states_output_names = sorted( + {name for out in self.model.outputs for name in out.names if name.startswith("hidden_states")} + ) def forward( self, @@ -973,9 +975,13 @@ def forward( main_out = ov_outputs[0] model_outputs = {} model_outputs[self.model.outputs[0].get_any_name()] = torch.from_numpy(main_out) - if self.hidden_states_output_names and not "last_hidden_state" in model_outputs: + if self.hidden_states_output_names and "last_hidden_state" not in model_outputs: model_outputs["last_hidden_state"] = torch.from_numpy(ov_outputs[self.hidden_states_output_names[-1]]) - if self.hidden_states_output_names and output_hidden_states or self.config.output_hidden_states: + if ( + self.hidden_states_output_names + and output_hidden_states + or getattr(self.config, "output_hidden_states", False) + ): hidden_states = [torch.from_numpy(ov_outputs[out_name]) for out_name in self.hidden_states_output_names] model_outputs["hidden_states"] = hidden_states diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index 74d48064c6..951066a811 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -185,10 +185,19 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: - out_channels = pipeline.unet.config.out_channels if pipeline.unet is not None else pipeline.transformer.config.out_channels + out_channels = ( + pipeline.unet.config.out_channels + if pipeline.unet is not None + else pipeline.transformer.config.out_channels + ) self.assertEqual( outputs.shape, - (batch_size, out_channels, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ( + batch_size, + out_channels, + height // pipeline.vae_scale_factor, + width // pipeline.vae_scale_factor, + ), ) @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -246,7 +255,7 @@ def test_negative_prompt(self, model_arch: str): do_classifier_free_guidance=True, negative_prompt=negative_prompt, ) - + else: inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = pipeline.encode_prompt( prompt=prompt, @@ -306,8 +315,10 @@ def test_height_width_properties(self, model_arch: str): ) self.assertFalse(ov_pipeline.is_dynamic) - expected_batch = batch_size * num_images_per_prompt - if ov_pipeline.unet is not None and "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs}: + expected_batch = batch_size * num_images_per_prompt + if ov_pipeline.unet is not None and "timestep_cond" not in { + inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs + }: expected_batch *= 2 self.assertEqual( ov_pipeline.batch_size, @@ -435,10 +446,19 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: - out_channels = pipeline.unet.config.out_channels if pipeline.unet is not None else pipeline.transformer.config.out_channels + out_channels = ( + pipeline.unet.config.out_channels + if pipeline.unet is not None + else pipeline.transformer.config.out_channels + ) self.assertEqual( outputs.shape, - (batch_size, out_channels, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ( + batch_size, + out_channels, + height // pipeline.vae_scale_factor, + width // pipeline.vae_scale_factor, + ), ) @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -521,13 +541,12 @@ def test_height_width_properties(self, model_arch: str): ) self.assertFalse(ov_pipeline.is_dynamic) - expected_batch = batch_size * num_images_per_prompt - if ov_pipeline.unet is not None and "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs}: + expected_batch = batch_size * num_images_per_prompt + if ov_pipeline.unet is not None and "timestep_cond" not in { + inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs + }: expected_batch *= 2 - self.assertEqual( - ov_pipeline.batch_size, - expected_batch - ) + self.assertEqual(ov_pipeline.batch_size, expected_batch) self.assertEqual(ov_pipeline.height, height) self.assertEqual(ov_pipeline.width, width) @@ -655,10 +674,19 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: - out_channels = pipeline.unet.config.out_channels if pipeline.unet is not None else pipeline.transformer.config.out_channels + out_channels = ( + pipeline.unet.config.out_channels + if pipeline.unet is not None + else pipeline.transformer.config.out_channels + ) self.assertEqual( outputs.shape, - (batch_size, out_channels, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ( + batch_size, + out_channels, + height // pipeline.vae_scale_factor, + width // pipeline.vae_scale_factor, + ), ) @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -676,7 +704,7 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - + np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -741,8 +769,10 @@ def test_height_width_properties(self, model_arch: str): ) self.assertFalse(ov_pipeline.is_dynamic) - expected_batch = batch_size * num_images_per_prompt - if ov_pipeline.unet is not None and "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs}: + expected_batch = batch_size * num_images_per_prompt + if ov_pipeline.unet is not None and "timestep_cond" not in { + inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs + }: expected_batch *= 2 self.assertEqual( ov_pipeline.batch_size, From 09b5d407c70ca2be6aaf76bda184395e018166c4 Mon Sep 17 00:00:00 2001 From: eaidova Date: Mon, 14 Oct 2024 20:50:16 +0400 Subject: [PATCH 15/24] add export tests --- optimum/exporters/openvino/convert.py | 9 +++++++-- optimum/exporters/openvino/model_configs.py | 2 +- optimum/intel/__init__.py | 6 ++++++ tests/openvino/test_export.py | 2 ++ tests/openvino/test_exporters_cli.py | 2 ++ 5 files changed, 18 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 3aaa36aa0a..80613e8658 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -922,7 +922,12 @@ def get_diffusion_models_for_export_ext( if not is_sd3: return None, get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter) + models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype) + return None, models_for_export + + +def get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype): models_for_export = {} # Text encoder @@ -951,7 +956,7 @@ def get_diffusion_models_for_export_ext( exporter=exporter, library_name="diffusers", task="semantic-segmentation", - model_type="transformer", + model_type="sd3-transformer", ) transformer_export_config = export_config_constructor( pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype @@ -1018,4 +1023,4 @@ def get_diffusion_models_for_export_ext( ) models_for_export["text_encoder_3"] = (text_encoder_3, export_config) - return None, models_for_export + return models_for_export diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 6933f0042c..ea5d8d553f 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1598,7 +1598,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) -@register_in_tasks_manager("transformer", *["semantic-segmentation"], library_name="diffusers") +@register_in_tasks_manager("sd3-transformer", *["semantic-segmentation"], library_name="diffusers") class TransformerOpenVINOConfig(UNetOnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = UNetOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES + ( PooledProjectionsDummyInputGenerator, diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 548b373afc..0bbf2cedb6 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -100,6 +100,9 @@ "OVStableDiffusionXLPipeline", "OVStableDiffusionXLImg2ImgPipeline", "OVStableDiffusionXLInpaintPipeline", + "OVStableDiffusion3Pipeline", + "OVStableDiffusion3Image2ImagePipeline", + "OVStableDiffusion3InpaintPipeline", "OVLatentConsistencyModelPipeline", "OVLatentConsistencyModelImg2ImgPipeline", "OVPipelineForImage2Image", @@ -116,6 +119,9 @@ "OVStableDiffusionXLPipeline", "OVStableDiffusionXLImg2ImgPipeline", "OVStableDiffusionXLInpaintPipeline", + "OVStableDiffusion3Pipeline", + "OVStableDiffusion3Image2ImagePipeline", + "OVStableDiffusion3InpaintPipeline", "OVLatentConsistencyModelPipeline", "OVLatentConsistencyModelImg2ImgPipeline", "OVPipelineForImage2Image", diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 43c535e673..e0a5fbb0e5 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -40,6 +40,7 @@ OVModelForSequenceClassification, OVModelForSpeechSeq2Seq, OVModelForTokenClassification, + OVStableDiffusion3Pipeline, OVStableDiffusionPipeline, OVStableDiffusionXLImg2ImgPipeline, OVStableDiffusionXLPipeline, @@ -68,6 +69,7 @@ class ExportModelTest(unittest.TestCase): "stable-diffusion-xl": OVStableDiffusionXLPipeline, "stable-diffusion-xl-refiner": OVStableDiffusionXLImg2ImgPipeline, "latent-consistency": OVLatentConsistencyModelPipeline, + "stable-diffusion-3": OVStableDiffusion3Pipeline, } GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper") diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 8443f95b31..287a7351fe 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -70,6 +70,7 @@ class OVCLIExportTestCase(unittest.TestCase): ("feature-extraction", "blenderbot"), ("text-to-image", "stable-diffusion"), ("text-to-image", "stable-diffusion-xl"), + ("text-to-image", "stable-diffusion-3"), ("image-to-image", "stable-diffusion-xl-refiner"), ) EXPECTED_NUMBER_OF_TOKENIZER_MODELS = { @@ -84,6 +85,7 @@ class OVCLIExportTestCase(unittest.TestCase): "blenderbot": 2 if is_tokenizers_version("<", "0.20") else 0, "stable-diffusion": 2 if is_tokenizers_version("<", "0.20") else 0, "stable-diffusion-xl": 4 if is_tokenizers_version("<", "0.20") else 0, + "stable-diffusion-3": 6 if is_tokenizers_version("<", "0.20") else 0, } SUPPORTED_SD_HYBRID_ARCHITECTURES = ( From 6ce4b5086fdcbe28366bc3f922aaf53417feddec Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 15 Oct 2024 10:42:04 +0400 Subject: [PATCH 16/24] fix cli tests --- optimum/intel/openvino/utils.py | 1 + tests/openvino/test_diffusion.py | 6 +++--- tests/openvino/test_exporters_cli.py | 1 + tests/openvino/utils_tests.py | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index fcc6944e9f..a8aa0643c0 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -119,6 +119,7 @@ "audio-classification": "OVModelForAudioClassification", "stable-diffusion": "OVStableDiffusionPipeline", "stable-diffusion-xl": "OVStableDiffusionXLPipeline", + "stable-diffusion-3": "OVStableDiffusion3Pipeline", "pix2struct": "OVModelForPix2Struct", "latent-consistency": "OVLatentConsistencyModelPipeline", "open_clip_text": "OVModelOpenCLIPText", diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index 951066a811..5e2d998c18 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -316,7 +316,7 @@ def test_height_width_properties(self, model_arch: str): self.assertFalse(ov_pipeline.is_dynamic) expected_batch = batch_size * num_images_per_prompt - if ov_pipeline.unet is not None and "timestep_cond" not in { + if ov_pipeline.unet is None or "timestep_cond" not in { inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs }: expected_batch *= 2 @@ -542,7 +542,7 @@ def test_height_width_properties(self, model_arch: str): self.assertFalse(ov_pipeline.is_dynamic) expected_batch = batch_size * num_images_per_prompt - if ov_pipeline.unet is not None and "timestep_cond" not in { + if ov_pipeline.unet is None or "timestep_cond" not in { inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs }: expected_batch *= 2 @@ -770,7 +770,7 @@ def test_height_width_properties(self, model_arch: str): self.assertFalse(ov_pipeline.is_dynamic) expected_batch = batch_size * num_images_per_prompt - if ov_pipeline.unet is not None and "timestep_cond" not in { + if ov_pipeline.unet is None or "timestep_cond" not in { inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs }: expected_batch *= 2 diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 287a7351fe..07bcdd897e 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -39,6 +39,7 @@ OVModelOpenCLIPText, OVModelOpenCLIPVisual, OVSentenceTransformer, + OVStableDiffusion3Pipeline, OVStableDiffusionPipeline, OVStableDiffusionXLPipeline, ) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index eaa4a09836..bd17134d8b 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -118,7 +118,7 @@ "stable-diffusion-openvino": "hf-internal-testing/tiny-stable-diffusion-openvino", "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", "stable-diffusion-xl-refiner": "echarlaix/tiny-random-stable-diffusion-xl-refiner", - "stable-diffusion-3": "katuni4ka/tiny-random-stable-diffusion-3", + "stable-diffusion-3": "yujiepan/stable-diffusion-3-tiny-random", "stablelm": "hf-internal-testing/tiny-random-StableLmForCausalLM", "starcoder2": "hf-internal-testing/tiny-random-Starcoder2ForCausalLM", "latent-consistency": "echarlaix/tiny-random-latent-consistency", From 22232d226c00813aefebf980520f7f0a6ea683e1 Mon Sep 17 00:00:00 2001 From: eaidova Date: Mon, 21 Oct 2024 11:35:30 +0400 Subject: [PATCH 17/24] use fp32 timesteps --- optimum/exporters/openvino/model_configs.py | 19 +++++++++++++++---- tests/openvino/test_diffusion.py | 17 +++++++++++++---- tests/openvino/test_exporters_cli.py | 1 + tests/openvino/utils_tests.py | 1 + 4 files changed, 30 insertions(+), 8 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index ea5d8d553f..70aef3f901 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -46,6 +46,7 @@ DummyInputGenerator, DummyPastKeyValuesGenerator, DummyTextInputGenerator, + DummyTimestepInputGenerator, DummyVisionInputGenerator, FalconDummyPastKeyValuesGenerator, MistralDummyPastKeyValuesGenerator, @@ -1575,7 +1576,7 @@ def patch_model_for_export( class PooledProjectionsDummyInputGenerator(DummyInputGenerator): - SUPPORTED_INPUT_NAMES = "pooled_projection" + SUPPORTED_INPUT_NAMES = "pooled_projections" def __init__( self, @@ -1598,10 +1599,20 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) +class DummyTransformerTimestpsInputGenerator(DummyTimestepInputGenerator): + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "timestep": + shape = [self.batch_size] + return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype) + return super().generate(input_name, framework, int_dtype, float_dtype) + + @register_in_tasks_manager("sd3-transformer", *["semantic-segmentation"], library_name="diffusers") -class TransformerOpenVINOConfig(UNetOnnxConfig): - DUMMY_INPUT_GENERATOR_CLASSES = UNetOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES + ( - PooledProjectionsDummyInputGenerator, +class SD3TransformerOpenVINOConfig(UNetOnnxConfig): + DUMMY_INPUT_GENERATOR_CLASSES = ( + (DummyTransformerTimestpsInputGenerator,) + + UNetOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES + + (PooledProjectionsDummyInputGenerator,) ) NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args( image_size="sample_size", diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index 5e2d998c18..ad9f18c1ab 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -35,6 +35,7 @@ OVPipelineForInpainting, OVPipelineForText2Image, ) +from optimum.intel.utils.import_utils import is_transformers_version from optimum.utils.testing_utils import require_diffusers @@ -72,7 +73,10 @@ def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type= class OVPipelineForText2ImageTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency", "stable-diffusion-3"] + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] + if is_transformers_version(">=", "4.40.0"): + SUPPORTED_ARCHITECTURES.append("stable-diffusion-3") + CALLBACK_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] OVMODEL_CLASS = OVPipelineForText2Image AUTOMODEL_CLASS = AutoPipelineForText2Image @@ -138,7 +142,7 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2) - @parameterized.expand(["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]) + @parameterized.expand(CALLBACK_SUPPORT_ARCHITECTURES) @require_diffusers def test_callback(self, model_arch: str): height, width, batch_size = 64, 128, 1 @@ -353,7 +357,9 @@ def test_textual_inversion(self): class OVPipelineForImage2ImageTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency", "stable-diffusion-3"] + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] + if is_transformers_version(">=", "4.40.0"): + SUPPORTED_ARCHITECTURES.append("stable-diffusion-3") AUTOMODEL_CLASS = AutoPipelineForImage2Image OVMODEL_CLASS = OVPipelineForImage2Image @@ -576,7 +582,10 @@ def test_textual_inversion(self): class OVPipelineForInpaintingTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "stable-diffusion-3"] + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"] + + if is_transformers_version(">=", "4.40.0"): + SUPPORTED_ARCHITECTURES.append("stable-diffusion-3") AUTOMODEL_CLASS = AutoPipelineForInpainting OVMODEL_CLASS = OVPipelineForInpainting diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 07bcdd897e..a697665994 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -93,6 +93,7 @@ class OVCLIExportTestCase(unittest.TestCase): ("stable-diffusion", 72, 195), ("stable-diffusion-xl", 84, 331), ("latent-consistency", 50, 135), + ("stable-diffusion-3", 84, 331), ) TEST_4BIT_CONFIGURATONS = [ diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index bd17134d8b..114de633c2 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -171,6 +171,7 @@ "stable-diffusion-xl": (366, 34, 42, 66), "stable-diffusion-xl-refiner": (366, 34, 42, 66), "open-clip": (20, 28), + "stable-diffusion-3": (366, 34, 42, 66), } From 8da6765599d7968243a4958eac4dcca40868bb81 Mon Sep 17 00:00:00 2001 From: eaidova Date: Mon, 21 Oct 2024 20:13:22 +0400 Subject: [PATCH 18/24] add flux --- optimum/commands/export/openvino.py | 8 ++ optimum/exporters/openvino/convert.py | 97 +++++++++++++- optimum/exporters/openvino/model_configs.py | 125 +++++++++++++++++- optimum/exporters/openvino/model_patcher.py | 48 ++++++- optimum/intel/__init__.py | 2 + optimum/intel/openvino/__init__.py | 1 + optimum/intel/openvino/modeling_base.py | 12 +- optimum/intel/openvino/modeling_diffusion.py | 64 ++++++--- .../dummy_openvino_and_diffusers_objects.py | 11 ++ tests/openvino/test_diffusion.py | 50 ++++--- tests/openvino/test_exporters_cli.py | 23 ++-- tests/openvino/test_quantization.py | 22 ++- tests/openvino/utils_tests.py | 1 + 13 files changed, 398 insertions(+), 66 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 93528e0085..c90258078f 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -318,6 +318,14 @@ def run(self): from optimum.intel import OVStableDiffusionPipeline model_cls = OVStableDiffusionPipeline + elif class_name == "StableDiffusion3Pipeline": + from optimum.intel import OVStableDiffusion3Pipeline + + model_cls = OVStableDiffusion3Pipeline + elif class_name == "FluxPipeline": + from optimum.intel import OVFluxPipeline + + model_cls = OVFluxPipeline else: raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.") diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 80613e8658..853d0232df 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -920,9 +920,19 @@ def get_diffusion_models_for_export_ext( except ImportError: is_sd3 = False - if not is_sd3: + try: + from diffusers import FluxPipeline + + is_flux = isinstance(pipeline, FluxPipeline) + except ImportError: + is_flux = False + + if not is_sd3 and not is_flux: return None, get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter) - models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype) + if is_sd3: + models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype) + else: + models_for_export = get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype) return None, models_for_export @@ -1024,3 +1034,86 @@ def get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype): models_for_export["text_encoder_3"] = (text_encoder_3, export_config) return models_for_export + + +def get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype): + models_for_export = {} + + # Text encoder + text_encoder = getattr(pipeline, "text_encoder", None) + if text_encoder is not None: + text_encoder_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder, + exporter=exporter, + library_name="diffusers", + task="feature-extraction", + model_type="clip-text-model", + ) + text_encoder_export_config = text_encoder_config_constructor( + pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["text_encoder"] = (text_encoder, text_encoder_export_config) + + transformer = pipeline.transformer + transformer.config.text_encoder_projection_dim = transformer.config.joint_attention_dim + transformer.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False) + transformer.config.time_cond_proj_dim = None + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=transformer, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="flux-transformer", + ) + transformer_export_config = export_config_constructor( + pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["transformer"] = (transformer, transformer_export_config) + + # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 + vae_encoder = copy.deepcopy(pipeline.vae) + vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters} + vae_config_constructor = TasksManager.get_exporter_config_constructor( + model=vae_encoder, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="vae-encoder", + ) + vae_encoder_export_config = vae_config_constructor( + vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config) + + # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 + vae_decoder = copy.deepcopy(pipeline.vae) + vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample) + vae_config_constructor = TasksManager.get_exporter_config_constructor( + model=vae_decoder, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="vae-decoder", + ) + vae_decoder_export_config = vae_config_constructor( + vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["vae_decoder"] = (vae_decoder, vae_decoder_export_config) + + text_encoder_2 = getattr(pipeline, "text_encoder_2", None) + if text_encoder_2 is not None: + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder_2, + exporter=exporter, + library_name="diffusers", + task="feature-extraction", + model_type="t5-encoder-model", + ) + export_config = export_config_constructor( + text_encoder_2.config, + int_dtype=int_dtype, + float_dtype=float_dtype, + ) + models_for_export["text_encoder_2"] = (text_encoder_2, export_config) + + return models_for_export diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 70aef3f901..49dbd1e5c7 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -43,8 +43,10 @@ from optimum.exporters.tasks import TasksManager from optimum.utils import DEFAULT_DUMMY_SHAPES from optimum.utils.input_generators import ( + DTYPE_MAPPER, DummyInputGenerator, DummyPastKeyValuesGenerator, + DummySeq2SeqDecoderTextInputGenerator, DummyTextInputGenerator, DummyTimestepInputGenerator, DummyVisionInputGenerator, @@ -63,6 +65,7 @@ DBRXModelPatcher, DeciLMModelPatcher, FalconModelPatcher, + FluxTransfromerModelPatcher, Gemma2ModelPatcher, GptNeoxJapaneseModelPatcher, GptNeoxModelPatcher, @@ -96,9 +99,9 @@ def init_model_configs(): "transformers", "LlavaNextForConditionalGeneration", ) - TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS[ - "image-text-to-text" - ] = TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"] + TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["image-text-to-text"] = ( + TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"] + ) supported_model_types = [ "_SUPPORTED_MODEL_TYPE", @@ -1576,7 +1579,7 @@ def patch_model_for_export( class PooledProjectionsDummyInputGenerator(DummyInputGenerator): - SUPPORTED_INPUT_NAMES = "pooled_projections" + SUPPORTED_INPUT_NAMES = ["pooled_projections"] def __init__( self, @@ -1600,8 +1603,10 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int class DummyTransformerTimestpsInputGenerator(DummyTimestepInputGenerator): + SUPPORTED_INPUT_NAMES = ("timestep", "text_embeds", "time_ids", "timestep_cond", "guidance") + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): - if input_name == "timestep": + if input_name in ["timestep", "guidance"]: shape = [self.batch_size] return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype) return super().generate(input_name, framework, int_dtype, float_dtype) @@ -1642,3 +1647,113 @@ def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> ModelPatcher: return ModelPatcher(self, model, model_kwargs=model_kwargs) + + +class DummyFluxTransformerInputGenerator(DummyVisionInputGenerator): + SUPPORTED_INPUT_NAMES = ( + "pixel_values", + "pixel_mask", + "sample", + "latent_sample", + "hidden_states", + "img_ids", + ) + + def __init__( + self, + task: str, + normalized_config: NormalizedVisionConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], + width: int = DEFAULT_DUMMY_SHAPES["width"], + height: int = DEFAULT_DUMMY_SHAPES["height"], + **kwargs, + ): + + super().__init__(task, normalized_config, batch_size, num_channels, width, height, **kwargs) + if getattr(normalized_config, "in_channels", None): + self.num_channels = normalized_config.in_channels // 4 + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name in ["hidden_states", "sample"]: + shape = [self.batch_size, (self.height // 2) * (self.width // 2), self.num_channels * 4] + return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) + if input_name == "img_ids": + return self.prepare_image_ids(framework, int_dtype, float_dtype) + + return super().generate(input_name, framework, int_dtype, float_dtype) + + def prepare_image_ids(self, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + img_ids_height = self.height // 2 + img_ids_width = self.width // 2 + if framework == "pt": + import torch + + latent_image_ids = torch.zeros(img_ids_height, img_ids_width, 3) + latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(img_ids_height)[:, None] + latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(img_ids_width)[None, :] + + latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape + + latent_image_ids = latent_image_ids[None, :].repeat(self.batch_size, 1, 1, 1) + latent_image_ids = latent_image_ids.reshape( + self.batch_size, latent_image_id_height * latent_image_id_width, latent_image_id_channels + ) + latent_image_ids.to(DTYPE_MAPPER.pt(float_dtype)) + return latent_image_ids + if framework == "np": + import numpy as np + + latent_image_ids = np.zeros(img_ids_height, img_ids_width, 3) + latent_image_ids[..., 1] = latent_image_ids[..., 1] + np.arange(img_ids_height)[:, None] + latent_image_ids[..., 2] = latent_image_ids[..., 2] + np.arange(img_ids_width)[None, :] + + latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape + + latent_image_ids = np.tile(latent_image_ids[None, :], (self.batch_size, 1, 1, 1)) + latent_image_ids = latent_image_ids.reshape( + self.batch_size, latent_image_id_height * latent_image_id_width, latent_image_id_channels + ) + latent_image_ids.astype(DTYPE_MAPPER.np[float_dtype]) + return latent_image_ids + + +class DummyFluxTextInputGenerator(DummySeq2SeqDecoderTextInputGenerator): + SUPPORTED_INPUT_NAMES = ( + "decoder_input_ids", + "decoder_attention_mask", + "encoder_outputs", + "encoder_hidden_states", + "txt_ids", + ) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "txt_ids": + return self.constant_tensor([self.batch_size, self.sequence_length, 3], 0, DTYPE_MAPPER.pt(float_dtype)) + return super().generate(input_name, framework, int_dtype, float_dtype) + + +@register_in_tasks_manager("flux-transformer", *["semantic-segmentation"], library_name="diffusers") +class FluxTransformerOpenVINOConfig(SD3TransformerOpenVINOConfig): + DUMMY_INPUT_GENERATOR_CLASSES = ( + DummyTransformerTimestpsInputGenerator, + DummyFluxTransformerInputGenerator, + DummyFluxTextInputGenerator, + PooledProjectionsDummyInputGenerator, + ) + + @property + def inputs(self): + common_inputs = super().inputs + common_inputs.pop("sample", None) + common_inputs["hidden_states"] = {0: "batch_size", 1: "packed_height_width"} + common_inputs["txt_ids"] = {0: "batch_size", 1: "sequence_length"} + common_inputs["img_ids"] = {0: "batch_size", 1: "packed_height_width"} + if getattr(self._normalized_config, "guidance_embeds", False): + common_inputs["guidance"] = {0: "batch_size"} + return common_inputs + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> ModelPatcher: + return FluxTransfromerModelPatcher(self, model, model_kwargs=model_kwargs) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index eadce6d382..faafdda430 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -411,9 +411,9 @@ def _llama_gemma_update_causal_mask_legacy(self, attention_mask, input_tensor, c offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[ - : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] - ] = mask_slice + causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( + mask_slice + ) if ( self.config._attn_implementation == "sdpa" @@ -1979,9 +1979,9 @@ def _dbrx_update_causal_mask_legacy( offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[ - : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] - ] = mask_slice + causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( + mask_slice + ) if ( self.config._attn_implementation == "sdpa" @@ -2705,3 +2705,39 @@ def __init__( def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) self._model.forward = self._model.__orig_forward + + +def _embednb_forward(self, ids: torch.Tensor) -> torch.Tensor: + def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor: + assert dim % 2 == 0, "The dimension must be even." + + scale = torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim + omega = 1.0 / (theta**scale) + + batch_size, seq_length = pos.shape + out = pos.unsqueeze(-1) * omega.unsqueeze(0).unsqueeze(0) + cos_out = torch.cos(out) + sin_out = torch.sin(out) + + stacked_out = torch.stack([cos_out, -sin_out, sin_out, cos_out], dim=-1) + out = stacked_out.view(batch_size, -1, dim // 2, 2, 2) + return out.float() + + n_axes = ids.shape[-1] + emb = torch.cat( + [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)], + dim=-3, + ) + return emb.unsqueeze(1) + + +class FluxTransfromerModelPatcher(ModelPatcher): + def __enter__(self): + super().__enter__() + self._model.pos_embed._orig_forward = self._model.pos_embed.forward + self._model.pos_embed.forward = types.MethodType(_embednb_forward, self._model.pos_embed) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + + self._model.pos_embed.forward = self._model.pos_embed._orig_forward diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 0bbf2cedb6..31526f1aa8 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -105,6 +105,7 @@ "OVStableDiffusion3InpaintPipeline", "OVLatentConsistencyModelPipeline", "OVLatentConsistencyModelImg2ImgPipeline", + "OVFluxPipeline", "OVPipelineForImage2Image", "OVPipelineForText2Image", "OVPipelineForInpainting", @@ -124,6 +125,7 @@ "OVStableDiffusion3InpaintPipeline", "OVLatentConsistencyModelPipeline", "OVLatentConsistencyModelImg2ImgPipeline", + "OVFluxPipeline", "OVPipelineForImage2Image", "OVPipelineForText2Image", "OVPipelineForInpainting", diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index da8b95499b..589a0938e3 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -82,6 +82,7 @@ if is_diffusers_available(): from .modeling_diffusion import ( OVDiffusionPipeline, + OVFluxPipeline, OVLatentConsistencyModelImg2ImgPipeline, OVLatentConsistencyModelPipeline, OVPipelineForImage2Image, diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index ed3cdadb51..d4123c0bd5 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -111,9 +111,9 @@ def __init__( for idx, key in enumerate(model.inputs): names = tuple(key.get_names()) input_names[next((name for name in names if "/" not in name), names[0])] = idx - input_dtypes[ - next((name for name in names if "/" not in name), names[0]) - ] = key.get_element_type().get_type_name() + input_dtypes[next((name for name in names if "/" not in name), names[0])] = ( + key.get_element_type().get_type_name() + ) self.input_names = input_names self.input_dtypes = input_dtypes @@ -122,9 +122,9 @@ def __init__( for idx, key in enumerate(model.outputs): names = tuple(key.get_names()) output_names[next((name for name in names if "/" not in name), names[0])] = idx - output_dtypes[ - next((name for name in names if "/" not in name), names[0]) - ] = key.get_element_type().get_type_name() + output_dtypes[next((name for name in names if "/" not in name), names[0])] = ( + key.get_element_type().get_type_name() + ) self.output_names = output_names self.output_dtypes = output_dtypes diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index ca5f37ca57..7cf064af13 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -88,9 +88,10 @@ StableDiffusion3Pipeline, StableDiffusion3Img2ImgPipeline = StableDiffusionPipeline, StableDiffusionImg2ImgPipeline if is_diffusers_version(">=", "0.30.0"): - from diffusers import StableDiffusion3InpaintPipeline + from diffusers import FluxPipeline, StableDiffusion3InpaintPipeline else: - StableDiffusion3InpaintPipeline = StableDiffusion3Pipeline + StableDiffusion3InpaintPipeline = StableDiffusionInpaintPipeline + FluxPipeline = StableDiffusionPipeline DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer" @@ -564,7 +565,8 @@ def to(self, *args, device: Optional[str] = None, dtype: Optional[torch.dtype] = @property def height(self) -> int: - model = self.unet.model if self.unet is not None else self.transformer.model + # flux transformer does not preserve info about height/width, they are knwon in vae_decoder + model = self.unet.model if self.unet is not None else self.vae.decoder.model height = model.inputs[0].get_partial_shape()[2] if height.is_dynamic: return -1 @@ -572,7 +574,8 @@ def height(self) -> int: @property def width(self) -> int: - model = self.unet.model if self.unet is not None else self.transformer.model + # flux transformer does not preserve info about height/width, they are known in vae_decoder + model = self.unet.model if self.unet is not None else self.vae.decoder.model width = model.inputs[0].get_partial_shape()[3] if width.is_dynamic: return -1 @@ -646,28 +649,42 @@ def _reshape_transformer( batch_size = -1 else: # The factor of 2 comes from the guidance scale > 1 - batch_size *= 2 * num_images_per_prompt + batch_size *= num_images_per_prompt + if "img_ids" not in {inputs.get_any_name() for inputs in model.inputs}: + batch_size *= 2 height = height // self.vae_scale_factor if height > 0 else height width = width // self.vae_scale_factor if width > 0 else width + packed_height = height // 2 if height > 0 else height + packed_width = width // 2 if width > 0 else width + packed_height_width = packed_width * packed_height if height > 0 and width > 0 else -1 shapes = {} for inputs in model.inputs: shapes[inputs] = inputs.get_partial_shape() - if inputs.get_any_name() == "timestep": + if inputs.get_any_name() in ["timestep", "guidance"]: shapes[inputs][0] = batch_size elif inputs.get_any_name() == "hidden_states": in_channels = self.transformer.config.get("in_channels", None) if in_channels is None: - in_channels = shapes[inputs][1] + in_channels = ( + shapes[inputs][1] if inputs.get_partial_shape().rank.get_length() == 4 else shapes[inputs][2] + ) if in_channels.is_dynamic: logger.warning( "Could not identify `in_channels` from the unet configuration, to statically reshape the unet please provide a configuration." ) self.is_dynamic = True + if inputs.get_partial_shape().rank.get_length() == 4: + shapes[inputs] = [batch_size, in_channels, height, width] + else: + shapes[inputs] = [batch_size, packed_height_width, in_channels] - shapes[inputs] = [batch_size, in_channels, height, width] elif inputs.get_any_name() == "pooled_projections": shapes[inputs] = [batch_size, self.transformer.config["pooled_projection_dim"]] + elif inputs.get_any_name() == "img_ids": + shapes[inputs] = [batch_size, packed_height_width, 3] + elif inputs.get_any_name() == "txt_ids": + shapes[inputs] = [batch_size, -1, 3] else: shapes[inputs][0] = batch_size shapes[inputs][1] = -1 # text_encoder_3 may have vary input length @@ -836,7 +853,7 @@ def components(self) -> Dict[str, Any]: components = { "vae": self.vae, "unet": self.unet, - "transfomer": self.transformer, + "transformer": self.transformer, "text_encoder": self.text_encoder, "text_encoder_2": self.text_encoder_2, "text_encoder_3": self.text_encoder_2, @@ -968,13 +985,14 @@ def forward( return_dict: bool = False, ): self._compile() - model_inputs = {"input_ids": input_ids} ov_outputs = self.request(model_inputs, share_inputs=True) main_out = ov_outputs[0] model_outputs = {} model_outputs[self.model.outputs[0].get_any_name()] = torch.from_numpy(main_out) + if len(self.model.outputs) > 1 and "pooler_output" in self.model.outputs[1].get_any_name(): + model_outputs["pooler_output"] = torch.from_numpy(ov_outputs[1]) if self.hidden_states_output_names and "last_hidden_state" not in model_outputs: model_outputs["last_hidden_state"] = torch.from_numpy(ov_outputs[self.hidden_states_output_names[-1]]) if ( @@ -987,7 +1005,6 @@ def forward( if return_dict: return model_outputs - return ModelOutput(**model_outputs) @@ -1052,6 +1069,9 @@ def forward( encoder_hidden_states: torch.FloatTensor = None, pooled_projections: torch.FloatTensor = None, timestep: torch.LongTensor = None, + img_ids: torch.Tensor = None, + txt_ids: torch.Tensor = None, + guidance: torch.Tensor = None, block_controlnet_hidden_states: List = None, joint_attention_kwargs: Optional[Dict[str, Any]] = None, return_dict: bool = True, @@ -1065,6 +1085,13 @@ def forward( "pooled_projections": pooled_projections, } + if img_ids is not None: + model_inputs["img_ids"] = img_ids + if txt_ids is not None: + model_inputs["txt_ids"] = txt_ids + if guidance is not None: + model_inputs["guidance"] = guidance + ov_outputs = self.request(model_inputs, share_inputs=True).to_dict() model_outputs = {} @@ -1359,17 +1386,23 @@ class OVStableDiffusion3Pipeline(OVDiffusionPipeline, OVTextualInversionLoaderMi class OVStableDiffusion3Img2ImgPipeline( OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3Img2ImgPipeline ): - main_input_name = "prompt" - export_feature = "text-to-image" + main_input_name = "image" + export_feature = "image-to-image" auto_model_class = StableDiffusion3Img2ImgPipeline class OVStableDiffusion3InpaintPipeline( OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3InpaintPipeline ): + main_input_name = "image" + export_feature = "inpainting" + auto_model_class = StableDiffusion3InpaintPipeline + + +class OVFluxPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, FluxPipeline): main_input_name = "prompt" export_feature = "text-to-image" - auto_model_class = StableDiffusion3InpaintPipeline + auto_model_class = FluxPipeline SUPPORTED_OV_PIPELINES = [ @@ -1431,8 +1464,9 @@ def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tru OV_IMAGE2IMAGE_PIPELINES_MAPPING["stable-diffusion-3"] = OVStableDiffusion3Img2ImgPipeline if is_diffusers_version(">=", "0.30.0"): - SUPPORTED_OV_PIPELINES.append(OVStableDiffusion3InpaintPipeline) + SUPPORTED_OV_PIPELINES.extend([OVStableDiffusion3InpaintPipeline, OVFluxPipeline]) OV_INPAINT_PIPELINES_MAPPING["stable-diffusion-3"] = OVStableDiffusion3InpaintPipeline + OV_TEXT2IMAGE_PIPELINES_MAPPING["flux"] = OVFluxPipeline SUPPORTED_OV_PIPELINES_MAPPINGS = [ diff --git a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py index 6d6f6262e7..38aea6c1f1 100644 --- a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py +++ b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py @@ -178,3 +178,14 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino", "diffusers"]) + + +class OVFluxPipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index ad9f18c1ab..fb5d75c570 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -74,8 +74,10 @@ def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type= class OVPipelineForText2ImageTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] + NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] if is_transformers_version(">=", "4.40.0"): - SUPPORTED_ARCHITECTURES.append("stable-diffusion-3") + SUPPORTED_ARCHITECTURES.extend(["stable-diffusion-3", "flux"]) + NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES.append("stable-diffusion-3") CALLBACK_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] OVMODEL_CLASS = OVPipelineForText2Image @@ -189,20 +191,26 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: - out_channels = ( - pipeline.unet.config.out_channels - if pipeline.unet is not None - else pipeline.transformer.config.out_channels - ) - self.assertEqual( - outputs.shape, - ( - batch_size, - out_channels, - height // pipeline.vae_scale_factor, - width // pipeline.vae_scale_factor, - ), - ) + if model_arch != "flux": + out_channels = ( + pipeline.unet.config.out_channels + if pipeline.unet is not None + else pipeline.transformer.config.out_channels + ) + self.assertEqual( + outputs.shape, + ( + batch_size, + out_channels, + height // pipeline.vae_scale_factor, + width // pipeline.vae_scale_factor, + ), + ) + else: + packed_height = height // pipeline.vae_scale_factor + packed_width = width // pipeline.vae_scale_factor + channels = pipeline.transformer.config.in_channels + self.assertEqual(outputs.shape, (batch_size, packed_height * packed_width, channels)) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -220,7 +228,7 @@ def test_image_reproducibility(self, model_arch: str): self.assertFalse(np.array_equal(ov_outputs_1.images[0], ov_outputs_3.images[0])) np.testing.assert_allclose(ov_outputs_1.images[0], ov_outputs_2.images[0], atol=1e-4, rtol=1e-2) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES) def test_negative_prompt(self, model_arch: str): height, width, batch_size = 64, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) @@ -320,9 +328,13 @@ def test_height_width_properties(self, model_arch: str): self.assertFalse(ov_pipeline.is_dynamic) expected_batch = batch_size * num_images_per_prompt - if ov_pipeline.unet is None or "timestep_cond" not in { - inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs - }: + if ( + ov_pipeline.unet is not None + and "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs} + ) or ( + ov_pipeline.transformer is not None + and "txt_ids" not in {inputs.get_any_name() for inputs in ov_pipeline.transformer.model.inputs} + ): expected_batch *= 2 self.assertEqual( ov_pipeline.batch_size, diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index a697665994..4148d51e9c 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -49,6 +49,7 @@ compare_versions, is_openvino_tokenizers_available, is_tokenizers_version, + is_transformers_version, ) @@ -57,7 +58,7 @@ class OVCLIExportTestCase(unittest.TestCase): Integration tests ensuring supported models are correctly exported. """ - SUPPORTED_ARCHITECTURES = ( + SUPPORTED_ARCHITECTURES = [ ("text-generation", "gpt2"), ("text-generation-with-past", "gpt2"), ("text2text-generation", "t5"), @@ -71,9 +72,11 @@ class OVCLIExportTestCase(unittest.TestCase): ("feature-extraction", "blenderbot"), ("text-to-image", "stable-diffusion"), ("text-to-image", "stable-diffusion-xl"), - ("text-to-image", "stable-diffusion-3"), ("image-to-image", "stable-diffusion-xl-refiner"), - ) + ] + + if is_transformers_version(">=", "4.45"): + SUPPORTED_ARCHITECTURES.append(("text-to-image", "stable-diffusion-3")) EXPECTED_NUMBER_OF_TOKENIZER_MODELS = { "gpt2": 2 if is_tokenizers_version("<", "0.20") else 0, "t5": 0, # no .model file in the repository @@ -89,12 +92,14 @@ class OVCLIExportTestCase(unittest.TestCase): "stable-diffusion-3": 6 if is_tokenizers_version("<", "0.20") else 0, } - SUPPORTED_SD_HYBRID_ARCHITECTURES = ( + SUPPORTED_SD_HYBRID_ARCHITECTURES = [ ("stable-diffusion", 72, 195), ("stable-diffusion-xl", 84, 331), ("latent-consistency", 50, 135), - ("stable-diffusion-3", 84, 331), - ) + ] + + if is_transformers_version(">=", "4.45"): + SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("stable-diffusion-3", 9, 65)) TEST_4BIT_CONFIGURATONS = [ ("text-generation-with-past", "opt125m", "int4 --sym --group-size 128", {"int8": 4, "int4": 72}), @@ -213,7 +218,7 @@ def test_exporters_cli_int8(self, task: str, model_type: str): if task.endswith("with-past"): models.append(model.decoder_with_past) elif model_type.startswith("stable-diffusion"): - models = [model.unet, model.vae_encoder, model.vae_decoder] + models = [model.unet or model.transformer, model.vae_encoder, model.vae_decoder] models.append(model.text_encoder if model_type == "stable-diffusion" else model.text_encoder_2) else: models = [model] @@ -232,7 +237,9 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in check=True, ) model = eval(_HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")]).from_pretrained(tmpdir) - num_fq, num_weight_nodes = get_num_quantized_nodes(model.unet) + num_fq, num_weight_nodes = get_num_quantized_nodes( + model.unet if model.unet is not None else model.transformer + ) self.assertEqual(exp_num_int8, num_weight_nodes["int8"]) self.assertEqual(exp_num_fq, num_fq) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index b294e3e221..43f87af60c 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -56,6 +56,7 @@ OVModelForSpeechSeq2Seq, OVStableDiffusionPipeline, OVStableDiffusionXLPipeline, + OVStableDiffusion3Pipeline, OVQuantizer, OVTrainer, OVQuantizationConfig, @@ -300,11 +301,16 @@ class OVWeightCompressionTest(unittest.TestCase): (OVModelOpenCLIPForZeroShotImageClassification, "open-clip"), ) - SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = ( + SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [ (OVStableDiffusionPipeline, "stable-diffusion", 72, 195), (OVStableDiffusionXLPipeline, "stable-diffusion-xl", 84, 331), (OVLatentConsistencyModelPipeline, "latent-consistency", 50, 135), - ) + ] + + if is_transformers_version(">=", "4.45.0"): + SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION.append( + (OVStableDiffusion3Pipeline, "stable-diffusion-3", 9, 65) + ) IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3") @@ -454,7 +460,9 @@ def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_f with TemporaryDirectory() as tmp_dir: model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) - num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(model.unet) + num_fake_quantize, num_weight_nodes = get_num_quantized_nodes( + model.unet if model.unet is not None else model.transformer + ) self.assertEqual(expected_num_fake_quantize, num_fake_quantize) self.assertEqual(expected_ov_int8, num_weight_nodes["int8"]) self.assertEqual(0, num_weight_nodes["int4"]) @@ -468,7 +476,9 @@ def test_stable_diffusion_with_weight_compression(self): quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config)) - num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(int8_pipe.unet) + num_fake_quantize, num_weight_nodes = get_num_quantized_nodes( + int8_pipe.unet if int8_pipe.unet is not None else int8_pipe.transformer + ) self.assertEqual(0, num_fake_quantize) self.assertEqual(242, num_weight_nodes["int8"]) self.assertEqual(0, num_weight_nodes["int4"]) @@ -487,7 +497,9 @@ def test_ovmodel_hybrid_quantization_with_custom_dataset( self.assertEqual(quantization_config.quant_method, OVQuantizationMethod.HYBRID) quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config), calibration_dataset=dataset) - num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(model.unet) + num_fake_quantize, num_weight_nodes = get_num_quantized_nodes( + model.unet if model.unet is not None else model.transformer + ) self.assertEqual(expected_num_fake_quantize, num_fake_quantize) self.assertEqual(expected_ov_int8, num_weight_nodes["int8"]) self.assertEqual(0, num_weight_nodes["int4"]) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 114de633c2..30aa8703e1 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -59,6 +59,7 @@ "falcon": "fxmarty/really-tiny-falcon-testing", "falcon-40b": "katuni4ka/tiny-random-falcon-40b", "flaubert": "hf-internal-testing/tiny-random-flaubert", + "flux": "katuni4ka/tiny-random-flux", "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", "gpt2": "hf-internal-testing/tiny-random-gpt2", "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", From 45976413cd88d6c9c261877de12fff567985a69a Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 22 Oct 2024 09:13:24 +0400 Subject: [PATCH 19/24] fix after black update --- optimum/exporters/openvino/model_configs.py | 7 +++---- optimum/exporters/openvino/model_patcher.py | 12 ++++++------ optimum/intel/openvino/modeling_base.py | 12 ++++++------ 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 49dbd1e5c7..b7bbd55d15 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -99,9 +99,9 @@ def init_model_configs(): "transformers", "LlavaNextForConditionalGeneration", ) - TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["image-text-to-text"] = ( - TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"] - ) + TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS[ + "image-text-to-text" + ] = TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"] supported_model_types = [ "_SUPPORTED_MODEL_TYPE", @@ -1669,7 +1669,6 @@ def __init__( height: int = DEFAULT_DUMMY_SHAPES["height"], **kwargs, ): - super().__init__(task, normalized_config, batch_size, num_channels, width, height, **kwargs) if getattr(normalized_config, "in_channels", None): self.num_channels = normalized_config.in_channels // 4 diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index faafdda430..ed8cb3b488 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -411,9 +411,9 @@ def _llama_gemma_update_causal_mask_legacy(self, attention_mask, input_tensor, c offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( - mask_slice - ) + causal_mask[ + : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] + ] = mask_slice if ( self.config._attn_implementation == "sdpa" @@ -1979,9 +1979,9 @@ def _dbrx_update_causal_mask_legacy( offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( - mask_slice - ) + causal_mask[ + : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] + ] = mask_slice if ( self.config._attn_implementation == "sdpa" diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index d4123c0bd5..ed3cdadb51 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -111,9 +111,9 @@ def __init__( for idx, key in enumerate(model.inputs): names = tuple(key.get_names()) input_names[next((name for name in names if "/" not in name), names[0])] = idx - input_dtypes[next((name for name in names if "/" not in name), names[0])] = ( - key.get_element_type().get_type_name() - ) + input_dtypes[ + next((name for name in names if "/" not in name), names[0]) + ] = key.get_element_type().get_type_name() self.input_names = input_names self.input_dtypes = input_dtypes @@ -122,9 +122,9 @@ def __init__( for idx, key in enumerate(model.outputs): names = tuple(key.get_names()) output_names[next((name for name in names if "/" not in name), names[0])] = idx - output_dtypes[next((name for name in names if "/" not in name), names[0])] = ( - key.get_element_type().get_type_name() - ) + output_dtypes[ + next((name for name in names if "/" not in name), names[0]) + ] = key.get_element_type().get_type_name() self.output_names = output_names self.output_dtypes = output_dtypes From 4d9249d53f4473e9d0119138b42f7c9f0bcda831 Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 22 Oct 2024 17:10:22 +0400 Subject: [PATCH 20/24] apply review comments --- optimum/commands/export/openvino.py | 4 -- optimum/exporters/openvino/model_configs.py | 50 ++++++--------------- optimum/intel/openvino/utils.py | 1 + tests/openvino/test_export.py | 7 ++- tests/openvino/test_exporters_cli.py | 8 ++-- tests/openvino/test_quantization.py | 7 ++- tests/openvino/utils_tests.py | 3 +- 7 files changed, 32 insertions(+), 48 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index c90258078f..70d2e4885c 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -322,10 +322,6 @@ def run(self): from optimum.intel import OVStableDiffusion3Pipeline model_cls = OVStableDiffusion3Pipeline - elif class_name == "FluxPipeline": - from optimum.intel import OVFluxPipeline - - model_cls = OVFluxPipeline else: raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.") diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index b7bbd55d15..85360e9a57 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1678,43 +1678,17 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int shape = [self.batch_size, (self.height // 2) * (self.width // 2), self.num_channels * 4] return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) if input_name == "img_ids": - return self.prepare_image_ids(framework, int_dtype, float_dtype) - - return super().generate(input_name, framework, int_dtype, float_dtype) - - def prepare_image_ids(self, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): - img_ids_height = self.height // 2 - img_ids_width = self.width // 2 - if framework == "pt": - import torch - - latent_image_ids = torch.zeros(img_ids_height, img_ids_width, 3) - latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(img_ids_height)[:, None] - latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(img_ids_width)[None, :] - - latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape - - latent_image_ids = latent_image_ids[None, :].repeat(self.batch_size, 1, 1, 1) - latent_image_ids = latent_image_ids.reshape( - self.batch_size, latent_image_id_height * latent_image_id_width, latent_image_id_channels + img_ids_height = self.height // 2 + img_ids_width = self.width // 2 + return self.random_int_tensor( + [self.batch_size, img_ids_height * img_ids_width, 3], + min_value=0, + max_value=min(img_ids_height, img_ids_width), + framework=framework, + dtype=float_dtype, ) - latent_image_ids.to(DTYPE_MAPPER.pt(float_dtype)) - return latent_image_ids - if framework == "np": - import numpy as np - latent_image_ids = np.zeros(img_ids_height, img_ids_width, 3) - latent_image_ids[..., 1] = latent_image_ids[..., 1] + np.arange(img_ids_height)[:, None] - latent_image_ids[..., 2] = latent_image_ids[..., 2] + np.arange(img_ids_width)[None, :] - - latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape - - latent_image_ids = np.tile(latent_image_ids[None, :], (self.batch_size, 1, 1, 1)) - latent_image_ids = latent_image_ids.reshape( - self.batch_size, latent_image_id_height * latent_image_id_width, latent_image_id_channels - ) - latent_image_ids.astype(DTYPE_MAPPER.np[float_dtype]) - return latent_image_ids + return super().generate(input_name, framework, int_dtype, float_dtype) class DummyFluxTextInputGenerator(DummySeq2SeqDecoderTextInputGenerator): @@ -1728,7 +1702,11 @@ class DummyFluxTextInputGenerator(DummySeq2SeqDecoderTextInputGenerator): def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name == "txt_ids": - return self.constant_tensor([self.batch_size, self.sequence_length, 3], 0, DTYPE_MAPPER.pt(float_dtype)) + import torch + + shape = [self.batch_size, self.sequence_length, 3] + dtype = DTYPE_MAPPER.pt(float_dtype) + return torch.full(shape, 0, dtype=dtype) return super().generate(input_name, framework, int_dtype, float_dtype) diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index a8aa0643c0..ca7d177201 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -120,6 +120,7 @@ "stable-diffusion": "OVStableDiffusionPipeline", "stable-diffusion-xl": "OVStableDiffusionXLPipeline", "stable-diffusion-3": "OVStableDiffusion3Pipeline", + "flux": "OVFluxPipeline", "pix2struct": "OVModelForPix2Struct", "latent-consistency": "OVLatentConsistencyModelPipeline", "open_clip_text": "OVModelOpenCLIPText", diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index e0a5fbb0e5..6a42c4a09f 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -27,6 +27,7 @@ from optimum.exporters.openvino import export_from_model, main_export from optimum.exporters.tasks import TasksManager from optimum.intel import ( + OVFluxPipeline, OVLatentConsistencyModelPipeline, OVModelForAudioClassification, OVModelForCausalLM, @@ -47,7 +48,7 @@ ) from optimum.intel.openvino.modeling_base import OVBaseModel from optimum.intel.openvino.utils import TemporaryDirectory -from optimum.intel.utils.import_utils import _transformers_version +from optimum.intel.utils.import_utils import _transformers_version, is_transformers_version from optimum.utils.save_utils import maybe_load_preprocessors @@ -69,9 +70,11 @@ class ExportModelTest(unittest.TestCase): "stable-diffusion-xl": OVStableDiffusionXLPipeline, "stable-diffusion-xl-refiner": OVStableDiffusionXLImg2ImgPipeline, "latent-consistency": OVLatentConsistencyModelPipeline, - "stable-diffusion-3": OVStableDiffusion3Pipeline, } + if is_transformers_version(">=", "4.45"): + SUPPORTED_ARCHITECTURES.update({"stable-diffusion-3": OVStableDiffusion3Pipeline, "flux": OVFluxPipeline}) + GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper") def _openvino_export( diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 4148d51e9c..fd5a25b4f3 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -25,6 +25,7 @@ from optimum.exporters.openvino.__main__ import main_export from optimum.intel import ( # noqa + OVFluxPipeline, OVLatentConsistencyModelPipeline, OVModelForAudioClassification, OVModelForCausalLM, @@ -76,7 +77,7 @@ class OVCLIExportTestCase(unittest.TestCase): ] if is_transformers_version(">=", "4.45"): - SUPPORTED_ARCHITECTURES.append(("text-to-image", "stable-diffusion-3")) + SUPPORTED_ARCHITECTURES.extend([("text-to-image", "stable-diffusion-3"), ("text-to-image", "flux")]) EXPECTED_NUMBER_OF_TOKENIZER_MODELS = { "gpt2": 2 if is_tokenizers_version("<", "0.20") else 0, "t5": 0, # no .model file in the repository @@ -89,7 +90,8 @@ class OVCLIExportTestCase(unittest.TestCase): "blenderbot": 2 if is_tokenizers_version("<", "0.20") else 0, "stable-diffusion": 2 if is_tokenizers_version("<", "0.20") else 0, "stable-diffusion-xl": 4 if is_tokenizers_version("<", "0.20") else 0, - "stable-diffusion-3": 6 if is_tokenizers_version("<", "0.20") else 0, + "stable-diffusion-3": 6 if is_tokenizers_version("<", "0.20") else 2, + "flux": 4 if is_tokenizers_version("<", "0.20") else 0, } SUPPORTED_SD_HYBRID_ARCHITECTURES = [ @@ -217,7 +219,7 @@ def test_exporters_cli_int8(self, task: str, model_type: str): models = [model.encoder, model.decoder] if task.endswith("with-past"): models.append(model.decoder_with_past) - elif model_type.startswith("stable-diffusion"): + elif model_type.startswith("stable-diffusion") or model_type.startswith("flux"): models = [model.unet or model.transformer, model.vae_encoder, model.vae_decoder] models.append(model.text_encoder if model_type == "stable-diffusion" else model.text_encoder_2) else: diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 43f87af60c..f2a4dc723f 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -57,6 +57,7 @@ OVStableDiffusionPipeline, OVStableDiffusionXLPipeline, OVStableDiffusion3Pipeline, + OVFluxPipeline, OVQuantizer, OVTrainer, OVQuantizationConfig, @@ -308,8 +309,10 @@ class OVWeightCompressionTest(unittest.TestCase): ] if is_transformers_version(">=", "4.45.0"): - SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION.append( - (OVStableDiffusion3Pipeline, "stable-diffusion-3", 9, 65) + SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION.extend( + [ + (OVStableDiffusion3Pipeline, "stable-diffusion-3", 9, 65), + ] ) IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3") diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 30aa8703e1..e5a9f73a64 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -172,7 +172,8 @@ "stable-diffusion-xl": (366, 34, 42, 66), "stable-diffusion-xl-refiner": (366, 34, 42, 66), "open-clip": (20, 28), - "stable-diffusion-3": (366, 34, 42, 66), + "stable-diffusion-3": (66, 42, 58, 30), + "flux": (56, 24, 28, 64), } From b3a8726b56b8c6613f0f005de73847ad92c831e3 Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 22 Oct 2024 20:20:27 +0400 Subject: [PATCH 21/24] compatibility with diffusers 0.31.0 --- optimum/exporters/openvino/model_configs.py | 22 +++++++++++++++----- optimum/exporters/openvino/model_patcher.py | 10 +++++---- optimum/intel/openvino/modeling_diffusion.py | 8 +++++-- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 85360e9a57..43e3885a67 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -55,7 +55,7 @@ ) from optimum.utils.normalized_config import NormalizedConfig, NormalizedTextConfig, NormalizedVisionConfig -from ...intel.utils.import_utils import _transformers_version, is_transformers_version +from ...intel.utils.import_utils import _transformers_version, is_diffusers_version, is_transformers_version from .model_patcher import ( AquilaModelPatcher, ArcticModelPatcher, @@ -1681,7 +1681,9 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int img_ids_height = self.height // 2 img_ids_width = self.width // 2 return self.random_int_tensor( - [self.batch_size, img_ids_height * img_ids_width, 3], + [self.batch_size, img_ids_height * img_ids_width, 3] + if is_diffusers_version("<", "0.31.0") + else [img_ids_height * img_ids_width, 3], min_value=0, max_value=min(img_ids_height, img_ids_width), framework=framework, @@ -1704,7 +1706,11 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int if input_name == "txt_ids": import torch - shape = [self.batch_size, self.sequence_length, 3] + shape = ( + [self.batch_size, self.sequence_length, 3] + if is_diffusers_version("<", "0.31.0") + else [self.sequence_length, 3] + ) dtype = DTYPE_MAPPER.pt(float_dtype) return torch.full(shape, 0, dtype=dtype) return super().generate(input_name, framework, int_dtype, float_dtype) @@ -1724,8 +1730,14 @@ def inputs(self): common_inputs = super().inputs common_inputs.pop("sample", None) common_inputs["hidden_states"] = {0: "batch_size", 1: "packed_height_width"} - common_inputs["txt_ids"] = {0: "batch_size", 1: "sequence_length"} - common_inputs["img_ids"] = {0: "batch_size", 1: "packed_height_width"} + common_inputs["txt_ids"] = ( + {0: "batch_size", 1: "sequence_length"} if is_diffusers_version("<", "0.31.0") else {0: "sequence_length"} + ) + common_inputs["img_ids"] = ( + {0: "batch_size", 1: "packed_height_width"} + if is_diffusers_version("<", "0.31.0") + else {0: "packed_height_width"} + ) if getattr(self._normalized_config, "guidance_embeds", False): common_inputs["guidance"] = {0: "batch_size"} return common_inputs diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index ed8cb3b488..3bc9452ff9 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -29,6 +29,7 @@ _openvino_version, _torch_version, _transformers_version, + is_diffusers_version, is_openvino_version, is_torch_version, is_transformers_version, @@ -2734,10 +2735,11 @@ def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor: class FluxTransfromerModelPatcher(ModelPatcher): def __enter__(self): super().__enter__() - self._model.pos_embed._orig_forward = self._model.pos_embed.forward - self._model.pos_embed.forward = types.MethodType(_embednb_forward, self._model.pos_embed) + if is_diffusers_version("<", "0.31.0"): + self._model.pos_embed._orig_forward = self._model.pos_embed.forward + self._model.pos_embed.forward = types.MethodType(_embednb_forward, self._model.pos_embed) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - - self._model.pos_embed.forward = self._model.pos_embed._orig_forward + if hasattr(self._model.pos_embed, "_orig_forward"): + self._model.pos_embed.forward = self._model.pos_embed._orig_forward diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 7cf064af13..2096784efa 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -682,9 +682,13 @@ def _reshape_transformer( elif inputs.get_any_name() == "pooled_projections": shapes[inputs] = [batch_size, self.transformer.config["pooled_projection_dim"]] elif inputs.get_any_name() == "img_ids": - shapes[inputs] = [batch_size, packed_height_width, 3] + shapes[inputs] = ( + [batch_size, packed_height_width, 3] + if is_diffusers_version("<", "0.31.0") + else [packed_height_width, 3] + ) elif inputs.get_any_name() == "txt_ids": - shapes[inputs] = [batch_size, -1, 3] + shapes[inputs] = [batch_size, -1, 3] if is_diffusers_version("<", "0.31.0") else [-1, 3] else: shapes[inputs][0] = batch_size shapes[inputs][1] = -1 # text_encoder_3 may have vary input length From 06673207c069aaa34c79f54c0efdfb2f9b3cbba7 Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 22 Oct 2024 23:11:04 +0400 Subject: [PATCH 22/24] apply review comments --- optimum/exporters/openvino/model_configs.py | 14 +++----------- optimum/intel/__init__.py | 5 +++++ optimum/intel/openvino/modeling_diffusion.py | 6 ++---- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 43e3885a67..ace5c150df 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -13,7 +13,6 @@ # limitations under the License. import enum -import random from copy import deepcopy from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union @@ -1590,11 +1589,7 @@ def __init__( **kwargs, ): self.task = task - if random_batch_size_range: - low, high = random_batch_size_range - self.batch_size = random.randint(low, high) - else: - self.batch_size = batch_size + self.batch_size = batch_size self.pooled_projection_dim = normalized_config.config.pooled_projection_dim def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): @@ -1642,11 +1637,8 @@ def rename_ambiguous_inputs(self, inputs): @register_in_tasks_manager("t5-encoder-model", *["feature-extraction"], library_name="diffusers") -class T5EncoderOpenVINOConfig(CLIPTextOnnxConfig): - def patch_model_for_export( - self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None - ) -> ModelPatcher: - return ModelPatcher(self, model, model_kwargs=model_kwargs) +class T5EncoderOpenVINOConfig(CLIPTextOpenVINOConfig): + pass class DummyFluxTransformerInputGenerator(DummyVisionInputGenerator): diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 31526f1aa8..67a01011a2 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -271,6 +271,7 @@ except OptionalDependencyNotAvailable: from .utils.dummy_openvino_and_diffusers_objects import ( OVDiffusionPipeline, + OVFluxPipeline, OVLatentConsistencyModelPipeline, OVPipelineForImage2Image, OVPipelineForInpainting, @@ -287,11 +288,15 @@ else: from .openvino import ( OVDiffusionPipeline, + OVFluxPipeline, OVLatentConsistencyModelImg2ImgPipeline, OVLatentConsistencyModelPipeline, OVPipelineForImage2Image, OVPipelineForInpainting, OVPipelineForText2Image, + OVStableDiffusion3Img2ImgPipeline, + OVStableDiffusion3InpaintPipeline, + OVStableDiffusion3Pipeline, OVStableDiffusionImg2ImgPipeline, OVStableDiffusionInpaintPipeline, OVStableDiffusionPipeline, diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 2096784efa..1bf452efab 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -565,8 +565,7 @@ def to(self, *args, device: Optional[str] = None, dtype: Optional[torch.dtype] = @property def height(self) -> int: - # flux transformer does not preserve info about height/width, they are knwon in vae_decoder - model = self.unet.model if self.unet is not None else self.vae.decoder.model + model = self.vae.decoder.model height = model.inputs[0].get_partial_shape()[2] if height.is_dynamic: return -1 @@ -574,8 +573,7 @@ def height(self) -> int: @property def width(self) -> int: - # flux transformer does not preserve info about height/width, they are known in vae_decoder - model = self.unet.model if self.unet is not None else self.vae.decoder.model + model = self.vae.decoder.model width = model.inputs[0].get_partial_shape()[3] if width.is_dynamic: return -1 From 163a466f002ebadb3c16bd42fcf045934d5b9afe Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Tue, 22 Oct 2024 23:18:46 +0400 Subject: [PATCH 23/24] Update tests/openvino/test_diffusion.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --- tests/openvino/test_diffusion.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index fb5d75c570..98d72cee55 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -136,7 +136,6 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], text_encoder_3=None) for output_type in ["latent", "np", "pt"]: - print(output_type) inputs["output_type"] = output_type ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images From e52d9f20c1fc23f97afc421799a6e54c66a69ef5 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Wed, 23 Oct 2024 08:10:26 +0400 Subject: [PATCH 24/24] Update tests/openvino/test_diffusion.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --- tests/openvino/test_diffusion.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index 98d72cee55..1467e5ed1f 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -719,7 +719,6 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) for output_type in ["latent", "np", "pt"]: - print(output_type) inputs["output_type"] = output_type ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images