add pipeline

eaidova · eaidova · commit 714b8af3cc14 · 2025-01-13T20:20:04.000+04:00
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -106,6 +106,12 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it."
         ),
     )
+    optional_group.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help=("Select a variant of the model to export."),
+    )
     optional_group.add_argument(
         "--ratio",
         type=float,
@@ -467,5 +473,6 @@ def run(self):
                 stateful=not self.args.disable_stateful,
                 convert_tokenizer=not self.args.disable_convert_tokenizer,
                 library_name=library_name,
+                model_variant=self.args.variant,
                 # **input_shapes,
             )
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -121,6 +121,7 @@ def main_export(
     convert_tokenizer: bool = False,
     library_name: Optional[str] = None,
     model_loading_kwargs: Optional[Dict[str, Any]] = None,
+    model_variant: Optional[str] = None,
     **kwargs_shapes,
 ):
     """
@@ -236,6 +237,8 @@ def main_export(
     custom_architecture = False
     patch_16bit = False
     loading_kwargs = model_loading_kwargs or {}
+    if model_variant is not None:
+        loading_kwargs["variant"] = model_variant
     if library_name == "transformers":
         config = AutoConfig.from_pretrained(
             model_name_or_path,
@@ -342,6 +345,7 @@ class StoreAttr(object):
 
                 GPTQQuantizer.post_init_model = post_init_model
     elif library_name == "diffusers" and is_openvino_version(">=", "2024.6"):
+        _loading_kwargs = {} if model_variant is None else {"variant": model_variant}
         dtype = deduce_diffusers_dtype(
             model_name_or_path,
             revision=revision,
@@ -350,6 +354,7 @@ class StoreAttr(object):
             local_files_only=local_files_only,
             force_download=force_download,
             trust_remote_code=trust_remote_code,
+            **_loading_kwargs,
         )
         if dtype in [torch.float16, torch.bfloat16]:
             loading_kwargs["torch_dtype"] = dtype
@@ -359,6 +364,7 @@ class StoreAttr(object):
         if library_name == "open_clip":
             model = _OpenClipForZeroShotImageClassification.from_pretrained(model_name_or_path, cache_dir=cache_dir)
         else:
+            logger.warn(loading_kwargs)
             model = TasksManager.get_model_from_task(
                 task,
                 model_name_or_path,
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
@@ -1002,6 +1002,7 @@ def get_diffusion_models_for_export_ext(
             sd3_pipes.append(StableDiffusion3InpaintPipeline)
 
         is_sd3 = isinstance(pipeline, tuple(sd3_pipes))
+        logger.warn(f"IS SD3 {pipeline} {is_sd3}")
     else:
         is_sd3 = False
 
@@ -1023,18 +1024,19 @@ def get_diffusion_models_for_export_ext(
         is_flux = isinstance(pipeline, tuple(flux_pipes))
     else:
         is_flux = False
-    
-    try:
+
+    if is_diffusers_version(">=", "0.32.0"):
         from diffusers import SanaPipeline
+
         is_sana = isinstance(pipeline, SanaPipeline)
-    except ImportError:
+    else:
         is_sana = False
 
     if not any([is_sana, is_flux, is_sd3]):
         return None, get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter)
     if is_sd3:
         models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype)
-    if is_sana:
+    elif is_sana:
         models_for_export = get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype)
     else:
         models_for_export = get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype)
@@ -1043,17 +1045,15 @@ def get_diffusion_models_for_export_ext(
 
 
 def get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype):
-    DEFAULT_DUMMY_SHAPES["heigh"] = DEFAULT_DUMMY_SHAPES["height"] // 4
-    DEFAULT_DUMMY_SHAPES["width"] = DEFAULT_DUMMY_SHAPES["width"] // 4
     models_for_export = {}
     text_encoder = pipeline.text_encoder
     text_encoder_config_constructor = TasksManager.get_exporter_config_constructor(
-            model=text_encoder,
-            exporter=exporter,
-            library_name="diffusers",
-            task="feature-extraction",
-            model_type="gemma2-text-encoder",
-        )
+        model=text_encoder,
+        exporter=exporter,
+        library_name="diffusers",
+        task="feature-extraction",
+        model_type="gemma2-text-encoder",
+    )
     text_encoder_export_config = text_encoder_config_constructor(
         pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
     )
@@ -1075,13 +1075,13 @@ def get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype):
     models_for_export["transformer"] = (transformer, transformer_export_config)
     # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565
     vae_encoder = copy.deepcopy(pipeline.vae)
-    vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters}
+    vae_encoder.forward = lambda sample: {"latent": vae_encoder.encode(x=sample)["latent"]}
     vae_config_constructor = TasksManager.get_exporter_config_constructor(
         model=vae_encoder,
         exporter=exporter,
         library_name="diffusers",
         task="semantic-segmentation",
-        model_type="vae-encoder",
+        model_type="dcae-encoder",
     )
     vae_encoder_export_config = vae_config_constructor(
         vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
@@ -1137,6 +1137,7 @@ def get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype):
         task="semantic-segmentation",
         model_type="sd3-transformer",
     )
+    logger.warn(f"TRANSFORMER COFG {export_config_constructor}")
     transformer_export_config = export_config_constructor(
         pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype
     )
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -39,6 +39,7 @@
     MPTOnnxConfig,
     PhiOnnxConfig,
     UNetOnnxConfig,
+    VaeEncoderOnnxConfig,
     VisionOnnxConfig,
 )
 from optimum.exporters.onnx.model_patcher import ModelPatcher
@@ -54,7 +55,6 @@
     DummyVisionInputGenerator,
     FalconDummyPastKeyValuesGenerator,
     MistralDummyPastKeyValuesGenerator,
-    DummySeq2SeqDecoderTextInputGenerator
 )
 from optimum.utils.normalized_config import NormalizedConfig, NormalizedTextConfig, NormalizedVisionConfig
 
@@ -1889,52 +1889,78 @@ def rename_ambiguous_inputs(self, inputs):
 class T5EncoderOpenVINOConfig(CLIPTextOpenVINOConfig):
     pass
 
+
 @register_in_tasks_manager("gemma2-text-encoder", *["feature-extraction"], library_name="diffusers")
 class Gemma2TextEncoderOpenVINOConfig(CLIPTextOpenVINOConfig):
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
         return {
             "input_ids": {0: "batch_size", 1: "sequence_length"},
-            "attention_mask": {0: "batch_size", 1: "sequence_length"}
+            "attention_mask": {0: "batch_size", 1: "sequence_length"},
         }
 
 
-class DummySeq2SeqDecoderTextWithEncMaskInputGenerator(DummySeq2SeqDecoderTextInputGenerator):
+class DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator(DummySeq2SeqDecoderTextInputGenerator):
     SUPPORTED_INPUT_NAMES = (
         "decoder_input_ids",
         "decoder_attention_mask",
         "encoder_outputs",
         "encoder_hidden_states",
-        "encoder_attention_mask"
+        "encoder_attention_mask",
     )
 
 
-class DummySanaTransformerVisionInputGenerator(DummyVisionInputGenerator):
-    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
-        if input_name not in ["sample", "latent_sample"]:
-            return super().generate(input_name, framework, int_dtype, float_dtype)
-        return self.random_float_tensor(
-            shape=[self.batch_size, self.num_channels, self.height, self.width],
-            framework=framework,
-            dtype=float_dtype,
-        )
+class DummySanaTransformerVisionInputGenerator(DummyUnetVisionInputGenerator):
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedVisionConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
+        width: int = DEFAULT_DUMMY_SHAPES["width"] // 8,
+        height: int = DEFAULT_DUMMY_SHAPES["height"] // 8,
+        # Reduce img shape by 4 for FLUX to reduce memory usage on conversion
+        **kwargs,
+    ):
+        super().__init__(task, normalized_config, batch_size, num_channels, width=width, height=height, **kwargs)
+
 
 @register_in_tasks_manager("sana-transformer", *["semantic-segmentation"], library_name="diffusers")
 class SanaTransformerOpenVINOConfig(UNetOpenVINOConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args(
         image_size="sample_size",
         num_channels="in_channels",
-        hidden_size="cross_attention_dim",
+        hidden_size="caption_channels",
         vocab_size="attention_head_dim",
         allow_new=True,
     )
-    DUMMY_INPUT_GENERATOR_CLASSES = (DummySanaTransformerVisionInputGenerator, DummySeq2SeqDecoderTextWithEncMaskInputGenerator) + UNetOpenVINOConfig.DUMMY_INPUT_GENERATOR_CLASSES[1:-1]
+    DUMMY_INPUT_GENERATOR_CLASSES = (
+        DummySanaTransformerVisionInputGenerator,
+        DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator,
+    ) + UNetOpenVINOConfig.DUMMY_INPUT_GENERATOR_CLASSES[1:-1]
+
     @property
     def inputs(self):
         common_inputs = super().inputs
         common_inputs["encoder_attention_mask"] = {0: "batch_size", 1: "sequence_length"}
         return common_inputs
 
+    def rename_ambiguous_inputs(self, inputs):
+        #  The input name in the model signature is `x, hence the export input name is updated.
+        hidden_states = inputs.pop("sample", None)
+        if hidden_states is not None:
+            inputs["hidden_states"] = hidden_states
+        return inputs
+
+
+@register_in_tasks_manager("dcae-encoder", *["semantic-segmentation"], library_name="diffusers")
+class DcaeEncoderOpenVINOConfig(VaeEncoderOnnxConfig):
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "latent": {0: "batch_size", 2: "height_latent", 3: "width_latent"},
+        }
+
 
 class DummyFluxTransformerInputGenerator(DummyVisionInputGenerator):
     SUPPORTED_INPUT_NAMES = (
diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py
@@ -127,6 +127,7 @@
         "OVFluxImg2ImgPipeline",
         "OVFluxInpaintPipeline",
         "OVFluxFillPipeline",
+        "OVSanaPipeline",
         "OVPipelineForImage2Image",
         "OVPipelineForText2Image",
         "OVPipelineForInpainting",
@@ -150,6 +151,7 @@
             "OVFluxImg2ImgPipeline",
             "OVFluxInpaintPipeline",
             "OVFluxFillPipeline",
+            "OVSanaPipeline",
             "OVPipelineForImage2Image",
             "OVPipelineForText2Image",
             "OVPipelineForInpainting",
diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py
@@ -91,6 +91,7 @@
         OVPipelineForImage2Image,
         OVPipelineForInpainting,
         OVPipelineForText2Image,
+        OVSanaPipeline,
         OVStableDiffusion3Img2ImgPipeline,
         OVStableDiffusion3InpaintPipeline,
         OVStableDiffusion3Pipeline,
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
@@ -102,9 +102,10 @@
     FluxInpaintPipeline = object
 
 if is_diffusers_version(">=", "0.32.0"):
-    from diffusers import FluxFillPipeline
+    from diffusers import FluxFillPipeline, SanaPipeline
 else:
     FluxFillPipeline = object
+    SanaPipeline = object
 
 
 DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer"
@@ -809,9 +810,14 @@ def reshape(
         if self.tokenizer is None and self.tokenizer_2 is None:
             tokenizer_max_len = -1
         else:
-            tokenizer_max_len = (
-                self.tokenizer.model_max_length if self.tokenizer is not None else self.tokenizer_2.model_max_length
-            )
+            if self.tokenizer is not None and "Gemma" in self.tokenizer.__class__.__name__:
+                tokenizer_max_len = -1
+            else:
+                tokenizer_max_len = (
+                    self.tokenizer.model_max_length
+                    if self.tokenizer is not None
+                    else self.tokenizer_2.model_max_length
+                )
 
         if self.unet is not None:
             self.unet.model = self._reshape_unet(
@@ -1033,6 +1039,7 @@ def __init__(self, model: openvino.runtime.Model, parent_pipeline: OVDiffusionPi
         self.hidden_states_output_names = [
             name for out in self.model.outputs for name in out.names if name.startswith("hidden_states")
         ]
+        self.input_names = [inp.get_any_name() for inp in self.model.inputs]
 
     def forward(
         self,
@@ -1044,6 +1051,11 @@ def forward(
         self._compile()
         model_inputs = {"input_ids": input_ids}
 
+        if "attention_mask" in self.input_names:
+            model_inputs["attention_mask"] = (
+                attention_mask if attention_mask is not None else torch.ones(input_ids.shape, dtype=torch.long)
+            )
+
         ov_outputs = self.request(model_inputs, share_inputs=True)
         main_out = ov_outputs[0]
         model_outputs = {}
@@ -1131,6 +1143,8 @@ def forward(
         guidance: torch.Tensor = None,
         block_controlnet_hidden_states: List = None,
         joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: torch.LongTensor = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
         return_dict: bool = True,
     ):
         self._compile()
@@ -1139,16 +1153,20 @@ def forward(
             "hidden_states": hidden_states,
             "timestep": timestep,
             "encoder_hidden_states": encoder_hidden_states,
-            "pooled_projections": pooled_projections,
         }
 
+        if pooled_projections is not None:
+            model_inputs["pooled_projections"] = pooled_projections
         if img_ids is not None:
             model_inputs["img_ids"] = img_ids
         if txt_ids is not None:
             model_inputs["txt_ids"] = txt_ids
         if guidance is not None:
             model_inputs["guidance"] = guidance
 
+        if encoder_attention_mask is not None:
+            model_inputs["encoder_attention_mask"] = encoder_attention_mask
+
         ov_outputs = self.request(model_inputs, share_inputs=True).to_dict()
 
         model_outputs = {}
@@ -1480,6 +1498,12 @@ class OVFluxFillPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, Flu
     auto_model_class = FluxFillPipeline
 
 
+class OVSanaPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, SanaPipeline):
+    main_input_name = "prompt"
+    export_feature = "text-to-image"
+    auto_model_class = SanaPipeline
+
+
 SUPPORTED_OV_PIPELINES = [
     OVStableDiffusionPipeline,
     OVStableDiffusionImg2ImgPipeline,
@@ -1551,6 +1575,8 @@ def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tru
 if is_diffusers_version(">=", "0.32.0"):
     OV_INPAINT_PIPELINES_MAPPING["flux-fill"] = OVFluxFillPipeline
     SUPPORTED_OV_PIPELINES.append(OVFluxFillPipeline)
+    OV_TEXT2IMAGE_PIPELINES_MAPPING["sana"] = OVSanaPipeline
+    SUPPORTED_OV_PIPELINES.append(OVSanaPipeline)
 
 SUPPORTED_OV_PIPELINES_MAPPINGS = [
     OV_TEXT2IMAGE_PIPELINES_MAPPING,
diff --git a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py
@@ -222,3 +222,14 @@ def __init__(self, *args, **kwargs):
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["openvino", "diffusers"])
+
+
+class OVSanaPipeline(metaclass=DummyObject):
+    _backends = ["openvino", "diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["openvino", "diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["openvino", "diffusers"])
diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py