Skip to content

Commit 714b8af

Browse files
committed
add pipeline
1 parent 4aadc6a commit 714b8af

11 files changed

+162
-50
lines changed

optimum/commands/export/openvino.py

+7
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,12 @@ def parse_args_openvino(parser: "ArgumentParser"):
106106
"This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it."
107107
),
108108
)
109+
optional_group.add_argument(
110+
"--variant",
111+
type=str,
112+
default=None,
113+
help=("Select a variant of the model to export."),
114+
)
109115
optional_group.add_argument(
110116
"--ratio",
111117
type=float,
@@ -467,5 +473,6 @@ def run(self):
467473
stateful=not self.args.disable_stateful,
468474
convert_tokenizer=not self.args.disable_convert_tokenizer,
469475
library_name=library_name,
476+
model_variant=self.args.variant,
470477
# **input_shapes,
471478
)

optimum/exporters/openvino/__main__.py

+6
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ def main_export(
121121
convert_tokenizer: bool = False,
122122
library_name: Optional[str] = None,
123123
model_loading_kwargs: Optional[Dict[str, Any]] = None,
124+
model_variant: Optional[str] = None,
124125
**kwargs_shapes,
125126
):
126127
"""
@@ -236,6 +237,8 @@ def main_export(
236237
custom_architecture = False
237238
patch_16bit = False
238239
loading_kwargs = model_loading_kwargs or {}
240+
if model_variant is not None:
241+
loading_kwargs["variant"] = model_variant
239242
if library_name == "transformers":
240243
config = AutoConfig.from_pretrained(
241244
model_name_or_path,
@@ -342,6 +345,7 @@ class StoreAttr(object):
342345

343346
GPTQQuantizer.post_init_model = post_init_model
344347
elif library_name == "diffusers" and is_openvino_version(">=", "2024.6"):
348+
_loading_kwargs = {} if model_variant is None else {"variant": model_variant}
345349
dtype = deduce_diffusers_dtype(
346350
model_name_or_path,
347351
revision=revision,
@@ -350,6 +354,7 @@ class StoreAttr(object):
350354
local_files_only=local_files_only,
351355
force_download=force_download,
352356
trust_remote_code=trust_remote_code,
357+
**_loading_kwargs,
353358
)
354359
if dtype in [torch.float16, torch.bfloat16]:
355360
loading_kwargs["torch_dtype"] = dtype
@@ -359,6 +364,7 @@ class StoreAttr(object):
359364
if library_name == "open_clip":
360365
model = _OpenClipForZeroShotImageClassification.from_pretrained(model_name_or_path, cache_dir=cache_dir)
361366
else:
367+
logger.warn(loading_kwargs)
362368
model = TasksManager.get_model_from_task(
363369
task,
364370
model_name_or_path,

optimum/exporters/openvino/convert.py

+15-14
Original file line numberDiff line numberDiff line change
@@ -1002,6 +1002,7 @@ def get_diffusion_models_for_export_ext(
10021002
sd3_pipes.append(StableDiffusion3InpaintPipeline)
10031003

10041004
is_sd3 = isinstance(pipeline, tuple(sd3_pipes))
1005+
logger.warn(f"IS SD3 {pipeline} {is_sd3}")
10051006
else:
10061007
is_sd3 = False
10071008

@@ -1023,18 +1024,19 @@ def get_diffusion_models_for_export_ext(
10231024
is_flux = isinstance(pipeline, tuple(flux_pipes))
10241025
else:
10251026
is_flux = False
1026-
1027-
try:
1027+
1028+
if is_diffusers_version(">=", "0.32.0"):
10281029
from diffusers import SanaPipeline
1030+
10291031
is_sana = isinstance(pipeline, SanaPipeline)
1030-
except ImportError:
1032+
else:
10311033
is_sana = False
10321034

10331035
if not any([is_sana, is_flux, is_sd3]):
10341036
return None, get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter)
10351037
if is_sd3:
10361038
models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype)
1037-
if is_sana:
1039+
elif is_sana:
10381040
models_for_export = get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype)
10391041
else:
10401042
models_for_export = get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype)
@@ -1043,17 +1045,15 @@ def get_diffusion_models_for_export_ext(
10431045

10441046

10451047
def get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype):
1046-
DEFAULT_DUMMY_SHAPES["heigh"] = DEFAULT_DUMMY_SHAPES["height"] // 4
1047-
DEFAULT_DUMMY_SHAPES["width"] = DEFAULT_DUMMY_SHAPES["width"] // 4
10481048
models_for_export = {}
10491049
text_encoder = pipeline.text_encoder
10501050
text_encoder_config_constructor = TasksManager.get_exporter_config_constructor(
1051-
model=text_encoder,
1052-
exporter=exporter,
1053-
library_name="diffusers",
1054-
task="feature-extraction",
1055-
model_type="gemma2-text-encoder",
1056-
)
1051+
model=text_encoder,
1052+
exporter=exporter,
1053+
library_name="diffusers",
1054+
task="feature-extraction",
1055+
model_type="gemma2-text-encoder",
1056+
)
10571057
text_encoder_export_config = text_encoder_config_constructor(
10581058
pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
10591059
)
@@ -1075,13 +1075,13 @@ def get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype):
10751075
models_for_export["transformer"] = (transformer, transformer_export_config)
10761076
# VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565
10771077
vae_encoder = copy.deepcopy(pipeline.vae)
1078-
vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters}
1078+
vae_encoder.forward = lambda sample: {"latent": vae_encoder.encode(x=sample)["latent"]}
10791079
vae_config_constructor = TasksManager.get_exporter_config_constructor(
10801080
model=vae_encoder,
10811081
exporter=exporter,
10821082
library_name="diffusers",
10831083
task="semantic-segmentation",
1084-
model_type="vae-encoder",
1084+
model_type="dcae-encoder",
10851085
)
10861086
vae_encoder_export_config = vae_config_constructor(
10871087
vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
@@ -1137,6 +1137,7 @@ def get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype):
11371137
task="semantic-segmentation",
11381138
model_type="sd3-transformer",
11391139
)
1140+
logger.warn(f"TRANSFORMER COFG {export_config_constructor}")
11401141
transformer_export_config = export_config_constructor(
11411142
pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype
11421143
)

optimum/exporters/openvino/model_configs.py

+41-15
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
MPTOnnxConfig,
4040
PhiOnnxConfig,
4141
UNetOnnxConfig,
42+
VaeEncoderOnnxConfig,
4243
VisionOnnxConfig,
4344
)
4445
from optimum.exporters.onnx.model_patcher import ModelPatcher
@@ -54,7 +55,6 @@
5455
DummyVisionInputGenerator,
5556
FalconDummyPastKeyValuesGenerator,
5657
MistralDummyPastKeyValuesGenerator,
57-
DummySeq2SeqDecoderTextInputGenerator
5858
)
5959
from optimum.utils.normalized_config import NormalizedConfig, NormalizedTextConfig, NormalizedVisionConfig
6060

@@ -1889,52 +1889,78 @@ def rename_ambiguous_inputs(self, inputs):
18891889
class T5EncoderOpenVINOConfig(CLIPTextOpenVINOConfig):
18901890
pass
18911891

1892+
18921893
@register_in_tasks_manager("gemma2-text-encoder", *["feature-extraction"], library_name="diffusers")
18931894
class Gemma2TextEncoderOpenVINOConfig(CLIPTextOpenVINOConfig):
18941895
@property
18951896
def inputs(self) -> Dict[str, Dict[int, str]]:
18961897
return {
18971898
"input_ids": {0: "batch_size", 1: "sequence_length"},
1898-
"attention_mask": {0: "batch_size", 1: "sequence_length"}
1899+
"attention_mask": {0: "batch_size", 1: "sequence_length"},
18991900
}
19001901

19011902

1902-
class DummySeq2SeqDecoderTextWithEncMaskInputGenerator(DummySeq2SeqDecoderTextInputGenerator):
1903+
class DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator(DummySeq2SeqDecoderTextInputGenerator):
19031904
SUPPORTED_INPUT_NAMES = (
19041905
"decoder_input_ids",
19051906
"decoder_attention_mask",
19061907
"encoder_outputs",
19071908
"encoder_hidden_states",
1908-
"encoder_attention_mask"
1909+
"encoder_attention_mask",
19091910
)
19101911

19111912

1912-
class DummySanaTransformerVisionInputGenerator(DummyVisionInputGenerator):
1913-
def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
1914-
if input_name not in ["sample", "latent_sample"]:
1915-
return super().generate(input_name, framework, int_dtype, float_dtype)
1916-
return self.random_float_tensor(
1917-
shape=[self.batch_size, self.num_channels, self.height, self.width],
1918-
framework=framework,
1919-
dtype=float_dtype,
1920-
)
1913+
class DummySanaTransformerVisionInputGenerator(DummyUnetVisionInputGenerator):
1914+
def __init__(
1915+
self,
1916+
task: str,
1917+
normalized_config: NormalizedVisionConfig,
1918+
batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
1919+
num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
1920+
width: int = DEFAULT_DUMMY_SHAPES["width"] // 8,
1921+
height: int = DEFAULT_DUMMY_SHAPES["height"] // 8,
1922+
# Reduce img shape by 4 for FLUX to reduce memory usage on conversion
1923+
**kwargs,
1924+
):
1925+
super().__init__(task, normalized_config, batch_size, num_channels, width=width, height=height, **kwargs)
1926+
19211927

19221928
@register_in_tasks_manager("sana-transformer", *["semantic-segmentation"], library_name="diffusers")
19231929
class SanaTransformerOpenVINOConfig(UNetOpenVINOConfig):
19241930
NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args(
19251931
image_size="sample_size",
19261932
num_channels="in_channels",
1927-
hidden_size="cross_attention_dim",
1933+
hidden_size="caption_channels",
19281934
vocab_size="attention_head_dim",
19291935
allow_new=True,
19301936
)
1931-
DUMMY_INPUT_GENERATOR_CLASSES = (DummySanaTransformerVisionInputGenerator, DummySeq2SeqDecoderTextWithEncMaskInputGenerator) + UNetOpenVINOConfig.DUMMY_INPUT_GENERATOR_CLASSES[1:-1]
1937+
DUMMY_INPUT_GENERATOR_CLASSES = (
1938+
DummySanaTransformerVisionInputGenerator,
1939+
DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator,
1940+
) + UNetOpenVINOConfig.DUMMY_INPUT_GENERATOR_CLASSES[1:-1]
1941+
19321942
@property
19331943
def inputs(self):
19341944
common_inputs = super().inputs
19351945
common_inputs["encoder_attention_mask"] = {0: "batch_size", 1: "sequence_length"}
19361946
return common_inputs
19371947

1948+
def rename_ambiguous_inputs(self, inputs):
1949+
# The input name in the model signature is `x, hence the export input name is updated.
1950+
hidden_states = inputs.pop("sample", None)
1951+
if hidden_states is not None:
1952+
inputs["hidden_states"] = hidden_states
1953+
return inputs
1954+
1955+
1956+
@register_in_tasks_manager("dcae-encoder", *["semantic-segmentation"], library_name="diffusers")
1957+
class DcaeEncoderOpenVINOConfig(VaeEncoderOnnxConfig):
1958+
@property
1959+
def outputs(self) -> Dict[str, Dict[int, str]]:
1960+
return {
1961+
"latent": {0: "batch_size", 2: "height_latent", 3: "width_latent"},
1962+
}
1963+
19381964

19391965
class DummyFluxTransformerInputGenerator(DummyVisionInputGenerator):
19401966
SUPPORTED_INPUT_NAMES = (

optimum/intel/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@
127127
"OVFluxImg2ImgPipeline",
128128
"OVFluxInpaintPipeline",
129129
"OVFluxFillPipeline",
130+
"OVSanaPipeline",
130131
"OVPipelineForImage2Image",
131132
"OVPipelineForText2Image",
132133
"OVPipelineForInpainting",
@@ -150,6 +151,7 @@
150151
"OVFluxImg2ImgPipeline",
151152
"OVFluxInpaintPipeline",
152153
"OVFluxFillPipeline",
154+
"OVSanaPipeline",
153155
"OVPipelineForImage2Image",
154156
"OVPipelineForText2Image",
155157
"OVPipelineForInpainting",

optimum/intel/openvino/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@
9191
OVPipelineForImage2Image,
9292
OVPipelineForInpainting,
9393
OVPipelineForText2Image,
94+
OVSanaPipeline,
9495
OVStableDiffusion3Img2ImgPipeline,
9596
OVStableDiffusion3InpaintPipeline,
9697
OVStableDiffusion3Pipeline,

optimum/intel/openvino/modeling_diffusion.py

+31-5
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,10 @@
102102
FluxInpaintPipeline = object
103103

104104
if is_diffusers_version(">=", "0.32.0"):
105-
from diffusers import FluxFillPipeline
105+
from diffusers import FluxFillPipeline, SanaPipeline
106106
else:
107107
FluxFillPipeline = object
108+
SanaPipeline = object
108109

109110

110111
DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer"
@@ -809,9 +810,14 @@ def reshape(
809810
if self.tokenizer is None and self.tokenizer_2 is None:
810811
tokenizer_max_len = -1
811812
else:
812-
tokenizer_max_len = (
813-
self.tokenizer.model_max_length if self.tokenizer is not None else self.tokenizer_2.model_max_length
814-
)
813+
if self.tokenizer is not None and "Gemma" in self.tokenizer.__class__.__name__:
814+
tokenizer_max_len = -1
815+
else:
816+
tokenizer_max_len = (
817+
self.tokenizer.model_max_length
818+
if self.tokenizer is not None
819+
else self.tokenizer_2.model_max_length
820+
)
815821

816822
if self.unet is not None:
817823
self.unet.model = self._reshape_unet(
@@ -1033,6 +1039,7 @@ def __init__(self, model: openvino.runtime.Model, parent_pipeline: OVDiffusionPi
10331039
self.hidden_states_output_names = [
10341040
name for out in self.model.outputs for name in out.names if name.startswith("hidden_states")
10351041
]
1042+
self.input_names = [inp.get_any_name() for inp in self.model.inputs]
10361043

10371044
def forward(
10381045
self,
@@ -1044,6 +1051,11 @@ def forward(
10441051
self._compile()
10451052
model_inputs = {"input_ids": input_ids}
10461053

1054+
if "attention_mask" in self.input_names:
1055+
model_inputs["attention_mask"] = (
1056+
attention_mask if attention_mask is not None else torch.ones(input_ids.shape, dtype=torch.long)
1057+
)
1058+
10471059
ov_outputs = self.request(model_inputs, share_inputs=True)
10481060
main_out = ov_outputs[0]
10491061
model_outputs = {}
@@ -1131,6 +1143,8 @@ def forward(
11311143
guidance: torch.Tensor = None,
11321144
block_controlnet_hidden_states: List = None,
11331145
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
1146+
encoder_attention_mask: torch.LongTensor = None,
1147+
attention_kwargs: Optional[Dict[str, Any]] = None,
11341148
return_dict: bool = True,
11351149
):
11361150
self._compile()
@@ -1139,16 +1153,20 @@ def forward(
11391153
"hidden_states": hidden_states,
11401154
"timestep": timestep,
11411155
"encoder_hidden_states": encoder_hidden_states,
1142-
"pooled_projections": pooled_projections,
11431156
}
11441157

1158+
if pooled_projections is not None:
1159+
model_inputs["pooled_projections"] = pooled_projections
11451160
if img_ids is not None:
11461161
model_inputs["img_ids"] = img_ids
11471162
if txt_ids is not None:
11481163
model_inputs["txt_ids"] = txt_ids
11491164
if guidance is not None:
11501165
model_inputs["guidance"] = guidance
11511166

1167+
if encoder_attention_mask is not None:
1168+
model_inputs["encoder_attention_mask"] = encoder_attention_mask
1169+
11521170
ov_outputs = self.request(model_inputs, share_inputs=True).to_dict()
11531171

11541172
model_outputs = {}
@@ -1480,6 +1498,12 @@ class OVFluxFillPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, Flu
14801498
auto_model_class = FluxFillPipeline
14811499

14821500

1501+
class OVSanaPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, SanaPipeline):
1502+
main_input_name = "prompt"
1503+
export_feature = "text-to-image"
1504+
auto_model_class = SanaPipeline
1505+
1506+
14831507
SUPPORTED_OV_PIPELINES = [
14841508
OVStableDiffusionPipeline,
14851509
OVStableDiffusionImg2ImgPipeline,
@@ -1551,6 +1575,8 @@ def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tru
15511575
if is_diffusers_version(">=", "0.32.0"):
15521576
OV_INPAINT_PIPELINES_MAPPING["flux-fill"] = OVFluxFillPipeline
15531577
SUPPORTED_OV_PIPELINES.append(OVFluxFillPipeline)
1578+
OV_TEXT2IMAGE_PIPELINES_MAPPING["sana"] = OVSanaPipeline
1579+
SUPPORTED_OV_PIPELINES.append(OVSanaPipeline)
15541580

15551581
SUPPORTED_OV_PIPELINES_MAPPINGS = [
15561582
OV_TEXT2IMAGE_PIPELINES_MAPPING,

optimum/intel/utils/dummy_openvino_and_diffusers_objects.py

+11
Original file line numberDiff line numberDiff line change
@@ -222,3 +222,14 @@ def __init__(self, *args, **kwargs):
222222
@classmethod
223223
def from_pretrained(cls, *args, **kwargs):
224224
requires_backends(cls, ["openvino", "diffusers"])
225+
226+
227+
class OVSanaPipeline(metaclass=DummyObject):
228+
_backends = ["openvino", "diffusers"]
229+
230+
def __init__(self, *args, **kwargs):
231+
requires_backends(self, ["openvino", "diffusers"])
232+
233+
@classmethod
234+
def from_pretrained(cls, *args, **kwargs):
235+
requires_backends(cls, ["openvino", "diffusers"])

0 commit comments

Comments
 (0)