Skip to content

Commit 6a31129

Browse files
Merge branch 'main' into ns/q-config-kwargs
2 parents 1462d11 + 727b6ce commit 6a31129

19 files changed

+825
-177
lines changed

.github/workflows/test_ipex.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ jobs:
1818
strategy:
1919
fail-fast: false
2020
matrix:
21-
transformers-version: ["4.47.0", "4.47.1"]
22-
torch-version: ["2.4.0", "2.5.*"]
21+
transformers-version: ["4.47.*"]
22+
torch-version: ["2.6.0"]
2323

2424
runs-on: ubuntu-22.04
2525

docs/source/openvino/models.mdx

+1
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ Here is the list of the supported architectures :
106106
- Qwen2(Qwen1.5, Qwen2.5)
107107
- Qwen2MoE
108108
- Qwen2VL
109+
- Qwen2.5VL
109110
- ResNet
110111
- Roberta
111112
- Roformer

optimum/commands/export/openvino.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -429,7 +429,7 @@ def run(self):
429429
maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
430430
elif (
431431
quantize_with_dataset
432-
and (task.startswith("text-generation") or task == "automatic-speech-recognition")
432+
and (task.startswith("text-generation") or "automatic-speech-recognition" in task)
433433
or (task == "image-text-to-text" and quantization_config is not None)
434434
):
435435
if task.startswith("text-generation"):

optimum/exporters/openvino/model_configs.py

+57-5
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@
105105
PersimmonModelPatcher,
106106
Phi3ModelPatcher,
107107
Phi3VisionImageEmbeddingsPatcher,
108+
Qwen2_5_VLVisionEmbMergerPatcher,
108109
Qwen2VLLanguageModelPatcher,
109110
Qwen2VLVisionEmbMergerPatcher,
110111
QwenModelPatcher,
@@ -131,10 +132,14 @@ def init_model_configs():
131132
"transformers",
132133
"Qwen2VLForConditionalGeneration",
133134
)
135+
TasksManager._CUSTOM_CLASSES[("pt", "qwen2-5-vl", "image-text-to-text")] = (
136+
"transformers",
137+
"AutoModelForImageTextToText",
138+
)
139+
134140
TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS[
135141
"image-text-to-text"
136142
] = TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"]
137-
138143
if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS:
139144
TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline"
140145
TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["fill"] = {"flux": "FluxFillPipeline"}
@@ -2510,7 +2515,13 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
25102515

25112516

25122517
class DummyQwen2VLVisionEmbedInputGenerator(DummyVisionInputGenerator):
2513-
SUPPORTED_INPUT_NAMES = ("hidden_states", "attention_mask", "rotary_pos_emb")
2518+
SUPPORTED_INPUT_NAMES = (
2519+
"hidden_states",
2520+
"attention_mask",
2521+
"window_attention_mask",
2522+
"window_index",
2523+
"rotary_pos_emb",
2524+
)
25142525

25152526
def __init__(
25162527
self,
@@ -2529,10 +2540,17 @@ def __init__(
25292540
self.temporal_patch_size = normalized_config.config.temporal_patch_size
25302541
self.patch_size = normalized_config.config.patch_size
25312542
if normalized_config.use_embed_dim:
2532-
self.embed_dim = normalized_config.config.embed_dim
2543+
self.embed_dim = (
2544+
normalized_config.config.embed_dim
2545+
if hasattr(normalized_config.config, "embed_dim")
2546+
else normalized_config.hidden_size
2547+
)
25332548
else:
25342549
self.embed_dim = self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size
25352550
self.num_heads = normalized_config.config.num_heads
2551+
self.spatial_merge_size = None
2552+
if hasattr(normalized_config.config, "spatial_merge_size"):
2553+
self.spatial_merge_size = normalized_config.config.spatial_merge_size
25362554

25372555
def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
25382556
grid_h, grid_w = self.height // self.patch_size, self.width // self.patch_size
@@ -2543,7 +2561,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
25432561
[grid_t * grid_h * grid_w, self.embed_dim], framework=framework, dtype=float_dtype
25442562
)
25452563

2546-
if input_name == "attention_mask":
2564+
if input_name in ["attention_mask", "window_attention_mask"]:
25472565
return self.random_mask_tensor(
25482566
[1, grid_t * grid_h * grid_w, grid_t * grid_h * grid_w], framework=framework, dtype=float_dtype
25492567
)
@@ -2552,6 +2570,15 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
25522570
dim = self.embed_dim // self.num_heads // 2
25532571
return self.random_float_tensor([grid_h * grid_t * grid_w, dim], framework=framework, dtype=float_dtype)
25542572

2573+
if input_name == "window_index":
2574+
if self.spatial_merge_size is None:
2575+
raise ValueError(
2576+
"`spatial_merge_size` parameter is not found in model config. Can not generate dummy input data for `window_index` input"
2577+
)
2578+
spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
2579+
hidden_size = (grid_t * grid_h * grid_w) // spatial_merge_unit
2580+
return self.random_int_tensor([hidden_size], max_value=hidden_size)
2581+
25552582

25562583
class Qwen2VLConfigBehavior(str, enum.Enum):
25572584
LANGUAGE = "language"
@@ -2674,7 +2701,7 @@ def patch_model_for_export(
26742701

26752702
@property
26762703
def inputs(self) -> Dict[str, Dict[int, str]]:
2677-
if self._behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS:
2704+
if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS:
26782705
return {"hidden_states": {0: "patch_thw_grid", 1: "patch_temporal_channels"}}
26792706
if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER:
26802707
return {
@@ -2690,6 +2717,31 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
26902717
return {}
26912718

26922719

2720+
@register_in_tasks_manager("qwen2-5-vl", *["image-text-to-text"], library_name="transformers")
2721+
class Qwen2_5_VLOpenVINOConfig(Qwen2VLOpenVINOConfig):
2722+
MIN_TRANSFORMERS_VERSION = version.parse("4.49.0")
2723+
2724+
@property
2725+
def inputs(self) -> Dict[str, Dict[int, str]]:
2726+
if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER:
2727+
return {
2728+
"hidden_states": {0: "sequence_length"},
2729+
"attention_mask": {1: "sequence_length", 2: "sequence_length"},
2730+
"window_attention_mask": {1: "sequence_length", 2: "sequence_length"},
2731+
"window_index": {0: "unit_sequence_length"},
2732+
"rotary_pos_emb": {0: "sequence_length"},
2733+
}
2734+
return super().inputs
2735+
2736+
def patch_model_for_export(
2737+
self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
2738+
):
2739+
model_kwargs = model_kwargs or {}
2740+
if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER:
2741+
return Qwen2_5_VLVisionEmbMergerPatcher(self, model, model_kwargs)
2742+
return super().patch_model_for_export(model, model_kwargs)
2743+
2744+
26932745
@register_in_tasks_manager(
26942746
"glm",
26952747
*[

optimum/exporters/openvino/model_patcher.py

+160-27
Original file line numberDiff line numberDiff line change
@@ -3909,29 +3909,8 @@ def __exit__(self, exc_type, exc_value, traceback):
39093909
self._model.forward = self._model.__orig_forward
39103910

39113911

3912-
class Qwen2VLVisionEmbMergerPatcher(ModelPatcher):
3913-
def __init__(
3914-
self,
3915-
config: "OnnxConfig",
3916-
model: Union["PreTrainedModel", "TFPreTrainedModel"],
3917-
model_kwargs: Dict[str, Any] = None,
3918-
):
3919-
model.__orig_forward = model.forward
3920-
3921-
# Modified from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1118
3922-
# added attention_mask input instead cu_lens for its internal calculation model (unsupported by tracing due to cycle with dynamic len)
3923-
# separated patch_embed and rot_pos_emb calls for performing as part of another model
3924-
def image_embed_forward(
3925-
self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, rotary_pos_emb: torch.Tensor
3926-
) -> torch.Tensor:
3927-
for blk in self.blocks:
3928-
hidden_states = blk(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb)
3929-
return self.merger(hidden_states)
3930-
3931-
model.forward = types.MethodType(image_embed_forward, model)
3932-
super().__init__(config, model, model_kwargs)
3933-
3934-
def __enter__(self):
3912+
def patch_qwen2vl_vision_blocks(model, force_new_behaviour=False):
3913+
if not force_new_behaviour and is_transformers_version("<=", "4.48.99"):
39353914
# Modified from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L390
39363915
# added attention_mask input instead of internal calculation (unsupported by tracing due to cycle with dynamic len)
39373916
def sdpa_attn_forward(
@@ -3976,11 +3955,165 @@ def block_forward(self, hidden_states, attention_mask, rotary_pos_emb) -> torch.
39763955
hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
39773956
return hidden_states
39783957

3958+
else:
3959+
# Modified from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L391
3960+
# added attention_mask input instead of internal calculation (unsupported by tracing due to cycle with dynamic len)
3961+
def sdpa_attn_forward(
3962+
self,
3963+
hidden_states: torch.Tensor,
3964+
attention_mask: torch.Tensor,
3965+
rotary_pos_emb: torch.Tensor = None,
3966+
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
3967+
):
3968+
def rotate_half(x):
3969+
"""Rotates half the hidden dims of the input."""
3970+
x1 = x[..., : x.shape[-1] // 2]
3971+
x2 = x[..., x.shape[-1] // 2 :]
3972+
return torch.cat((-x2, x1), dim=-1)
3973+
3974+
def apply_rotary_pos_emb_vision(
3975+
q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
3976+
) -> Tuple[torch.Tensor, torch.Tensor]:
3977+
orig_q_dtype = q.dtype
3978+
orig_k_dtype = k.dtype
3979+
q, k = q.float(), k.float()
3980+
cos, sin = cos.unsqueeze(-2), sin.unsqueeze(-2)
3981+
q_embed = (q * cos) + (rotate_half(q) * sin)
3982+
k_embed = (k * cos) + (rotate_half(k) * sin)
3983+
q_embed = q_embed.to(orig_q_dtype)
3984+
k_embed = k_embed.to(orig_k_dtype)
3985+
return q_embed, k_embed
3986+
3987+
seq_length = hidden_states.shape[0]
3988+
q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
3989+
if position_embeddings is None:
3990+
emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
3991+
cos = emb.cos().float()
3992+
sin = emb.sin().float()
3993+
else:
3994+
cos, sin = position_embeddings
3995+
q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
3996+
q = q.transpose(0, 1)
3997+
k = k.transpose(0, 1)
3998+
v = v.transpose(0, 1)
3999+
attn_output = torch.nn.functional.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
4000+
attn_output = attn_output.transpose(0, 1)
4001+
attn_output = attn_output.reshape(seq_length, -1)
4002+
attn_output = self.proj(attn_output)
4003+
return attn_output
4004+
4005+
# Modified from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L446
4006+
# added attention_mask input propagation to self.attn
4007+
def block_forward(
4008+
self,
4009+
hidden_states,
4010+
attention_mask,
4011+
rotary_pos_emb: Optional[torch.Tensor] = None,
4012+
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
4013+
) -> torch.Tensor:
4014+
hidden_states = hidden_states + self.attn(
4015+
self.norm1(hidden_states),
4016+
attention_mask=attention_mask,
4017+
rotary_pos_emb=rotary_pos_emb,
4018+
position_embeddings=position_embeddings,
4019+
)
4020+
hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
4021+
return hidden_states
4022+
4023+
for block in model.blocks:
4024+
block._orig_forward = block.forward
4025+
block.forward = types.MethodType(block_forward, block)
4026+
block.attn._orig_forward = block.attn.forward
4027+
block.attn.forward = types.MethodType(sdpa_attn_forward, block.attn)
4028+
4029+
4030+
class Qwen2VLVisionEmbMergerPatcher(ModelPatcher):
4031+
def __init__(
4032+
self,
4033+
config: "OnnxConfig",
4034+
model: Union["PreTrainedModel", "TFPreTrainedModel"],
4035+
model_kwargs: Dict[str, Any] = None,
4036+
):
4037+
model.__orig_forward = model.forward
4038+
4039+
# Modified from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1118
4040+
# added attention_mask input instead cu_lens for its internal calculation model (unsupported by tracing due to cycle with dynamic len)
4041+
# separated patch_embed and rot_pos_emb calls for performing as part of another model
4042+
def image_embed_forward(
4043+
self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, rotary_pos_emb: torch.Tensor
4044+
) -> torch.Tensor:
4045+
for blk in self.blocks:
4046+
hidden_states = blk(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb)
4047+
return self.merger(hidden_states)
4048+
4049+
model.forward = types.MethodType(image_embed_forward, model)
4050+
super().__init__(config, model, model_kwargs)
4051+
4052+
def __enter__(self):
4053+
patch_qwen2vl_vision_blocks(self._model)
4054+
super().__enter__()
4055+
4056+
def __exit__(self, exc_type, exc_value, traceback):
4057+
super().__exit__(exc_type, exc_value, traceback)
4058+
self._model.forward = self._model.__orig_forward
39794059
for block in self._model.blocks:
3980-
block._orig_forward = block.forward
3981-
block.forward = types.MethodType(block_forward, block)
3982-
block.attn._orig_forward = block.attn.forward
3983-
block.attn.forward = types.MethodType(sdpa_attn_forward, block.attn)
4060+
block.forward = block._orig_forward
4061+
block.attn.forward = block.attn._orig_forward
4062+
4063+
4064+
class Qwen2_5_VLVisionEmbMergerPatcher(ModelPatcher):
4065+
def __init__(
4066+
self,
4067+
config: "OnnxConfig",
4068+
model: Union["PreTrainedModel", "TFPreTrainedModel"],
4069+
model_kwargs: Dict[str, Any] = None,
4070+
):
4071+
super().__init__(config, model, model_kwargs)
4072+
4073+
model.__orig_forward = model.forward
4074+
4075+
# Modified from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py#L405
4076+
# added attention_mask and window_attention_mask inputs instead cu_lens and window_cu_lens processing for its internal calculation model
4077+
# (unsupported by tracing due to cycle with dynamic len)
4078+
# separated patch_embed and rot_pos_emb calls for performing as part of another model
4079+
def image_embed_forward(
4080+
self,
4081+
hidden_states: torch.Tensor,
4082+
attention_mask: torch.Tensor,
4083+
window_attention_mask: torch.Tensor,
4084+
window_index: torch.Tensor,
4085+
rotary_pos_emb: torch.Tensor,
4086+
) -> torch.Tensor:
4087+
seq_len = hidden_states.shape[0]
4088+
hidden_states = hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
4089+
hidden_states = hidden_states[window_index, :, :]
4090+
hidden_states = hidden_states.reshape(seq_len, -1)
4091+
rotary_pos_emb = rotary_pos_emb.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
4092+
rotary_pos_emb = rotary_pos_emb[window_index, :, :]
4093+
rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
4094+
emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
4095+
position_embeddings = (emb.cos(), emb.sin())
4096+
for layer_num, blk in enumerate(self.blocks):
4097+
if layer_num in self.fullatt_block_indexes:
4098+
attention_mask_now = attention_mask
4099+
else:
4100+
attention_mask_now = window_attention_mask
4101+
hidden_states = blk(
4102+
hidden_states, attention_mask=attention_mask_now, position_embeddings=position_embeddings
4103+
)
4104+
4105+
hidden_states = self.merger(hidden_states)
4106+
reverse_indices = torch.argsort(window_index)
4107+
hidden_states = hidden_states[reverse_indices, :]
4108+
4109+
return hidden_states
4110+
4111+
model.forward = types.MethodType(image_embed_forward, model)
4112+
super().__init__(config, model, model_kwargs)
4113+
4114+
def __enter__(self):
4115+
patch_qwen2vl_vision_blocks(self._model, force_new_behaviour=True)
4116+
super().__enter__()
39844117

39854118
def __exit__(self, exc_type, exc_value, traceback):
39864119
super().__exit__(exc_type, exc_value, traceback)

optimum/exporters/openvino/utils.py

+1
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ def get_submodels(model):
226226
"minicpmv",
227227
"phi3-v",
228228
"qwen2-vl",
229+
"qwen2-5-vl",
229230
]
230231

231232

0 commit comments

Comments
 (0)