Skip to content

Commit 908a5a2

Browse files
committed
apply review comments
1 parent ecaa78b commit 908a5a2

File tree

3 files changed

+19
-11
lines changed

3 files changed

+19
-11
lines changed

optimum/exporters/openvino/model_configs.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -132,13 +132,15 @@ def init_model_configs():
132132
"transformers",
133133
"Qwen2VLForConditionalGeneration",
134134
)
135-
TasksManager._CUSTOM_CLASSES[("pt", "qwen2-5-vl", "image-text-to-text")] = (
136-
"transformers",
137-
"Qwen2_5_VLForConditionalGeneration",
135+
136+
TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["image-text-to-text"] = (
137+
(
138+
"AutoModelForImageTextToText",
139+
"AutoModelForCausalLM",
140+
)
141+
if is_transformers_version(">=", "4.46")
142+
else TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"]
138143
)
139-
TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS[
140-
"image-text-to-text"
141-
] = TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"]
142144

143145
if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS:
144146
TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline"
@@ -2571,6 +2573,10 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
25712573
return self.random_float_tensor([grid_h * grid_t * grid_w, dim], framework=framework, dtype=float_dtype)
25722574

25732575
if input_name == "window_index":
2576+
if self.spatial_merge_size:
2577+
raise ValueError(
2578+
"`spatial_merge_size` parameter is not found in model config. Can not generate dummy input data for `window_index` input"
2579+
)
25742580
spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
25752581
hidden_size = (grid_t * grid_h * grid_w) // spatial_merge_unit
25762582
return self.random_int_tensor([hidden_size], max_value=hidden_size)

optimum/intel/openvino/modeling_visual_language.py

+2
Original file line numberDiff line numberDiff line change
@@ -2386,6 +2386,7 @@ def get_rope_index(
23862386
second_per_grid_ts: Optional[torch.Tensor] = None,
23872387
attention_mask: Optional[torch.Tensor] = None,
23882388
) -> Tuple[torch.Tensor, torch.Tensor]:
2389+
# modified from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py#L1546
23892390
"""
23902391
Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
23912392
"""
@@ -2597,6 +2598,7 @@ def get_multimodal_embeddings(
25972598
second_per_grid_ts: Optional[torch.Tensor] = None,
25982599
**kwargs,
25992600
):
2601+
# Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py#L1791-L1861
26002602
inputs_embeds = torch.from_numpy(self.get_text_embeddings(input_ids))
26012603
if pixel_values is not None and input_ids.shape[1] != 1:
26022604
image_embeds = torch.from_numpy(self.get_vision_embeddings(pixel_values, image_grid_thw))

tests/openvino/test_modeling.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -2148,7 +2148,11 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
21482148
)
21492149

21502150
def get_transformer_model_class(self, model_arch):
2151-
if model_arch == "llava":
2151+
if is_transformers_version(">=", "4.46") and model_arch in ["llava", "llava_next", "qwen2_vl", "qwen2_5_vl"]:
2152+
from transformers import AutoModelForImageTextToText
2153+
2154+
return AutoModelForImageTextToText
2155+
if model_arch in "llava":
21522156
from transformers import LlavaForConditionalGeneration
21532157

21542158
return LlavaForConditionalGeneration
@@ -2160,10 +2164,6 @@ def get_transformer_model_class(self, model_arch):
21602164
from transformers import Qwen2VLForConditionalGeneration
21612165

21622166
return Qwen2VLForConditionalGeneration
2163-
if model_arch == "qwen2_5_vl":
2164-
from transformers import Qwen2_5_VLForConditionalGeneration
2165-
2166-
return Qwen2_5_VLForConditionalGeneration
21672167
return AutoModelForCausalLM
21682168

21692169
def _check_device_and_request(self, ov_model, expected_device, has_request):

0 commit comments

Comments
 (0)