Skip to content

Commit 3fbd0b1

Browse files
committed
apply review comments
1 parent 8416414 commit 3fbd0b1

File tree

3 files changed

+11
-21
lines changed

3 files changed

+11
-21
lines changed

optimum/exporters/openvino/model_configs.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2708,7 +2708,7 @@ class Qwen2VLConfigBehavior(str, enum.Enum):
27082708
TEXT_EMBEDDINGS = "text_embeddings"
27092709

27102710

2711-
@register_in_tasks_manager("qwen2-vl", *["image-text-to-text"], library_name="transformers")
2711+
@register_in_tasks_manager("qwen2-vl", *["image-text-to-text", "video-text-to-text"], library_name="transformers")
27122712
class Qwen2VLOpenVINOConfig(OnnxConfig):
27132713
SUPPORTED_BEHAVIORS = [model_type.value for model_type in Qwen2VLConfigBehavior]
27142714
NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
@@ -2838,7 +2838,7 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
28382838
return {}
28392839

28402840

2841-
@register_in_tasks_manager("qwen2-5-vl", *["image-text-to-text"], library_name="transformers")
2841+
@register_in_tasks_manager("qwen2-5-vl", *["image-text-to-text", "video-text-to-text"], library_name="transformers")
28422842
class Qwen2_5_VLOpenVINOConfig(Qwen2VLOpenVINOConfig):
28432843
MIN_TRANSFORMERS_VERSION = version.parse("4.49.0")
28442844

optimum/intel/openvino/modeling_visual_language.py

+8-17
Original file line numberDiff line numberDiff line change
@@ -1121,7 +1121,6 @@ def add_image_features(
11211121
pixel_values,
11221122
attention_mask,
11231123
position_ids,
1124-
past_key_values,
11251124
image_sizes,
11261125
legacy_processing,
11271126
**kwargs,
@@ -1193,7 +1192,6 @@ def get_multimodal_embeddings(
11931192
pixel_values,
11941193
attention_mask,
11951194
position_ids,
1196-
past_key_values,
11971195
image_sizes,
11981196
legacy_processing,
11991197
**kwargs,
@@ -1353,7 +1351,6 @@ def get_text_embeddings(self, input_ids, **kwargs):
13531351

13541352
class _OVLlavaNextVideoForCausalLM(_OVLlavaNextForCausalLM):
13551353
additional_parts = ["vision_resampler", "multi_modal_projector"]
1356-
export_feature = "video-text-to-text"
13571354
auto_model_class = AutoModelForVision2Seq
13581355

13591356
def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
@@ -1382,14 +1379,11 @@ def preprocess_inputs(
13821379
chat_prompt[0]["content"].append({"type": "video"})
13831380
prompt = processor.apply_chat_template(chat_prompt, add_generation_prompt=True, tokenize=False)
13841381
else:
1385-
if image is not None and "<image>" not in text:
1386-
prompt = "<image>\n" + text
1387-
else:
1388-
prompt = text
1389-
if video is not None and "<video>" not in text:
1390-
prompt = "<video>\n" + text
1391-
else:
1392-
prompt = text
1382+
prompt = text
1383+
if image is not None and "<image>" not in prompt:
1384+
prompt = "<image>\n" + prompt
1385+
if video is not None and "<video>" not in prompt:
1386+
prompt = "<video>\n" + prompt
13931387

13941388
if is_transformers_version(">", "4.47.99") and getattr(processor, "patch_size", None) is None:
13951389
if (
@@ -1402,7 +1396,7 @@ def preprocess_inputs(
14021396
"Processor does not have `patch_size` attribute. Please fix the processor or provide `patch_size` in the config."
14031397
)
14041398

1405-
inputs = processor(images=image, text=prompt, videos=[video], return_tensors="pt")
1399+
inputs = processor(images=image, text=prompt, videos=video, return_tensors="pt")
14061400
return inputs
14071401

14081402
def get_multimodal_embeddings(
@@ -1450,7 +1444,6 @@ def get_multimodal_embeddings(
14501444
pixel_values,
14511445
attention_mask,
14521446
position_ids,
1453-
past_key_values,
14541447
image_sizes,
14551448
legacy_processing,
14561449
**kwargs,
@@ -1463,7 +1456,6 @@ def get_multimodal_embeddings(
14631456
pixel_values_videos,
14641457
attention_mask,
14651458
position_ids,
1466-
past_key_values,
14671459
legacy_processing=legacy_processing,
14681460
**kwargs,
14691461
)
@@ -1480,7 +1472,6 @@ def add_video_features(
14801472
pixel_values_videos,
14811473
attention_mask,
14821474
position_ids,
1483-
past_key_values,
14841475
legacy_processing,
14851476
**kwargs,
14861477
):
@@ -2556,7 +2547,7 @@ def preprocess_inputs(
25562547

25572548
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
25582549

2559-
inputs = processor(images=image, text=text_prompt, videos=[video], return_tensors="pt")
2550+
inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt")
25602551
return inputs
25612552

25622553

@@ -2992,7 +2983,7 @@ def preprocess_inputs(
29922983

29932984
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
29942985

2995-
inputs = processor(images=image, text=text_prompt, videos=[video], return_tensors="pt")
2986+
inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt")
29962987
return inputs
29972988

29982989
# Copied from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1602

tests/openvino/test_modeling.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -2156,7 +2156,6 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
21562156
)
21572157

21582158
def get_transformer_model_class(self, model_arch):
2159-
print(model_arch)
21602159
if is_transformers_version(">=", "4.46") and model_arch in ["llava", "llava_next", "qwen2_vl", "qwen2_5_vl"]:
21612160
from transformers import AutoModelForImageTextToText
21622161

@@ -2355,7 +2354,7 @@ def test_generate_utils(self, model_arch):
23552354
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
23562355
self.assertIsInstance(outputs[0], str)
23572356

2358-
# video laoder helper only available for transformers >= 4.49
2357+
# video loader helper only available for transformers >= 4.49
23592358
if model_arch in self.SUPPORT_VIDEO and is_transformers_version(">=", "4.49"):
23602359
from transformers.image_utils import load_video
23612360

0 commit comments

Comments
 (0)