Skip to content

Commit 6017322

Browse files
committed
apply review comments
1 parent 8416414 commit 6017322

File tree

3 files changed

+10
-15
lines changed

3 files changed

+10
-15
lines changed

optimum/exporters/openvino/model_configs.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2708,7 +2708,7 @@ class Qwen2VLConfigBehavior(str, enum.Enum):
27082708
TEXT_EMBEDDINGS = "text_embeddings"
27092709

27102710

2711-
@register_in_tasks_manager("qwen2-vl", *["image-text-to-text"], library_name="transformers")
2711+
@register_in_tasks_manager("qwen2-vl", *["image-text-to-text", "video-text-to-text"], library_name="transformers")
27122712
class Qwen2VLOpenVINOConfig(OnnxConfig):
27132713
SUPPORTED_BEHAVIORS = [model_type.value for model_type in Qwen2VLConfigBehavior]
27142714
NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
@@ -2838,7 +2838,7 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
28382838
return {}
28392839

28402840

2841-
@register_in_tasks_manager("qwen2-5-vl", *["image-text-to-text"], library_name="transformers")
2841+
@register_in_tasks_manager("qwen2-5-vl", *["image-text-to-text", "video-text-to-text"], library_name="transformers")
28422842
class Qwen2_5_VLOpenVINOConfig(Qwen2VLOpenVINOConfig):
28432843
MIN_TRANSFORMERS_VERSION = version.parse("4.49.0")
28442844

optimum/intel/openvino/modeling_visual_language.py

+8-12
Original file line numberDiff line numberDiff line change
@@ -1353,7 +1353,6 @@ def get_text_embeddings(self, input_ids, **kwargs):
13531353

13541354
class _OVLlavaNextVideoForCausalLM(_OVLlavaNextForCausalLM):
13551355
additional_parts = ["vision_resampler", "multi_modal_projector"]
1356-
export_feature = "video-text-to-text"
13571356
auto_model_class = AutoModelForVision2Seq
13581357

13591358
def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
@@ -1382,14 +1381,11 @@ def preprocess_inputs(
13821381
chat_prompt[0]["content"].append({"type": "video"})
13831382
prompt = processor.apply_chat_template(chat_prompt, add_generation_prompt=True, tokenize=False)
13841383
else:
1385-
if image is not None and "<image>" not in text:
1386-
prompt = "<image>\n" + text
1387-
else:
1388-
prompt = text
1389-
if video is not None and "<video>" not in text:
1390-
prompt = "<video>\n" + text
1391-
else:
1392-
prompt = text
1384+
prompt = text
1385+
if image is not None and "<image>" not in prompt:
1386+
prompt = "<image>\n" + prompt
1387+
if video is not None and "<video>" not in prompt:
1388+
prompt = "<video>\n" + prompt
13931389

13941390
if is_transformers_version(">", "4.47.99") and getattr(processor, "patch_size", None) is None:
13951391
if (
@@ -1402,7 +1398,7 @@ def preprocess_inputs(
14021398
"Processor does not have `patch_size` attribute. Please fix the processor or provide `patch_size` in the config."
14031399
)
14041400

1405-
inputs = processor(images=image, text=prompt, videos=[video], return_tensors="pt")
1401+
inputs = processor(images=image, text=prompt, videos=video, return_tensors="pt")
14061402
return inputs
14071403

14081404
def get_multimodal_embeddings(
@@ -2556,7 +2552,7 @@ def preprocess_inputs(
25562552

25572553
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
25582554

2559-
inputs = processor(images=image, text=text_prompt, videos=[video], return_tensors="pt")
2555+
inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt")
25602556
return inputs
25612557

25622558

@@ -2992,7 +2988,7 @@ def preprocess_inputs(
29922988

29932989
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
29942990

2995-
inputs = processor(images=image, text=text_prompt, videos=[video], return_tensors="pt")
2991+
inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt")
29962992
return inputs
29972993

29982994
# Copied from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1602

tests/openvino/test_modeling.py

-1
Original file line numberDiff line numberDiff line change
@@ -2156,7 +2156,6 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
21562156
)
21572157

21582158
def get_transformer_model_class(self, model_arch):
2159-
print(model_arch)
21602159
if is_transformers_version(">=", "4.46") and model_arch in ["llava", "llava_next", "qwen2_vl", "qwen2_5_vl"]:
21612160
from transformers import AutoModelForImageTextToText
21622161

0 commit comments

Comments
 (0)