apply review comments

eaidova · eaidova · commit 6017322e97ab · 2025-03-03T17:48:23.000+04:00
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -2708,7 +2708,7 @@ class Qwen2VLConfigBehavior(str, enum.Enum):
     TEXT_EMBEDDINGS = "text_embeddings"
 
 
-@register_in_tasks_manager("qwen2-vl", *["image-text-to-text"], library_name="transformers")
+@register_in_tasks_manager("qwen2-vl", *["image-text-to-text", "video-text-to-text"], library_name="transformers")
 class Qwen2VLOpenVINOConfig(OnnxConfig):
     SUPPORTED_BEHAVIORS = [model_type.value for model_type in Qwen2VLConfigBehavior]
     NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
@@ -2838,7 +2838,7 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
         return {}
 
 
-@register_in_tasks_manager("qwen2-5-vl", *["image-text-to-text"], library_name="transformers")
+@register_in_tasks_manager("qwen2-5-vl", *["image-text-to-text", "video-text-to-text"], library_name="transformers")
 class Qwen2_5_VLOpenVINOConfig(Qwen2VLOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = version.parse("4.49.0")
 
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
@@ -1353,7 +1353,6 @@ def get_text_embeddings(self, input_ids, **kwargs):
 
 class _OVLlavaNextVideoForCausalLM(_OVLlavaNextForCausalLM):
     additional_parts = ["vision_resampler", "multi_modal_projector"]
-    export_feature = "video-text-to-text"
     auto_model_class = AutoModelForVision2Seq
 
     def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
@@ -1382,14 +1381,11 @@ def preprocess_inputs(
                 chat_prompt[0]["content"].append({"type": "video"})
             prompt = processor.apply_chat_template(chat_prompt, add_generation_prompt=True, tokenize=False)
         else:
-            if image is not None and "<image>" not in text:
-                prompt = "<image>\n" + text
-            else:
-                prompt = text
-            if video is not None and "<video>" not in text:
-                prompt = "<video>\n" + text
-            else:
-                prompt = text
+            prompt = text
+            if image is not None and "<image>" not in prompt:
+                prompt = "<image>\n" + prompt
+            if video is not None and "<video>" not in prompt:
+                prompt = "<video>\n" + prompt
 
         if is_transformers_version(">", "4.47.99") and getattr(processor, "patch_size", None) is None:
             if (
@@ -1402,7 +1398,7 @@ def preprocess_inputs(
                     "Processor does not have `patch_size` attribute. Please fix the processor or provide `patch_size` in the config."
                 )
 
-        inputs = processor(images=image, text=prompt, videos=[video], return_tensors="pt")
+        inputs = processor(images=image, text=prompt, videos=video, return_tensors="pt")
         return inputs
 
     def get_multimodal_embeddings(
@@ -2556,7 +2552,7 @@ def preprocess_inputs(
 
         text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 
-        inputs = processor(images=image, text=text_prompt, videos=[video], return_tensors="pt")
+        inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt")
         return inputs
 
 
@@ -2992,7 +2988,7 @@ def preprocess_inputs(
 
         text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 
-        inputs = processor(images=image, text=text_prompt, videos=[video], return_tensors="pt")
+        inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt")
         return inputs
 
     # Copied from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1602
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -2156,7 +2156,6 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
     )
 
     def get_transformer_model_class(self, model_arch):
-        print(model_arch)
         if is_transformers_version(">=", "4.46") and model_arch in ["llava", "llava_next", "qwen2_vl", "qwen2_5_vl"]:
             from transformers import AutoModelForImageTextToText
 

Original file line number	Diff line number	Diff line change
`@@ -2156,7 +2156,6 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):`
`2156`	`2156`	`)`
`2157`	`2157`
`2158`	`2158`	`def get_transformer_model_class(self, model_arch):`
`2159`		`- print(model_arch)`
`2160`	`2159`	`if is_transformers_version(">=", "4.46") and model_arch in ["llava", "llava_next", "qwen2_vl", "qwen2_5_vl"]:`
`2161`	`2160`	`from transformers import AutoModelForImageTextToText`
`2162`	`2161`