apply review comments

eaidova · eaidova · commit 3fbd0b12f1e8 · 2025-03-03T17:53:45.000+04:00
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -2708,7 +2708,7 @@ class Qwen2VLConfigBehavior(str, enum.Enum):
     TEXT_EMBEDDINGS = "text_embeddings"
 
 
-@register_in_tasks_manager("qwen2-vl", *["image-text-to-text"], library_name="transformers")
+@register_in_tasks_manager("qwen2-vl", *["image-text-to-text", "video-text-to-text"], library_name="transformers")
 class Qwen2VLOpenVINOConfig(OnnxConfig):
     SUPPORTED_BEHAVIORS = [model_type.value for model_type in Qwen2VLConfigBehavior]
     NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
@@ -2838,7 +2838,7 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
         return {}
 
 
-@register_in_tasks_manager("qwen2-5-vl", *["image-text-to-text"], library_name="transformers")
+@register_in_tasks_manager("qwen2-5-vl", *["image-text-to-text", "video-text-to-text"], library_name="transformers")
 class Qwen2_5_VLOpenVINOConfig(Qwen2VLOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = version.parse("4.49.0")
 
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
@@ -1121,7 +1121,6 @@ def add_image_features(
         pixel_values,
         attention_mask,
         position_ids,
-        past_key_values,
         image_sizes,
         legacy_processing,
         **kwargs,
@@ -1193,7 +1192,6 @@ def get_multimodal_embeddings(
                 pixel_values,
                 attention_mask,
                 position_ids,
-                past_key_values,
                 image_sizes,
                 legacy_processing,
                 **kwargs,
@@ -1353,7 +1351,6 @@ def get_text_embeddings(self, input_ids, **kwargs):
 
 class _OVLlavaNextVideoForCausalLM(_OVLlavaNextForCausalLM):
     additional_parts = ["vision_resampler", "multi_modal_projector"]
-    export_feature = "video-text-to-text"
     auto_model_class = AutoModelForVision2Seq
 
     def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
@@ -1382,14 +1379,11 @@ def preprocess_inputs(
                 chat_prompt[0]["content"].append({"type": "video"})
             prompt = processor.apply_chat_template(chat_prompt, add_generation_prompt=True, tokenize=False)
         else:
-            if image is not None and "<image>" not in text:
-                prompt = "<image>\n" + text
-            else:
-                prompt = text
-            if video is not None and "<video>" not in text:
-                prompt = "<video>\n" + text
-            else:
-                prompt = text
+            prompt = text
+            if image is not None and "<image>" not in prompt:
+                prompt = "<image>\n" + prompt
+            if video is not None and "<video>" not in prompt:
+                prompt = "<video>\n" + prompt
 
         if is_transformers_version(">", "4.47.99") and getattr(processor, "patch_size", None) is None:
             if (
@@ -1402,7 +1396,7 @@ def preprocess_inputs(
                     "Processor does not have `patch_size` attribute. Please fix the processor or provide `patch_size` in the config."
                 )
 
-        inputs = processor(images=image, text=prompt, videos=[video], return_tensors="pt")
+        inputs = processor(images=image, text=prompt, videos=video, return_tensors="pt")
         return inputs
 
     def get_multimodal_embeddings(
@@ -1450,7 +1444,6 @@ def get_multimodal_embeddings(
                 pixel_values,
                 attention_mask,
                 position_ids,
-                past_key_values,
                 image_sizes,
                 legacy_processing,
                 **kwargs,
@@ -1463,7 +1456,6 @@ def get_multimodal_embeddings(
                 pixel_values_videos,
                 attention_mask,
                 position_ids,
-                past_key_values,
                 legacy_processing=legacy_processing,
                 **kwargs,
             )
@@ -1480,7 +1472,6 @@ def add_video_features(
         pixel_values_videos,
         attention_mask,
         position_ids,
-        past_key_values,
         legacy_processing,
         **kwargs,
     ):
@@ -2556,7 +2547,7 @@ def preprocess_inputs(
 
         text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 
-        inputs = processor(images=image, text=text_prompt, videos=[video], return_tensors="pt")
+        inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt")
         return inputs
 
 
@@ -2992,7 +2983,7 @@ def preprocess_inputs(
 
         text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 
-        inputs = processor(images=image, text=text_prompt, videos=[video], return_tensors="pt")
+        inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt")
         return inputs
 
     # Copied from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1602
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -2156,7 +2156,6 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
     )
 
     def get_transformer_model_class(self, model_arch):
-        print(model_arch)
         if is_transformers_version(">=", "4.46") and model_arch in ["llava", "llava_next", "qwen2_vl", "qwen2_5_vl"]:
             from transformers import AutoModelForImageTextToText
 
@@ -2355,7 +2354,7 @@ def test_generate_utils(self, model_arch):
         outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
         self.assertIsInstance(outputs[0], str)
 
-        # video laoder helper only available for transformers >= 4.49
+        # video loader helper only available for transformers >= 4.49
         if model_arch in self.SUPPORT_VIDEO and is_transformers_version(">=", "4.49"):
             from transformers.image_utils import load_video