fix test for video

eaidova · eaidova · commit 841641444b17 · 2025-03-03T13:30:28.000+04:00
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
@@ -1402,7 +1402,7 @@ def preprocess_inputs(
                     "Processor does not have `patch_size` attribute. Please fix the processor or provide `patch_size` in the config."
                 )
 
-        inputs = processor(images=image, text=prompt, videos=video, return_tensors="pt")
+        inputs = processor(images=image, text=prompt, videos=[video], return_tensors="pt")
         return inputs
 
     def get_multimodal_embeddings(
@@ -2556,7 +2556,7 @@ def preprocess_inputs(
 
         text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 
-        inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt")
+        inputs = processor(images=image, text=text_prompt, videos=[video], return_tensors="pt")
         return inputs
 
 
@@ -2992,7 +2992,7 @@ def preprocess_inputs(
 
         text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 
-        inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt")
+        inputs = processor(images=image, text=text_prompt, videos=[video], return_tensors="pt")
         return inputs
 
     # Copied from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1602
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -2362,7 +2362,7 @@ def test_generate_utils(self, model_arch):
             video_path = hf_hub_download(
                 repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
             )
-            input_video = load_video(video_path, num_frames=4)
+            input_video, _ = load_video(video_path, num_frames=2)
             question = "Why is this video funny?"
             inputs = model.preprocess_inputs(**preprocessors, text=question, video=input_video)
             outputs = model.generate(**inputs, max_new_tokens=10)

Original file line number	Diff line number	Diff line change
`@@ -2362,7 +2362,7 @@ def test_generate_utils(self, model_arch):`
`2362`	`2362`	`video_path = hf_hub_download(`
`2363`	`2363`	`repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"`
`2364`	`2364`	`)`
`2365`		`- input_video = load_video(video_path, num_frames=4)`
	`2365`	`+ input_video, _ = load_video(video_path, num_frames=2)`
`2366`	`2366`	`question = "Why is this video funny?"`
`2367`	`2367`	`inputs = model.preprocess_inputs(**preprocessors, text=question, video=input_video)`
`2368`	`2368`	`outputs = model.generate(**inputs, max_new_tokens=10)`