|
31 | 31 | import torch
|
32 | 32 | from datasets import load_dataset
|
33 | 33 | from evaluate import evaluator
|
34 |
| -from huggingface_hub import HfApi |
| 34 | +from huggingface_hub import HfApi, hf_hub_download |
35 | 35 | from parameterized import parameterized
|
36 | 36 | from PIL import Image
|
37 | 37 | from sentence_transformers import SentenceTransformer
|
@@ -2126,21 +2126,25 @@ def test_compare_with_and_without_past_key_values(self):
|
2126 | 2126 |
|
2127 | 2127 | class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
|
2128 | 2128 | SUPPORTED_ARCHITECTURES = ["llava"]
|
| 2129 | + SUPPORT_VIDEO = [] |
2129 | 2130 |
|
2130 |
| - # if is_transformers_version(">=", "4.40.0"): |
2131 |
| - # SUPPORTED_ARCHITECTURES += ["llava_next", "nanollava"] |
| 2131 | + if is_transformers_version(">=", "4.40.0"): |
| 2132 | + SUPPORTED_ARCHITECTURES += ["llava_next", "nanollava"] |
2132 | 2133 |
|
2133 | 2134 | if is_transformers_version(">=", "4.42.0"):
|
2134 | 2135 | SUPPORTED_ARCHITECTURES += ["llava_next_video"]
|
| 2136 | + SUPPORT_VIDEO.append("llava_next_video") |
2135 | 2137 |
|
2136 |
| - # if is_transformers_version(">=", "4.45.0"): |
2137 |
| - # SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl"] |
| 2138 | + if is_transformers_version(">=", "4.45.0"): |
| 2139 | + SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl"] |
| 2140 | + SUPPORT_VIDEO.append("qwen2_vl") |
2138 | 2141 |
|
2139 |
| - # if is_transformers_version(">=", "4.46.0"): |
2140 |
| - # SUPPORTED_ARCHITECTURES += ["maira2"] |
| 2142 | + if is_transformers_version(">=", "4.46.0"): |
| 2143 | + SUPPORTED_ARCHITECTURES += ["maira2"] |
2141 | 2144 |
|
2142 |
| - # if is_transformers_version(">=", "4.49.0"): |
2143 |
| - # SUPPORTED_ARCHITECTURES += ["qwen2_5_vl"] |
| 2145 | + if is_transformers_version(">=", "4.49.0"): |
| 2146 | + SUPPORTED_ARCHITECTURES += ["qwen2_5_vl"] |
| 2147 | + SUPPORT_VIDEO.append("qwen2_5_vl") |
2144 | 2148 | TASK = "image-text-to-text"
|
2145 | 2149 | REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v", "maira2"]
|
2146 | 2150 |
|
@@ -2350,6 +2354,22 @@ def test_generate_utils(self, model_arch):
|
2350 | 2354 | outputs = outputs[:, inputs["input_ids"].shape[1] :]
|
2351 | 2355 | outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
2352 | 2356 | self.assertIsInstance(outputs[0], str)
|
| 2357 | + |
| 2358 | + # video laoder helper only available for transformers >= 4.49 |
| 2359 | + if model_arch in self.SUPPORT_VIDEO and is_transformers_version(">=", "4.49"): |
| 2360 | + from transformers.image_utils import load_video |
| 2361 | + |
| 2362 | + video_path = hf_hub_download( |
| 2363 | + repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset" |
| 2364 | + ) |
| 2365 | + input_video = load_video(video_path, num_frames=4) |
| 2366 | + question = "Why is this video funny?" |
| 2367 | + inputs = model.preprocess_inputs(**preprocessors, text=question, video=input_video) |
| 2368 | + outputs = model.generate(**inputs, max_new_tokens=10) |
| 2369 | + # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200 |
| 2370 | + outputs = outputs[:, inputs["input_ids"].shape[1] :] |
| 2371 | + outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) |
| 2372 | + self.assertIsInstance(outputs[0], str) |
2353 | 2373 | del model
|
2354 | 2374 |
|
2355 | 2375 | gc.collect()
|
|
0 commit comments