@@ -1121,7 +1121,6 @@ def add_image_features(
1121
1121
pixel_values ,
1122
1122
attention_mask ,
1123
1123
position_ids ,
1124
- past_key_values ,
1125
1124
image_sizes ,
1126
1125
legacy_processing ,
1127
1126
** kwargs ,
@@ -1193,7 +1192,6 @@ def get_multimodal_embeddings(
1193
1192
pixel_values ,
1194
1193
attention_mask ,
1195
1194
position_ids ,
1196
- past_key_values ,
1197
1195
image_sizes ,
1198
1196
legacy_processing ,
1199
1197
** kwargs ,
@@ -1353,7 +1351,6 @@ def get_text_embeddings(self, input_ids, **kwargs):
1353
1351
1354
1352
class _OVLlavaNextVideoForCausalLM (_OVLlavaNextForCausalLM ):
1355
1353
additional_parts = ["vision_resampler" , "multi_modal_projector" ]
1356
- export_feature = "video-text-to-text"
1357
1354
auto_model_class = AutoModelForVision2Seq
1358
1355
1359
1356
def get_vision_embeddings (self , pixel_values , input_ids = None , ** kwargs ):
@@ -1382,14 +1379,11 @@ def preprocess_inputs(
1382
1379
chat_prompt [0 ]["content" ].append ({"type" : "video" })
1383
1380
prompt = processor .apply_chat_template (chat_prompt , add_generation_prompt = True , tokenize = False )
1384
1381
else :
1385
- if image is not None and "<image>" not in text :
1386
- prompt = "<image>\n " + text
1387
- else :
1388
- prompt = text
1389
- if video is not None and "<video>" not in text :
1390
- prompt = "<video>\n " + text
1391
- else :
1392
- prompt = text
1382
+ prompt = text
1383
+ if image is not None and "<image>" not in prompt :
1384
+ prompt = "<image>\n " + prompt
1385
+ if video is not None and "<video>" not in prompt :
1386
+ prompt = "<video>\n " + prompt
1393
1387
1394
1388
if is_transformers_version (">" , "4.47.99" ) and getattr (processor , "patch_size" , None ) is None :
1395
1389
if (
@@ -1402,7 +1396,7 @@ def preprocess_inputs(
1402
1396
"Processor does not have `patch_size` attribute. Please fix the processor or provide `patch_size` in the config."
1403
1397
)
1404
1398
1405
- inputs = processor (images = image , text = prompt , videos = [ video ] , return_tensors = "pt" )
1399
+ inputs = processor (images = image , text = prompt , videos = video , return_tensors = "pt" )
1406
1400
return inputs
1407
1401
1408
1402
def get_multimodal_embeddings (
@@ -1450,7 +1444,6 @@ def get_multimodal_embeddings(
1450
1444
pixel_values ,
1451
1445
attention_mask ,
1452
1446
position_ids ,
1453
- past_key_values ,
1454
1447
image_sizes ,
1455
1448
legacy_processing ,
1456
1449
** kwargs ,
@@ -1463,7 +1456,6 @@ def get_multimodal_embeddings(
1463
1456
pixel_values_videos ,
1464
1457
attention_mask ,
1465
1458
position_ids ,
1466
- past_key_values ,
1467
1459
legacy_processing = legacy_processing ,
1468
1460
** kwargs ,
1469
1461
)
@@ -1480,7 +1472,6 @@ def add_video_features(
1480
1472
pixel_values_videos ,
1481
1473
attention_mask ,
1482
1474
position_ids ,
1483
- past_key_values ,
1484
1475
legacy_processing ,
1485
1476
** kwargs ,
1486
1477
):
@@ -2556,7 +2547,7 @@ def preprocess_inputs(
2556
2547
2557
2548
text_prompt = processor .apply_chat_template (conversation , add_generation_prompt = True )
2558
2549
2559
- inputs = processor (images = image , text = text_prompt , videos = [ video ] , return_tensors = "pt" )
2550
+ inputs = processor (images = image , text = text_prompt , videos = video , return_tensors = "pt" )
2560
2551
return inputs
2561
2552
2562
2553
@@ -2992,7 +2983,7 @@ def preprocess_inputs(
2992
2983
2993
2984
text_prompt = processor .apply_chat_template (conversation , add_generation_prompt = True )
2994
2985
2995
- inputs = processor (images = image , text = text_prompt , videos = [ video ] , return_tensors = "pt" )
2986
+ inputs = processor (images = image , text = text_prompt , videos = video , return_tensors = "pt" )
2996
2987
return inputs
2997
2988
2998
2989
# Copied from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1602
0 commit comments