@@ -1353,7 +1353,6 @@ def get_text_embeddings(self, input_ids, **kwargs):
1353
1353
1354
1354
class _OVLlavaNextVideoForCausalLM (_OVLlavaNextForCausalLM ):
1355
1355
additional_parts = ["vision_resampler" , "multi_modal_projector" ]
1356
- export_feature = "video-text-to-text"
1357
1356
auto_model_class = AutoModelForVision2Seq
1358
1357
1359
1358
def get_vision_embeddings (self , pixel_values , input_ids = None , ** kwargs ):
@@ -1382,14 +1381,11 @@ def preprocess_inputs(
1382
1381
chat_prompt [0 ]["content" ].append ({"type" : "video" })
1383
1382
prompt = processor .apply_chat_template (chat_prompt , add_generation_prompt = True , tokenize = False )
1384
1383
else :
1385
- if image is not None and "<image>" not in text :
1386
- prompt = "<image>\n " + text
1387
- else :
1388
- prompt = text
1389
- if video is not None and "<video>" not in text :
1390
- prompt = "<video>\n " + text
1391
- else :
1392
- prompt = text
1384
+ prompt = text
1385
+ if image is not None and "<image>" not in prompt :
1386
+ prompt = "<image>\n " + prompt
1387
+ if video is not None and "<video>" not in prompt :
1388
+ prompt = "<video>\n " + prompt
1393
1389
1394
1390
if is_transformers_version (">" , "4.47.99" ) and getattr (processor , "patch_size" , None ) is None :
1395
1391
if (
@@ -1402,7 +1398,7 @@ def preprocess_inputs(
1402
1398
"Processor does not have `patch_size` attribute. Please fix the processor or provide `patch_size` in the config."
1403
1399
)
1404
1400
1405
- inputs = processor (images = image , text = prompt , videos = [ video ] , return_tensors = "pt" )
1401
+ inputs = processor (images = image , text = prompt , videos = video , return_tensors = "pt" )
1406
1402
return inputs
1407
1403
1408
1404
def get_multimodal_embeddings (
@@ -2556,7 +2552,7 @@ def preprocess_inputs(
2556
2552
2557
2553
text_prompt = processor .apply_chat_template (conversation , add_generation_prompt = True )
2558
2554
2559
- inputs = processor (images = image , text = text_prompt , videos = [ video ] , return_tensors = "pt" )
2555
+ inputs = processor (images = image , text = text_prompt , videos = video , return_tensors = "pt" )
2560
2556
return inputs
2561
2557
2562
2558
@@ -2992,7 +2988,7 @@ def preprocess_inputs(
2992
2988
2993
2989
text_prompt = processor .apply_chat_template (conversation , add_generation_prompt = True )
2994
2990
2995
- inputs = processor (images = image , text = text_prompt , videos = [ video ] , return_tensors = "pt" )
2991
+ inputs = processor (images = image , text = text_prompt , videos = video , return_tensors = "pt" )
2996
2992
return inputs
2997
2993
2998
2994
# Copied from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1602
0 commit comments