Skip to content

Commit f2e3135

Browse files
committed
add video input support in preprocess_input
1 parent 080c874 commit f2e3135

File tree

1 file changed

+90
-42
lines changed

1 file changed

+90
-42
lines changed

optimum/intel/openvino/modeling_visual_language.py

+90-42
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555

5656
if TYPE_CHECKING:
5757
from PIL.Image import Image
58+
from transformers.image_utils import VideoInput
5859

5960

6061
logger = logging.getLogger(__name__)
@@ -839,6 +840,7 @@ def preprocess_inputs(
839840
processor: Optional[AutoImageProcessor] = None,
840841
tokenizer: Optional[PreTrainedTokenizer] = None,
841842
config: Optional[PretrainedConfig] = None,
843+
video: Optional["VideoInput"] = None,
842844
):
843845
"""
844846
Preprocess input instruction and an image.
@@ -1016,9 +1018,12 @@ def preprocess_inputs(
10161018
processor: Optional[AutoImageProcessor] = None,
10171019
tokenizer: Optional[PreTrainedTokenizer] = None,
10181020
config: Optional[PretrainedConfig] = None,
1021+
video: Optional["VideoInput"] = None,
10191022
):
10201023
if processor is None:
10211024
raise ValueError("Processor is required.")
1025+
if video is not None:
1026+
raise ValueError("Video input is not supported")
10221027
if getattr(processor, "chat_template", None) is not None:
10231028
chat_prompt = [{"role": "user", "content": [{"type": "text", "text": text}]}]
10241029
if image is not None:
@@ -1354,6 +1359,48 @@ def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
13541359
image_features = self.multi_modal_projector(image_features)
13551360
return image_features
13561361

1362+
@staticmethod
1363+
def preprocess_inputs(
1364+
text: str,
1365+
image: Optional["Image"] = None,
1366+
processor: Optional[AutoImageProcessor] = None,
1367+
tokenizer: Optional[PreTrainedTokenizer] = None,
1368+
config: Optional[PretrainedConfig] = None,
1369+
video: Optional["VideoInput"] = None,
1370+
):
1371+
if processor is None:
1372+
raise ValueError("Processor is required.")
1373+
if getattr(processor, "chat_template", None) is not None:
1374+
chat_prompt = [{"role": "user", "content": [{"type": "text", "text": text}]}]
1375+
if image is not None:
1376+
chat_prompt[0]["content"].append({"type": "image"})
1377+
if video is not None:
1378+
chat_prompt[0]["content"].append({"type": "video"})
1379+
prompt = processor.apply_chat_template(chat_prompt, add_generation_prompt=True, tokenize=False)
1380+
else:
1381+
if image is not None and "<image>" not in text:
1382+
prompt = "<image>\n" + text
1383+
else:
1384+
prompt = text
1385+
if video is not None and "<video>" not in text:
1386+
prompt = "<video>\n" + text
1387+
else:
1388+
prompt = text
1389+
1390+
if is_transformers_version(">", "4.47.99") and getattr(processor, "patch_size", None) is None:
1391+
if (
1392+
getattr(config, "vision_config", None) is not None
1393+
and getattr(config.vision_config, "patch_size", None) is not None
1394+
):
1395+
processor.patch_size = config.vision_config.patch_size
1396+
else:
1397+
raise ValueError(
1398+
"Processor does not have `patch_size` attribute. Please fix the processor or provide `patch_size` in the config."
1399+
)
1400+
1401+
inputs = processor(images=image, text=prompt, videos=video, return_tensors="pt")
1402+
return inputs
1403+
13571404
def get_multimodal_embeddings(
13581405
self,
13591406
input_ids,
@@ -1511,9 +1558,12 @@ def preprocess_inputs(
15111558
processor: Optional[AutoImageProcessor] = None,
15121559
tokenizer: Optional[PreTrainedTokenizer] = None,
15131560
config: Optional[PretrainedConfig] = None,
1561+
video: Optional["VideoInput"] = None,
15141562
):
15151563
if tokenizer is None:
15161564
raise ValueError("Tokenizer is required.")
1565+
if video is not None:
1566+
raise ValueError("Video input is not supported")
15171567
import torchvision.transforms as T
15181568
from torchvision.transforms.functional import InterpolationMode
15191569

@@ -1886,9 +1936,12 @@ def preprocess_inputs(
18861936
processor: Optional[AutoImageProcessor] = None,
18871937
tokenizer: Optional[PreTrainedTokenizer] = None,
18881938
config: Optional[PretrainedConfig] = None,
1939+
video: Optional["VideoInput"] = None,
18891940
):
18901941
if processor is None:
18911942
raise ValueError("Processor is required.")
1943+
if video is not None:
1944+
raise ValueError("Video input is not supported")
18921945
if getattr(processor, "chat_template", None) is not None:
18931946
messages = [{"role": "user", "content": text if image is None else "(<image>./</image>)\n" + text}]
18941947
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
@@ -2083,9 +2136,12 @@ def preprocess_inputs(
20832136
processor: Optional[AutoImageProcessor] = None,
20842137
tokenizer: Optional[PreTrainedTokenizer] = None,
20852138
config: Optional[PretrainedConfig] = None,
2139+
video: Optional["VideoInput"] = None,
20862140
):
20872141
if tokenizer is None:
20882142
raise ValueError("Tokenizer is required.")
2143+
if video is not None:
2144+
raise ValueError("Video input is not supported")
20892145
if image is not None and processor is None:
20902146
raise ValueError("Processor is required.")
20912147
text = f"<image>\n{text}" if image is not None else text
@@ -2244,9 +2300,12 @@ def preprocess_inputs(
22442300
processor: Optional[AutoImageProcessor] = None,
22452301
tokenizer: Optional[PreTrainedTokenizer] = None,
22462302
config: Optional[PretrainedConfig] = None,
2303+
video: Optional["VideoInput"] = None,
22472304
):
22482305
if processor is None:
22492306
raise ValueError("Processor is required.")
2307+
if video is not None:
2308+
raise ValueError("Video input is not supported")
22502309
if image is not None and "<|image_1|>" not in text:
22512310
text = "<|image_1|>\n" + text
22522311
if getattr(processor.tokenizer, "chat_template", None) is not None:
@@ -2474,33 +2533,26 @@ def preprocess_inputs(
24742533
processor: Optional[AutoImageProcessor] = None,
24752534
tokenizer: Optional[PreTrainedTokenizer] = None,
24762535
config: Optional[PretrainedConfig] = None,
2536+
video: Optional["VideoInput"] = None,
24772537
):
24782538
if processor is None:
24792539
raise ValueError("Processor is required.")
2540+
conversation = [
2541+
{
2542+
"role": "user",
2543+
"content": [
2544+
{"type": "text", "text": text},
2545+
],
2546+
}
2547+
]
24802548
if image is not None:
2481-
conversation = [
2482-
{
2483-
"role": "user",
2484-
"content": [
2485-
{
2486-
"type": "image",
2487-
},
2488-
{"type": "text", "text": text},
2489-
],
2490-
}
2491-
]
2492-
else:
2493-
conversation = [
2494-
{
2495-
"role": "user",
2496-
"content": [
2497-
{"type": "text", "text": text},
2498-
],
2499-
}
2500-
]
2549+
conversation[0]["content"].insert(0, {"type": "image"})
2550+
if video is not None:
2551+
conversation[0]["content"].insert(0, {"type": "video"})
2552+
25012553
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
25022554

2503-
inputs = processor(images=image, text=text_prompt, return_tensors="pt")
2555+
inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt")
25042556
return inputs
25052557

25062558

@@ -2917,33 +2969,26 @@ def preprocess_inputs(
29172969
processor: Optional[AutoImageProcessor] = None,
29182970
tokenizer: Optional[PreTrainedTokenizer] = None,
29192971
config: Optional[PretrainedConfig] = None,
2972+
video: Optional["VideoInput"] = None,
29202973
):
29212974
if processor is None:
29222975
raise ValueError("Processor is required.")
2976+
conversation = [
2977+
{
2978+
"role": "user",
2979+
"content": [
2980+
{"type": "text", "text": text},
2981+
],
2982+
}
2983+
]
29232984
if image is not None:
2924-
conversation = [
2925-
{
2926-
"role": "user",
2927-
"content": [
2928-
{
2929-
"type": "image",
2930-
},
2931-
{"type": "text", "text": text},
2932-
],
2933-
}
2934-
]
2935-
else:
2936-
conversation = [
2937-
{
2938-
"role": "user",
2939-
"content": [
2940-
{"type": "text", "text": text},
2941-
],
2942-
}
2943-
]
2985+
conversation[0]["content"].insert(0, {"type": "image"})
2986+
if video is not None:
2987+
conversation[0]["content"].insert(0, {"type": "video"})
2988+
29442989
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
29452990

2946-
inputs = processor(images=image, text=text_prompt, return_tensors="pt")
2991+
inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt")
29472992
return inputs
29482993

29492994
# Copied from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1602
@@ -2975,9 +3020,12 @@ def preprocess_inputs(
29753020
processor: Optional[AutoImageProcessor] = None,
29763021
tokenizer: Optional[PreTrainedTokenizer] = None,
29773022
config: Optional[PretrainedConfig] = None,
3023+
video: Optional["VideoInput"] = None,
29783024
):
29793025
if processor is None:
29803026
raise ValueError("processor is required")
3027+
if video is not None:
3028+
raise ValueError("Video input is not supported")
29813029
if image is None:
29823030
return processor(text=text, return_tensors="pt")
29833031
processed_inputs = processor.format_and_preprocess_phrase_grounding_input(

0 commit comments

Comments
 (0)