|
55 | 55 |
|
56 | 56 | if TYPE_CHECKING:
|
57 | 57 | from PIL.Image import Image
|
| 58 | + from transformers.image_utils import VideoInput |
58 | 59 |
|
59 | 60 |
|
60 | 61 | logger = logging.getLogger(__name__)
|
@@ -839,6 +840,7 @@ def preprocess_inputs(
|
839 | 840 | processor: Optional[AutoImageProcessor] = None,
|
840 | 841 | tokenizer: Optional[PreTrainedTokenizer] = None,
|
841 | 842 | config: Optional[PretrainedConfig] = None,
|
| 843 | + video: Optional["VideoInput"] = None, |
842 | 844 | ):
|
843 | 845 | """
|
844 | 846 | Preprocess input instruction and an image.
|
@@ -1016,9 +1018,12 @@ def preprocess_inputs(
|
1016 | 1018 | processor: Optional[AutoImageProcessor] = None,
|
1017 | 1019 | tokenizer: Optional[PreTrainedTokenizer] = None,
|
1018 | 1020 | config: Optional[PretrainedConfig] = None,
|
| 1021 | + video: Optional["VideoInput"] = None, |
1019 | 1022 | ):
|
1020 | 1023 | if processor is None:
|
1021 | 1024 | raise ValueError("Processor is required.")
|
| 1025 | + if video is not None: |
| 1026 | + raise ValueError("Video input is not supported") |
1022 | 1027 | if getattr(processor, "chat_template", None) is not None:
|
1023 | 1028 | chat_prompt = [{"role": "user", "content": [{"type": "text", "text": text}]}]
|
1024 | 1029 | if image is not None:
|
@@ -1354,6 +1359,48 @@ def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
|
1354 | 1359 | image_features = self.multi_modal_projector(image_features)
|
1355 | 1360 | return image_features
|
1356 | 1361 |
|
| 1362 | + @staticmethod |
| 1363 | + def preprocess_inputs( |
| 1364 | + text: str, |
| 1365 | + image: Optional["Image"] = None, |
| 1366 | + processor: Optional[AutoImageProcessor] = None, |
| 1367 | + tokenizer: Optional[PreTrainedTokenizer] = None, |
| 1368 | + config: Optional[PretrainedConfig] = None, |
| 1369 | + video: Optional["VideoInput"] = None, |
| 1370 | + ): |
| 1371 | + if processor is None: |
| 1372 | + raise ValueError("Processor is required.") |
| 1373 | + if getattr(processor, "chat_template", None) is not None: |
| 1374 | + chat_prompt = [{"role": "user", "content": [{"type": "text", "text": text}]}] |
| 1375 | + if image is not None: |
| 1376 | + chat_prompt[0]["content"].append({"type": "image"}) |
| 1377 | + if video is not None: |
| 1378 | + chat_prompt[0]["content"].append({"type": "video"}) |
| 1379 | + prompt = processor.apply_chat_template(chat_prompt, add_generation_prompt=True, tokenize=False) |
| 1380 | + else: |
| 1381 | + if image is not None and "<image>" not in text: |
| 1382 | + prompt = "<image>\n" + text |
| 1383 | + else: |
| 1384 | + prompt = text |
| 1385 | + if video is not None and "<video>" not in text: |
| 1386 | + prompt = "<video>\n" + text |
| 1387 | + else: |
| 1388 | + prompt = text |
| 1389 | + |
| 1390 | + if is_transformers_version(">", "4.47.99") and getattr(processor, "patch_size", None) is None: |
| 1391 | + if ( |
| 1392 | + getattr(config, "vision_config", None) is not None |
| 1393 | + and getattr(config.vision_config, "patch_size", None) is not None |
| 1394 | + ): |
| 1395 | + processor.patch_size = config.vision_config.patch_size |
| 1396 | + else: |
| 1397 | + raise ValueError( |
| 1398 | + "Processor does not have `patch_size` attribute. Please fix the processor or provide `patch_size` in the config." |
| 1399 | + ) |
| 1400 | + |
| 1401 | + inputs = processor(images=image, text=prompt, videos=video, return_tensors="pt") |
| 1402 | + return inputs |
| 1403 | + |
1357 | 1404 | def get_multimodal_embeddings(
|
1358 | 1405 | self,
|
1359 | 1406 | input_ids,
|
@@ -1511,9 +1558,12 @@ def preprocess_inputs(
|
1511 | 1558 | processor: Optional[AutoImageProcessor] = None,
|
1512 | 1559 | tokenizer: Optional[PreTrainedTokenizer] = None,
|
1513 | 1560 | config: Optional[PretrainedConfig] = None,
|
| 1561 | + video: Optional["VideoInput"] = None, |
1514 | 1562 | ):
|
1515 | 1563 | if tokenizer is None:
|
1516 | 1564 | raise ValueError("Tokenizer is required.")
|
| 1565 | + if video is not None: |
| 1566 | + raise ValueError("Video input is not supported") |
1517 | 1567 | import torchvision.transforms as T
|
1518 | 1568 | from torchvision.transforms.functional import InterpolationMode
|
1519 | 1569 |
|
@@ -1886,9 +1936,12 @@ def preprocess_inputs(
|
1886 | 1936 | processor: Optional[AutoImageProcessor] = None,
|
1887 | 1937 | tokenizer: Optional[PreTrainedTokenizer] = None,
|
1888 | 1938 | config: Optional[PretrainedConfig] = None,
|
| 1939 | + video: Optional["VideoInput"] = None, |
1889 | 1940 | ):
|
1890 | 1941 | if processor is None:
|
1891 | 1942 | raise ValueError("Processor is required.")
|
| 1943 | + if video is not None: |
| 1944 | + raise ValueError("Video input is not supported") |
1892 | 1945 | if getattr(processor, "chat_template", None) is not None:
|
1893 | 1946 | messages = [{"role": "user", "content": text if image is None else "(<image>./</image>)\n" + text}]
|
1894 | 1947 | prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
@@ -2083,9 +2136,12 @@ def preprocess_inputs(
|
2083 | 2136 | processor: Optional[AutoImageProcessor] = None,
|
2084 | 2137 | tokenizer: Optional[PreTrainedTokenizer] = None,
|
2085 | 2138 | config: Optional[PretrainedConfig] = None,
|
| 2139 | + video: Optional["VideoInput"] = None, |
2086 | 2140 | ):
|
2087 | 2141 | if tokenizer is None:
|
2088 | 2142 | raise ValueError("Tokenizer is required.")
|
| 2143 | + if video is not None: |
| 2144 | + raise ValueError("Video input is not supported") |
2089 | 2145 | if image is not None and processor is None:
|
2090 | 2146 | raise ValueError("Processor is required.")
|
2091 | 2147 | text = f"<image>\n{text}" if image is not None else text
|
@@ -2244,9 +2300,12 @@ def preprocess_inputs(
|
2244 | 2300 | processor: Optional[AutoImageProcessor] = None,
|
2245 | 2301 | tokenizer: Optional[PreTrainedTokenizer] = None,
|
2246 | 2302 | config: Optional[PretrainedConfig] = None,
|
| 2303 | + video: Optional["VideoInput"] = None, |
2247 | 2304 | ):
|
2248 | 2305 | if processor is None:
|
2249 | 2306 | raise ValueError("Processor is required.")
|
| 2307 | + if video is not None: |
| 2308 | + raise ValueError("Video input is not supported") |
2250 | 2309 | if image is not None and "<|image_1|>" not in text:
|
2251 | 2310 | text = "<|image_1|>\n" + text
|
2252 | 2311 | if getattr(processor.tokenizer, "chat_template", None) is not None:
|
@@ -2474,33 +2533,26 @@ def preprocess_inputs(
|
2474 | 2533 | processor: Optional[AutoImageProcessor] = None,
|
2475 | 2534 | tokenizer: Optional[PreTrainedTokenizer] = None,
|
2476 | 2535 | config: Optional[PretrainedConfig] = None,
|
| 2536 | + video: Optional["VideoInput"] = None, |
2477 | 2537 | ):
|
2478 | 2538 | if processor is None:
|
2479 | 2539 | raise ValueError("Processor is required.")
|
| 2540 | + conversation = [ |
| 2541 | + { |
| 2542 | + "role": "user", |
| 2543 | + "content": [ |
| 2544 | + {"type": "text", "text": text}, |
| 2545 | + ], |
| 2546 | + } |
| 2547 | + ] |
2480 | 2548 | if image is not None:
|
2481 |
| - conversation = [ |
2482 |
| - { |
2483 |
| - "role": "user", |
2484 |
| - "content": [ |
2485 |
| - { |
2486 |
| - "type": "image", |
2487 |
| - }, |
2488 |
| - {"type": "text", "text": text}, |
2489 |
| - ], |
2490 |
| - } |
2491 |
| - ] |
2492 |
| - else: |
2493 |
| - conversation = [ |
2494 |
| - { |
2495 |
| - "role": "user", |
2496 |
| - "content": [ |
2497 |
| - {"type": "text", "text": text}, |
2498 |
| - ], |
2499 |
| - } |
2500 |
| - ] |
| 2549 | + conversation[0]["content"].insert(0, {"type": "image"}) |
| 2550 | + if video is not None: |
| 2551 | + conversation[0]["content"].insert(0, {"type": "video"}) |
| 2552 | + |
2501 | 2553 | text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
|
2502 | 2554 |
|
2503 |
| - inputs = processor(images=image, text=text_prompt, return_tensors="pt") |
| 2555 | + inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt") |
2504 | 2556 | return inputs
|
2505 | 2557 |
|
2506 | 2558 |
|
@@ -2917,33 +2969,26 @@ def preprocess_inputs(
|
2917 | 2969 | processor: Optional[AutoImageProcessor] = None,
|
2918 | 2970 | tokenizer: Optional[PreTrainedTokenizer] = None,
|
2919 | 2971 | config: Optional[PretrainedConfig] = None,
|
| 2972 | + video: Optional["VideoInput"] = None, |
2920 | 2973 | ):
|
2921 | 2974 | if processor is None:
|
2922 | 2975 | raise ValueError("Processor is required.")
|
| 2976 | + conversation = [ |
| 2977 | + { |
| 2978 | + "role": "user", |
| 2979 | + "content": [ |
| 2980 | + {"type": "text", "text": text}, |
| 2981 | + ], |
| 2982 | + } |
| 2983 | + ] |
2923 | 2984 | if image is not None:
|
2924 |
| - conversation = [ |
2925 |
| - { |
2926 |
| - "role": "user", |
2927 |
| - "content": [ |
2928 |
| - { |
2929 |
| - "type": "image", |
2930 |
| - }, |
2931 |
| - {"type": "text", "text": text}, |
2932 |
| - ], |
2933 |
| - } |
2934 |
| - ] |
2935 |
| - else: |
2936 |
| - conversation = [ |
2937 |
| - { |
2938 |
| - "role": "user", |
2939 |
| - "content": [ |
2940 |
| - {"type": "text", "text": text}, |
2941 |
| - ], |
2942 |
| - } |
2943 |
| - ] |
| 2985 | + conversation[0]["content"].insert(0, {"type": "image"}) |
| 2986 | + if video is not None: |
| 2987 | + conversation[0]["content"].insert(0, {"type": "video"}) |
| 2988 | + |
2944 | 2989 | text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
|
2945 | 2990 |
|
2946 |
| - inputs = processor(images=image, text=text_prompt, return_tensors="pt") |
| 2991 | + inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt") |
2947 | 2992 | return inputs
|
2948 | 2993 |
|
2949 | 2994 | # Copied from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1602
|
@@ -2975,9 +3020,12 @@ def preprocess_inputs(
|
2975 | 3020 | processor: Optional[AutoImageProcessor] = None,
|
2976 | 3021 | tokenizer: Optional[PreTrainedTokenizer] = None,
|
2977 | 3022 | config: Optional[PretrainedConfig] = None,
|
| 3023 | + video: Optional["VideoInput"] = None, |
2978 | 3024 | ):
|
2979 | 3025 | if processor is None:
|
2980 | 3026 | raise ValueError("processor is required")
|
| 3027 | + if video is not None: |
| 3028 | + raise ValueError("Video input is not supported") |
2981 | 3029 | if image is None:
|
2982 | 3030 | return processor(text=text, return_tensors="pt")
|
2983 | 3031 | processed_inputs = processor.format_and_preprocess_phrase_grounding_input(
|
|
0 commit comments