Skip to content

Commit 080c874

Browse files
committed
add docs and tests
1 parent 7ca2a02 commit 080c874

File tree

6 files changed

+61
-30
lines changed

6 files changed

+61
-30
lines changed

docs/source/openvino/models.mdx

+1
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ Here is the list of the supported architectures :
7474
- Llama
7575
- Llava
7676
- Llava-Next
77+
- Llava-Next-Video
7778
- M2-M100
7879
- MAIRA-2
7980
- MBart

optimum/exporters/openvino/model_configs.py

+5
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,11 @@ def init_model_configs():
138138
"AutoModelForImageTextToText",
139139
)
140140

141+
TasksManager._CUSTOM_CLASSES[("pt", "llava-next-video", "image-text-to-text")] = (
142+
"transformers",
143+
"AutoModelForVision2Seq",
144+
)
145+
141146
TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS[
142147
"image-text-to-text"
143148
] = TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"]

optimum/exporters/openvino/model_patcher.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -3112,11 +3112,10 @@ def llava_vision_embed_forward(self, pixel_values):
31123112

31133113

31143114
def llava_next_video_vision_embed_forward(self, pixel_values):
3115-
# copied from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/llava/modeling_llava.py#L428-L441
3115+
# copied from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L519
31163116
# these changes does not bring any difference from original, it only packs model subcomponent inference together
31173117
# that allow us avoid memory overheads and their inference results handling on code-level
31183118
image_features = self.vision_tower(pixel_values, output_hidden_states=True)
3119-
# this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
31203119
vision_feature_layer = self.config.vision_feature_layer
31213120
if isinstance(vision_feature_layer, int):
31223121
selected_image_feature = image_features.hidden_states[vision_feature_layer]

optimum/intel/openvino/modeling_visual_language.py

+34-18
Original file line numberDiff line numberDiff line change
@@ -1204,9 +1204,10 @@ def merge_vision_text_embeddings(
12041204
attention_mask,
12051205
position_ids=None,
12061206
legacy_processing=False,
1207+
image_token_index=None,
12071208
**kwargs,
12081209
):
1209-
image_token_index = self.config.image_token_index
1210+
image_token_index = self.config.image_token_index if image_token_index is None else image_token_index
12101211
image_features = torch.from_numpy(vision_embeds) if isinstance(vision_embeds, np.ndarray) else vision_embeds
12111212
inputs_embeds = torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds
12121213

@@ -1235,7 +1236,7 @@ def merge_vision_text_embeddings(
12351236

12361237
# Whether to turn off right padding
12371238
# 1. Create a mask to know where special image tokens are
1238-
special_image_token_mask = input_ids == image_token_index
1239+
special_image_token_mask = torch.tensor(input_ids == image_token_index)
12391240
# special_image_token_mask: [bsz, seqlen]
12401241
num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
12411242
# num_special_image_tokens: [bsz]
@@ -1328,7 +1329,7 @@ def merge_vision_text_embeddings(
13281329
final_attention_mask |= image_to_overwrite
13291330
position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
13301331
else:
1331-
special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
1332+
special_image_mask = torch.tensor((input_ids == image_token_index)).unsqueeze(-1).expand_as(inputs_embeds)
13321333
image_features = image_features.to(inputs_embeds.dtype)
13331334
final_embedding = inputs_embeds.masked_scatter(special_image_mask, image_features)
13341335
final_attention_mask = attention_mask
@@ -1432,28 +1433,43 @@ def add_video_features(
14321433
legacy_processing,
14331434
**kwargs,
14341435
):
1436+
# Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L732-L751
14351437
video_features = self.get_video_features(pixel_values_videos, input_ids)
14361438
if video_features is not None:
14371439
if legacy_processing:
1438-
raise ValueError("Video processing supported only for transformers>=4.45 preprocessing.")
1439-
inputs_embeds = torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds
1440-
video_features = [feature.flatten(0, 1) for feature in video_features]
1441-
video_feature_lens = [feature.size(0) for feature in video_features]
1442-
video_features = torch.cat(video_features, dim=0)
1443-
video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
1444-
1445-
special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1)
1446-
special_image_mask = special_image_mask.expand_as(inputs_embeds)
1447-
if inputs_embeds[special_image_mask].numel() != video_features.numel():
1448-
n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
1449-
n_video_features = video_features.shape[0]
1450-
raise ValueError(
1451-
f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
1440+
video_feature_lens = [feature.size(0) for feature in video_features]
1441+
inputs_embeds, attention_mask, position_ids = self.merge_vision_text_embeddings(
1442+
video_features,
1443+
inputs_embeds,
1444+
video_feature_lens,
1445+
input_ids,
1446+
attention_mask,
1447+
position_ids,
1448+
legacy_processing,
1449+
self.config.video_token_index,
14521450
)
1453-
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
1451+
else:
1452+
inputs_embeds = (
1453+
torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds
1454+
)
1455+
video_features = [feature.flatten(0, 1) for feature in video_features]
1456+
video_feature_lens = [feature.size(0) for feature in video_features]
1457+
video_features = torch.cat(video_features, dim=0)
1458+
video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
1459+
1460+
special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1)
1461+
special_image_mask = special_image_mask.expand_as(inputs_embeds)
1462+
if inputs_embeds[special_image_mask].numel() != video_features.numel():
1463+
n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
1464+
n_video_features = video_features.shape[0]
1465+
raise ValueError(
1466+
f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
1467+
)
1468+
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
14541469
return inputs_embeds, attention_mask, position_ids
14551470

14561471
def get_video_features(self, pixel_values, input_ids=None, **kwargs):
1472+
# Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L835
14571473
if input_ids is not None and input_ids.shape[1] == 1:
14581474
return None
14591475
batch_size, frames, channels, height, width = pixel_values.shape

tests/openvino/test_modeling.py

+19-10
Original file line numberDiff line numberDiff line change
@@ -2127,16 +2127,20 @@ def test_compare_with_and_without_past_key_values(self):
21272127
class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
21282128
SUPPORTED_ARCHITECTURES = ["llava"]
21292129

2130-
if is_transformers_version(">=", "4.40.0"):
2131-
SUPPORTED_ARCHITECTURES += ["llava_next", "nanollava"]
2132-
if is_transformers_version(">=", "4.45.0"):
2133-
SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl"]
2130+
# if is_transformers_version(">=", "4.40.0"):
2131+
# SUPPORTED_ARCHITECTURES += ["llava_next", "nanollava"]
21342132

2135-
if is_transformers_version(">=", "4.46.0"):
2136-
SUPPORTED_ARCHITECTURES += ["maira2"]
2133+
if is_transformers_version(">=", "4.42.0"):
2134+
SUPPORTED_ARCHITECTURES += ["llava_next_video"]
2135+
2136+
# if is_transformers_version(">=", "4.45.0"):
2137+
# SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl"]
21372138

2138-
if is_transformers_version(">=", "4.49.0"):
2139-
SUPPORTED_ARCHITECTURES += ["qwen2_5_vl"]
2139+
# if is_transformers_version(">=", "4.46.0"):
2140+
# SUPPORTED_ARCHITECTURES += ["maira2"]
2141+
2142+
# if is_transformers_version(">=", "4.49.0"):
2143+
# SUPPORTED_ARCHITECTURES += ["qwen2_5_vl"]
21402144
TASK = "image-text-to-text"
21412145
REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v", "maira2"]
21422146

@@ -2148,11 +2152,16 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
21482152
)
21492153

21502154
def get_transformer_model_class(self, model_arch):
2155+
print(model_arch)
21512156
if is_transformers_version(">=", "4.46") and model_arch in ["llava", "llava_next", "qwen2_vl", "qwen2_5_vl"]:
21522157
from transformers import AutoModelForImageTextToText
21532158

21542159
return AutoModelForImageTextToText
2155-
if model_arch in "llava":
2160+
if model_arch == "llava_next_video":
2161+
from transformers import AutoModelForVision2Seq
2162+
2163+
return AutoModelForVision2Seq
2164+
if model_arch == "llava":
21562165
from transformers import LlavaForConditionalGeneration
21572166

21582167
return LlavaForConditionalGeneration
@@ -2259,7 +2268,7 @@ def test_compare_to_transformers(self, model_arch):
22592268

22602269
gc.collect()
22612270

2262-
@parameterized.expand(["llava", "llava_next"])
2271+
@parameterized.expand(["llava", "llava_next", "llava_next_video"])
22632272
@unittest.skipIf(
22642273
is_transformers_version("<", "4.45.0"), reason="New preprocessing available only in transformers >= 4.45"
22652274
)

tests/openvino/utils_tests.py

+1
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@
8888
"llama_awq": "HuggingFaceH4/tiny-random-LlamaForCausalLM",
8989
"llava": "katuni4ka/tiny-random-llava",
9090
"llava_next": "katuni4ka/tiny-random-llava-next",
91+
"llava_next_video": "katuni4ka/tiny-random-llava-next-video",
9192
"m2m_100": "hf-internal-testing/tiny-random-m2m_100",
9293
"opt": "hf-internal-testing/tiny-random-OPTModel",
9394
"opt125m": "facebook/opt-125m",

0 commit comments

Comments
 (0)