add docs and tests

eaidova · eaidova · commit 080c874ed25d · 2025-03-03T12:19:50.000+04:00
diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
@@ -74,6 +74,7 @@ Here is the list of the supported architectures :
 - Llama
 - Llava
 - Llava-Next
+- Llava-Next-Video
 - M2-M100
 - MAIRA-2
 - MBart
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -138,6 +138,11 @@ def init_model_configs():
         "AutoModelForImageTextToText",
     )
 
+    TasksManager._CUSTOM_CLASSES[("pt", "llava-next-video", "image-text-to-text")] = (
+        "transformers",
+        "AutoModelForVision2Seq",
+    )
+
     TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS[
         "image-text-to-text"
     ] = TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"]
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -3112,11 +3112,10 @@ def llava_vision_embed_forward(self, pixel_values):
 
 
 def llava_next_video_vision_embed_forward(self, pixel_values):
-    # copied from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/llava/modeling_llava.py#L428-L441
+    # copied from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L519
     # these changes does not bring any difference from original, it only packs model subcomponent inference together
     # that allow us avoid memory overheads and their inference results handling on code-level
     image_features = self.vision_tower(pixel_values, output_hidden_states=True)
-    # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
     vision_feature_layer = self.config.vision_feature_layer
     if isinstance(vision_feature_layer, int):
         selected_image_feature = image_features.hidden_states[vision_feature_layer]
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
@@ -1204,9 +1204,10 @@ def merge_vision_text_embeddings(
         attention_mask,
         position_ids=None,
         legacy_processing=False,
+        image_token_index=None,
         **kwargs,
     ):
-        image_token_index = self.config.image_token_index
+        image_token_index = self.config.image_token_index if image_token_index is None else image_token_index
         image_features = torch.from_numpy(vision_embeds) if isinstance(vision_embeds, np.ndarray) else vision_embeds
         inputs_embeds = torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds
 
@@ -1235,7 +1236,7 @@ def merge_vision_text_embeddings(
 
                 # Whether to turn off right padding
                 # 1. Create a mask to know where special image tokens are
-                special_image_token_mask = input_ids == image_token_index
+                special_image_token_mask = torch.tensor(input_ids == image_token_index)
                 # special_image_token_mask: [bsz, seqlen]
                 num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
                 # num_special_image_tokens: [bsz]
@@ -1328,7 +1329,7 @@ def merge_vision_text_embeddings(
             final_attention_mask |= image_to_overwrite
             position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
         else:
-            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            special_image_mask = torch.tensor((input_ids == image_token_index)).unsqueeze(-1).expand_as(inputs_embeds)
             image_features = image_features.to(inputs_embeds.dtype)
             final_embedding = inputs_embeds.masked_scatter(special_image_mask, image_features)
             final_attention_mask = attention_mask
@@ -1432,28 +1433,43 @@ def add_video_features(
         legacy_processing,
         **kwargs,
     ):
+        # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L732-L751
         video_features = self.get_video_features(pixel_values_videos, input_ids)
         if video_features is not None:
             if legacy_processing:
-                raise ValueError("Video processing supported only for transformers>=4.45 preprocessing.")
-            inputs_embeds = torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds
-            video_features = [feature.flatten(0, 1) for feature in video_features]
-            video_feature_lens = [feature.size(0) for feature in video_features]
-            video_features = torch.cat(video_features, dim=0)
-            video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
-
-            special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1)
-            special_image_mask = special_image_mask.expand_as(inputs_embeds)
-            if inputs_embeds[special_image_mask].numel() != video_features.numel():
-                n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
-                n_video_features = video_features.shape[0]
-                raise ValueError(
-                    f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
+                video_feature_lens = [feature.size(0) for feature in video_features]
+                inputs_embeds, attention_mask, position_ids = self.merge_vision_text_embeddings(
+                    video_features,
+                    inputs_embeds,
+                    video_feature_lens,
+                    input_ids,
+                    attention_mask,
+                    position_ids,
+                    legacy_processing,
+                    self.config.video_token_index,
                 )
-            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
+            else:
+                inputs_embeds = (
+                    torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds
+                )
+                video_features = [feature.flatten(0, 1) for feature in video_features]
+                video_feature_lens = [feature.size(0) for feature in video_features]
+                video_features = torch.cat(video_features, dim=0)
+                video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
+
+                special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1)
+                special_image_mask = special_image_mask.expand_as(inputs_embeds)
+                if inputs_embeds[special_image_mask].numel() != video_features.numel():
+                    n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
+                    n_video_features = video_features.shape[0]
+                    raise ValueError(
+                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
+                    )
+                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
         return inputs_embeds, attention_mask, position_ids
 
     def get_video_features(self, pixel_values, input_ids=None, **kwargs):
+        # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L835
         if input_ids is not None and input_ids.shape[1] == 1:
             return None
         batch_size, frames, channels, height, width = pixel_values.shape
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -2127,16 +2127,20 @@ def test_compare_with_and_without_past_key_values(self):
 class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES = ["llava"]
 
-    if is_transformers_version(">=", "4.40.0"):
-        SUPPORTED_ARCHITECTURES += ["llava_next", "nanollava"]
-    if is_transformers_version(">=", "4.45.0"):
-        SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl"]
+    # if is_transformers_version(">=", "4.40.0"):
+    #     SUPPORTED_ARCHITECTURES += ["llava_next", "nanollava"]
 
-    if is_transformers_version(">=", "4.46.0"):
-        SUPPORTED_ARCHITECTURES += ["maira2"]
+    if is_transformers_version(">=", "4.42.0"):
+        SUPPORTED_ARCHITECTURES += ["llava_next_video"]
+
+    # if is_transformers_version(">=", "4.45.0"):
+    #     SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl"]
 
-    if is_transformers_version(">=", "4.49.0"):
-        SUPPORTED_ARCHITECTURES += ["qwen2_5_vl"]
+    # if is_transformers_version(">=", "4.46.0"):
+    #     SUPPORTED_ARCHITECTURES += ["maira2"]
+
+    # if is_transformers_version(">=", "4.49.0"):
+    #     SUPPORTED_ARCHITECTURES += ["qwen2_5_vl"]
     TASK = "image-text-to-text"
     REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v", "maira2"]
 
@@ -2148,11 +2152,16 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
     )
 
     def get_transformer_model_class(self, model_arch):
+        print(model_arch)
         if is_transformers_version(">=", "4.46") and model_arch in ["llava", "llava_next", "qwen2_vl", "qwen2_5_vl"]:
             from transformers import AutoModelForImageTextToText
 
             return AutoModelForImageTextToText
-        if model_arch in "llava":
+        if model_arch == "llava_next_video":
+            from transformers import AutoModelForVision2Seq
+
+            return AutoModelForVision2Seq
+        if model_arch == "llava":
             from transformers import LlavaForConditionalGeneration
 
             return LlavaForConditionalGeneration
@@ -2259,7 +2268,7 @@ def test_compare_to_transformers(self, model_arch):
 
         gc.collect()
 
-    @parameterized.expand(["llava", "llava_next"])
+    @parameterized.expand(["llava", "llava_next", "llava_next_video"])
     @unittest.skipIf(
         is_transformers_version("<", "4.45.0"), reason="New preprocessing available only in transformers >= 4.45"
     )
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
@@ -88,6 +88,7 @@
     "llama_awq": "HuggingFaceH4/tiny-random-LlamaForCausalLM",
     "llava": "katuni4ka/tiny-random-llava",
     "llava_next": "katuni4ka/tiny-random-llava-next",
+    "llava_next_video": "katuni4ka/tiny-random-llava-next-video",
     "m2m_100": "hf-internal-testing/tiny-random-m2m_100",
     "opt": "hf-internal-testing/tiny-random-OPTModel",
     "opt125m": "facebook/opt-125m",