fix video processing, add quantization tests

eaidova · eaidova · commit c08b95d0709e · 2025-03-03T22:47:24.000+04:00
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
@@ -1551,9 +1551,13 @@ def add_video_features(
     ):
         # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L732-L751
         video_features = self.get_video_features(pixel_values_videos, input_ids)
-        if video_features is not None:
+        if video_features is not None and len(video_features) != 0:
+            video_features = [feature.flatten(0, 1) for feature in video_features]
+            video_feature_lens = [feature.size(0) for feature in video_features]
+            video_features = torch.cat(video_features, dim=0)
+            video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
+
             if legacy_processing:
-                video_feature_lens = [feature.size(0) for feature in video_features]
                 inputs_embeds, attention_mask, position_ids = self.merge_vision_text_embeddings(
                     video_features,
                     inputs_embeds,
@@ -1568,11 +1572,6 @@ def add_video_features(
                 inputs_embeds = (
                     torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds
                 )
-                video_features = [feature.flatten(0, 1) for feature in video_features]
-                video_feature_lens = [feature.size(0) for feature in video_features]
-                video_features = torch.cat(video_features, dim=0)
-                video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
-
                 special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1)
                 special_image_mask = special_image_mask.expand_as(inputs_embeds)
                 if inputs_embeds[special_image_mask].numel() != video_features.numel():
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -341,6 +341,19 @@ class OVCLIExportTestCase(unittest.TestCase):
             ]
         )
 
+    if is_transformers_version(">=", "4.42.0"):
+        TEST_4BIT_CONFIGURATIONS.extend(
+            [
+                (
+                    "image-text-to-text",
+                    "llava_next_video",
+                    'int4 --group-size 16 --ratio 0.8 --sensitivity-metric "hessian_input_activation" '
+                    "--dataset contextual --num-samples 1",
+                    [{"int8": 6, "int4": 24}, {"int8": 1}, {"int8": 7}, {}, {"int8": 2}],
+                ),
+            ]
+        )
+
     if is_transformers_version(">=", "4.45.0"):
         TEST_4BIT_CONFIGURATIONS.extend(
             [
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
@@ -581,6 +581,27 @@ class OVWeightCompressionTest(unittest.TestCase):
             ]
         )
 
+    if is_transformers_version(">=", "4.42.0"):
+        LOAD_IN_4_BITS_SCOPE.extend(
+            [
+                (
+                    OVModelForVisualCausalLM,
+                    "llava_next_video",
+                    False,
+                    dict(
+                        bits=4,
+                        group_size=16,
+                        dataset="contextual",
+                        ratio=0.8,
+                        sensitivity_metric="hessian_input_activation",
+                        num_samples=1,
+                        processor=MODEL_NAMES["llava_next_video"],
+                    ),
+                    [{"int8": 6, "int4": 24}, {"int8": 1}, {"int8": 7}, {}, {"int8": 2}],
+                ),
+            ]
+        )
+
     if is_transformers_version(">=", "4.45.0"):
         LOAD_IN_4_BITS_SCOPE.extend(
             [
@@ -668,6 +689,9 @@ class OVWeightCompressionTest(unittest.TestCase):
     if is_transformers_version(">=", "4.40.0"):
         SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "nanollava", True))
 
+    if is_transformers_version(">=", "4.42.0"):
+        SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "llava_next_video", False))
+
     if is_transformers_version(">=", "4.45.0"):
         SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "minicpmv", True))
         SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "qwen2_vl", False))
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
@@ -203,6 +203,7 @@
     "llava": (30, 1, 9),
     "llava_next": (30, 1, 9),
     "minicpmv": (30, 1, 26, 6),
+    "llava_next_video": (30, 1, 7, 0, 2),
     "nanollava": (30, 1, 15),
     "qwen2_vl": (30, 1, 1, 10),
     "sana": (58, 28, 28, 18),

Original file line number	Diff line number	Diff line change
`@@ -341,6 +341,19 @@ class OVCLIExportTestCase(unittest.TestCase):`
`341`	`341`	`]`
`342`	`342`	`)`
`343`	`343`
	`344`	`+ if is_transformers_version(">=", "4.42.0"):`
	`345`	`+ TEST_4BIT_CONFIGURATIONS.extend(`
	`346`	`+ [`
	`347`	`+ (`
	`348`	`+ "image-text-to-text",`
	`349`	`+ "llava_next_video",`
	`350`	`+ 'int4 --group-size 16 --ratio 0.8 --sensitivity-metric "hessian_input_activation" '`
	`351`	`+ "--dataset contextual --num-samples 1",`
	`352`	`+ [{"int8": 6, "int4": 24}, {"int8": 1}, {"int8": 7}, {}, {"int8": 2}],`
	`353`	`+ ),`
	`354`	`+ ]`
	`355`	`+ )`
	`356`	`+`
`344`	`357`	`if is_transformers_version(">=", "4.45.0"):`
`345`	`358`	`TEST_4BIT_CONFIGURATIONS.extend(`
`346`	`359`	`[`