Skip to content

Commit c08b95d

Browse files
committed
fix video processing, add quantization tests
1 parent 3131396 commit c08b95d

File tree

4 files changed

+44
-7
lines changed

4 files changed

+44
-7
lines changed

optimum/intel/openvino/modeling_visual_language.py

+6-7
Original file line numberDiff line numberDiff line change
@@ -1551,9 +1551,13 @@ def add_video_features(
15511551
):
15521552
# Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L732-L751
15531553
video_features = self.get_video_features(pixel_values_videos, input_ids)
1554-
if video_features is not None:
1554+
if video_features is not None and len(video_features) != 0:
1555+
video_features = [feature.flatten(0, 1) for feature in video_features]
1556+
video_feature_lens = [feature.size(0) for feature in video_features]
1557+
video_features = torch.cat(video_features, dim=0)
1558+
video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
1559+
15551560
if legacy_processing:
1556-
video_feature_lens = [feature.size(0) for feature in video_features]
15571561
inputs_embeds, attention_mask, position_ids = self.merge_vision_text_embeddings(
15581562
video_features,
15591563
inputs_embeds,
@@ -1568,11 +1572,6 @@ def add_video_features(
15681572
inputs_embeds = (
15691573
torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds
15701574
)
1571-
video_features = [feature.flatten(0, 1) for feature in video_features]
1572-
video_feature_lens = [feature.size(0) for feature in video_features]
1573-
video_features = torch.cat(video_features, dim=0)
1574-
video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
1575-
15761575
special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1)
15771576
special_image_mask = special_image_mask.expand_as(inputs_embeds)
15781577
if inputs_embeds[special_image_mask].numel() != video_features.numel():

tests/openvino/test_exporters_cli.py

+13
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,19 @@ class OVCLIExportTestCase(unittest.TestCase):
341341
]
342342
)
343343

344+
if is_transformers_version(">=", "4.42.0"):
345+
TEST_4BIT_CONFIGURATIONS.extend(
346+
[
347+
(
348+
"image-text-to-text",
349+
"llava_next_video",
350+
'int4 --group-size 16 --ratio 0.8 --sensitivity-metric "hessian_input_activation" '
351+
"--dataset contextual --num-samples 1",
352+
[{"int8": 6, "int4": 24}, {"int8": 1}, {"int8": 7}, {}, {"int8": 2}],
353+
),
354+
]
355+
)
356+
344357
if is_transformers_version(">=", "4.45.0"):
345358
TEST_4BIT_CONFIGURATIONS.extend(
346359
[

tests/openvino/test_quantization.py

+24
Original file line numberDiff line numberDiff line change
@@ -581,6 +581,27 @@ class OVWeightCompressionTest(unittest.TestCase):
581581
]
582582
)
583583

584+
if is_transformers_version(">=", "4.42.0"):
585+
LOAD_IN_4_BITS_SCOPE.extend(
586+
[
587+
(
588+
OVModelForVisualCausalLM,
589+
"llava_next_video",
590+
False,
591+
dict(
592+
bits=4,
593+
group_size=16,
594+
dataset="contextual",
595+
ratio=0.8,
596+
sensitivity_metric="hessian_input_activation",
597+
num_samples=1,
598+
processor=MODEL_NAMES["llava_next_video"],
599+
),
600+
[{"int8": 6, "int4": 24}, {"int8": 1}, {"int8": 7}, {}, {"int8": 2}],
601+
),
602+
]
603+
)
604+
584605
if is_transformers_version(">=", "4.45.0"):
585606
LOAD_IN_4_BITS_SCOPE.extend(
586607
[
@@ -668,6 +689,9 @@ class OVWeightCompressionTest(unittest.TestCase):
668689
if is_transformers_version(">=", "4.40.0"):
669690
SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "nanollava", True))
670691

692+
if is_transformers_version(">=", "4.42.0"):
693+
SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "llava_next_video", False))
694+
671695
if is_transformers_version(">=", "4.45.0"):
672696
SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "minicpmv", True))
673697
SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "qwen2_vl", False))

tests/openvino/utils_tests.py

+1
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@
203203
"llava": (30, 1, 9),
204204
"llava_next": (30, 1, 9),
205205
"minicpmv": (30, 1, 26, 6),
206+
"llava_next_video": (30, 1, 7, 0, 2),
206207
"nanollava": (30, 1, 15),
207208
"qwen2_vl": (30, 1, 1, 10),
208209
"sana": (58, 28, 28, 18),

0 commit comments

Comments
 (0)