Skip to content

Commit 7f70f2b

Browse files
add support llava-next-video (#1183)
* add support llava-next-video * add docs and tests * add video input support in preprocess_input * test for video * fix test for video * apply review comments * fix image features packing * fix video processing, add quantization tests * Apply suggestions from code review Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --------- Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
1 parent 6e4bb36 commit 7f70f2b

File tree

10 files changed

+611
-92
lines changed

10 files changed

+611
-92
lines changed

docs/source/openvino/models.mdx

+1
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ Here is the list of the supported architectures :
7474
- Llama
7575
- Llava
7676
- Llava-Next
77+
- Llava-Next-Video
7778
- M2-M100
7879
- MAIRA-2
7980
- MBart

optimum/exporters/openvino/__main__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ def main_export(
313313
and framework == "pt"
314314
and (
315315
task.startswith("text-generation")
316-
or getattr(config, "model_type", None) in MULTI_MODAL_TEXT_GENERATION_MODELS
316+
or getattr(config, "model_type", "").replace("_", "-") in MULTI_MODAL_TEXT_GENERATION_MODELS
317317
)
318318
and getattr(config, "torch_dtype", torch.float32) in [torch.float16, torch.bfloat16]
319319
):

optimum/exporters/openvino/model_configs.py

+123-2
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@
9494
JaisModelPatcher,
9595
LlamaModelPatcher,
9696
LlavaImageEmbeddingModelPatcher,
97+
LlavaNextVideoImageEmbeddingModelPatcher,
9798
LlavaQwen2ImageEmbeddingsModelPatcher,
9899
MiniCPM3Patcher,
99100
MiniCPMModelPatcher,
@@ -137,9 +138,17 @@ def init_model_configs():
137138
"AutoModelForImageTextToText",
138139
)
139140

141+
TasksManager._CUSTOM_CLASSES[("pt", "llava-next-video", "image-text-to-text")] = (
142+
"transformers",
143+
"AutoModelForVision2Seq",
144+
)
145+
140146
TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS[
141147
"image-text-to-text"
142148
] = TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"]
149+
150+
TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["video-text-to-text"] = "AutoModelForVision2Seq"
151+
143152
if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS:
144153
TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline"
145154
TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["fill"] = {"flux": "FluxFillPipeline"}
@@ -1591,6 +1600,118 @@ class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig):
15911600
MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")
15921601

15931602

1603+
class DummyLLavaMultiModalProjectorInputGenerator(DummyInputGenerator):
1604+
SUPPORTED_INPUT_NAMES = ["image_features"]
1605+
1606+
def __init__(
1607+
self,
1608+
task: str,
1609+
normalized_config: NormalizedTextConfig,
1610+
batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
1611+
random_batch_size_range: Optional[Tuple[int, int]] = None,
1612+
**kwargs,
1613+
):
1614+
self.task = task
1615+
1616+
self.batch_size = batch_size
1617+
self.hidden_size = normalized_config.hidden_size
1618+
self.num_patches = (normalized_config.image_size // normalized_config.patch_size) ** 2
1619+
self.normalized_config = normalized_config
1620+
1621+
def generate(
1622+
self,
1623+
input_name: str,
1624+
framework: str = "pt",
1625+
int_dtype: str = "int64",
1626+
float_dtype: str = "fp32",
1627+
):
1628+
shape = [self.batch_size, self.num_patches, self.hidden_size]
1629+
return self.random_float_tensor(shape, framework=framework, dtype=float_dtype)
1630+
1631+
1632+
class LLavaMultimodalProjectorOpenVINOConfig(OnnxConfig):
1633+
DUMMY_INPUT_GENERATOR_CLASSES = (DummyLLavaMultiModalProjectorInputGenerator,)
1634+
NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
1635+
1636+
@property
1637+
def inputs(self) -> Dict[str, Dict[int, str]]:
1638+
return {"image_features": {0: "batch_size", 1: "sequence_length"}}
1639+
1640+
@property
1641+
def outputs(self) -> Dict[str, Dict[int, str]]:
1642+
return {"hidden_states": {0: "batch_size", 1: "sequence_length"}}
1643+
1644+
1645+
class LlavaNextVideoConfigBehavior(str, enum.Enum):
1646+
LANGUAGE = "language"
1647+
VISION_EMBEDDINGS = "vision_embeddings"
1648+
VISION_RESAMPLER = "vision_resampler"
1649+
MULTI_MODAL_PROJECTOR = "multi_modal_projector"
1650+
TEXT_EMBEDDINGS = "text_embeddings"
1651+
1652+
1653+
@register_in_tasks_manager(
1654+
"llava-next-video", *["image-text-to-text", "video-text-to-text"], library_name="transformers"
1655+
)
1656+
class LlavaNextVideoOpenVINOConfig(LlavaOpenVINOConfig):
1657+
MIN_TRANSFORMERS_VERSION = version.parse("4.42.0")
1658+
SUPPORTED_BEHAVIORS = [model_type.value for model_type in LlavaNextVideoConfigBehavior]
1659+
1660+
def with_behavior(
1661+
self,
1662+
behavior: Union[str, LlavaNextVideoConfigBehavior],
1663+
):
1664+
"""
1665+
Creates a config for different behaviour.
1666+
1667+
Args:
1668+
behavior ([`ConfigBehavior`]):
1669+
The behavior to use for the new instance.
1670+
"""
1671+
if isinstance(behavior, str) and not isinstance(behavior, LlavaNextVideoConfigBehavior):
1672+
behavior = LlavaNextVideoConfigBehavior(behavior)
1673+
1674+
if behavior == LlavaNextVideoConfigBehavior.MULTI_MODAL_PROJECTOR:
1675+
export_config = LLavaMultimodalProjectorOpenVINOConfig(
1676+
self._orig_config.vision_config,
1677+
task="feature-extraction",
1678+
int_dtype=self.int_dtype,
1679+
float_dtype=self.float_dtype,
1680+
)
1681+
return export_config
1682+
1683+
if behavior == LlavaNextVideoConfigBehavior.VISION_RESAMPLER:
1684+
export_config = LLavaMultimodalProjectorOpenVINOConfig(
1685+
self._orig_config.vision_config,
1686+
task="feature-extraction",
1687+
int_dtype=self.int_dtype,
1688+
float_dtype=self.float_dtype,
1689+
)
1690+
return export_config
1691+
1692+
return super().with_behavior(behavior)
1693+
1694+
def get_model_for_behavior(self, model, behavior: Union[str, LlavaNextVideoConfigBehavior]):
1695+
if isinstance(behavior, str) and not isinstance(behavior, LlavaNextVideoConfigBehavior):
1696+
behavior = LlavaNextVideoConfigBehavior(behavior)
1697+
1698+
if behavior == LlavaNextVideoConfigBehavior.MULTI_MODAL_PROJECTOR:
1699+
return model.multi_modal_projector
1700+
1701+
if behavior == LlavaNextVideoConfigBehavior.VISION_RESAMPLER:
1702+
return model.vision_resampler
1703+
1704+
return super().get_model_for_behavior(model, behavior)
1705+
1706+
def patch_model_for_export(
1707+
self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
1708+
):
1709+
model_kwargs = model_kwargs or {}
1710+
if self._behavior != LlavaConfigBehavior.VISION_EMBEDDINGS:
1711+
return super().patch_model_for_export(model, model_kwargs)
1712+
return LlavaNextVideoImageEmbeddingModelPatcher(self, model, model_kwargs)
1713+
1714+
15941715
@register_in_tasks_manager(
15951716
"maira2", *["image-text-to-text", "text-generation", "text-generation-with-past"], library_name="transformers"
15961717
)
@@ -2587,7 +2708,7 @@ class Qwen2VLConfigBehavior(str, enum.Enum):
25872708
TEXT_EMBEDDINGS = "text_embeddings"
25882709

25892710

2590-
@register_in_tasks_manager("qwen2-vl", *["image-text-to-text"], library_name="transformers")
2711+
@register_in_tasks_manager("qwen2-vl", *["image-text-to-text", "video-text-to-text"], library_name="transformers")
25912712
class Qwen2VLOpenVINOConfig(OnnxConfig):
25922713
SUPPORTED_BEHAVIORS = [model_type.value for model_type in Qwen2VLConfigBehavior]
25932714
NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
@@ -2717,7 +2838,7 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
27172838
return {}
27182839

27192840

2720-
@register_in_tasks_manager("qwen2-5-vl", *["image-text-to-text"], library_name="transformers")
2841+
@register_in_tasks_manager("qwen2-5-vl", *["image-text-to-text", "video-text-to-text"], library_name="transformers")
27212842
class Qwen2_5_VLOpenVINOConfig(Qwen2VLOpenVINOConfig):
27222843
MIN_TRANSFORMERS_VERSION = version.parse("4.49.0")
27232844

optimum/exporters/openvino/model_patcher.py

+38
Original file line numberDiff line numberDiff line change
@@ -3111,6 +3111,27 @@ def llava_vision_embed_forward(self, pixel_values):
31113111
return image_features
31123112

31133113

3114+
def llava_next_video_vision_embed_forward(self, pixel_values):
3115+
# copied from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L519
3116+
# these changes does not bring any difference from original, it only packs model subcomponent inference together
3117+
# that allow us avoid memory overheads and their inference results handling on code-level
3118+
image_features = self.vision_tower(pixel_values, output_hidden_states=True)
3119+
vision_feature_layer = self.config.vision_feature_layer
3120+
if isinstance(vision_feature_layer, int):
3121+
selected_image_feature = image_features.hidden_states[vision_feature_layer]
3122+
else:
3123+
hs_pool = [image_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
3124+
selected_image_feature = torch.cat(hs_pool, dim=-1)
3125+
3126+
if self.config.vision_feature_select_strategy == "default":
3127+
selected_image_feature = selected_image_feature[:, 1:]
3128+
elif self.config.vision_feature_select_strategy == "full":
3129+
selected_image_feature = selected_image_feature
3130+
else:
3131+
raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
3132+
return selected_image_feature
3133+
3134+
31143135
class LlavaImageEmbeddingModelPatcher(ModelPatcher):
31153136
def __init__(
31163137
self,
@@ -3128,6 +3149,23 @@ def __exit__(self, exc_type, exc_value, traceback):
31283149
self._model.forward = self._model.__orig_forward
31293150

31303151

3152+
class LlavaNextVideoImageEmbeddingModelPatcher(ModelPatcher):
3153+
def __init__(
3154+
self,
3155+
config: "OnnxConfig",
3156+
model: Union["PreTrainedModel", "TFPreTrainedModel"],
3157+
model_kwargs: Dict[str, Any],
3158+
):
3159+
model.__orig_forward = model.forward
3160+
model.forward = types.MethodType(llava_next_video_vision_embed_forward, model)
3161+
3162+
super().__init__(config, model, model_kwargs)
3163+
3164+
def __exit__(self, exc_type, exc_value, traceback):
3165+
super().__exit__(exc_type, exc_value, traceback)
3166+
self._model.forward = self._model.__orig_forward
3167+
3168+
31313169
def _embednb_forward(self, ids: torch.Tensor) -> torch.Tensor:
31323170
def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
31333171
assert dim % 2 == 0, "The dimension must be even."

optimum/exporters/openvino/utils.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ def get_submodels(model):
220220
MULTI_MODAL_TEXT_GENERATION_MODELS = [
221221
"llava",
222222
"llava-next",
223+
"llava-next-video",
223224
"llava-qwen2",
224225
"internvl-chat",
225226
"maira2",
@@ -299,7 +300,7 @@ def save_preprocessors(
299300
preprocessors[1].chat_template = getattr(preprocessors[0], "chat_template", None)
300301
if (
301302
is_transformers_version(">=", "4.45")
302-
and model_type in ["llava", "llava-next"]
303+
and model_type in ["llava", "llava-next", "llava-next-video"]
303304
and preprocessors is not None
304305
):
305306
if getattr(preprocessors[1], "patch_size", None) is None:

0 commit comments

Comments
 (0)