Skip to content

Commit 7ca2a02

Browse files
committed
add support llava-next-video
1 parent c9ff040 commit 7ca2a02

File tree

5 files changed

+357
-42
lines changed

5 files changed

+357
-42
lines changed

optimum/exporters/openvino/__main__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ def main_export(
311311
and framework == "pt"
312312
and (
313313
task.startswith("text-generation")
314-
or getattr(config, "model_type", None) in MULTI_MODAL_TEXT_GENERATION_MODELS
314+
or getattr(config, "model_type", "").replace("_", "-") in MULTI_MODAL_TEXT_GENERATION_MODELS
315315
)
316316
and getattr(config, "torch_dtype", torch.float32) in [torch.float16, torch.bfloat16]
317317
):

optimum/exporters/openvino/model_configs.py

+116
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@
9494
JaisModelPatcher,
9595
LlamaModelPatcher,
9696
LlavaImageEmbeddingModelPatcher,
97+
LlavaNextVideoImageEmbeddingModelPatcher,
9798
LlavaQwen2ImageEmbeddingsModelPatcher,
9899
MiniCPM3Patcher,
99100
MiniCPMModelPatcher,
@@ -140,6 +141,9 @@ def init_model_configs():
140141
TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS[
141142
"image-text-to-text"
142143
] = TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"]
144+
145+
TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["video-text-to-text"] = "AutoModelForVision2Seq"
146+
143147
if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS:
144148
TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline"
145149
TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["fill"] = {"flux": "FluxFillPipeline"}
@@ -1591,6 +1595,118 @@ class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig):
15911595
MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")
15921596

15931597

1598+
class DummyLLavaMultiModalProjectorInputGenerator(DummyInputGenerator):
1599+
SUPPORTED_INPUT_NAMES = ["image_features"]
1600+
1601+
def __init__(
1602+
self,
1603+
task: str,
1604+
normalized_config: NormalizedTextConfig,
1605+
batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
1606+
random_batch_size_range: Optional[Tuple[int, int]] = None,
1607+
**kwargs,
1608+
):
1609+
self.task = task
1610+
1611+
self.batch_size = batch_size
1612+
self.hidden_size = normalized_config.hidden_size
1613+
self.num_patches = (normalized_config.image_size // normalized_config.patch_size) ** 2
1614+
self.normalized_config = normalized_config
1615+
1616+
def generate(
1617+
self,
1618+
input_name: str,
1619+
framework: str = "pt",
1620+
int_dtype: str = "int64",
1621+
float_dtype: str = "fp32",
1622+
):
1623+
shape = [self.batch_size, self.num_patches, self.hidden_size]
1624+
return self.random_float_tensor(shape, framework=framework, dtype=float_dtype)
1625+
1626+
1627+
class LLavaMultimodalProjectorOpenVINOConfig(OnnxConfig):
1628+
DUMMY_INPUT_GENERATOR_CLASSES = (DummyLLavaMultiModalProjectorInputGenerator,)
1629+
NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
1630+
1631+
@property
1632+
def inputs(self) -> Dict[str, Dict[int, str]]:
1633+
return {"image_features": {0: "batch_size", 1: "sequence_length"}}
1634+
1635+
@property
1636+
def outputs(self) -> Dict[str, Dict[int, str]]:
1637+
return {"hidden_states": {0: "batch_size", 1: "sequence_length"}}
1638+
1639+
1640+
class LlavaNextVideoConfigBehavior(str, enum.Enum):
1641+
LANGUAGE = "language"
1642+
VISION_EMBEDDINGS = "vision_embeddings"
1643+
VISION_RESAMPLER = "vision_resampler"
1644+
MULTI_MODAL_PROJECTOR = "multi_modal_projector"
1645+
TEXT_EMBEDDINGS = "text_embeddings"
1646+
1647+
1648+
@register_in_tasks_manager(
1649+
"llava-next-video", *["image-text-to-text", "video-text-to-text"], library_name="transformers"
1650+
)
1651+
class LlavaNextVideoOpenVINOConfig(LlavaOpenVINOConfig):
1652+
MIN_TRANSFORMERS_VERSION = version.parse("4.42.0")
1653+
SUPPORTED_BEHAVIORS = [model_type.value for model_type in LlavaNextVideoConfigBehavior]
1654+
1655+
def with_behavior(
1656+
self,
1657+
behavior: Union[str, LlavaNextVideoConfigBehavior],
1658+
):
1659+
"""
1660+
Creates a config for different behaviour.
1661+
1662+
Args:
1663+
behavior ([`ConfigBehavior`]):
1664+
The behavior to use for the new instance.
1665+
"""
1666+
if isinstance(behavior, str) and not isinstance(behavior, LlavaNextVideoConfigBehavior):
1667+
behavior = LlavaNextVideoConfigBehavior(behavior)
1668+
1669+
if behavior == LlavaNextVideoConfigBehavior.MULTI_MODAL_PROJECTOR:
1670+
export_config = LLavaMultimodalProjectorOpenVINOConfig(
1671+
self._orig_config.vision_config,
1672+
task="feature-extraction",
1673+
int_dtype=self.int_dtype,
1674+
float_dtype=self.float_dtype,
1675+
)
1676+
return export_config
1677+
1678+
if behavior == LlavaNextVideoConfigBehavior.VISION_RESAMPLER:
1679+
export_config = LLavaMultimodalProjectorOpenVINOConfig(
1680+
self._orig_config.vision_config,
1681+
task="feature-extraction",
1682+
int_dtype=self.int_dtype,
1683+
float_dtype=self.float_dtype,
1684+
)
1685+
return export_config
1686+
1687+
return super().with_behavior(behavior)
1688+
1689+
def get_model_for_behavior(self, model, behavior: Union[str, LlavaNextVideoConfigBehavior]):
1690+
if isinstance(behavior, str) and not isinstance(behavior, LlavaNextVideoConfigBehavior):
1691+
behavior = LlavaNextVideoConfigBehavior(behavior)
1692+
1693+
if behavior == LlavaNextVideoConfigBehavior.MULTI_MODAL_PROJECTOR:
1694+
return model.multi_modal_projector
1695+
1696+
if behavior == LlavaNextVideoConfigBehavior.VISION_RESAMPLER:
1697+
return model.vision_resampler
1698+
1699+
return super().get_model_for_behavior(model, behavior)
1700+
1701+
def patch_model_for_export(
1702+
self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
1703+
):
1704+
model_kwargs = model_kwargs or {}
1705+
if self._behavior != LlavaConfigBehavior.VISION_EMBEDDINGS:
1706+
return super().patch_model_for_export(model, model_kwargs)
1707+
return LlavaNextVideoImageEmbeddingModelPatcher(self, model, model_kwargs)
1708+
1709+
15941710
@register_in_tasks_manager(
15951711
"maira2", *["image-text-to-text", "text-generation", "text-generation-with-past"], library_name="transformers"
15961712
)

optimum/exporters/openvino/model_patcher.py

+39
Original file line numberDiff line numberDiff line change
@@ -3111,6 +3111,28 @@ def llava_vision_embed_forward(self, pixel_values):
31113111
return image_features
31123112

31133113

3114+
def llava_next_video_vision_embed_forward(self, pixel_values):
3115+
# copied from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/llava/modeling_llava.py#L428-L441
3116+
# these changes does not bring any difference from original, it only packs model subcomponent inference together
3117+
# that allow us avoid memory overheads and their inference results handling on code-level
3118+
image_features = self.vision_tower(pixel_values, output_hidden_states=True)
3119+
# this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
3120+
vision_feature_layer = self.config.vision_feature_layer
3121+
if isinstance(vision_feature_layer, int):
3122+
selected_image_feature = image_features.hidden_states[vision_feature_layer]
3123+
else:
3124+
hs_pool = [image_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
3125+
selected_image_feature = torch.cat(hs_pool, dim=-1)
3126+
3127+
if self.config.vision_feature_select_strategy == "default":
3128+
selected_image_feature = selected_image_feature[:, 1:]
3129+
elif self.config.vision_feature_select_strategy == "full":
3130+
selected_image_feature = selected_image_feature
3131+
else:
3132+
raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
3133+
return selected_image_feature
3134+
3135+
31143136
class LlavaImageEmbeddingModelPatcher(ModelPatcher):
31153137
def __init__(
31163138
self,
@@ -3128,6 +3150,23 @@ def __exit__(self, exc_type, exc_value, traceback):
31283150
self._model.forward = self._model.__orig_forward
31293151

31303152

3153+
class LlavaNextVideoImageEmbeddingModelPatcher(ModelPatcher):
3154+
def __init__(
3155+
self,
3156+
config: "OnnxConfig",
3157+
model: Union["PreTrainedModel", "TFPreTrainedModel"],
3158+
model_kwargs: Dict[str, Any],
3159+
):
3160+
model.__orig_forward = model.forward
3161+
model.forward = types.MethodType(llava_next_video_vision_embed_forward, model)
3162+
3163+
super().__init__(config, model, model_kwargs)
3164+
3165+
def __exit__(self, exc_type, exc_value, traceback):
3166+
super().__exit__(exc_type, exc_value, traceback)
3167+
self._model.forward = self._model.__orig_forward
3168+
3169+
31313170
def _embednb_forward(self, ids: torch.Tensor) -> torch.Tensor:
31323171
def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
31333172
assert dim % 2 == 0, "The dimension must be even."

optimum/exporters/openvino/utils.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ def get_submodels(model):
220220
MULTI_MODAL_TEXT_GENERATION_MODELS = [
221221
"llava",
222222
"llava-next",
223+
"llava-next-video",
223224
"llava-qwen2",
224225
"internvl-chat",
225226
"maira2",
@@ -299,7 +300,7 @@ def save_preprocessors(
299300
preprocessors[1].chat_template = getattr(preprocessors[0], "chat_template", None)
300301
if (
301302
is_transformers_version(">=", "4.45")
302-
and model_type in ["llava", "llava-next"]
303+
and model_type in ["llava", "llava-next", "llava-next-video"]
303304
and preprocessors is not None
304305
):
305306
if getattr(preprocessors[1], "patch_size", None) is None:

0 commit comments

Comments
 (0)