|
94 | 94 | JaisModelPatcher,
|
95 | 95 | LlamaModelPatcher,
|
96 | 96 | LlavaImageEmbeddingModelPatcher,
|
| 97 | + LlavaNextVideoImageEmbeddingModelPatcher, |
97 | 98 | LlavaQwen2ImageEmbeddingsModelPatcher,
|
98 | 99 | MiniCPM3Patcher,
|
99 | 100 | MiniCPMModelPatcher,
|
@@ -140,6 +141,9 @@ def init_model_configs():
|
140 | 141 | TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS[
|
141 | 142 | "image-text-to-text"
|
142 | 143 | ] = TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"]
|
| 144 | + |
| 145 | + TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["video-text-to-text"] = "AutoModelForVision2Seq" |
| 146 | + |
143 | 147 | if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS:
|
144 | 148 | TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline"
|
145 | 149 | TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["fill"] = {"flux": "FluxFillPipeline"}
|
@@ -1591,6 +1595,118 @@ class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig):
|
1591 | 1595 | MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")
|
1592 | 1596 |
|
1593 | 1597 |
|
| 1598 | +class DummyLLavaMultiModalProjectorInputGenerator(DummyInputGenerator): |
| 1599 | + SUPPORTED_INPUT_NAMES = ["image_features"] |
| 1600 | + |
| 1601 | + def __init__( |
| 1602 | + self, |
| 1603 | + task: str, |
| 1604 | + normalized_config: NormalizedTextConfig, |
| 1605 | + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], |
| 1606 | + random_batch_size_range: Optional[Tuple[int, int]] = None, |
| 1607 | + **kwargs, |
| 1608 | + ): |
| 1609 | + self.task = task |
| 1610 | + |
| 1611 | + self.batch_size = batch_size |
| 1612 | + self.hidden_size = normalized_config.hidden_size |
| 1613 | + self.num_patches = (normalized_config.image_size // normalized_config.patch_size) ** 2 |
| 1614 | + self.normalized_config = normalized_config |
| 1615 | + |
| 1616 | + def generate( |
| 1617 | + self, |
| 1618 | + input_name: str, |
| 1619 | + framework: str = "pt", |
| 1620 | + int_dtype: str = "int64", |
| 1621 | + float_dtype: str = "fp32", |
| 1622 | + ): |
| 1623 | + shape = [self.batch_size, self.num_patches, self.hidden_size] |
| 1624 | + return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) |
| 1625 | + |
| 1626 | + |
| 1627 | +class LLavaMultimodalProjectorOpenVINOConfig(OnnxConfig): |
| 1628 | + DUMMY_INPUT_GENERATOR_CLASSES = (DummyLLavaMultiModalProjectorInputGenerator,) |
| 1629 | + NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig |
| 1630 | + |
| 1631 | + @property |
| 1632 | + def inputs(self) -> Dict[str, Dict[int, str]]: |
| 1633 | + return {"image_features": {0: "batch_size", 1: "sequence_length"}} |
| 1634 | + |
| 1635 | + @property |
| 1636 | + def outputs(self) -> Dict[str, Dict[int, str]]: |
| 1637 | + return {"hidden_states": {0: "batch_size", 1: "sequence_length"}} |
| 1638 | + |
| 1639 | + |
| 1640 | +class LlavaNextVideoConfigBehavior(str, enum.Enum): |
| 1641 | + LANGUAGE = "language" |
| 1642 | + VISION_EMBEDDINGS = "vision_embeddings" |
| 1643 | + VISION_RESAMPLER = "vision_resampler" |
| 1644 | + MULTI_MODAL_PROJECTOR = "multi_modal_projector" |
| 1645 | + TEXT_EMBEDDINGS = "text_embeddings" |
| 1646 | + |
| 1647 | + |
| 1648 | +@register_in_tasks_manager( |
| 1649 | + "llava-next-video", *["image-text-to-text", "video-text-to-text"], library_name="transformers" |
| 1650 | +) |
| 1651 | +class LlavaNextVideoOpenVINOConfig(LlavaOpenVINOConfig): |
| 1652 | + MIN_TRANSFORMERS_VERSION = version.parse("4.42.0") |
| 1653 | + SUPPORTED_BEHAVIORS = [model_type.value for model_type in LlavaNextVideoConfigBehavior] |
| 1654 | + |
| 1655 | + def with_behavior( |
| 1656 | + self, |
| 1657 | + behavior: Union[str, LlavaNextVideoConfigBehavior], |
| 1658 | + ): |
| 1659 | + """ |
| 1660 | + Creates a config for different behaviour. |
| 1661 | +
|
| 1662 | + Args: |
| 1663 | + behavior ([`ConfigBehavior`]): |
| 1664 | + The behavior to use for the new instance. |
| 1665 | + """ |
| 1666 | + if isinstance(behavior, str) and not isinstance(behavior, LlavaNextVideoConfigBehavior): |
| 1667 | + behavior = LlavaNextVideoConfigBehavior(behavior) |
| 1668 | + |
| 1669 | + if behavior == LlavaNextVideoConfigBehavior.MULTI_MODAL_PROJECTOR: |
| 1670 | + export_config = LLavaMultimodalProjectorOpenVINOConfig( |
| 1671 | + self._orig_config.vision_config, |
| 1672 | + task="feature-extraction", |
| 1673 | + int_dtype=self.int_dtype, |
| 1674 | + float_dtype=self.float_dtype, |
| 1675 | + ) |
| 1676 | + return export_config |
| 1677 | + |
| 1678 | + if behavior == LlavaNextVideoConfigBehavior.VISION_RESAMPLER: |
| 1679 | + export_config = LLavaMultimodalProjectorOpenVINOConfig( |
| 1680 | + self._orig_config.vision_config, |
| 1681 | + task="feature-extraction", |
| 1682 | + int_dtype=self.int_dtype, |
| 1683 | + float_dtype=self.float_dtype, |
| 1684 | + ) |
| 1685 | + return export_config |
| 1686 | + |
| 1687 | + return super().with_behavior(behavior) |
| 1688 | + |
| 1689 | + def get_model_for_behavior(self, model, behavior: Union[str, LlavaNextVideoConfigBehavior]): |
| 1690 | + if isinstance(behavior, str) and not isinstance(behavior, LlavaNextVideoConfigBehavior): |
| 1691 | + behavior = LlavaNextVideoConfigBehavior(behavior) |
| 1692 | + |
| 1693 | + if behavior == LlavaNextVideoConfigBehavior.MULTI_MODAL_PROJECTOR: |
| 1694 | + return model.multi_modal_projector |
| 1695 | + |
| 1696 | + if behavior == LlavaNextVideoConfigBehavior.VISION_RESAMPLER: |
| 1697 | + return model.vision_resampler |
| 1698 | + |
| 1699 | + return super().get_model_for_behavior(model, behavior) |
| 1700 | + |
| 1701 | + def patch_model_for_export( |
| 1702 | + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None |
| 1703 | + ): |
| 1704 | + model_kwargs = model_kwargs or {} |
| 1705 | + if self._behavior != LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1706 | + return super().patch_model_for_export(model, model_kwargs) |
| 1707 | + return LlavaNextVideoImageEmbeddingModelPatcher(self, model, model_kwargs) |
| 1708 | + |
| 1709 | + |
1594 | 1710 | @register_in_tasks_manager(
|
1595 | 1711 | "maira2", *["image-text-to-text", "text-generation", "text-generation-with-past"], library_name="transformers"
|
1596 | 1712 | )
|
|
0 commit comments