|
94 | 94 | JaisModelPatcher,
|
95 | 95 | LlamaModelPatcher,
|
96 | 96 | LlavaImageEmbeddingModelPatcher,
|
| 97 | + LlavaNextVideoImageEmbeddingModelPatcher, |
97 | 98 | LlavaQwen2ImageEmbeddingsModelPatcher,
|
98 | 99 | MiniCPM3Patcher,
|
99 | 100 | MiniCPMModelPatcher,
|
@@ -137,9 +138,17 @@ def init_model_configs():
|
137 | 138 | "AutoModelForImageTextToText",
|
138 | 139 | )
|
139 | 140 |
|
| 141 | + TasksManager._CUSTOM_CLASSES[("pt", "llava-next-video", "image-text-to-text")] = ( |
| 142 | + "transformers", |
| 143 | + "AutoModelForVision2Seq", |
| 144 | + ) |
| 145 | + |
140 | 146 | TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS[
|
141 | 147 | "image-text-to-text"
|
142 | 148 | ] = TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"]
|
| 149 | + |
| 150 | + TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["video-text-to-text"] = "AutoModelForVision2Seq" |
| 151 | + |
143 | 152 | if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS:
|
144 | 153 | TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline"
|
145 | 154 | TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["fill"] = {"flux": "FluxFillPipeline"}
|
@@ -1591,6 +1600,118 @@ class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig):
|
1591 | 1600 | MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")
|
1592 | 1601 |
|
1593 | 1602 |
|
| 1603 | +class DummyLLavaMultiModalProjectorInputGenerator(DummyInputGenerator): |
| 1604 | + SUPPORTED_INPUT_NAMES = ["image_features"] |
| 1605 | + |
| 1606 | + def __init__( |
| 1607 | + self, |
| 1608 | + task: str, |
| 1609 | + normalized_config: NormalizedTextConfig, |
| 1610 | + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], |
| 1611 | + random_batch_size_range: Optional[Tuple[int, int]] = None, |
| 1612 | + **kwargs, |
| 1613 | + ): |
| 1614 | + self.task = task |
| 1615 | + |
| 1616 | + self.batch_size = batch_size |
| 1617 | + self.hidden_size = normalized_config.hidden_size |
| 1618 | + self.num_patches = (normalized_config.image_size // normalized_config.patch_size) ** 2 |
| 1619 | + self.normalized_config = normalized_config |
| 1620 | + |
| 1621 | + def generate( |
| 1622 | + self, |
| 1623 | + input_name: str, |
| 1624 | + framework: str = "pt", |
| 1625 | + int_dtype: str = "int64", |
| 1626 | + float_dtype: str = "fp32", |
| 1627 | + ): |
| 1628 | + shape = [self.batch_size, self.num_patches, self.hidden_size] |
| 1629 | + return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) |
| 1630 | + |
| 1631 | + |
| 1632 | +class LLavaMultimodalProjectorOpenVINOConfig(OnnxConfig): |
| 1633 | + DUMMY_INPUT_GENERATOR_CLASSES = (DummyLLavaMultiModalProjectorInputGenerator,) |
| 1634 | + NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig |
| 1635 | + |
| 1636 | + @property |
| 1637 | + def inputs(self) -> Dict[str, Dict[int, str]]: |
| 1638 | + return {"image_features": {0: "batch_size", 1: "sequence_length"}} |
| 1639 | + |
| 1640 | + @property |
| 1641 | + def outputs(self) -> Dict[str, Dict[int, str]]: |
| 1642 | + return {"hidden_states": {0: "batch_size", 1: "sequence_length"}} |
| 1643 | + |
| 1644 | + |
| 1645 | +class LlavaNextVideoConfigBehavior(str, enum.Enum): |
| 1646 | + LANGUAGE = "language" |
| 1647 | + VISION_EMBEDDINGS = "vision_embeddings" |
| 1648 | + VISION_RESAMPLER = "vision_resampler" |
| 1649 | + MULTI_MODAL_PROJECTOR = "multi_modal_projector" |
| 1650 | + TEXT_EMBEDDINGS = "text_embeddings" |
| 1651 | + |
| 1652 | + |
| 1653 | +@register_in_tasks_manager( |
| 1654 | + "llava-next-video", *["image-text-to-text", "video-text-to-text"], library_name="transformers" |
| 1655 | +) |
| 1656 | +class LlavaNextVideoOpenVINOConfig(LlavaOpenVINOConfig): |
| 1657 | + MIN_TRANSFORMERS_VERSION = version.parse("4.42.0") |
| 1658 | + SUPPORTED_BEHAVIORS = [model_type.value for model_type in LlavaNextVideoConfigBehavior] |
| 1659 | + |
| 1660 | + def with_behavior( |
| 1661 | + self, |
| 1662 | + behavior: Union[str, LlavaNextVideoConfigBehavior], |
| 1663 | + ): |
| 1664 | + """ |
| 1665 | + Creates a config for different behaviour. |
| 1666 | +
|
| 1667 | + Args: |
| 1668 | + behavior ([`ConfigBehavior`]): |
| 1669 | + The behavior to use for the new instance. |
| 1670 | + """ |
| 1671 | + if isinstance(behavior, str) and not isinstance(behavior, LlavaNextVideoConfigBehavior): |
| 1672 | + behavior = LlavaNextVideoConfigBehavior(behavior) |
| 1673 | + |
| 1674 | + if behavior == LlavaNextVideoConfigBehavior.MULTI_MODAL_PROJECTOR: |
| 1675 | + export_config = LLavaMultimodalProjectorOpenVINOConfig( |
| 1676 | + self._orig_config.vision_config, |
| 1677 | + task="feature-extraction", |
| 1678 | + int_dtype=self.int_dtype, |
| 1679 | + float_dtype=self.float_dtype, |
| 1680 | + ) |
| 1681 | + return export_config |
| 1682 | + |
| 1683 | + if behavior == LlavaNextVideoConfigBehavior.VISION_RESAMPLER: |
| 1684 | + export_config = LLavaMultimodalProjectorOpenVINOConfig( |
| 1685 | + self._orig_config.vision_config, |
| 1686 | + task="feature-extraction", |
| 1687 | + int_dtype=self.int_dtype, |
| 1688 | + float_dtype=self.float_dtype, |
| 1689 | + ) |
| 1690 | + return export_config |
| 1691 | + |
| 1692 | + return super().with_behavior(behavior) |
| 1693 | + |
| 1694 | + def get_model_for_behavior(self, model, behavior: Union[str, LlavaNextVideoConfigBehavior]): |
| 1695 | + if isinstance(behavior, str) and not isinstance(behavior, LlavaNextVideoConfigBehavior): |
| 1696 | + behavior = LlavaNextVideoConfigBehavior(behavior) |
| 1697 | + |
| 1698 | + if behavior == LlavaNextVideoConfigBehavior.MULTI_MODAL_PROJECTOR: |
| 1699 | + return model.multi_modal_projector |
| 1700 | + |
| 1701 | + if behavior == LlavaNextVideoConfigBehavior.VISION_RESAMPLER: |
| 1702 | + return model.vision_resampler |
| 1703 | + |
| 1704 | + return super().get_model_for_behavior(model, behavior) |
| 1705 | + |
| 1706 | + def patch_model_for_export( |
| 1707 | + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None |
| 1708 | + ): |
| 1709 | + model_kwargs = model_kwargs or {} |
| 1710 | + if self._behavior != LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1711 | + return super().patch_model_for_export(model, model_kwargs) |
| 1712 | + return LlavaNextVideoImageEmbeddingModelPatcher(self, model, model_kwargs) |
| 1713 | + |
| 1714 | + |
1594 | 1715 | @register_in_tasks_manager(
|
1595 | 1716 | "maira2", *["image-text-to-text", "text-generation", "text-generation-with-past"], library_name="transformers"
|
1596 | 1717 | )
|
@@ -2587,7 +2708,7 @@ class Qwen2VLConfigBehavior(str, enum.Enum):
|
2587 | 2708 | TEXT_EMBEDDINGS = "text_embeddings"
|
2588 | 2709 |
|
2589 | 2710 |
|
2590 |
| -@register_in_tasks_manager("qwen2-vl", *["image-text-to-text"], library_name="transformers") |
| 2711 | +@register_in_tasks_manager("qwen2-vl", *["image-text-to-text", "video-text-to-text"], library_name="transformers") |
2591 | 2712 | class Qwen2VLOpenVINOConfig(OnnxConfig):
|
2592 | 2713 | SUPPORTED_BEHAVIORS = [model_type.value for model_type in Qwen2VLConfigBehavior]
|
2593 | 2714 | NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
|
@@ -2717,7 +2838,7 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
|
2717 | 2838 | return {}
|
2718 | 2839 |
|
2719 | 2840 |
|
2720 |
| -@register_in_tasks_manager("qwen2-5-vl", *["image-text-to-text"], library_name="transformers") |
| 2841 | +@register_in_tasks_manager("qwen2-5-vl", *["image-text-to-text", "video-text-to-text"], library_name="transformers") |
2721 | 2842 | class Qwen2_5_VLOpenVINOConfig(Qwen2VLOpenVINOConfig):
|
2722 | 2843 | MIN_TRANSFORMERS_VERSION = version.parse("4.49.0")
|
2723 | 2844 |
|
|
0 commit comments