diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx index ef5bd128e1..e1c8c7864e 100644 --- a/docs/source/openvino/models.mdx +++ b/docs/source/openvino/models.mdx @@ -72,6 +72,7 @@ Here is the list of the supported architectures : - Llava - Llava-Next - M2-M100 +- MAIRA-2 - MBart - MPNet - MPT diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 8cb54addf2..6807644b9e 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1488,6 +1488,7 @@ def __init__( float_dtype: str = "fp32", behavior: LlavaConfigBehavior = LlavaConfigBehavior.VISION_EMBEDDINGS, preprocessors: Optional[List[Any]] = None, + **kwargs, ): super().__init__( config=config, @@ -1584,6 +1585,14 @@ class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = version.parse("4.40.0") +@register_in_tasks_manager( + "maira2", *["image-text-to-text", "text-generation", "text-generation-with-past"], library_name="transformers" +) +class MairaOpenVINOConfig(LlavaOpenVINOConfig): + MIN_TRANSFORMERS_VERSION = version.parse("4.46.0") + SUPPORTS_PAST = True + + class InternVLChatConfigBehavior(str, enum.Enum): LANGUAGE = "language" VISION_EMBEDDINGS = "vision_embeddings" diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index e00708c8eb..c31026e5f9 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -222,6 +222,7 @@ def get_submodels(model): "llava-next", "llava-qwen2", "internvl-chat", + "maira2", "minicpmv", "phi3-v", "qwen2-vl", diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 4b2c5ee031..5ba4eec864 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -2331,11 +2331,33 @@ def preprocess_inputs( return inputs +class _OVMaira2ForCausalLM(_OVLlavaForCausalLM): + @staticmethod + def preprocess_inputs( + text: str, + image: Optional["Image"] = None, + processor: Optional[AutoImageProcessor] = None, + tokenizer: Optional[PreTrainedTokenizer] = None, + config: Optional[PretrainedConfig] = None, + ): + if processor is None: + raise ValueError("processor is required") + if image is None: + return processor(text=text, return_tensors="pt") + processed_inputs = processor.format_and_preprocess_phrase_grounding_input( + frontal_image=image, + phrase=text, + return_tensors="pt", + ) + return processed_inputs + + MODEL_TYPE_TO_CLS_MAPPING = { "llava": _OVLlavaForCausalLM, "llava_next": _OVLlavaNextForCausalLM, "minicpmv": _OVMiniCPMVForCausalLM, "llava-qwen2": _OVNanoLlavaForCausalLM, + "maira2": _OVMaira2ForCausalLM, "phi3_v": _OVPhi3VisionForCausalLM, "internvl_chat": _OVInternVLForCausalLM, "qwen2_vl": _OVQwen2VLForCausalLM, diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 8ca72d06c2..aa276673b5 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -2110,9 +2110,9 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.40.0"): SUPPORTED_ARCHITECTURES += ["llava_next", "nanollava"] if is_transformers_version(">=", "4.45.0"): - SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl"] + SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl", "maira2"] TASK = "image-text-to-text" - REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v"] + REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v", "maira2"] IMAGE = Image.open( requests.get( @@ -2192,7 +2192,7 @@ def test_compare_to_transformers(self, model_arch): with torch.no_grad(): transformers_outputs = transformers_model(**transformers_inputs) self.assertTrue( - torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4), + torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=4e-3), f"Max abs diff {(torch.abs(ov_outputs.logits - transformers_outputs.logits).max())}", ) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index e4c2ede8e9..ac39b065ca 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -91,6 +91,7 @@ "opt": "hf-internal-testing/tiny-random-OPTModel", "opt125m": "facebook/opt-125m", "opt_gptq": "ybelkada/opt-125m-gptq-4bit", + "maira2": "katuni4ka/tiny-random-maira2", "marian": "sshleifer/tiny-marian-en-de", "mbart": "hf-internal-testing/tiny-random-mbart", "minicpm": "katuni4ka/tiny-random-minicpm",