Skip to content

Commit 2113731

Browse files
committed
maira2 support
1 parent f601b8b commit 2113731

File tree

3 files changed

+29
-0
lines changed

3 files changed

+29
-0
lines changed

optimum/exporters/openvino/model_configs.py

+7
Original file line numberDiff line numberDiff line change
@@ -1488,6 +1488,7 @@ def __init__(
14881488
float_dtype: str = "fp32",
14891489
behavior: LlavaConfigBehavior = LlavaConfigBehavior.VISION_EMBEDDINGS,
14901490
preprocessors: Optional[List[Any]] = None,
1491+
**kwargs
14911492
):
14921493
super().__init__(
14931494
config=config,
@@ -1584,6 +1585,12 @@ class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig):
15841585
MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")
15851586

15861587

1588+
@register_in_tasks_manager("maira2", *["image-text-to-text", "text-generation", "text-generation-with-past"], library_name="transformers")
1589+
class MairaOpenVINOConfig(LlavaOpenVINOConfig):
1590+
MIN_TRANSFORMERS_VERSION = version.parse("4.46.0")
1591+
SUPPORTS_PAST = True
1592+
1593+
15871594
class InternVLChatConfigBehavior(str, enum.Enum):
15881595
LANGUAGE = "language"
15891596
VISION_EMBEDDINGS = "vision_embeddings"

optimum/exporters/openvino/utils.py

+1
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ def get_submodels(model):
222222
"llava-next",
223223
"llava-qwen2",
224224
"internvl-chat",
225+
"maira2",
225226
"minicpmv",
226227
"phi3-v",
227228
"qwen2-vl",

optimum/intel/openvino/modeling_visual_language.py

+21
Original file line numberDiff line numberDiff line change
@@ -2331,11 +2331,32 @@ def preprocess_inputs(
23312331
return inputs
23322332

23332333

2334+
class _OVMaira2ForCausalLM(_OVLlavaForCausalLM):
2335+
@staticmethod
2336+
def preprocess_inputs(
2337+
text: str,
2338+
image: Optional["Image"] = None,
2339+
processor: Optional[AutoImageProcessor] = None,
2340+
tokenizer: Optional[PreTrainedTokenizer] = None,
2341+
config: Optional[PretrainedConfig] = None,
2342+
):
2343+
if processor is None:
2344+
raise ValueError("processor is required")
2345+
processed_inputs = processor.format_and_preprocess_phrase_grounding_input(
2346+
frontal_image=image,
2347+
phrase=text,
2348+
return_tensors="pt",
2349+
)
2350+
return processed_inputs
2351+
2352+
2353+
23342354
MODEL_TYPE_TO_CLS_MAPPING = {
23352355
"llava": _OVLlavaForCausalLM,
23362356
"llava_next": _OVLlavaNextForCausalLM,
23372357
"minicpmv": _OVMiniCPMVForCausalLM,
23382358
"llava-qwen2": _OVNanoLlavaForCausalLM,
2359+
"maira2": _OVMaira2ForCausalLM,
23392360
"phi3_v": _OVPhi3VisionForCausalLM,
23402361
"internvl_chat": _OVInternVLForCausalLM,
23412362
"qwen2_vl": _OVQwen2VLForCausalLM,

0 commit comments

Comments
 (0)