Skip to content

Commit fcbc4c0

Browse files
committed
maira2 support
1 parent dd4fe68 commit fcbc4c0

File tree

3 files changed

+30
-0
lines changed

3 files changed

+30
-0
lines changed

optimum/exporters/openvino/model_configs.py

+9
Original file line numberDiff line numberDiff line change
@@ -1488,6 +1488,7 @@ def __init__(
14881488
float_dtype: str = "fp32",
14891489
behavior: LlavaConfigBehavior = LlavaConfigBehavior.VISION_EMBEDDINGS,
14901490
preprocessors: Optional[List[Any]] = None,
1491+
**kwargs,
14911492
):
14921493
super().__init__(
14931494
config=config,
@@ -1584,6 +1585,14 @@ class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig):
15841585
MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")
15851586

15861587

1588+
@register_in_tasks_manager(
1589+
"maira2", *["image-text-to-text", "text-generation", "text-generation-with-past"], library_name="transformers"
1590+
)
1591+
class MairaOpenVINOConfig(LlavaOpenVINOConfig):
1592+
MIN_TRANSFORMERS_VERSION = version.parse("4.46.0")
1593+
SUPPORTS_PAST = True
1594+
1595+
15871596
class InternVLChatConfigBehavior(str, enum.Enum):
15881597
LANGUAGE = "language"
15891598
VISION_EMBEDDINGS = "vision_embeddings"

optimum/exporters/openvino/utils.py

+1
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ def get_submodels(model):
222222
"llava-next",
223223
"llava-qwen2",
224224
"internvl-chat",
225+
"maira2",
225226
"minicpmv",
226227
"phi3-v",
227228
"qwen2-vl",

optimum/intel/openvino/modeling_visual_language.py

+20
Original file line numberDiff line numberDiff line change
@@ -2331,11 +2331,31 @@ def preprocess_inputs(
23312331
return inputs
23322332

23332333

2334+
class _OVMaira2ForCausalLM(_OVLlavaForCausalLM):
2335+
@staticmethod
2336+
def preprocess_inputs(
2337+
text: str,
2338+
image: Optional["Image"] = None,
2339+
processor: Optional[AutoImageProcessor] = None,
2340+
tokenizer: Optional[PreTrainedTokenizer] = None,
2341+
config: Optional[PretrainedConfig] = None,
2342+
):
2343+
if processor is None:
2344+
raise ValueError("processor is required")
2345+
processed_inputs = processor.format_and_preprocess_phrase_grounding_input(
2346+
frontal_image=image,
2347+
phrase=text,
2348+
return_tensors="pt",
2349+
)
2350+
return processed_inputs
2351+
2352+
23342353
MODEL_TYPE_TO_CLS_MAPPING = {
23352354
"llava": _OVLlavaForCausalLM,
23362355
"llava_next": _OVLlavaNextForCausalLM,
23372356
"minicpmv": _OVMiniCPMVForCausalLM,
23382357
"llava-qwen2": _OVNanoLlavaForCausalLM,
2358+
"maira2": _OVMaira2ForCausalLM,
23392359
"phi3_v": _OVPhi3VisionForCausalLM,
23402360
"internvl_chat": _OVInternVLForCausalLM,
23412361
"qwen2_vl": _OVQwen2VLForCausalLM,

0 commit comments

Comments
 (0)