Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add support got-ocr2 #1202

Merged
merged 2 commits into from
Mar 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/openvino/models.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ Here is the list of the supported architectures :
- GPT-NeoX-Japanese
- Gemma
- Gemma2
- GOT-OCR 2.0
- Granite
- GraniteMoE
- Hubert
Expand Down
14 changes: 14 additions & 0 deletions optimum/exporters/openvino/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
FalconModelPatcher,
FluxTransfromerModelPatcher,
Gemma2ModelPatcher,
GotOCR2ImageEmbeddingsModelPatcher,
GptBigCodeModelPatcher,
GptJModelPatcher,
GptNeoModelPatcher,
Expand Down Expand Up @@ -3001,3 +3002,16 @@ def patch_model_for_export(
self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
) -> "ModelPatcher":
return DeepseekPatcher(self, model, model_kwargs=model_kwargs)


@register_in_tasks_manager("got-ocr2", *["image-to-text", "image-text-to-text"], library_name="transformers")
class GotOCR2OpenVINOConfig(LlavaOpenVINOConfig):
MIN_TRANSFORMERS_VERSION = "4.49.0"

def patch_model_for_export(
self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
):
model_kwargs = model_kwargs or {}
if self._behavior != LlavaConfigBehavior.VISION_EMBEDDINGS:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is VISION_EMBEDDINGS not added directly to ConfigBehavior ? using LlavaConfigBehavior looks wrong here

Copy link
Collaborator Author

@eaidova eaidova Mar 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure that I understand your comment, there is no separated config behaviour for this model class. it inherits it from parent class - LllavaOpenVINOConfig. vision embeddings is part of it and its LlavaConfigBehavior. This overriding required to use got-ocr specific patching only for vision_embeddings part instead of llava (because they are different).

if inverted logic looks clearer, I can invert it

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

my point is that those behaviors should probably be added to the original ConfigBehavior class, like decoder/encoder. Having them in LlavaConfigBehavior makes less sense the more we add multi-modal models.
Anyways this can be addressed in a separate PR if it makes sense for you as well, I will proceed with merging.

Copy link
Collaborator Author

@eaidova eaidova Mar 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

every multimodal model may have own partition. it is difficult to maintain everything using only one enum class and will required additional logic for maintenance and validation of model specific structure.

it may be refactored later if you have better solution

return super().patch_model_for_export(model, model_kwargs)
return GotOCR2ImageEmbeddingsModelPatcher(self, model, model_kwargs)
17 changes: 17 additions & 0 deletions optimum/exporters/openvino/model_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -4405,3 +4405,20 @@ def __init__(
layer.mlp.down_proj.to(torch.float32)

super().__init__(config, model, model_kwargs)


class GotOCR2ImageEmbeddingsModelPatcher(ModelPatcher):
def __init__(
self,
config: "OnnxConfig",
model: Union["PreTrainedModel", "TFPreTrainedModel"],
model_kwargs: Dict[str, Any],
):
model.__orig_forward = model.forward
# Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L835
model.forward = model.get_image_features
super().__init__(config, model, model_kwargs)

def __exit__(self, exc_type, exc_value, traceback):
super().__exit__(exc_type, exc_value, traceback)
self._model.forward = self._model.__orig_forward
1 change: 1 addition & 0 deletions optimum/exporters/openvino/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ def get_submodels(model):
"phi3-v",
"qwen2-vl",
"qwen2-5-vl",
"got-ocr2",
]


Expand Down
45 changes: 45 additions & 0 deletions optimum/intel/openvino/modeling_visual_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -3109,6 +3109,50 @@ def preprocess_inputs(
return processed_inputs


class _OVGotOCR2ForCausalLM(OVModelForVisualCausalLM):
def get_vision_embeddings(self, pixel_values, input_ids, **kwargs):
if input_ids is not None and input_ids.shape[1] == 1 and kwargs.get("past_key_values") is not None:
return None
return self.vision_embeddings(pixel_values).last_hidden_state

def merge_vision_text_embeddings(
self, vision_embeds, inputs_embeds, input_ids=None, attention_mask=None, position_ids=None, **kwargs
):
# Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L836-L845
image_features = torch.from_numpy(vision_embeds) if isinstance(vision_embeds, np.ndarray) else vision_embeds
inputs_embeds = torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds
n_image_tokens = (input_ids == self.config.image_token_index).sum()
n_image_features = image_features.shape[0] * image_features.shape[1]
if n_image_tokens != n_image_features:
raise ValueError(
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
)
special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)

return inputs_embeds, attention_mask, position_ids

@staticmethod
def preprocess_inputs(
text: Optional[str] = None,
image: Optional["Image"] = None,
processor: Optional[AutoImageProcessor] = None,
tokenizer: Optional[PreTrainedTokenizer] = None,
config: Optional[PretrainedConfig] = None,
video: Optional["VideoInput"] = None,
):
if processor is None:
raise ValueError("processor is required")
if video is not None:
raise ValueError("Video input is not supported")
if image is None:
raise ValueError("Image is required")
processed_inputs = processor(image, return_tensors="pt")
return processed_inputs


MODEL_TYPE_TO_CLS_MAPPING = {
"llava": _OVLlavaForCausalLM,
"llava_next": _OVLlavaNextForCausalLM,
Expand All @@ -3120,4 +3164,5 @@ def preprocess_inputs(
"internvl_chat": _OVInternVLForCausalLM,
"qwen2_vl": _OVQwen2VLForCausalLM,
"qwen2_5_vl": _OVQwen2_5_VLForCausalLM,
"got_ocr2": _OVGotOCR2ForCausalLM,
}
28 changes: 18 additions & 10 deletions tests/openvino/test_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -2141,7 +2141,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
SUPPORTED_ARCHITECTURES += ["maira2"]

if is_transformers_version(">=", "4.49.0"):
SUPPORTED_ARCHITECTURES += ["qwen2_5_vl"]
SUPPORTED_ARCHITECTURES += ["qwen2_5_vl", "got_ocr2"]
SUPPORT_VIDEO.append("qwen2_5_vl")
TASK = "image-text-to-text"
REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v", "maira2"]
Expand All @@ -2154,7 +2154,13 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
)

def get_transformer_model_class(self, model_arch):
if is_transformers_version(">=", "4.46") and model_arch in ["llava", "llava_next", "qwen2_vl", "qwen2_5_vl"]:
if is_transformers_version(">=", "4.46") and model_arch in [
"llava",
"llava_next",
"qwen2_vl",
"qwen2_5_vl",
"got_ocr2",
]:
from transformers import AutoModelForImageTextToText

return AutoModelForImageTextToText
Expand Down Expand Up @@ -2339,14 +2345,16 @@ def test_generate_utils(self, model_arch):
outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
self.assertIsInstance(outputs[0], str)

# No input image case
question = "Hi, how are you?"
inputs = model.preprocess_inputs(**preprocessors, text=question, image=None)
outputs = model.generate(**inputs, max_new_tokens=10)
# filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200
outputs = outputs[:, inputs["input_ids"].shape[1] :]
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
self.assertIsInstance(outputs[0], str)
# GOT-OCR2 does not support text-only input
if model_arch != "got_ocr2":
# No input image case
question = "Hi, how are you?"
inputs = model.preprocess_inputs(**preprocessors, text=question, image=None)
outputs = model.generate(**inputs, max_new_tokens=10)
# filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200
outputs = outputs[:, inputs["input_ids"].shape[1] :]
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
self.assertIsInstance(outputs[0], str)

# video loader helper only available for transformers >= 4.49
if model_arch in self.SUPPORT_VIDEO and is_transformers_version(">=", "4.49"):
Expand Down
1 change: 1 addition & 0 deletions tests/openvino/utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
"exaone": "katuni4ka/tiny-random-exaone",
"gemma": "fxmarty/tiny-random-GemmaForCausalLM",
"gemma2": "katuni4ka/tiny-random-gemma2",
"got_ocr2": "katuni4ka/tiny-random-got-ocr2-hf",
"falcon": "fxmarty/really-tiny-falcon-testing",
"falcon-40b": "katuni4ka/tiny-random-falcon-40b",
"flaubert": "hf-internal-testing/tiny-random-flaubert",
Expand Down
Loading