huggingface · IlyasMoutawwakil · Mar 17, 2025 · Mar 17, 2025 · Mar 17, 2025 · IlyasMoutawwakil
diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
@@ -62,6 +62,7 @@ Here is the list of the supported architectures :
 - GPT-NeoX-Japanese
 - Gemma
 - Gemma2
+- GOT-OCR 2.0
 - Granite
 - GraniteMoE
 - Hubert

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -79,6 +79,7 @@
     FalconModelPatcher,
     FluxTransfromerModelPatcher,
     Gemma2ModelPatcher,
+    GotOCR2ImageEmbeddingsModelPatcher,
     GptBigCodeModelPatcher,
     GptJModelPatcher,
     GptNeoModelPatcher,
@@ -3001,3 +3002,16 @@ def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
         return DeepseekPatcher(self, model, model_kwargs=model_kwargs)
+
+
+@register_in_tasks_manager("got-ocr2", *["image-to-text", "image-text-to-text"], library_name="transformers")
+class GotOCR2OpenVINOConfig(LlavaOpenVINOConfig):
+    MIN_TRANSFORMERS_VERSION = "4.49.0"
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ):
+        model_kwargs = model_kwargs or {}
+        if self._behavior != LlavaConfigBehavior.VISION_EMBEDDINGS:
+            return super().patch_model_for_export(model, model_kwargs)
+        return GotOCR2ImageEmbeddingsModelPatcher(self, model, model_kwargs)
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -4405,3 +4405,20 @@ def __init__(
                 layer.mlp.down_proj.to(torch.float32)
 
         super().__init__(config, model, model_kwargs)
+
+
+class GotOCR2ImageEmbeddingsModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Dict[str, Any],
+    ):
+        model.__orig_forward = model.forward
+        # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L835
+        model.forward = model.get_image_features
+        super().__init__(config, model, model_kwargs)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
@@ -228,6 +228,7 @@ def get_submodels(model):
     "phi3-v",
     "qwen2-vl",
     "qwen2-5-vl",
+    "got-ocr2",
 ]
 
 

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
@@ -3109,6 +3109,50 @@ def preprocess_inputs(
         return processed_inputs
 
 
+class _OVGotOCR2ForCausalLM(OVModelForVisualCausalLM):
+    def get_vision_embeddings(self, pixel_values, input_ids, **kwargs):
+        if input_ids is not None and input_ids.shape[1] == 1 and kwargs.get("past_key_values") is not None:
+            return None
+        return self.vision_embeddings(pixel_values).last_hidden_state
+
+    def merge_vision_text_embeddings(
+        self, vision_embeds, inputs_embeds, input_ids=None, attention_mask=None, position_ids=None, **kwargs
+    ):
+        # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L836-L845
+        image_features = torch.from_numpy(vision_embeds) if isinstance(vision_embeds, np.ndarray) else vision_embeds
+        inputs_embeds = torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds
+        n_image_tokens = (input_ids == self.config.image_token_index).sum()
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if n_image_tokens != n_image_features:
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+        special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        return inputs_embeds, attention_mask, position_ids
+
+    @staticmethod
+    def preprocess_inputs(
+        text: Optional[str] = None,
+        image: Optional["Image"] = None,
+        processor: Optional[AutoImageProcessor] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+        config: Optional[PretrainedConfig] = None,
+        video: Optional["VideoInput"] = None,
+    ):
+        if processor is None:
+            raise ValueError("processor is required")
+        if video is not None:
+            raise ValueError("Video input is not supported")
+        if image is None:
+            raise ValueError("Image is required")
+        processed_inputs = processor(image, return_tensors="pt")
+        return processed_inputs
+
+
 MODEL_TYPE_TO_CLS_MAPPING = {
     "llava": _OVLlavaForCausalLM,
     "llava_next": _OVLlavaNextForCausalLM,
@@ -3120,4 +3164,5 @@ def preprocess_inputs(
     "internvl_chat": _OVInternVLForCausalLM,
     "qwen2_vl": _OVQwen2VLForCausalLM,
     "qwen2_5_vl": _OVQwen2_5_VLForCausalLM,
+    "got_ocr2": _OVGotOCR2ForCausalLM,
 }
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -2141,7 +2141,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
         SUPPORTED_ARCHITECTURES += ["maira2"]
 
     if is_transformers_version(">=", "4.49.0"):
-        SUPPORTED_ARCHITECTURES += ["qwen2_5_vl"]
+        SUPPORTED_ARCHITECTURES += ["qwen2_5_vl", "got_ocr2"]
         SUPPORT_VIDEO.append("qwen2_5_vl")
     TASK = "image-text-to-text"
     REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v", "maira2"]
@@ -2154,7 +2154,13 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
     )
 
     def get_transformer_model_class(self, model_arch):
-        if is_transformers_version(">=", "4.46") and model_arch in ["llava", "llava_next", "qwen2_vl", "qwen2_5_vl"]:
+        if is_transformers_version(">=", "4.46") and model_arch in [
+            "llava",
+            "llava_next",
+            "qwen2_vl",
+            "qwen2_5_vl",
+            "got_ocr2",
+        ]:
             from transformers import AutoModelForImageTextToText
 
             return AutoModelForImageTextToText
@@ -2339,14 +2345,16 @@ def test_generate_utils(self, model_arch):
         outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
         self.assertIsInstance(outputs[0], str)
 
-        # No input image case
-        question = "Hi, how are you?"
-        inputs = model.preprocess_inputs(**preprocessors, text=question, image=None)
-        outputs = model.generate(**inputs, max_new_tokens=10)
-        # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200
-        outputs = outputs[:, inputs["input_ids"].shape[1] :]
-        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        self.assertIsInstance(outputs[0], str)
+        # GOT-OCR2 does not support text-only input
+        if model_arch != "got_ocr2":
+            # No input image case
+            question = "Hi, how are you?"
+            inputs = model.preprocess_inputs(**preprocessors, text=question, image=None)
+            outputs = model.generate(**inputs, max_new_tokens=10)
+            # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200
+            outputs = outputs[:, inputs["input_ids"].shape[1] :]
+            outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            self.assertIsInstance(outputs[0], str)
 
         # video loader helper only available for transformers >= 4.49
         if model_arch in self.SUPPORT_VIDEO and is_transformers_version(">=", "4.49"):

diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
@@ -63,6 +63,7 @@
     "exaone": "katuni4ka/tiny-random-exaone",
     "gemma": "fxmarty/tiny-random-GemmaForCausalLM",
     "gemma2": "katuni4ka/tiny-random-gemma2",
+    "got_ocr2": "katuni4ka/tiny-random-got-ocr2-hf",
     "falcon": "fxmarty/really-tiny-falcon-testing",
     "falcon-40b": "katuni4ka/tiny-random-falcon-40b",
     "flaubert": "hf-internal-testing/tiny-random-flaubert",
-Original file line number
+Diff line change
@@ Expand Up / @@ -62,6 +62,7 @@ Here is the list of the supported architectures : @@
     - GPT-NeoX-Japanese
     - Gemma
     - Gemma2
+    - GOT-OCR 2.0
     - Granite
     - GraniteMoE
     - Hubert
@@ Expand Down @@