Merge branch 'main' into patch

jiqing-feng · web-flow · commit bbf09cd84bca · 2025-03-18T09:28:38.000+08:00
diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
@@ -62,6 +62,7 @@ Here is the list of the supported architectures :
 - GPT-NeoX-Japanese
 - Gemma
 - Gemma2
+- GOT-OCR 2.0
 - Granite
 - GraniteMoE
 - Hubert
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -79,6 +79,7 @@
     FalconModelPatcher,
     FluxTransfromerModelPatcher,
     Gemma2ModelPatcher,
+    GotOCR2ImageEmbeddingsModelPatcher,
     GptBigCodeModelPatcher,
     GptJModelPatcher,
     GptNeoModelPatcher,
@@ -3001,3 +3002,16 @@ def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
         return DeepseekPatcher(self, model, model_kwargs=model_kwargs)
+
+
+@register_in_tasks_manager("got-ocr2", *["image-to-text", "image-text-to-text"], library_name="transformers")
+class GotOCR2OpenVINOConfig(LlavaOpenVINOConfig):
+    MIN_TRANSFORMERS_VERSION = "4.49.0"
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ):
+        model_kwargs = model_kwargs or {}
+        if self._behavior != LlavaConfigBehavior.VISION_EMBEDDINGS:
+            return super().patch_model_for_export(model, model_kwargs)
+        return GotOCR2ImageEmbeddingsModelPatcher(self, model, model_kwargs)
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -4405,3 +4405,20 @@ def __init__(
                 layer.mlp.down_proj.to(torch.float32)
 
         super().__init__(config, model, model_kwargs)
+
+
+class GotOCR2ImageEmbeddingsModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Dict[str, Any],
+    ):
+        model.__orig_forward = model.forward
+        # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L835
+        model.forward = model.get_image_features
+        super().__init__(config, model, model_kwargs)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
@@ -228,6 +228,7 @@ def get_submodels(model):
     "phi3-v",
     "qwen2-vl",
     "qwen2-5-vl",
+    "got-ocr2",
 ]
 
 
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
@@ -71,6 +71,17 @@
     _COMPILE_NOT_READY_MODEL_TYPES = ("llama", "falcon", "gpt2", "qwen2")
 
 
+try:
+    import intel_extension_for_pytorch as ipex
+
+    if hasattr(torch, "xpu") and torch.xpu.is_available() and not ipex._C._has_xpu():
+        logger.warning(
+            "Detect you have XPU device but the ipex do not support XPU, please install a xpu version ipex by checking https://pytorch-extension.intel.com/installation?platform=gpu"
+        )
+except ImportError:
+    logger.warning("No intel_extension_for_pytorch found, please `pip install intel_extension_for_pytorch`")
+
+
 def _is_patched_with_ipex(model, task, use_cache: bool = True):
     if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING):
         return False
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
@@ -889,9 +889,7 @@ def reshape(
             )
 
         if self.text_encoder_3 is not None:
-            self.text_encoder_3.model = self._reshape_text_encoder(
-                self.text_encoder_3.model, batch_size, getattr(self.tokenizer_3, "model_max_length", -1)
-            )
+            self.text_encoder_3.model = self._reshape_text_encoder(self.text_encoder_3.model, batch_size, -1)
 
         self.clear_requests()
         return self
@@ -973,6 +971,63 @@ def __call__(self, *args, **kwargs):
         for k, v in kwargs.items():
             kwargs[k] = np_to_pt_generators(v, self.device)
 
+        height, width = None, None
+        height_idx, width_idx = None, None
+        shapes_overriden = False
+        sig = inspect.signature(self.auto_model_class.__call__)
+        sig_height_idx = list(sig.parameters).index("height") if "height" in sig.parameters else len(sig.parameters)
+        sig_width_idx = list(sig.parameters).index("width") if "width" in sig.parameters else len(sig.parameters)
+        if "height" in kwargs:
+            height = kwargs["height"]
+        elif len(args) > sig_height_idx:
+            height = args[sig_height_idx]
+            height_idx = sig_height_idx
+
+        if "width" in kwargs:
+            width = kwargs["width"]
+        elif len(args) > sig_width_idx:
+            width = args[sig_width_idx]
+            width_idx = sig_width_idx
+
+        if self.height != -1:
+            if height is not None and height != self.height:
+                logger.warning(f"Incompatible height argument provided {height}. Pipeline only support {self.height}.")
+                height = self.height
+            else:
+                height = self.height
+
+            if height_idx is not None:
+                args[height_idx] = height
+            else:
+                kwargs["height"] = height
+
+            shapes_overriden = True
+
+        if self.width != -1:
+            if width is not None and width != self.width:
+                logger.warning(f"Incompatible widtth argument provided {width}. Pipeline only support {self.width}.")
+                width = self.width
+            else:
+                width = self.width
+
+            if width_idx is not None:
+                args[width_idx] = width
+            else:
+                kwargs["width"] = width
+            shapes_overriden = True
+
+        # Sana generates images in specific resolution grid size and then resize to requested size by default, it may contradict with pipeline height / width
+        # Disable this behavior for static shape pipeline
+        if self.auto_model_class.__name__.startswith("Sana") and shapes_overriden:
+            sig_resolution_bining_idx = (
+                list(sig.parameters).index("use_resolution_binning")
+                if "use_resolution_binning" in sig.parameters
+                else len(sig.parameters)
+            )
+            if len(args) > sig_resolution_bining_idx:
+                args[sig_resolution_bining_idx] = False
+            else:
+                kwargs["use_resolution_binning"] = False
         # we use auto_model_class.__call__ here because we can't call super().__call__
         # as OptimizedModel already defines a __call__ which is the first in the MRO
         return self.auto_model_class.__call__(self, *args, **kwargs)
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
@@ -3109,6 +3109,50 @@ def preprocess_inputs(
         return processed_inputs
 
 
+class _OVGotOCR2ForCausalLM(OVModelForVisualCausalLM):
+    def get_vision_embeddings(self, pixel_values, input_ids, **kwargs):
+        if input_ids is not None and input_ids.shape[1] == 1 and kwargs.get("past_key_values") is not None:
+            return None
+        return self.vision_embeddings(pixel_values).last_hidden_state
+
+    def merge_vision_text_embeddings(
+        self, vision_embeds, inputs_embeds, input_ids=None, attention_mask=None, position_ids=None, **kwargs
+    ):
+        # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L836-L845
+        image_features = torch.from_numpy(vision_embeds) if isinstance(vision_embeds, np.ndarray) else vision_embeds
+        inputs_embeds = torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds
+        n_image_tokens = (input_ids == self.config.image_token_index).sum()
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if n_image_tokens != n_image_features:
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+        special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        return inputs_embeds, attention_mask, position_ids
+
+    @staticmethod
+    def preprocess_inputs(
+        text: Optional[str] = None,
+        image: Optional["Image"] = None,
+        processor: Optional[AutoImageProcessor] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+        config: Optional[PretrainedConfig] = None,
+        video: Optional["VideoInput"] = None,
+    ):
+        if processor is None:
+            raise ValueError("processor is required")
+        if video is not None:
+            raise ValueError("Video input is not supported")
+        if image is None:
+            raise ValueError("Image is required")
+        processed_inputs = processor(image, return_tensors="pt")
+        return processed_inputs
+
+
 MODEL_TYPE_TO_CLS_MAPPING = {
     "llava": _OVLlavaForCausalLM,
     "llava_next": _OVLlavaNextForCausalLM,
@@ -3120,4 +3164,5 @@ def preprocess_inputs(
     "internvl_chat": _OVInternVLForCausalLM,
     "qwen2_vl": _OVQwen2VLForCausalLM,
     "qwen2_5_vl": _OVQwen2_5_VLForCausalLM,
+    "got_ocr2": _OVGotOCR2ForCausalLM,
 }
diff --git a/setup.py b/setup.py
@@ -67,7 +67,7 @@
     "nncf": ["nncf>=2.14.0"],
     "openvino": ["nncf>=2.14.0", "openvino>=2024.5.0", "openvino-tokenizers>=2024.5.0"],
     "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<4.46"],
-    "ipex": ["intel-extension-for-pytorch>=2.4", "transformers>4.48,<4.50", "accelerate"],
+    "ipex": ["intel-extension-for-pytorch>=2.6", "transformers>4.48,<4.50", "accelerate"],
     "diffusers": ["diffusers"],
     "quality": QUALITY_REQUIRE,
     "tests": TESTS_REQUIRE,
diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 
 import json
+import logging
 import unittest
 from pathlib import Path
 
@@ -438,6 +439,33 @@ def test_load_custom_weight_variant(self):
 
         np.testing.assert_allclose(ov_images, diffusers_images, atol=1e-4, rtol=1e-2)
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_static_shape_image_generation(self, model_arch):
+        pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], compile=False)
+        pipeline.reshape(batch_size=1, height=32, width=32)
+        pipeline.compile()
+        # generation with incompatible size
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        inputs["output_type"] = "pil"
+        from optimum.intel.openvino.modeling_diffusion import logger as diffusers_logger
+
+        with self.assertLogs(diffusers_logger, logging.WARN) as warning_log:
+            image = pipeline(**inputs).images[0]
+            self.assertTrue(
+                any(
+                    "Incompatible width argument provided" in log or "Incompatible height argument provided" in log
+                    for log in warning_log.output
+                )
+            )
+        self.assertTupleEqual(image.size, (32, 32))
+        # generation without height / width provided
+        inputs.pop("height")
+        inputs.pop("width")
+        image = pipeline(**inputs).images[0]
+        self.assertTupleEqual(image.size, (32, 32))
+
 
 class OVPipelineForImage2ImageTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -2141,7 +2141,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
         SUPPORTED_ARCHITECTURES += ["maira2"]
 
     if is_transformers_version(">=", "4.49.0"):
-        SUPPORTED_ARCHITECTURES += ["qwen2_5_vl"]
+        SUPPORTED_ARCHITECTURES += ["qwen2_5_vl", "got_ocr2"]
         SUPPORT_VIDEO.append("qwen2_5_vl")
     TASK = "image-text-to-text"
     REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v", "maira2"]
@@ -2154,7 +2154,13 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
     )
 
     def get_transformer_model_class(self, model_arch):
-        if is_transformers_version(">=", "4.46") and model_arch in ["llava", "llava_next", "qwen2_vl", "qwen2_5_vl"]:
+        if is_transformers_version(">=", "4.46") and model_arch in [
+            "llava",
+            "llava_next",
+            "qwen2_vl",
+            "qwen2_5_vl",
+            "got_ocr2",
+        ]:
             from transformers import AutoModelForImageTextToText
 
             return AutoModelForImageTextToText
@@ -2339,14 +2345,16 @@ def test_generate_utils(self, model_arch):
         outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
         self.assertIsInstance(outputs[0], str)
 
-        # No input image case
-        question = "Hi, how are you?"
-        inputs = model.preprocess_inputs(**preprocessors, text=question, image=None)
-        outputs = model.generate(**inputs, max_new_tokens=10)
-        # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200
-        outputs = outputs[:, inputs["input_ids"].shape[1] :]
-        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        self.assertIsInstance(outputs[0], str)
+        # GOT-OCR2 does not support text-only input
+        if model_arch != "got_ocr2":
+            # No input image case
+            question = "Hi, how are you?"
+            inputs = model.preprocess_inputs(**preprocessors, text=question, image=None)
+            outputs = model.generate(**inputs, max_new_tokens=10)
+            # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200
+            outputs = outputs[:, inputs["input_ids"].shape[1] :]
+            outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            self.assertIsInstance(outputs[0], str)
 
         # video loader helper only available for transformers >= 4.49
         if model_arch in self.SUPPORT_VIDEO and is_transformers_version(">=", "4.49"):
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
@@ -63,6 +63,7 @@
     "exaone": "katuni4ka/tiny-random-exaone",
     "gemma": "fxmarty/tiny-random-GemmaForCausalLM",
     "gemma2": "katuni4ka/tiny-random-gemma2",
+    "got_ocr2": "katuni4ka/tiny-random-got-ocr2-hf",
     "falcon": "fxmarty/really-tiny-falcon-testing",
     "falcon-40b": "katuni4ka/tiny-random-falcon-40b",
     "flaubert": "hf-internal-testing/tiny-random-flaubert",

Original file line number	Diff line number	Diff line change
`@@ -228,6 +228,7 @@ def get_submodels(model):`
`228`	`228`	`"phi3-v",`
`229`	`229`	`"qwen2-vl",`
`230`	`230`	`"qwen2-5-vl",`
	`231`	`+ "got-ocr2",`
`231`	`232`	`]`
`232`	`233`
`233`	`234`