Skip to content

Commit bbf09cd

Browse files
authored
Merge branch 'main' into patch
2 parents 4db4db5 + 5ac3544 commit bbf09cd

File tree

11 files changed

+195
-14
lines changed

11 files changed

+195
-14
lines changed

docs/source/openvino/models.mdx

+1
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ Here is the list of the supported architectures :
6262
- GPT-NeoX-Japanese
6363
- Gemma
6464
- Gemma2
65+
- GOT-OCR 2.0
6566
- Granite
6667
- GraniteMoE
6768
- Hubert

optimum/exporters/openvino/model_configs.py

+14
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@
7979
FalconModelPatcher,
8080
FluxTransfromerModelPatcher,
8181
Gemma2ModelPatcher,
82+
GotOCR2ImageEmbeddingsModelPatcher,
8283
GptBigCodeModelPatcher,
8384
GptJModelPatcher,
8485
GptNeoModelPatcher,
@@ -3001,3 +3002,16 @@ def patch_model_for_export(
30013002
self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
30023003
) -> "ModelPatcher":
30033004
return DeepseekPatcher(self, model, model_kwargs=model_kwargs)
3005+
3006+
3007+
@register_in_tasks_manager("got-ocr2", *["image-to-text", "image-text-to-text"], library_name="transformers")
3008+
class GotOCR2OpenVINOConfig(LlavaOpenVINOConfig):
3009+
MIN_TRANSFORMERS_VERSION = "4.49.0"
3010+
3011+
def patch_model_for_export(
3012+
self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
3013+
):
3014+
model_kwargs = model_kwargs or {}
3015+
if self._behavior != LlavaConfigBehavior.VISION_EMBEDDINGS:
3016+
return super().patch_model_for_export(model, model_kwargs)
3017+
return GotOCR2ImageEmbeddingsModelPatcher(self, model, model_kwargs)

optimum/exporters/openvino/model_patcher.py

+17
Original file line numberDiff line numberDiff line change
@@ -4405,3 +4405,20 @@ def __init__(
44054405
layer.mlp.down_proj.to(torch.float32)
44064406

44074407
super().__init__(config, model, model_kwargs)
4408+
4409+
4410+
class GotOCR2ImageEmbeddingsModelPatcher(ModelPatcher):
4411+
def __init__(
4412+
self,
4413+
config: "OnnxConfig",
4414+
model: Union["PreTrainedModel", "TFPreTrainedModel"],
4415+
model_kwargs: Dict[str, Any],
4416+
):
4417+
model.__orig_forward = model.forward
4418+
# Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L835
4419+
model.forward = model.get_image_features
4420+
super().__init__(config, model, model_kwargs)
4421+
4422+
def __exit__(self, exc_type, exc_value, traceback):
4423+
super().__exit__(exc_type, exc_value, traceback)
4424+
self._model.forward = self._model.__orig_forward

optimum/exporters/openvino/utils.py

+1
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,7 @@ def get_submodels(model):
228228
"phi3-v",
229229
"qwen2-vl",
230230
"qwen2-5-vl",
231+
"got-ocr2",
231232
]
232233

233234

optimum/intel/ipex/modeling_base.py

+11
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,17 @@
7171
_COMPILE_NOT_READY_MODEL_TYPES = ("llama", "falcon", "gpt2", "qwen2")
7272

7373

74+
try:
75+
import intel_extension_for_pytorch as ipex
76+
77+
if hasattr(torch, "xpu") and torch.xpu.is_available() and not ipex._C._has_xpu():
78+
logger.warning(
79+
"Detect you have XPU device but the ipex do not support XPU, please install a xpu version ipex by checking https://pytorch-extension.intel.com/installation?platform=gpu"
80+
)
81+
except ImportError:
82+
logger.warning("No intel_extension_for_pytorch found, please `pip install intel_extension_for_pytorch`")
83+
84+
7485
def _is_patched_with_ipex(model, task, use_cache: bool = True):
7586
if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING):
7687
return False

optimum/intel/openvino/modeling_diffusion.py

+58-3
Original file line numberDiff line numberDiff line change
@@ -889,9 +889,7 @@ def reshape(
889889
)
890890

891891
if self.text_encoder_3 is not None:
892-
self.text_encoder_3.model = self._reshape_text_encoder(
893-
self.text_encoder_3.model, batch_size, getattr(self.tokenizer_3, "model_max_length", -1)
894-
)
892+
self.text_encoder_3.model = self._reshape_text_encoder(self.text_encoder_3.model, batch_size, -1)
895893

896894
self.clear_requests()
897895
return self
@@ -973,6 +971,63 @@ def __call__(self, *args, **kwargs):
973971
for k, v in kwargs.items():
974972
kwargs[k] = np_to_pt_generators(v, self.device)
975973

974+
height, width = None, None
975+
height_idx, width_idx = None, None
976+
shapes_overriden = False
977+
sig = inspect.signature(self.auto_model_class.__call__)
978+
sig_height_idx = list(sig.parameters).index("height") if "height" in sig.parameters else len(sig.parameters)
979+
sig_width_idx = list(sig.parameters).index("width") if "width" in sig.parameters else len(sig.parameters)
980+
if "height" in kwargs:
981+
height = kwargs["height"]
982+
elif len(args) > sig_height_idx:
983+
height = args[sig_height_idx]
984+
height_idx = sig_height_idx
985+
986+
if "width" in kwargs:
987+
width = kwargs["width"]
988+
elif len(args) > sig_width_idx:
989+
width = args[sig_width_idx]
990+
width_idx = sig_width_idx
991+
992+
if self.height != -1:
993+
if height is not None and height != self.height:
994+
logger.warning(f"Incompatible height argument provided {height}. Pipeline only support {self.height}.")
995+
height = self.height
996+
else:
997+
height = self.height
998+
999+
if height_idx is not None:
1000+
args[height_idx] = height
1001+
else:
1002+
kwargs["height"] = height
1003+
1004+
shapes_overriden = True
1005+
1006+
if self.width != -1:
1007+
if width is not None and width != self.width:
1008+
logger.warning(f"Incompatible widtth argument provided {width}. Pipeline only support {self.width}.")
1009+
width = self.width
1010+
else:
1011+
width = self.width
1012+
1013+
if width_idx is not None:
1014+
args[width_idx] = width
1015+
else:
1016+
kwargs["width"] = width
1017+
shapes_overriden = True
1018+
1019+
# Sana generates images in specific resolution grid size and then resize to requested size by default, it may contradict with pipeline height / width
1020+
# Disable this behavior for static shape pipeline
1021+
if self.auto_model_class.__name__.startswith("Sana") and shapes_overriden:
1022+
sig_resolution_bining_idx = (
1023+
list(sig.parameters).index("use_resolution_binning")
1024+
if "use_resolution_binning" in sig.parameters
1025+
else len(sig.parameters)
1026+
)
1027+
if len(args) > sig_resolution_bining_idx:
1028+
args[sig_resolution_bining_idx] = False
1029+
else:
1030+
kwargs["use_resolution_binning"] = False
9761031
# we use auto_model_class.__call__ here because we can't call super().__call__
9771032
# as OptimizedModel already defines a __call__ which is the first in the MRO
9781033
return self.auto_model_class.__call__(self, *args, **kwargs)

optimum/intel/openvino/modeling_visual_language.py

+45
Original file line numberDiff line numberDiff line change
@@ -3109,6 +3109,50 @@ def preprocess_inputs(
31093109
return processed_inputs
31103110

31113111

3112+
class _OVGotOCR2ForCausalLM(OVModelForVisualCausalLM):
3113+
def get_vision_embeddings(self, pixel_values, input_ids, **kwargs):
3114+
if input_ids is not None and input_ids.shape[1] == 1 and kwargs.get("past_key_values") is not None:
3115+
return None
3116+
return self.vision_embeddings(pixel_values).last_hidden_state
3117+
3118+
def merge_vision_text_embeddings(
3119+
self, vision_embeds, inputs_embeds, input_ids=None, attention_mask=None, position_ids=None, **kwargs
3120+
):
3121+
# Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L836-L845
3122+
image_features = torch.from_numpy(vision_embeds) if isinstance(vision_embeds, np.ndarray) else vision_embeds
3123+
inputs_embeds = torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds
3124+
n_image_tokens = (input_ids == self.config.image_token_index).sum()
3125+
n_image_features = image_features.shape[0] * image_features.shape[1]
3126+
if n_image_tokens != n_image_features:
3127+
raise ValueError(
3128+
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
3129+
)
3130+
special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
3131+
special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
3132+
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
3133+
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
3134+
3135+
return inputs_embeds, attention_mask, position_ids
3136+
3137+
@staticmethod
3138+
def preprocess_inputs(
3139+
text: Optional[str] = None,
3140+
image: Optional["Image"] = None,
3141+
processor: Optional[AutoImageProcessor] = None,
3142+
tokenizer: Optional[PreTrainedTokenizer] = None,
3143+
config: Optional[PretrainedConfig] = None,
3144+
video: Optional["VideoInput"] = None,
3145+
):
3146+
if processor is None:
3147+
raise ValueError("processor is required")
3148+
if video is not None:
3149+
raise ValueError("Video input is not supported")
3150+
if image is None:
3151+
raise ValueError("Image is required")
3152+
processed_inputs = processor(image, return_tensors="pt")
3153+
return processed_inputs
3154+
3155+
31123156
MODEL_TYPE_TO_CLS_MAPPING = {
31133157
"llava": _OVLlavaForCausalLM,
31143158
"llava_next": _OVLlavaNextForCausalLM,
@@ -3120,4 +3164,5 @@ def preprocess_inputs(
31203164
"internvl_chat": _OVInternVLForCausalLM,
31213165
"qwen2_vl": _OVQwen2VLForCausalLM,
31223166
"qwen2_5_vl": _OVQwen2_5_VLForCausalLM,
3167+
"got_ocr2": _OVGotOCR2ForCausalLM,
31233168
}

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767
"nncf": ["nncf>=2.14.0"],
6868
"openvino": ["nncf>=2.14.0", "openvino>=2024.5.0", "openvino-tokenizers>=2024.5.0"],
6969
"neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<4.46"],
70-
"ipex": ["intel-extension-for-pytorch>=2.4", "transformers>4.48,<4.50", "accelerate"],
70+
"ipex": ["intel-extension-for-pytorch>=2.6", "transformers>4.48,<4.50", "accelerate"],
7171
"diffusers": ["diffusers"],
7272
"quality": QUALITY_REQUIRE,
7373
"tests": TESTS_REQUIRE,

tests/openvino/test_diffusion.py

+28
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
import json
16+
import logging
1617
import unittest
1718
from pathlib import Path
1819

@@ -438,6 +439,33 @@ def test_load_custom_weight_variant(self):
438439

439440
np.testing.assert_allclose(ov_images, diffusers_images, atol=1e-4, rtol=1e-2)
440441

442+
@parameterized.expand(SUPPORTED_ARCHITECTURES)
443+
@require_diffusers
444+
def test_static_shape_image_generation(self, model_arch):
445+
pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], compile=False)
446+
pipeline.reshape(batch_size=1, height=32, width=32)
447+
pipeline.compile()
448+
# generation with incompatible size
449+
height, width, batch_size = 64, 64, 1
450+
inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
451+
inputs["output_type"] = "pil"
452+
from optimum.intel.openvino.modeling_diffusion import logger as diffusers_logger
453+
454+
with self.assertLogs(diffusers_logger, logging.WARN) as warning_log:
455+
image = pipeline(**inputs).images[0]
456+
self.assertTrue(
457+
any(
458+
"Incompatible width argument provided" in log or "Incompatible height argument provided" in log
459+
for log in warning_log.output
460+
)
461+
)
462+
self.assertTupleEqual(image.size, (32, 32))
463+
# generation without height / width provided
464+
inputs.pop("height")
465+
inputs.pop("width")
466+
image = pipeline(**inputs).images[0]
467+
self.assertTupleEqual(image.size, (32, 32))
468+
441469

442470
class OVPipelineForImage2ImageTest(unittest.TestCase):
443471
SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]

tests/openvino/test_modeling.py

+18-10
Original file line numberDiff line numberDiff line change
@@ -2141,7 +2141,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
21412141
SUPPORTED_ARCHITECTURES += ["maira2"]
21422142

21432143
if is_transformers_version(">=", "4.49.0"):
2144-
SUPPORTED_ARCHITECTURES += ["qwen2_5_vl"]
2144+
SUPPORTED_ARCHITECTURES += ["qwen2_5_vl", "got_ocr2"]
21452145
SUPPORT_VIDEO.append("qwen2_5_vl")
21462146
TASK = "image-text-to-text"
21472147
REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v", "maira2"]
@@ -2154,7 +2154,13 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
21542154
)
21552155

21562156
def get_transformer_model_class(self, model_arch):
2157-
if is_transformers_version(">=", "4.46") and model_arch in ["llava", "llava_next", "qwen2_vl", "qwen2_5_vl"]:
2157+
if is_transformers_version(">=", "4.46") and model_arch in [
2158+
"llava",
2159+
"llava_next",
2160+
"qwen2_vl",
2161+
"qwen2_5_vl",
2162+
"got_ocr2",
2163+
]:
21582164
from transformers import AutoModelForImageTextToText
21592165

21602166
return AutoModelForImageTextToText
@@ -2339,14 +2345,16 @@ def test_generate_utils(self, model_arch):
23392345
outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
23402346
self.assertIsInstance(outputs[0], str)
23412347

2342-
# No input image case
2343-
question = "Hi, how are you?"
2344-
inputs = model.preprocess_inputs(**preprocessors, text=question, image=None)
2345-
outputs = model.generate(**inputs, max_new_tokens=10)
2346-
# filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200
2347-
outputs = outputs[:, inputs["input_ids"].shape[1] :]
2348-
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
2349-
self.assertIsInstance(outputs[0], str)
2348+
# GOT-OCR2 does not support text-only input
2349+
if model_arch != "got_ocr2":
2350+
# No input image case
2351+
question = "Hi, how are you?"
2352+
inputs = model.preprocess_inputs(**preprocessors, text=question, image=None)
2353+
outputs = model.generate(**inputs, max_new_tokens=10)
2354+
# filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200
2355+
outputs = outputs[:, inputs["input_ids"].shape[1] :]
2356+
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
2357+
self.assertIsInstance(outputs[0], str)
23502358

23512359
# video loader helper only available for transformers >= 4.49
23522360
if model_arch in self.SUPPORT_VIDEO and is_transformers_version(">=", "4.49"):

tests/openvino/utils_tests.py

+1
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363
"exaone": "katuni4ka/tiny-random-exaone",
6464
"gemma": "fxmarty/tiny-random-GemmaForCausalLM",
6565
"gemma2": "katuni4ka/tiny-random-gemma2",
66+
"got_ocr2": "katuni4ka/tiny-random-got-ocr2-hf",
6667
"falcon": "fxmarty/really-tiny-falcon-testing",
6768
"falcon-40b": "katuni4ka/tiny-random-falcon-40b",
6869
"flaubert": "hf-internal-testing/tiny-random-flaubert",

0 commit comments

Comments
 (0)