diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index e147945878..9320303ff4 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -718,14 +718,15 @@ def _mistral_update_causal_mask( class MistralModelPatcher(DecoderModelPatcher): def __enter__(self): super().__enter__() - if is_transformers_version(">=", "4.42.0"): + if is_transformers_version(">=", "4.42.0") and is_transformers_version("<", "4.48.0"): # apply fix https://github.com/huggingface/transformers/commit/57d7594a79a9f5d835abf2d4d384db0e4818e548 self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask self._model.model._update_causal_mask = types.MethodType(_mistral_update_causal_mask, self._model.model) else: for layer in self._model.model.layers: - _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) + if hasattr(layer.self_attn, "rotary_emb"): + _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -734,7 +735,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask for layer in self._model.model.layers: - if hasattr(layer.self_attn.rotary_emb, "_orig_forward"): + if hasattr(layer.self_attn, "rotary_emb") and hasattr(layer.self_attn.rotary_emb, "_orig_forward"): layer.self_attn.rotary_emb.forward = layer.self_attn.rotary_emb._orig_forward @@ -1580,19 +1581,19 @@ def __enter__(self): ): self._model.config.max_position_embeddings = self._model.config.original_max_position_embeddings - if is_transformers_version(">=", "4.42.0"): + if is_transformers_version(">=", "4.42.0") and is_transformers_version("<", "4.48.0"): self._model.model._orig_forward = self._model.model.forward self._model.model.forward = types.MethodType(phi3_442_forward, self._model.model) # https://github.com/huggingface/transformers/blob/30ee508c6c92a1c0aa0281d193c7c0fb815b8d2f/src/transformers/models/phi3/modeling_phi3.py#L113 # init inv_freq for torchscript tracing for layer in self._model.model.layers: - if is_torch_version(">=", "2.1.0"): + if is_torch_version(">=", "2.1.0") and is_transformers_version("<", "4.48.0"): orig_self_attn_fwd = layer.self_attn.forward layer.self_attn.forward = types.MethodType(_phi3_self_attn_sdpa_forward, layer.self_attn) layer.self_attn._orig_forward = orig_self_attn_fwd - if layer.self_attn.rotary_emb.inv_freq is None: + if hasattr(layer.self_attn, "rotary_emb") and layer.self_attn.rotary_emb.inv_freq is None: rotary_emb = layer.self_attn.rotary_emb layer.self_attn.rotary_emb.inv_freq = 1.0 / ( rotary_emb.base ** (torch.arange(0, rotary_emb.dim, 2, dtype=torch.int64).float() / rotary_emb.dim) @@ -2493,7 +2494,9 @@ class UpdateCausalMaskModelPatcher(DecoderModelPatcher): def __enter__(self): super().__enter__() patch_update_causal_mask(self._model, "4.42.0") - if hasattr(self._model.model.layers[0].self_attn.rotary_emb, "_set_cos_sin_cache"): + if hasattr(self._model.model.layers[0].self_attn, "rotary_emb") and hasattr( + self._model.model.layers[0].self_attn.rotary_emb, "_set_cos_sin_cache" + ): for layer in self._model.model.layers: _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) @@ -3045,15 +3048,16 @@ def patched_forward(self, fn): def __enter__(self): if is_torch_version(">=", "2.1.0"): if self._model.config.model_type == "qwen2" and self._model.config._attn_implementation != "sdpa": - from transformers.models.qwen2.modeling_qwen2 import QWEN2_ATTENTION_CLASSES + if is_transformers_version("<", "4.48"): + from transformers.models.qwen2.modeling_qwen2 import QWEN2_ATTENTION_CLASSES - sdpa_attn = QWEN2_ATTENTION_CLASSES["sdpa"] - self._model.config._orig_attn_implementation = self._model.config._attn_implementation - self._model.config._attn_implementation = "sdpa" + sdpa_attn = QWEN2_ATTENTION_CLASSES["sdpa"] + self._model.config._orig_attn_implementation = self._model.config._attn_implementation + self._model.config._attn_implementation = "sdpa" - for layer in self._model.model.layers: - layer.self_attn._orig_forward = layer.self_attn.forward - layer.self_attn.forward = types.MethodType(sdpa_attn.forward, layer.self_attn) + for layer in self._model.model.layers: + layer.self_attn._orig_forward = layer.self_attn.forward + layer.self_attn.forward = types.MethodType(sdpa_attn.forward, layer.self_attn) if self._model.config.model_type == "llama" and self._model.config._attn_implementation != "sdpa": self._model.config._orig_attn_implementation = self._model.config._attn_implementation diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index b2519563f5..41e4c6e3e9 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -56,7 +56,11 @@ if TYPE_CHECKING: - from transformers.generation.streamers import BaseStreamer + try: + from transformers.generation.streamers import BaseStreamer + except Exception: + from typing import Generator as BaseStreamer + from transformers.modeling_utils import PreTrainedModel diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index e895a76e73..435f656c60 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -1020,6 +1020,18 @@ def preprocess_inputs( prompt = "\n" + text else: prompt = text + + if getattr(processor, "patch_size", None) is None: + if ( + getattr(config, "vision_config", None) is not None + and getattr(config.vision_config, "patch_size", None) is not None + ): + processor.patch_size = config.vision_config.patch_size + else: + raise ValueError( + "Processor does not have `patch_size` attribute. Please fix the processor or provide `patch_size` in the config." + ) + inputs = processor(images=image, text=prompt, return_tensors="pt") return inputs @@ -1915,6 +1927,7 @@ def preprocess_inputs( input_ids = tokenizer(text, return_tensors="pt").input_ids attention_mask = torch.ones_like(input_ids, dtype=torch.int64) result = {"input_ids": input_ids, "attention_mask": attention_mask} + if image is not None: result["images"] = processor(images=[image], return_tensors="pt")["pixel_values"] return result diff --git a/setup.py b/setup.py index cf4bfb3191..a8b898d230 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ INSTALL_REQUIRE = [ "torch>=1.11", "optimum~=1.24", - "transformers>=4.36,<4.48", + "transformers>=4.36,<4.49", "datasets>=1.4.0", "sentencepiece", "setuptools", diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 66f06354ee..8ca72d06c2 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -271,6 +271,7 @@ def test_load_from_hub_and_save_visual_language_model(self): else: self.assertEqual(component.request.get_property("PERFORMANCE_HINT"), "LATENCY") + processor.patch_size = loaded_model.config.vision_config.patch_size inputs = processor(images=image, text=prompt, return_tensors="pt") set_seed(SEED) loaded_model_outputs = loaded_model(**inputs) @@ -2170,6 +2171,7 @@ def test_compare_to_transformers(self, model_arch): for component_name, component in ov_model.components.items(): self.assertIsInstance(component, MODEL_PARTS_CLS_MAPPING[component_name]) self.assertIsInstance(ov_model.config, PretrainedConfig) + inputs = ov_model.preprocess_inputs(**preprocessors, text=prompt, image=self.IMAGE.resize((600, 600))) transformers_inputs = copy.deepcopy(inputs) test_device = "AUTO" @@ -2235,6 +2237,7 @@ def test_llava_with_new_preprocessing(self, model_arch): patch_size=config.vision_config.patch_size, vision_feature_select_strategy=config.vision_feature_select_strategy, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS, + num_additional_image_tokens=1, ) transformers_model = self.get_transformer_model_class(model_arch).from_pretrained(model_id) ov_model = OVModelForVisualCausalLM.from_pretrained( @@ -2244,8 +2247,9 @@ def test_llava_with_new_preprocessing(self, model_arch): self.assertTrue(processor.patch_size is not None) self.assertTrue(processor.vision_feature_select_strategy is not None) inputs = processor(images=self.IMAGE, text=prompt, return_tensors="pt") - self.assertTrue( - (inputs.input_ids == ov_model.config.image_token_index).sum(1).max() >= ov_model.config.image_seq_length + self.assertGreaterEqual( + (inputs.input_ids == ov_model.config.image_token_index).sum().max().item(), + ov_model.config.image_seq_length, ) set_seed(SEED) with torch.no_grad(): @@ -2308,17 +2312,17 @@ def test_generate_utils(self, model_arch): def get_preprocessors(self, model_arch): model_id = MODEL_NAMES[model_arch] + config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) + if model_arch == "nanollava": - config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) processor = AutoProcessor.from_pretrained( config.mm_vision_tower, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) tokenizer = AutoTokenizer.from_pretrained( model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) - preprocessors = {"processor": processor, "tokenizer": tokenizer} + preprocessors = {"processor": processor, "tokenizer": tokenizer, "config": config} elif model_arch == "internvl2": - config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) tokenizer = AutoTokenizer.from_pretrained( model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) @@ -2327,7 +2331,8 @@ def get_preprocessors(self, model_arch): processor = AutoProcessor.from_pretrained( model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) - preprocessors = {"processor": processor, "tokenizer": None} + preprocessors = {"processor": processor, "tokenizer": None, "config": config} + return preprocessors @parameterized.expand(SUPPORTED_ARCHITECTURES)