From 47af979df236374f386377efaf0161a434d3caa5 Mon Sep 17 00:00:00 2001 From: eaidova Date: Mon, 13 May 2024 17:06:28 +0400 Subject: [PATCH 1/4] add sdpa for phi3 openvino model --- optimum/exporters/openvino/model_patcher.py | 90 ++++++++++++++++++++- 1 file changed, 89 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index f68e873d40..b60f6a4741 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -951,15 +951,103 @@ def __exit__(self, exc_type, exc_value, traceback): block.attention.forward = block.attention._orig_forward +# Adapted from Phi3Attention.forward +def _phi3_self_attn_sdpa_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + return self._orig_forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv + + bsz, q_len, _ = hidden_states.size() + + qkv = self.qkv_proj(hidden_states) + query_pos = self.num_heads * self.head_dim + query_states = qkv[..., :query_pos] + key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim] + value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :] + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + class Phi3ModelPatcher(DecoderModelPatcher): def __enter__(self): super().__enter__() - # https://github.com/huggingface/transformers/blob/30ee508c6c92a1c0aa0281d193c7c0fb815b8d2f/src/transformers/models/phi3/modeling_phi3.py#L113 # init inv_freq for torchscript tracing for layer in self._model.model.layers: + if is_torch_version(">=", "2.1.0"): + orig_self_attn_fwd = layer.self_attn.forward + layer.self_attn.forward = types.MethodType(_phi3_self_attn_sdpa_forward, layer.self_attn) + layer.self_attn._orig_forward = orig_self_attn_fwd + if layer.self_attn.rotary_emb.inv_freq is None: rotary_emb = layer.self_attn.rotary_emb layer.self_attn.rotary_emb.inv_freq = 1.0 / ( rotary_emb.base ** (torch.arange(0, rotary_emb.dim, 2, dtype=torch.int64).float() / rotary_emb.dim) ) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + for layer in self._model.model.layers: + if hasattr(layer.self_attn, "_orig_forward"): + layer.self_attn.forward = layer.self_attn._orig_forward From 9917b6b197c239de8c1fdecdd7b66b3174961398 Mon Sep 17 00:00:00 2001 From: eaidova Date: Mon, 13 May 2024 18:54:37 +0400 Subject: [PATCH 2/4] fix pkv filling according model code --- optimum/exporters/openvino/model_configs.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 575f1cc4db..5a5d372d96 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -485,6 +485,12 @@ def patch_model_for_export( library_name="transformers", ) class Phi3OpenVINOConfig(PhiOnnxConfig): + DUMMY_INPUT_GENERATOR_CLASSES = ( + MistralDummyPastKeyValuesGenerator, + ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True) + def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": From a75b60957a0a10abd38784e3eeac069722c519ec Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Wed, 15 May 2024 08:33:03 +0400 Subject: [PATCH 3/4] Update optimum/exporters/openvino/model_patcher.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/exporters/openvino/model_patcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index b60f6a4741..e47165f31d 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -951,7 +951,7 @@ def __exit__(self, exc_type, exc_value, traceback): block.attention.forward = block.attention._orig_forward -# Adapted from Phi3Attention.forward +# Adapted from https://github.com/huggingface/transformers/blob/ccdabc5642bf84849af93f591e207dc625c8e1e1/src/transformers/models/phi3/modeling_phi3.py#L426 def _phi3_self_attn_sdpa_forward( self, hidden_states: torch.Tensor, From 8010e6096c3aa488e6a4789548c63e55ab9577f0 Mon Sep 17 00:00:00 2001 From: eaidova Date: Wed, 15 May 2024 18:32:45 +0400 Subject: [PATCH 4/4] import helpers from phi3 if available --- optimum/exporters/openvino/model_patcher.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index b60f6a4741..52d5551fad 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -971,7 +971,11 @@ def _phi3_self_attn_sdpa_forward( use_cache=use_cache, ) - from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv + # TO DO: remove llama imports when transformers with phi3 support will be released + try: + from transformers.models.phi3.modelling_phi3 import apply_rotary_pos_emb, repeat_kv + except ImportError: + from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv bsz, q_len, _ = hidden_states.size()