diff --git a/notebooks/ipex/text_generation.ipynb b/notebooks/ipex/text_generation.ipynb index d1a62d9201..dd6b8c0abb 100644 --- a/notebooks/ipex/text_generation.ipynb +++ b/notebooks/ipex/text_generation.ipynb @@ -22,6 +22,7 @@ "source": [ "import torch\n", "from transformers import AutoTokenizer\n", + "\n", "from optimum.intel.ipex import IPEXModelForCausalLM" ] }, diff --git a/notebooks/openvino/optimum_openvino_inference.ipynb b/notebooks/openvino/optimum_openvino_inference.ipynb index 76c77aec55..5106fe1fba 100644 --- a/notebooks/openvino/optimum_openvino_inference.ipynb +++ b/notebooks/openvino/optimum_openvino_inference.ipynb @@ -78,6 +78,7 @@ "source": [ "from optimum.intel import OVModelForQuestionAnswering\n", "\n", + "\n", "# Load PyTorch model from the Hub and export to OpenVINO in the background\n", "model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad\", export=True)\n", "\n", @@ -122,6 +123,7 @@ "source": [ "from transformers import AutoTokenizer\n", "\n", + "\n", "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased-distilled-squad\")\n", "tokenizer.save_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp32\")" ] @@ -182,9 +184,11 @@ } ], "source": [ - "from optimum.intel import OVModelForQuestionAnswering\n", "from transformers import AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVModelForQuestionAnswering\n", + "\n", + "\n", "model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp32\")\n", "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased-distilled-squad\")\n", "ov_pipe = pipeline(\"question-answering\", model=model, tokenizer=tokenizer)\n", @@ -240,9 +244,11 @@ ], "source": [ "import torch\n", - "from optimum.intel import OVModelForQuestionAnswering\n", "from transformers import AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVModelForQuestionAnswering\n", + "\n", + "\n", "model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp32\")\n", "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp32\")\n", "\n", @@ -324,9 +330,11 @@ } ], "source": [ - "from optimum.intel import OVModelForQuestionAnswering\n", "from transformers import AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVModelForQuestionAnswering\n", + "\n", + "\n", "model = OVModelForQuestionAnswering.from_pretrained(\n", " \"helenai/distilbert-base-uncased-distilled-squad-ov-fp32\", compile=False\n", ")\n", @@ -411,6 +419,7 @@ "source": [ "from openvino.runtime import Core\n", "\n", + "\n", "for device in Core().available_devices:\n", " print(device, Core().get_property(device, \"FULL_DEVICE_NAME\"))" ] @@ -528,10 +537,12 @@ } ], "source": [ + "from datasets import load_dataset\n", "from IPython.display import Audio\n", - "from optimum.intel import OVModelForAudioClassification\n", "from transformers import AutoFeatureExtractor, pipeline\n", - "from datasets import load_dataset\n", + "\n", + "from optimum.intel import OVModelForAudioClassification\n", + "\n", "\n", "model_id = \"helenai/MIT-ast-finetuned-speech-commands-v2-ov\"\n", "model = OVModelForAudioClassification.from_pretrained(model_id)\n", @@ -638,9 +649,11 @@ } ], "source": [ - "from optimum.intel import OVModelForCausalLM\n", "from transformers import AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVModelForCausalLM\n", + "\n", + "\n", "model_id = \"helenai/gpt2-ov\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", "model = OVModelForCausalLM.from_pretrained(model_id)\n", @@ -704,9 +717,11 @@ ], "source": [ "from IPython.display import Image\n", - "from optimum.intel import OVModelForImageClassification\n", "from transformers import AutoImageProcessor, pipeline\n", "\n", + "from optimum.intel import OVModelForImageClassification\n", + "\n", + "\n", "model_id = \"helenai/microsoft-swin-tiny-patch4-window7-224-ov\"\n", "model = OVModelForImageClassification.from_pretrained(model_id, compile=False)\n", "image_processor = AutoImageProcessor.from_pretrained(model_id)\n", @@ -766,9 +781,11 @@ } ], "source": [ - "from optimum.intel import OVModelForMaskedLM\n", "from transformers import AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVModelForMaskedLM\n", + "\n", + "\n", "model_id = \"helenai/bert-base-uncased-ov\"\n", "model = OVModelForMaskedLM.from_pretrained(model_id)\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", @@ -835,9 +852,11 @@ } ], "source": [ - "from optimum.intel import OVModelForQuestionAnswering\n", "from transformers import AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVModelForQuestionAnswering\n", + "\n", + "\n", "# Load the model and tokenizer saved in Part 1 of this notebook. Or use the line below to load them from the hub\n", "# model_id = \"helenai/distilbert-base-uncased-distilled-squad-ov-fp32\"\n", "model_id = \"distilbert-base-uncased-distilled-squad-ov-fp32\"\n", @@ -890,9 +909,11 @@ } ], "source": [ - "from optimum.intel import OVModelForSeq2SeqLM\n", "from transformers import AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVModelForSeq2SeqLM\n", + "\n", + "\n", "model_id = \"helenai/t5-small-ov\"\n", "model = OVModelForSeq2SeqLM.from_pretrained(model_id, compile=False, trust_remote_code=True)\n", "tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)\n", @@ -998,9 +1019,11 @@ } ], "source": [ - "from optimum.intel import OVModelForSequenceClassification\n", "from transformers import AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVModelForSequenceClassification\n", + "\n", + "\n", "model_id = \"helenai/papluca-xlm-roberta-base-language-detection-ov\"\n", "model = OVModelForSequenceClassification.from_pretrained(model_id)\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", @@ -1047,9 +1070,11 @@ } ], "source": [ - "from optimum.intel import OVModelForTokenClassification\n", "from transformers import AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVModelForTokenClassification\n", + "\n", + "\n", "model_id = \"helenai/dslim-bert-base-NER-ov-fp32\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", "model = OVModelForTokenClassification.from_pretrained(model_id)\n", diff --git a/notebooks/openvino/quantized_generation_demo.ipynb b/notebooks/openvino/quantized_generation_demo.ipynb index 5673243cb2..c160e735b0 100644 --- a/notebooks/openvino/quantized_generation_demo.ipynb +++ b/notebooks/openvino/quantized_generation_demo.ipynb @@ -45,6 +45,7 @@ "import os\n", "\n", "from transformers import AutoTokenizer\n", + "\n", "from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig" ] }, @@ -211,6 +212,7 @@ "source": [ "from transformers import TextStreamer\n", "\n", + "\n", "# Tokenize the sample\n", "inputs = tokenizer([sample], return_tensors='pt')\n", "\n", @@ -294,7 +296,7 @@ "\n", "\n", "# Tokenize the sample\n", - "inputs = tokenizer([sample], return_tensors='pt') \n", + "inputs = tokenizer([sample], return_tensors='pt')\n", "\n", "out = stateless_model.generate(\n", " **inputs,\n", @@ -302,7 +304,7 @@ " streamer=TextStreamer(tokenizer=tokenizer, skip_special_tokens=True),\n", " pad_token_id=tokenizer.eos_token_id,\n", " prompt_lookup_num_tokens=3,\n", - ") " + ")" ] }, { @@ -442,6 +444,7 @@ "outputs": [], "source": [ "from functools import wraps\n", + "\n", "import numpy as np\n", "\n", "\n", @@ -458,15 +461,15 @@ " if len(self.seq_lens) > 0 or len(self.win_sizes) > 0:\n", " raise RuntimeError(\"Always use a new instance, don't reuse!\")\n", " self.model_forward = self.model.forward\n", - " \n", + "\n", " @wraps(self.model_forward)\n", " def forward_wrapper(**kwargs):\n", " self.seq_lens[-1].append(kwargs.get(\"attention_mask\").shape[-1])\n", " self.win_sizes[-1].append(kwargs.get(\"input_ids\").shape[-1] - 1)\n", " return self.model_forward(**kwargs)\n", - " \n", + "\n", " self.model.forward = forward_wrapper\n", - " \n", + "\n", " # wrap generate method\n", " self.model_generate = self.model.generate\n", "\n", @@ -494,7 +497,7 @@ " self.seq_lens = [sl[1:] for sl in self.seq_lens]\n", " # Add window size for output to ease calculation later\n", " for ws, sl in zip(self.win_sizes, self.seq_lens):\n", - " ws.append(0) \n", + " ws.append(0)\n", "\n", " def acceptance_rate(self, return_mean=True, normalize=False):\n", " # ar_per_win = ((cur_seq_len - cur_win_size) - (prev_seq_len - prev_win_size) - 1) / prev_win_size\n", @@ -533,8 +536,9 @@ "metadata": {}, "outputs": [], "source": [ - "from tqdm import tqdm\n", "from datasets import load_dataset\n", + "from tqdm import tqdm\n", + "\n", "\n", "dataset_name = \"openai_humaneval\"\n", "dataset_subset_name = None\n", @@ -590,10 +594,10 @@ "from threading import Thread\n", "\n", "from transformers import (\n", - " TextIteratorStreamer,\n", + " GenerationConfig,\n", " StoppingCriteria,\n", " StoppingCriteriaList,\n", - " GenerationConfig,\n", + " TextIteratorStreamer,\n", ")\n", "\n", "\n", @@ -690,7 +694,7 @@ " prompt_char = \"▌\"\n", " history[-1][1] = prompt_char\n", " yield history, \"Status: Generating...\", *([gr.update(interactive=False)] * 4)\n", - " \n", + "\n", " streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n", "\n", " # Create a stopping criteria to prevent the model from playing the role of the user aswell.\n", @@ -770,6 +774,7 @@ "source": [ "import gradio as gr\n", "\n", + "\n", "try:\n", " demo.close()\n", "except:\n", @@ -808,7 +813,7 @@ " history: conversation history\n", " Returns:\n", " updated history\n", - " \"\"\" \n", + " \"\"\"\n", " history[-1][1] = None\n", " return history\n", "\n", diff --git a/notebooks/openvino/question_answering_quantization.ipynb b/notebooks/openvino/question_answering_quantization.ipynb index 2481c9b904..247a6f868b 100644 --- a/notebooks/openvino/question_answering_quantization.ipynb +++ b/notebooks/openvino/question_answering_quantization.ipynb @@ -51,9 +51,11 @@ "import transformers\n", "from evaluate import evaluator\n", "from openvino.runtime import Core\n", - "from optimum.intel import OVModelForQuestionAnswering, OVQuantizer, OVQuantizationConfig, OVConfig\n", "from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVConfig, OVModelForQuestionAnswering, OVQuantizationConfig, OVQuantizer\n", + "\n", + "\n", "transformers.logging.set_verbosity_error()\n", "datasets.logging.set_verbosity_error()" ] diff --git a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb index 8ef2e8ad6c..798aede77a 100644 --- a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb +++ b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb @@ -46,15 +46,18 @@ "outputs": [], "source": [ "import time\n", + "from pathlib import Path\n", + "\n", "import datasets\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import transformers\n", - "from pathlib import Path\n", "from openvino.runtime import Core\n", + "\n", "from optimum.intel import OVConfig, OVQuantizer, OVStableDiffusionPipeline, OVWeightQuantizationConfig\n", "from optimum.intel.openvino.configuration import OVQuantizationMethod\n", "\n", + "\n", "transformers.logging.set_verbosity_error()\n", "datasets.logging.set_verbosity_error()" ] diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py index 484fd38077..2a9af1cd52 100644 --- a/optimum/exporters/ipex/model_patcher.py +++ b/optimum/exporters/ipex/model_patcher.py @@ -29,11 +29,11 @@ from .modeling_utils import ( _IPEX_MINIMUM_VERSION_FOR_PATCHING, _gpt2_block_forward, - _ipex_rms_layer_norm_forward, _IPEXFalconDecoderLayer, _IPEXGPT2Attention, _IPEXIntermediate, _IPEXLlamaDecoderLayer, + _llama_layer_norm_forward, _llama_model_forward, ) @@ -79,7 +79,7 @@ def _patch_llama_model(model): 2. Linear fusion with (2 Linears + Silu + Mul) and (Linear + Add) """ convert_functions(model, LlamaModel, "forward", _llama_model_forward) - convert_functions(model, LlamaRMSNorm, "forward", _ipex_rms_layer_norm_forward) + convert_functions(model, LlamaRMSNorm, "forward", _llama_layer_norm_forward) convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayer, model.config) return model diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py index e415a40771..2e73fb9076 100644 --- a/optimum/exporters/ipex/modeling_utils.py +++ b/optimum/exporters/ipex/modeling_utils.py @@ -84,14 +84,12 @@ def padding_attn_mask(attn_mask, alignment): return new_attn_mask -# Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L83 def _llama_layer_norm_forward(self, hidden_states): - return rms_norm(hidden_states, self.weight, self.variance_epsilon) - - -# Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L83 -def _ipex_rms_layer_norm_forward(self, hidden_states): - return torch.ops.torch_ipex.rmsnorm(hidden_states, self.weight, self.variance_epsilon) + if hidden_states.device.type == "xpu": + return rms_norm(hidden_states, self.weight, self.variance_epsilon) + else: + # Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L83 + return torch.ops.torch_ipex.rmsnorm(hidden_states, self.weight, self.variance_epsilon) # Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L1130 @@ -226,14 +224,82 @@ def __init__(self, module, config) -> None: super().__init__() _setattr_from_module(self, module) self.config = config - self.ipex_scale_dot_product = IndirectAccessKVCacheAttention(text_max_length=config.max_position_embeddings) - if hasattr(config, "rope_theta"): - self.ipex_rope = RotaryEmbedding( - config.max_position_embeddings, - config.hidden_size // config.num_attention_heads, - config.rope_theta, - config.architectures[0], + self.module_device = next(module.parameters()).device.type + if self.module_device == "xpu": + from intel_extension_for_pytorch.transformers.models.xpu.fusions.mha_fusion import _IPEXRopeXPU + + self.ipex_rope = _IPEXRopeXPU( + module.config.max_position_embeddings, + module.config.hidden_size // module.config.num_attention_heads, + module.config.rope_theta, + module.config.architectures[0], ) + self.port_parameters(module) + torch.xpu.empty_cache() + else: + self.ipex_scale_dot_product = IndirectAccessKVCacheAttention( + text_max_length=config.max_position_embeddings + ) + if hasattr(config, "rope_theta"): + self.ipex_rope = RotaryEmbedding( + config.max_position_embeddings, + config.hidden_size // config.num_attention_heads, + config.rope_theta, + config.architectures[0], + ) + + def port_parameters(self, module): + self.qkv_proj_bias = None + self.qkv_proj_weight = None + if self.num_heads == self.num_key_value_heads: + q_proj = module.q_proj.weight.transpose(0, 1) + k_proj = module.k_proj.weight.transpose(0, 1) + v_proj = module.v_proj.weight.transpose(0, 1) + self.qkv_proj_weight = torch.stack([q_proj, k_proj, v_proj]).contiguous().view([3, -1, q_proj.shape[-1]]) + module.q_proj.weight.data = self.qkv_proj_weight[0, :, :].transpose(0, 1) + module.k_proj.weight.data = self.qkv_proj_weight[1, :, :].transpose(0, 1) + module.v_proj.weight.data = self.qkv_proj_weight[2, :, :].transpose(0, 1) + if module.q_proj.bias is not None: + self.qkv_proj_bias = ( + torch.stack([module.q_proj.bias, module.k_proj.bias, module.v_proj.bias]) + .contiguous() + .view([3, -1]) + ) + module.q_proj.bias.data = self.qkv_proj_bias[0] + module.k_proj.bias.data = self.qkv_proj_bias[1] + module.v_proj.bias.data = self.qkv_proj_bias[2] + else: + q_proj = module.q_proj.weight.view( + self.num_key_value_heads, self.num_key_value_groups, self.head_dim, self.hidden_size + ) + k_proj = module.k_proj.weight.view(self.num_key_value_heads, 1, self.head_dim, self.hidden_size) + v_proj = module.v_proj.weight.view(self.num_key_value_heads, 1, self.head_dim, self.hidden_size) + self.qkv_proj_weight = torch.cat([q_proj, k_proj, v_proj], dim=1).view( + [self.num_key_value_heads, self.num_key_value_groups + 2, self.head_dim, self.hidden_size] + ) + module.q_proj.data = self.qkv_proj_weight[:, : self.num_key_value_groups, :, :].reshape( + [self.num_key_value_heads * self.num_key_value_groups * self.head_dim, self.hidden_size] + ) + module.k_proj.data = self.qkv_proj_weight[:, self.num_key_value_groups, :, :].reshape( + [self.num_key_value_heads * self.head_dim, self.hidden_size] + ) + module.v_proj.data = self.qkv_proj_weight[:, self.num_key_value_groups + 1, :, :].reshape( + [self.num_key_value_heads * self.head_dim, self.hidden_size] + ) + self.qkv_proj_weight = self.qkv_proj_weight.permute(3, 0, 1, 2).contiguous() + if module.q_proj.bias is not None: + q_bias = module.q_proj.bias.view(self.num_key_value_heads, self.num_key_value_groups, self.head_dim) + k_bias = module.k_proj.bias.view(self.num_key_value_heads, 1, self.head_dim) + v_bias = module.v_proj.bias.view(self.num_key_value_heads, 1, self.head_dim) + self.qkv_proj_bias = torch.cat([q_bias, k_bias, v_bias], dim=1).view( + [self.num_key_value_heads, self.num_key_value_groups + 2, self.head_dim] + ) + module.q_proj.bias.data = self.qkv_proj_bias[:, : self.num_key_value_groups, self.head_dim].view(-1) + module.k_proj.bias.data = self.qkv_proj_bias[:, self.num_key_value_groups, self.head_dim].view(-1) + module.v_proj.bias.data = self.qkv_proj_bias[:, self.num_key_value_groups + 1, self.head_dim].view(-1) + self.o_proj_weight = module.o_proj.weight.transpose(0, 1).contiguous() + module.o_proj.weight.data = self.o_proj_weight.transpose(0, 1) + self.o_proj_bias = module.o_proj.bias def qkv_gemm(self, hidden_states): raise NotImplementedError("Need to implement in specific model class") @@ -244,16 +310,25 @@ def rope(self, *args, **kwargs): def sdpa_with_cache(self, query, key, value, past_key_value, attention_mask, **kwargs): # This ipex op pre-allocates buffers for past_key_values and use beam index history # which to decide which beam should be used to make attention scale dot more efficient. - (attn_output, attn_weights, past_key_value) = self.ipex_scale_dot_product( - query, - key, - value, - math.sqrt(self.head_dim), - past_key_value, - kwargs.get("head_mask", None), - attention_mask, - kwargs.get("alibi", None), - ) + if self.module_device == "xpu": + scale = 1.0 / math.sqrt(self.head_dim) + is_causal = False + attn_output = torch.xpu.IpexSDP( + query, key, value, None, attention_mask, None, scale, 1.0, 0.0, is_causal, False + ) + attn_weights = None + past_key_value = (key, value) + else: + (attn_output, attn_weights, past_key_value) = self.ipex_scale_dot_product( + query, + key, + value, + math.sqrt(self.head_dim), + past_key_value, + kwargs.get("head_mask", None), + attention_mask, + kwargs.get("alibi", None), + ) return attn_output, past_key_value, attn_weights def sdpa_without_cache(self, query, key, value, past_key_value, attention_mask, **kwargs): @@ -287,10 +362,18 @@ def forward( qkv_out = self.qkv_gemm(hidden_states) if isinstance(qkv_out, tuple) and len(qkv_out) == 3: query, key, value = self.qkv_gemm(hidden_states) - query, key = self.rope(query, key, kv_seq_len, use_cache, position_ids=position_ids) + query, key = self.rope(query, key, kv_seq_len, use_cache, position_ids, **kwargs) else: query, key, value = self.rope(qkv_out, kv_seq_len, use_cache, past_len=past_len) + if self.module_device == "xpu": + if past_key_value is not None: + key = torch.cat([past_key_value[0].transpose(1, 2), key], dim=1) + value = torch.cat([past_key_value[1].transpose(1, 2), value], dim=1) + query = query.transpose(1, 2) + key = key.transpose(1, 2) + value = value.transpose(1, 2) + attention_mask = self.prepare_attention_mask_float(attention_mask, query.dtype) sdpa = self.sdpa_with_cache if use_cache else self.sdpa_without_cache attn_output, past_key_value, attn_weights = sdpa( @@ -315,9 +398,10 @@ def forward( class _IPEXLlamaAttention(_IPEXAttention): def __init__(self, module, config) -> None: super().__init__(module, config) - if module.o_proj.__class__.__name__ not in ["LinearAllreduce"]: - self.mha_linear_add = LinearAdd(module.o_proj) - del self.__dict__["_modules"]["o_proj"] + if self.module_device == "cpu": + if module.o_proj.__class__.__name__ not in ["LinearAllreduce"]: + self.mha_linear_add = LinearAdd(module.o_proj) + del self.__dict__["_modules"]["o_proj"] def qkv_gemm(self, hidden_states): bsz, seq_len, _ = hidden_states.size() @@ -327,11 +411,16 @@ def qkv_gemm(self, hidden_states): return query, key, value - def rope(self, query, key, kv_seq_len, use_cache, position_ids): - if use_cache: - args = (self.head_dim, self.head_dim // 2, self.head_dim, kv_seq_len) - key = self.ipex_rope(key, position_ids, self.num_key_value_heads, *args) - query = self.ipex_rope(query, position_ids, self.num_heads, *args) + def rope(self, query, key, kv_seq_len, use_cache, position_ids, **kwargs): + if self.module_device == "xpu": + sin = kwargs.pop("sin", None) + cos = kwargs.pop("cos", None) + self.ipex_rope.apply_embedding(query, sin, cos, self.head_dim // 2, key) + else: + if use_cache: + args = (self.head_dim, self.head_dim // 2, self.head_dim, kv_seq_len) + key = self.ipex_rope(key, position_ids, self.num_key_value_heads, *args) + query = self.ipex_rope(query, position_ids, self.num_heads, *args) return query, key # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L341 @@ -418,59 +507,6 @@ def postprocess_attention_output(self, attn_output, bsz, seq_len): attn_output = self.resid_dropout(attn_output) return attn_output - def port_parameters(self, module): - self.qkv_proj_bias = None - self.qkv_proj_weight = None - if self.num_heads == self.num_key_value_heads: - q_proj = module.q_proj.weight.transpose(0, 1) - k_proj = module.k_proj.weight.transpose(0, 1) - v_proj = module.v_proj.weight.transpose(0, 1) - self.qkv_proj_weight = torch.stack([q_proj, k_proj, v_proj]).contiguous().view([3, -1, q_proj.shape[-1]]) - module.q_proj.weight.data = self.qkv_proj_weight[0, :, :].transpose(0, 1) - module.k_proj.weight.data = self.qkv_proj_weight[1, :, :].transpose(0, 1) - module.v_proj.weight.data = self.qkv_proj_weight[2, :, :].transpose(0, 1) - if module.q_proj.bias is not None: - self.qkv_proj_bias = ( - torch.stack([module.q_proj.bias, module.k_proj.bias, module.v_proj.bias]) - .contiguous() - .view([3, -1]) - ) - module.q_proj.bias.data = self.qkv_proj_bias[0] - module.k_proj.bias.data = self.qkv_proj_bias[1] - module.v_proj.bias.data = self.qkv_proj_bias[2] - else: - q_proj = module.q_proj.weight.view( - self.num_key_value_heads, self.num_key_value_groups, self.head_dim, self.hidden_size - ) - k_proj = module.k_proj.weight.view(self.num_key_value_heads, 1, self.head_dim, self.hidden_size) - v_proj = module.v_proj.weight.view(self.num_key_value_heads, 1, self.head_dim, self.hidden_size) - self.qkv_proj_weight = torch.cat([q_proj, k_proj, v_proj], dim=1).view( - [self.num_key_value_heads, self.num_key_value_groups + 2, self.head_dim, self.hidden_size] - ) - module.q_proj.data = self.qkv_proj_weight[:, : self.num_key_value_groups, :, :].reshape( - [self.num_key_value_heads * self.num_key_value_groups * self.head_dim, self.hidden_size] - ) - module.k_proj.data = self.qkv_proj_weight[:, self.num_key_value_groups, :, :].reshape( - [self.num_key_value_heads * self.head_dim, self.hidden_size] - ) - module.v_proj.data = self.qkv_proj_weight[:, self.num_key_value_groups + 1, :, :].reshape( - [self.num_key_value_heads * self.head_dim, self.hidden_size] - ) - self.qkv_proj_weight = self.qkv_proj_weight.permute(3, 0, 1, 2).contiguous() - if module.q_proj.bias is not None: - q_bias = module.q_proj.bias.view(self.num_key_value_heads, self.num_key_value_groups, self.head_dim) - k_bias = module.k_proj.bias.view(self.num_key_value_heads, 1, self.head_dim) - v_bias = module.v_proj.bias.view(self.num_key_value_heads, 1, self.head_dim) - self.qkv_proj_bias = torch.cat([q_bias, k_bias, v_bias], dim=1).view( - [self.num_key_value_heads, self.num_key_value_groups + 2, self.head_dim] - ) - module.q_proj.bias.data = self.qkv_proj_bias[:, : self.num_key_value_groups, self.head_dim].view(-1) - module.k_proj.bias.data = self.qkv_proj_bias[:, self.num_key_value_groups, self.head_dim].view(-1) - module.v_proj.bias.data = self.qkv_proj_bias[:, self.num_key_value_groups + 1, self.head_dim].view(-1) - self.o_proj_weight = module.o_proj.weight.transpose(0, 1).contiguous() - module.o_proj.weight.data = self.o_proj_weight.transpose(0, 1) - self.o_proj_bias = module.o_proj.bias - # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L186 class _IPEXLlamaMLP(nn.Module): @@ -478,19 +514,34 @@ def __init__(self, module, config) -> None: super().__init__() _setattr_from_module(self, module) self.config = config - # LinearAllreduce and LinearLayer cannot use fused op LinearAdd - if module.down_proj.__class__.__name__ not in ["LinearAllreduce"]: - self.mlp_linear_add = LinearAdd(module.down_proj) - del self.__dict__["_modules"]["down_proj"] - self.linear_silu_mul = Linear2SiluMul(module.gate_proj, module.up_proj) - del self.__dict__["_modules"]["gate_proj"] - del self.__dict__["_modules"]["up_proj"] + self.module_device = next(module.parameters()).device.type + if self.module_device == "xpu": + self.port_parameter(module) + torch.xpu.empty_cache() + else: + # LinearAllreduce and LinearLayer cannot use fused op LinearAdd + if module.down_proj.__class__.__name__ not in ["LinearAllreduce"]: + self.mlp_linear_add = LinearAdd(module.down_proj) + del self.__dict__["_modules"]["down_proj"] + self.linear_silu_mul = Linear2SiluMul(module.gate_proj, module.up_proj) + del self.__dict__["_modules"]["gate_proj"] + del self.__dict__["_modules"]["up_proj"] def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor = None, **kwargs): - if hasattr(self, "linear_silu_mul"): - mlp_gate = self.linear_silu_mul(hidden_states) - if hasattr(self, "mlp_linear_add"): - hidden_states = self.mlp_linear_add(mlp_gate, residual) + if self.module_device == "xpu": + up = torch.ops.torch_ipex.mm_silu(hidden_states, self.gate_proj_weight) + hidden_states = torch.ops.torch_ipex.mm_resmul(hidden_states, self.up_proj_weight, up) + hidden_states = matmul_add_add(hidden_states, self.down_proj_weight, self.down_proj_bias, residual) + else: + if hasattr(self, "linear_silu_mul"): + mlp_gate = self.linear_silu_mul(hidden_states) + if hasattr(self, "mlp_linear_add"): + hidden_states = self.mlp_linear_add(mlp_gate, residual) + else: + hidden_states = self.down_proj( + self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states) + ) + hidden_states = residual + hidden_states else: hidden_states = self.down_proj( self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 59d4bedb51..57185de6c1 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -404,9 +404,9 @@ def _llama_gemma_update_causal_mask_legacy(self, attention_mask, input_tensor, c offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[ - : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] - ] = mask_slice + causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( + mask_slice + ) if ( self.config._attn_implementation == "sdpa" @@ -1966,9 +1966,9 @@ def _dbrx_update_causal_mask_legacy( offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[ - : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] - ] = mask_slice + causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( + mask_slice + ) if ( self.config._attn_implementation == "sdpa" diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 6a820b4feb..b2be8a6b1d 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -154,7 +154,7 @@ def __init__( self._device = torch.device("cpu") # CPU only support jit model for now. - if export: + if export and self._device.type == "cpu": if isinstance(model, torch.jit.RecursiveScriptModule): logger.warning("The model has been exported already.") else: @@ -251,7 +251,6 @@ def _from_pretrained( ) token = use_auth_token - task = cls.export_feature commit_hash = kwargs.pop("_commit_hash", None) model_kwargs = { @@ -263,49 +262,11 @@ def _from_pretrained( "force_download": force_download, } - model = TasksManager.get_model_from_task(task, model_id, **model_kwargs) - - if is_torch_xpu_available(check_device=True): - model.to("xpu:0") - if _is_patched_with_ipex(model, task): - model = _patch_model(model) - else: - model = ipex_jit_trace(model, task, use_cache) - config.torchscript = True - config.torch_dtype = torch_dtype - return cls(model, config=config, model_save_dir=model_id, use_cache=use_cache, warmup=False) - - @classmethod - def _from_pretrained( - cls, - model_id: Union[str, Path], - config: PretrainedConfig, - use_auth_token: Optional[Union[bool, str]] = None, - token: Optional[Union[bool, str]] = None, - revision: Optional[str] = None, - force_download: bool = False, - cache_dir: str = HUGGINGFACE_HUB_CACHE, - file_name: Optional[str] = WEIGHTS_NAME, - local_files_only: bool = False, - subfolder: str = "", - **kwargs, - ): - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", - FutureWarning, - ) - if token is not None: - raise ValueError( - "Both the arguments `use_auth_token` and `token` were specified, which is not supported. Please specify only `token`." - ) - token = use_auth_token - if not getattr(config, "torchscript", False): logger.warning("Detect torchscript is false. Convert to torchscript model!") if is_torch_version("<", "2.1.0"): - raise ImportError("`torch>=2.0.0` is needed to trace your model") + raise ImportError("`torch>=2.1.0` is needed to trace your model") task = cls.export_feature config.torch_dtype = torch_dtype @@ -318,6 +279,15 @@ def _from_pretrained( _commit_hash=commit_hash, **model_kwargs, ) + if is_torch_xpu_available(check_device=True): + model.to("xpu:0") + if _is_patched_with_ipex(model, task): + model = _patch_model(model) + else: + use_cache = kwargs.get("use_cache", True) + model = ipex_jit_trace(model, task, use_cache) + config.torchscript = True + config.torch_dtype = torch_dtype return cls(model, config=config, export=True, **kwargs) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 89da349c82..8944ef6da2 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -84,9 +84,9 @@ def __init__( for idx, key in enumerate(model.inputs): names = tuple(key.get_names()) input_names[next((name for name in names if "/" not in name), names[0])] = idx - input_dtypes[ - next((name for name in names if "/" not in name), names[0]) - ] = key.get_element_type().get_type_name() + input_dtypes[next((name for name in names if "/" not in name), names[0])] = ( + key.get_element_type().get_type_name() + ) self.input_names = input_names self.input_dtypes = input_dtypes @@ -95,9 +95,9 @@ def __init__( for idx, key in enumerate(model.outputs): names = tuple(key.get_names()) output_names[next((name for name in names if "/" not in name), names[0])] = idx - output_dtypes[ - next((name for name in names if "/" not in name), names[0]) - ] = key.get_element_type().get_type_name() + output_dtypes[next((name for name in names if "/" not in name), names[0])] = ( + key.get_element_type().get_type_name() + ) self.output_names = output_names self.output_dtypes = output_dtypes