From 160f65c01a17b021105db0b423f288838bfc3c55 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 12 Mar 2025 12:24:00 +0000 Subject: [PATCH 1/3] upgrade transformers to 4.49 for patching models Signed-off-by: jiqing-feng --- .github/workflows/test_ipex.yml | 2 +- optimum/exporters/ipex/cache_utils.py | 6 ++++-- optimum/exporters/ipex/model_patcher.py | 4 ++-- optimum/exporters/ipex/modeling_utils.py | 25 +++++++++++++----------- 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml index b4cf5a16b9..3eea98ee54 100644 --- a/.github/workflows/test_ipex.yml +++ b/.github/workflows/test_ipex.yml @@ -18,7 +18,7 @@ jobs: strategy: fail-fast: false matrix: - transformers-version: ["4.47.*"] + transformers-version: ["4.49.0"] torch-version: ["2.6.0"] runs-on: ubuntu-22.04 diff --git a/optimum/exporters/ipex/cache_utils.py b/optimum/exporters/ipex/cache_utils.py index 7b8ab1cc7f..3c83d24c99 100755 --- a/optimum/exporters/ipex/cache_utils.py +++ b/optimum/exporters/ipex/cache_utils.py @@ -1,6 +1,7 @@ import os from typing import List, Optional, Tuple +import intel_extension_for_pytorch as ipex import torch from intel_extension_for_pytorch.llm.modules import PagedAttention from transformers import Cache, PretrainedConfig @@ -38,13 +39,14 @@ def __init__( config: PretrainedConfig, max_batch_size: int, max_cache_len: int, - device, + device=None, dtype=None, - layer_device_map=None, **kwargs, ) -> None: super().__init__() self.max_batch_size = max_batch_size + default_device = torch.device("xpu") if ipex._C._has_xpu() else torch.device("cpu") + device = device or default_device self.device = device self._supports_flash_decoding = ( is_ipex_version(">", "2.4.99") if device.type == "cpu" else is_ipex_version(">", "2.5.99") diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py index 506436bba0..a11a138c6b 100644 --- a/optimum/exporters/ipex/model_patcher.py +++ b/optimum/exporters/ipex/model_patcher.py @@ -46,8 +46,8 @@ # Please also update in the setup.py and .github/workflows/test_ipex.yml if you change the transformers version -_TRANSFORMERS_MIN_VERSION = "4.47.0" -_TRANSFORMERS_MAX_VERSION = "4.47.99" +_TRANSFORMERS_MIN_VERSION = "4.49.0" +_TRANSFORMERS_MAX_VERSION = "4.49.0" _IPEX_EXPORTED_GENERATION_TASKS = ("text-generation",) diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py index 7ad4a13429..e0c7e0b3cc 100755 --- a/optimum/exporters/ipex/modeling_utils.py +++ b/optimum/exporters/ipex/modeling_utils.py @@ -346,8 +346,8 @@ def _falcon_model_forward( # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head - # attention_probs has shape batch_size x num_heads x N x N - # head_mask has shape n_layer x batch x num_heads x N x N + # attention_probs has shape batch_size x num_attention_heads x N x N + # head_mask has shape n_layer x batch x num_attention_heads x N x N head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) hidden_states = inputs_embeds @@ -707,7 +707,9 @@ def __init__(self, module, device, config) -> None: _setattr_from_module(self, module) self.config = config self.module_device = device - self.num_groups = self.num_heads // self.num_key_value_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_attention_heads = config.num_attention_heads + self.num_groups = self.num_attention_heads // self.num_key_value_heads self.kv_head_mapping = torch.arange( 0, self.num_key_value_heads, dtype=torch.int32, device=self.module_device ).repeat_interleave(self.num_groups) @@ -894,11 +896,11 @@ def __init__(self, module, device, config) -> None: def qkv_gemm(self, hidden_states): if hasattr(self, "concat_qkv"): qkv_out = self.concat_qkv(hidden_states) - query = qkv_out[:, : self.q_slice].view(-1, self.num_heads, self.head_dim) + query = qkv_out[:, : self.q_slice].view(-1, self.num_attention_heads, self.head_dim) key = qkv_out[:, self.q_slice : self.k_slice].view(-1, self.num_key_value_heads, self.head_dim) value = qkv_out[:, self.k_slice :].view(-1, self.num_key_value_heads, self.head_dim) else: - query = self.q_proj(hidden_states).view(-1, self.num_heads, self.head_dim) + query = self.q_proj(hidden_states).view(-1, self.num_attention_heads, self.head_dim) key = self.k_proj(hidden_states).view(-1, self.num_key_value_heads, self.head_dim) value = self.v_proj(hidden_states).view(-1, self.num_key_value_heads, self.head_dim) @@ -916,12 +918,14 @@ def __init__(self, module, device, config): def qkv_gemm(self, hidden_states): qkv_out = self.query_key_value(hidden_states) if self.new_decoder_architecture: - qkv_out = qkv_out.view(qkv_out.shape[0], -1, self.num_heads // self.num_kv_heads + 2, self.head_dim) + qkv_out = qkv_out.view( + qkv_out.shape[0], -1, self.num_attention_heads // self.num_kv_heads + 2, self.head_dim + ) query = qkv_out[:, :, :-2, :].flatten(1, 2) key = qkv_out[:, :, [-2], :].flatten(1, 2) value = qkv_out[:, :, [-1], :].flatten(1, 2) else: - query = qkv_out[:, : self.q_slice].view(-1, self.num_heads, self.head_dim) + query = qkv_out[:, : self.q_slice].view(-1, self.num_attention_heads, self.head_dim) key = qkv_out[:, self.q_slice : self.k_slice].view(-1, self.num_key_value_heads, self.head_dim) value = qkv_out[:, self.k_slice :].view(-1, self.num_key_value_heads, self.head_dim) return query, key, value @@ -929,7 +933,6 @@ def qkv_gemm(self, hidden_states): class _IPEXGPT2Attention(_IPEXAttention): def __init__(self, module, device, config) -> None: - self.num_key_value_heads = config.num_key_value_heads super().__init__(module, device, config) _setattr_from_module(self, module) if getattr(config, "quantization_config", None) is None: @@ -952,9 +955,9 @@ def qkv_gemm(self, hidden_states): query, key, value = self.c_attn_linear(hidden_states).split(self.split_size, dim=-1) else: query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=-1) - query = query.view(-1, self.num_heads, self.head_dim) - key = key.view(-1, self.num_heads, self.head_dim) - value = value.view(-1, self.num_heads, self.head_dim) + query = query.view(-1, self.num_attention_heads, self.head_dim) + key = key.view(-1, self.num_attention_heads, self.head_dim) + value = value.view(-1, self.num_attention_heads, self.head_dim) return query, key, value def rope(self, query, key, *args, **kwargs): From 1b0dc0dc3664bd788fa4edc5bdc2eeffa74a029c Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 12 Mar 2025 12:31:12 +0000 Subject: [PATCH 2/3] update setup Signed-off-by: jiqing-feng --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 12871e962a..1431c861f4 100644 --- a/setup.py +++ b/setup.py @@ -67,7 +67,7 @@ "nncf": ["nncf>=2.14.0"], "openvino": ["nncf>=2.14.0", "openvino>=2024.5.0", "openvino-tokenizers>=2024.5.0"], "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<4.46"], - "ipex": ["intel-extension-for-pytorch>=2.4", "transformers>4.46,<4.48", "accelerate"], + "ipex": ["intel-extension-for-pytorch>=2.4", "transformers>4.48,<4.50", "accelerate"], "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE, From 15ddb8fbf474692065467145dd60166b9821eac1 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Mon, 17 Mar 2025 16:24:41 +0000 Subject: [PATCH 3/3] fix ipex version check Signed-off-by: jiqing-feng --- optimum/intel/ipex/modeling_base.py | 11 +++++++++++ setup.py | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index f9e7189899..c27a87bdba 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -71,6 +71,17 @@ _COMPILE_NOT_READY_MODEL_TYPES = ("llama", "falcon", "gpt2", "qwen2") +try: + import intel_extension_for_pytorch as ipex + + if hasattr(torch, "xpu") and torch.xpu.is_available() and not ipex._C._has_xpu(): + logger.warning( + "Detect you have XPU device but the ipex do not support XPU, please install a xpu version ipex by checking https://pytorch-extension.intel.com/installation?platform=gpu" + ) +except ImportError: + logger.warning("No intel_extension_for_pytorch found, please `pip install intel_extension_for_pytorch`") + + def _is_patched_with_ipex(model, task, use_cache: bool = True): if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING): return False diff --git a/setup.py b/setup.py index 1431c861f4..60851f39a1 100644 --- a/setup.py +++ b/setup.py @@ -67,7 +67,7 @@ "nncf": ["nncf>=2.14.0"], "openvino": ["nncf>=2.14.0", "openvino>=2024.5.0", "openvino-tokenizers>=2024.5.0"], "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<4.46"], - "ipex": ["intel-extension-for-pytorch>=2.4", "transformers>4.48,<4.50", "accelerate"], + "ipex": ["intel-extension-for-pytorch>=2.6", "transformers>4.48,<4.50", "accelerate"], "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE,