add xpu port

faaany · faaany · commit 89e10d617183 · 2024-05-25T18:19:19.000-07:00
diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
@@ -178,12 +178,6 @@ def _llama_model_forward(
 # Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L321
 class _IPEXLlamaAttentionRef(nn.Module):
     def __init__(self, module, config, distributed=False) -> None:
-        if is_ipex_version("<", "2.3.0"):
-            raise ImportError(
-                "Only ipex version > 2.3.0 supports LinearAdd, IndirectAccessKVCacheAttention, RotaryEmbedding"
-            )
-        from intel_extension_for_pytorch.llm.modules import IndirectAccessKVCacheAttention, LinearAdd, RotaryEmbedding
-
         super().__init__()
         for k, v in module.__dict__.items():
             setattr(self, k, v)
@@ -193,19 +187,33 @@ def __init__(self, module, config, distributed=False) -> None:
             setattr(self.__class__, k, getattr(module.__class__, k))
         self.config = config
         self.distributed = distributed
-        if not self.distributed:
-            self.mha_linear_add = LinearAdd(self.o_proj)
-            del self.__dict__["_modules"]["o_proj"]
-        self.ipex_scale_dot_product = IndirectAccessKVCacheAttention(
-            text_max_length=module.config.max_position_embeddings
-        )
-        self.ipex_rope = RotaryEmbedding(
-            module.config.max_position_embeddings,
-            module.config.hidden_size // module.config.num_attention_heads,
-            module.config.rope_theta,
-            module.config.architectures[0],
-        )
-
+        self.module_device = module.q_proj.weight.device.type
+        if self.module_device == "xpu":
+            from intel_extension_for_pytorch.transformers.models.xpu.fusions.mha_fusion import _IPEXRopeXPU
+            self.ipex_rope = _IPEXRopeXPU(
+                module.config.max_position_embeddings,
+                module.config.hidden_size // module.config.num_attention_heads,
+                module.config.rope_theta,
+                module.config.architectures[0],
+            )
+            self.port_parameters(module)
+            torch.xpu.empty_cache()
+        else:
+            from intel_extension_for_pytorch.llm.modules import IndirectAccessKVCacheAttention, LinearAdd, RotaryEmbedding
+            if not self.distributed:
+                self.mha_linear_add = LinearAdd(self.o_proj)
+                del self.__dict__["_modules"]["o_proj"]
+            self.ipex_scale_dot_product = IndirectAccessKVCacheAttention(
+                text_max_length=module.config.max_position_embeddings
+            )
+            self.ipex_rope = RotaryEmbedding(
+                module.config.max_position_embeddings,
+                module.config.hidden_size // module.config.num_attention_heads,
+                module.config.rope_theta,
+                module.config.architectures[0],
+            )
+            
+            
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -310,9 +318,60 @@ def forward(
             attn_weights = None
 
         return attn_output, past_key_value, attn_weights
-
-
-class _IPEXLlamaMLP(nn.Module):
+    
+    def port_parameters(self, module):
+        self.qkv_proj_bias = None
+        self.qkv_proj_weight = None
+        if self.num_heads == self.num_key_value_heads:
+            q_proj = module.q_proj.weight.transpose(0, 1)
+            k_proj = module.k_proj.weight.transpose(0, 1)
+            v_proj = module.v_proj.weight.transpose(0, 1)
+            self.qkv_proj_weight = torch.stack([q_proj, k_proj, v_proj]).contiguous().view([3, -1, q_proj.shape[-1]])
+            module.q_proj.weight.data = self.qkv_proj_weight[0, :, :].transpose(0, 1)
+            module.k_proj.weight.data = self.qkv_proj_weight[1, :, :].transpose(0, 1)
+            module.v_proj.weight.data = self.qkv_proj_weight[2, :, :].transpose(0, 1)
+            if module.q_proj.bias is not None:
+                self.qkv_proj_bias = (
+                    torch.stack([module.q_proj.bias, module.k_proj.bias, module.v_proj.bias])
+                    .contiguous()
+                    .view([3, -1])
+                )
+                module.q_proj.bias.data = self.qkv_proj_bias[0]
+                module.k_proj.bias.data = self.qkv_proj_bias[1]
+                module.v_proj.bias.data = self.qkv_proj_bias[2]
+        else:
+            q_proj = module.q_proj.weight.view(self.num_kv_heads, self.num_key_value_groups, self.head_dim, self.embed_dim)
+            k_proj = module.k_proj.weight.view(self.num_kv_heads, 1, self.head_dim, self.embed_dim)
+            v_proj = module.v_proj.weight.view(self.num_kv_heads, 1, self.head_dim, self.embed_dim)
+            self.qkv_proj_weight = torch.cat([q_proj, k_proj, v_proj], dim=1).view(
+                [self.num_kv_heads, self.num_key_value_groups + 2, self.head_dim, self.embed_dim]
+            )
+            module.q_proj.data = self.qkv_proj_weight[:, :self.num_key_value_groups, :, :].view(
+                [self.num_kv_heads * self.num_key_value_groups * self.head_dim, self.embed_dim]
+            )
+            module.k_proj.data = self.qkv_proj_weight[:, self.num_key_value_groups, :, :].view(
+                [self.num_kv_heads * self.head_dim, self.embed_dim]
+            )
+            module.v_proj.data = self.qkv_proj_weight[:, self.num_key_value_groups + 1, :, :].view(
+                [self.num_kv_heads * self.head_dim, self.embed_dim]
+            )
+            if module.q_proj.bias is not None:
+                q_bias = module.q_proj.bias.view(self.num_kv_heads, self.num_key_value_groups, self.head_dim)
+                k_bias = module.k_proj.bias.view(self.num_kv_heads, 1, self.head_dim)
+                v_bias = module.v_proj.bias.view(self.num_kv_heads, 1, self.head_dim)
+                self.qkv_proj_bias = torch.cat([q_bias, k_bias, v_bias], dim=1).view(
+                    [self.num_kv_heads, self.num_key_value_groups + 2, self.head_dim]
+                )
+                module.q_proj.bias.data = self.qkv_proj_bias[:, :self.num_key_value_groups, self.head_dim].view(-1)
+                module.k_proj.bias.data = self.qkv_proj_bias[:, self.num_key_value_groups, self.head_dim].view(-1)
+                module.v_proj.bias.data = self.qkv_proj_bias[:, self.num_key_value_groups + 1, self.head_dim].view(-1)
+
+        self.o_proj_weight = module.o_proj.weight.transpose(0, 1).contiguous()
+        module.o_proj.weight.data = self.o_proj_weight.transpose(0, 1)
+        self.o_proj_bias = module.o_proj.bias
+
+
+class _IPEXLlamaMLPRef(nn.Module):
     def __init__(self, module, config, distributed=False) -> None:
         if is_ipex_version("<", "2.3.0"):
             raise ImportError("Only ipex version > 2.3.0 supports Linear2SiluMul and LinearAdd")
@@ -327,31 +386,50 @@ def __init__(self, module, config, distributed=False) -> None:
             setattr(self.__class__, k, getattr(module.__class__, k))
         self.config = config
         self.distributed = distributed
-        if not self.distributed:
-            self.mlp_linear_add = LinearAdd(module.down_proj)
-            del self.__dict__["_modules"]["down_proj"]
-        self.linear_silu_mul = Linear2SiluMul(module.gate_proj, module.up_proj)
-        del self.__dict__["_modules"]["gate_proj"]
-        del self.__dict__["_modules"]["up_proj"]
+        self.module_device = module.gate_proj.weight.device.type
+        if self.module_device == "xpu":
+            self.port_parameter(module)
+            torch.xpu.empty_cache()
+        else:
+            if not self.distributed:
+                self.mlp_linear_add = LinearAdd(module.down_proj)
+                del self.__dict__["_modules"]["down_proj"]
+            self.linear_silu_mul = Linear2SiluMul(module.gate_proj, module.up_proj)
+            del self.__dict__["_modules"]["gate_proj"]
+            del self.__dict__["_modules"]["up_proj"]
 
     def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor = None, **kwargs):
         """
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
         """
-        if hasattr(self, "linear_silu_mul"):
-            mlp_gate = self.linear_silu_mul(hidden_states)
-            if hasattr(self, "mlp_linear_add"):
-                hidden_states = self.mlp_linear_add(mlp_gate, residual)
-            else:
-                hidden_states = self.down_proj(mlp_gate)
-                hidden_states = residual + hidden_states
+        if self.module_device == "xpu":
+            up = torch.ops.torch_ipex.mm_silu(hidden_states, self.gate_proj_weight)
+            hidden_states = torch.ops.torch_ipex.mm_resmul(hidden_states, self.up_proj_weight, up)
+            hidden_states = matmul_add_add(hidden_states, self.down_proj_weight, self.down_proj_bias, residual)
         else:
-            hidden_states = self.down_proj(self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states))
-            hidden_states = residual + hidden_states
-
+            if hasattr(self, "linear_silu_mul"):
+                mlp_gate = self.linear_silu_mul(hidden_states)
+                if hasattr(self, "mlp_linear_add"):
+                    hidden_states = self.mlp_linear_add(mlp_gate, residual)
+                else:
+                    hidden_states = self.down_proj(mlp_gate)
+                    hidden_states = residual + hidden_states
+            else:
+                hidden_states = self.down_proj(self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states))
+                hidden_states = residual + hidden_states            
         return hidden_states
 
+    def port_parameter(self, module):
+        self.up_proj_weight = module.up_proj.weight.transpose(0, 1).contiguous()
+        module.up_proj.weight.data = self.up_proj_weight.transpose(0, 1)
+        self.gate_proj_weight = module.gate_proj.weight.transpose(0, 1).contiguous()
+        module.gate_proj.weight.data = self.gate_proj_weight.transpose(0, 1)
+        self.down_proj_weight = module.down_proj.weight.transpose(0, 1).contiguous()
+        module.down_proj.weight.data = self.down_proj_weight.transpose(0, 1)
+        self.up_proj_bias = module.up_proj.bias
+        self.gate_proj_bias = module.gate_proj.bias
+        self.down_proj_bias = module.down_proj.bias
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L694
 class _IPEXLlamaDecoderLayerRef(nn.Module):
@@ -365,7 +443,7 @@ def __init__(self, module, config, distributed=False):
             setattr(self.__class__, k, getattr(module.__class__, k))
         self.distributed = distributed
         self.self_attn = _IPEXLlamaAttentionRef(module.self_attn, config, distributed)
-        self.mlp = _IPEXLlamaMLP(module.mlp, config, distributed)
+        self.mlp = _IPEXLlamaMLPRef(module.mlp, config, distributed)
 
     def forward(
         self,
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
@@ -140,10 +140,12 @@ def __init__(
         self._dtype = self.config.torch_dtype if self.config.torch_dtype is not None else torch.float32
         self.model_save_dir = model_save_dir
         self._is_ipex_exported = _is_patched_with_ipex(model, self.export_feature)
-
-        self.input_names = {
-            inputs.debugName().split(".")[0] for inputs in model.graph.inputs() if inputs.debugName() != "self"
-        }
+        if self._device.type == "cpu":
+            self.input_names = {
+                inputs.debugName().split(".")[0] for inputs in model.graph.inputs() if inputs.debugName() != "self"
+            }
+        else:
+            self.input_names = {"past_key_values": None, "position_ids": None}
         # Registers the IPEXModelForXXX classes into the transformers AutoModel classes to avoid warnings when creating
         # a pipeline https://github.com/huggingface/transformers/blob/cad61b68396a1a387287a8e2e2fef78a25b79383/src/transformers/pipelines/base.py#L863
         AutoConfig.register(self.base_model_prefix, AutoConfig)
@@ -169,7 +171,6 @@ def _from_transformers(
         trust_remote_code: bool = False,
         _commit_hash: str = None,
     ):
-        device_map = kwargs.pop("device_map", None)
         if use_auth_token is not None:
             warnings.warn(
                 "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
@@ -196,22 +197,15 @@ def _from_transformers(
 
         model = TasksManager.get_model_from_task(task, model_id, **model_kwargs)
 
-        if "cpu" in str(model.device):
-            if is_torch_version("<", "2.1.0"):
-                raise ImportError("`torch>=2.1.0` is needed to trace your model")
-            traced_model = ipex_jit_trace(model, task, use_cache)
-            config.torchscript = True
-            config.torch_dtype = torch_dtype
-            return cls(traced_model, config=config, model_save_dir=model_id, use_cache=use_cache, warmup=False)
-        else:
-            from optimum.exporters.ipex.model_patcher import _patch_model
-
+        if is_torch_xpu_available(check_device=True):
+            model.to("xpu:0")
             if _is_patched_with_ipex(model, task):
                 model = _patch_model(model)
-            else:
-                raise NotImplementedError(f"The given model is not support yet")
-
-            return model
+        else:
+            model = ipex_jit_trace(model, task, use_cache)
+            config.torchscript = True
+            config.torch_dtype = torch_dtype
+        return cls(model, config=config, model_save_dir=model_id, use_cache=use_cache, warmup=False)
 
     @classmethod
     def _from_pretrained(
@@ -462,7 +456,7 @@ def __init__(
         except AttributeError:
             self.model_cls = get_model_class(self.config, AutoModelForCausalLM._model_mapping)
 
-        if self._is_ipex_exported:
+        if self._is_ipex_exported and self._device.type == "cpu":
             self._reorder_cache = _ipex_reorder_cache
         else:
             # Check if _reorder_cache is a static method
@@ -552,7 +546,7 @@ def forward(
         if "position_ids" in self.input_names or not self.input_names:
             inputs["position_ids"] = position_ids
 
-        if self.use_cache:
+        if self.use_cache and self._device.type == "cpu":
             if past_key_values is None:
                 past_key_values = self._prepare_past_key_values(input_ids)