fix comments

jiqing-feng · jiqing-feng · commit 38ed0513780d · 2024-02-28T10:08:20.000-05:00
diff --git a/optimum/exporters/ipex/__init__.py b/optimum/exporters/ipex/__init__.py
@@ -1 +0,0 @@
-from .model_patcher import export_model
diff --git a/optimum/exporters/ipex/llama_functions.py b/optimum/exporters/ipex/llama_functions.py
@@ -1,3 +1,17 @@
+#  Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
 import math
 from typing import List, Optional, Tuple, Union
 
@@ -96,46 +110,6 @@ def llama_attn_forward(
     return attn_output, attn_weights, past_key_value
 
 
-def prepare_inputs_for_generation(
-    self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-):
-    if past_key_values is not None:
-        past_length = past_key_values[0][0].shape[2]
-
-        # Some generation methods already pass only the last input ID
-        if input_ids.shape[1] > past_length:
-            remove_prefix_length = past_length
-        else:
-            # Default to old behavior: keep only final ID
-            remove_prefix_length = input_ids.shape[1] - 1
-
-        input_ids = input_ids[:, remove_prefix_length:]
-
-    position_ids = kwargs.get("position_ids", None)
-    if attention_mask is not None and position_ids is None:
-        # create position_ids on the fly for batch generation
-        position_ids = attention_mask.long().cumsum(-1) - 1
-        position_ids.masked_fill_(attention_mask == 0, 1)
-        if past_key_values:
-            position_ids = position_ids[:, -input_ids.shape[1] :]
-
-    # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-    if inputs_embeds is not None and past_key_values is None:
-        model_inputs = {"inputs_embeds": inputs_embeds}
-    else:
-        model_inputs = {"input_ids": input_ids}
-
-    model_inputs.update(
-        {
-            "position_ids": position_ids,
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache"),
-            "attention_mask": attention_mask,
-        }
-    )
-    return model_inputs
-
-
 def llama_model_forward(
     self,
     input_ids: torch.LongTensor = None,
diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py
@@ -1,3 +1,17 @@
+#  Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
 from intel_extension_for_pytorch.llm.modules import ApplyRotaryEmbedding, IndirectAccessKVCache
 from transformers.models.llama.modeling_llama import (
     LlamaAttention,
@@ -12,10 +26,13 @@
     llama_attn_forward,
     llama_layer_norm_forward,
     llama_model_forward,
-    prepare_inputs_for_generation,
 )
 
 
+IPEX_EXPORTED_ARCH = ("LlamaForCausalLM",)
+IPEX_EXPORTED_TASK = ("text-generation",)
+
+
 def convert_func(m, func_name, new_function):
     bound_method = new_function.__get__(m, m.__class__)
     setattr(m, func_name, bound_method)
@@ -43,7 +60,7 @@ def patch_op(m, target_m, new_op_name, new_op):
         patch_op(sub_m, target_m, new_op_name, new_op)
 
 
-def export_llama_model(model):
+def _patch_llama_model(model):
     ipex_rope = ApplyRotaryEmbedding(
         model.config.max_position_embeddings,
         model.config.hidden_size // model.config.num_attention_heads,
@@ -54,7 +71,6 @@ def export_llama_model(model):
     patch_op(model, LlamaAttention, "ipex_rope", ipex_rope)
     patch_op(model, LlamaAttention, "ipex_scale_dot_product", ipex_scale_dot_product)
 
-    convert_func(model, "prepare_inputs_for_generation", prepare_inputs_for_generation)
     convert_functions(model, LlamaModel, "forward", llama_model_forward)
     convert_functions(model, LlamaAttention, "forward", llama_attn_forward)
     convert_functions(model, LlamaRMSNorm, "forward", llama_layer_norm_forward)
@@ -63,7 +79,7 @@ def export_llama_model(model):
     return model
 
 
-def export_model(model):
+def _patch_model(model):
     if isinstance(model, LlamaForCausalLM):
-        model = export_llama_model(model)
-    return model
+        model = _patch_llama_model(model)
+    return model
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
@@ -24,6 +24,7 @@
 from huggingface_hub import hf_hub_download
 from intel_extension_for_pytorch.cpu._auto_kernel_selection import _enable_tpp
 from intel_extension_for_pytorch.transformers.optimize import get_dummy_input
+from packaging import version
 from transformers import (
     AutoConfig,
     AutoModel,
@@ -47,7 +48,7 @@
 from optimum.modeling_base import OptimizedModel
 from optimum.utils import NormalizedConfigManager
 
-from ...exporters.ipex import export_model
+from ...exporters.ipex.model_patcher import IPEX_EXPORTED_ARCH, IPEX_EXPORTED_TASK, _patch_model
 from ..generation.modeling import jit_trace, prepare_jit_inputs
 from ..utils.import_utils import is_torch_version, is_transformers_version
 from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS, patch_decoder_attention_mask
@@ -56,17 +57,28 @@
 logger = logging.getLogger(__name__)
 
 
-IPEX_EXPORTED_LIST = ("LlamaForCausalLM", )
+IPEX_SUPPORT_MODEL_TYPES = "llama"
 
 
-def is_ipex_exported_model(model_name):
-    for name in IPEX_EXPORTED_LIST:
-        if model_name == name:
-            return True
-    return False
+def is_model_support_ipex_export(model, task):
+    if isinstance(model, torch.jit.ScriptModule):
+        is_ipex_exported = model.original_name in IPEX_EXPORTED_ARCH
+    else:
+        is_ipex_exported = model.config.model_type in IPEX_SUPPORT_MODEL_TYPES and task in IPEX_EXPORTED_TASK
+
+    return is_ipex_exported
+
+
+def ipex_jit_trace(model, task, use_cache):
+    if version.parse(ipex.__version__) <= version.parse("2.3.0") or not is_model_support_ipex_export(model, task):
+        model = patch_decoder_attention_mask(model)
+        model = ipex.optimize(model, dtype=model.dtype, level="O1", auto_kernel_selection=True)
+        return jit_trace(model, task, use_cache)
 
+    if is_torch_version("<", "2.1.0"):
+        raise ImportError("`torch>=2.1.0` is needed to trace your model")
 
-def ipex_jit_trace(model):
+    model = _patch_model(model)
     sample_inputs = get_dummy_input(model, return_dict=True)
     model.config.return_dict = False
     _enable_tpp()
@@ -104,7 +116,7 @@ def __init__(
         self._dtype = self.config.torch_dtype if self.config.torch_dtype is not None else torch.float32
         self.model.to(self._device)
         self.model_save_dir = model_save_dir
-        self.is_ipex_exported = kwargs.get("is_ipex_exported", None)
+        self.is_ipex_exported = is_model_support_ipex_export(model, self.export_feature)
 
         self.input_names = {
             inputs.debugName().split(".")[0] for inputs in model.graph.inputs() if inputs.debugName() != "self"
@@ -148,14 +160,7 @@ def _from_transformers(
         }
 
         model = TasksManager.get_model_from_task(task, model_id, **model_kwargs)
-        is_ipex_exported = is_ipex_exported_model(model.__class__.__name__)
-        if is_ipex_exported:
-            model = export_model(model)
-            traced_model = ipex_jit_trace(model)
-        else:
-            model = patch_decoder_attention_mask(model)
-            model = ipex.optimize(model, dtype=torch_dtype, level="O1", auto_kernel_selection=True)
-            traced_model = jit_trace(model, task, use_cache)
+        traced_model = ipex_jit_trace(model, task, use_cache)
 
         save_dir = TemporaryDirectory()
         save_dir_path = Path(save_dir.name)
@@ -173,7 +178,6 @@ def _from_transformers(
             local_files_only=local_files_only,
             use_cache=use_cache,
             model_dtype=torch_dtype,
-            is_ipex_exported=is_ipex_exported,
         )
 
     @classmethod
@@ -210,8 +214,6 @@ def _from_pretrained(
 
         model = torch.jit.load(model_cache_path)
         torch.jit.freeze(model.eval())
-        is_ipex_exported = is_ipex_exported_model(model.original_name)
-        kwargs["is_ipex_exported"] = is_ipex_exported
 
         return cls(model, config=config, model_save_dir=model_save_dir, **kwargs)
 
@@ -372,7 +374,6 @@ def __init__(
         model_type = config.model_type.replace("_", "-")
         self.normalized_config = NormalizedConfigManager.get_normalized_config_class(model_type)(config)
         self.use_cache = "past_key_values" in self.input_names
-        self.is_ipex_exported = kwargs.get("is_ipex_exported", None)
 
         if use_cache ^ self.use_cache:
             raise ValueError(
@@ -422,7 +423,11 @@ def _prepare_past_key_values(self, input_ids):
             num_attention_heads = self.normalized_config.num_attention_heads
 
         if self.is_ipex_exported:
-            beam_idx_tmp = torch.zeros((2048, input_ids.shape[0]), dtype=torch.long).contiguous()
+            # Indirect access kv cache has a different data layout compared with most transformers model,
+            # see https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/llm.html#indirect-access-kv-cache
+            beam_idx_tmp = torch.zeros(
+                (self.config.max_position_embeddings, input_ids.shape[0]), dtype=torch.long
+            ).contiguous()
             past_key_values = tuple(
                 [
                     (
@@ -562,8 +567,8 @@ def _prepare_inputs_for_generation_for_llama(
 def _ipex_reorder_cache(
     past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
 ) -> Tuple[Tuple[torch.Tensor]]:
-
-    if len(past_key_values[0]) == 4 and past_key_values[0][0].shape[-1] == 1:  # discrete kv_cache
+    # Ipex patched model uses indirect access kv cache which has a different shape with other transformers models
+    if len(past_key_values[0]) == 4 and past_key_values[0][0].shape[-1] == 1:
         for layer_past in past_key_values:
             layer_past[3][layer_past[0].size(-2) - 1] = beam_idx
         return past_key_values
@@ -577,8 +582,3 @@ def _ipex_reorder_cache(
             tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
             for layer_past in past_key_values
         )
-
-    return tuple(
-        tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
-        for layer_past in past_key_values
-    )

Original file line number	Diff line number	Diff line change
`@@ -1 +0,0 @@`
`1`		`-from .model_patcher import export_model`