fix comments

jiqing-feng · jiqing-feng · commit 4c3335b35024 · 2024-03-06T05:37:17.000-05:00
diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py
@@ -24,9 +24,9 @@
 
 from .modeling_utils import (
     _IPEXLlamaDecoderLayerRef,
-    llama_attn_forward,
-    llama_layer_norm_forward,
-    llama_model_forward,
+    _llama_attn_forward,
+    _llama_layer_norm_forward,
+    _llama_model_forward,
 )
 
 
@@ -77,9 +77,9 @@ def _patch_llama_model(model):
     patch_op(model, LlamaAttention, "ipex_rope", ipex_rope)
     patch_op(model, LlamaAttention, "ipex_scale_dot_product", ipex_scale_dot_product)
 
-    convert_functions(model, LlamaModel, "forward", llama_model_forward)
-    convert_functions(model, LlamaAttention, "forward", llama_attn_forward)
-    convert_functions(model, LlamaRMSNorm, "forward", llama_layer_norm_forward)
+    convert_functions(model, LlamaModel, "forward", _llama_model_forward)
+    convert_functions(model, LlamaAttention, "forward", _llama_attn_forward)
+    convert_functions(model, LlamaRMSNorm, "forward", _llama_layer_norm_forward)
 
     convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayerRef, model.config)
     return model
diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
@@ -24,11 +24,13 @@
 from optimum.intel.utils.import_utils import is_ipex_version
 
 
-def llama_layer_norm_forward(self, hidden_states):
+# Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L83
+def _llama_layer_norm_forward(self, hidden_states):
     return torch.ops.torch_ipex.rmsnorm(hidden_states, self.weight, self.variance_epsilon)
 
 
-def llama_attn_forward(
+# Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L321
+def _llama_attn_forward(
     self,
     hidden_states: torch.Tensor,
     attention_mask: Optional[torch.Tensor] = None,
@@ -111,7 +113,8 @@ def llama_attn_forward(
     return attn_output, attn_weights, past_key_value
 
 
-def llama_model_forward(
+# Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L1130
+def _llama_model_forward(
     self,
     input_ids: torch.LongTensor = None,
     attention_mask: Optional[torch.Tensor] = None,
@@ -168,9 +171,6 @@ def llama_model_forward(
     # embed positions
     hidden_states = inputs_embeds
 
-    if self.gradient_checkpointing and self.training:
-        use_cache = False
-
     # decoder layers
     all_hidden_states = () if output_hidden_states else None
     all_self_attns = () if output_attentions else None
@@ -182,25 +182,14 @@ def llama_model_forward(
 
         past_key_value = past_key_values[idx] if past_key_values is not None else None
 
-        if self.gradient_checkpointing and self.training:
-            layer_outputs = self._gradient_checkpointing_func(
-                decoder_layer.__call__,
-                hidden_states,
-                attention_mask,
-                position_ids,
-                past_key_value,
-                output_attentions,
-                use_cache,
-            )
-        else:
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
+        layer_outputs = decoder_layer(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
 
         hidden_states = layer_outputs[0]
 
@@ -227,6 +216,7 @@ def llama_model_forward(
     )
 
 
+# Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L694
 class _IPEXLlamaDecoderLayerRef(nn.Module):
     def __init__(self, module, config, distributed=False):
         if is_ipex_version("<=", "2.3.0"):
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
@@ -62,6 +62,7 @@
 def _is_patched_with_ipex(model, task):
     if is_ipex_version("<=", "2.3.0"):
         return False
+
     if isinstance(model, torch.jit.ScriptModule):
         for node in model.graph.nodes():
             # Jit will record the codes position so we can check if the node use ipex exporter.
diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py
@@ -264,31 +264,26 @@ def test_pipeline(self, model_arch):
             {
                 "model_arch": IPEX_PATCHED_SUPPORTED_ARCHITECTURES,
                 "use_cache": [True, False],
-                "num_beams": [1, 4],
-                "batch_size": [1, 4],
             }
         )
     )
     @unittest.skipIf(is_ipex_version("<=", "2.3.0"), reason="Only ipex version > 2.3.0 supports ipex model patching")
-    def test_ipex_patching(self, test_name, model_arch, use_cache, num_beams, batch_size):
+    def test_ipex_patching_generation(self, test_name, model_arch, use_cache):
         model_id = MODEL_NAMES[model_arch]
         set_seed(SEED)
-        model = IPEXModelForCausalLM.from_pretrained(model_id, export=True)
-        transformers_model = AutoModelForCausalLM.from_pretrained(model_id)
+        model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, use_cache=use_cache)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         tokenizer.pad_token = tokenizer.eos_token
-        texts = ["This is a sample"] * batch_size
-        tokens = tokenizer(texts, padding=True, return_tensors="pt")
-        generation_config = GenerationConfig(
-            max_new_tokens=16, num_beams=num_beams, do_sample=False, use_cache=use_cache
-        )
-        outputs = model.generate(**tokens, generation_config=generation_config)
-        with torch.no_grad():
-            transformers_outputs = transformers_model(**tokens)
-
-        self.assertIsInstance(outputs.logits, torch.Tensor)
-        # Compare tensor outputs
-        self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4))
+        # Test with batch_size is 1 and 2.
+        texts = ["This is a sample", ["This is the first input", "This is the second input"]]
+        for text in texts:
+            tokens = tokenizer(text, padding=True, return_tensors="pt")
+            for num_beams in [1, 4]:
+                generation_config = GenerationConfig(
+                    max_new_tokens=4, num_beams=num_beams, do_sample=True, top_p=0.9, top_k=5
+                )
+                outputs = model.generate(**tokens, generation_config=generation_config)
+                self.assertIsInstance(outputs, torch.Tensor)
 
     def test_compare_with_and_without_past_key_values(self):
         model_id = "echarlaix/tiny-random-gpt2-torchscript"