From 3888824b620c37d87536d84d8b35a8be5f985ac8 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 9 Dec 2024 12:31:18 +0000
Subject: [PATCH 01/58] enable IPEXModelForSeq2SeqLM

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/intel/__init__.py                 |  2 +
 optimum/intel/ipex/__init__.py            |  1 +
 optimum/intel/ipex/modeling_base.py       | 85 ++++++++++++++++++++++-
 optimum/intel/ipex/utils.py               |  1 +
 optimum/intel/pipelines/pipeline_base.py  |  7 ++
 optimum/intel/utils/dummy_ipex_objects.py | 11 +++
 6 files changed, 104 insertions(+), 3 deletions(-)

diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py
index 0230394d29..6091243894 100644
--- a/optimum/intel/__init__.py
+++ b/optimum/intel/__init__.py
@@ -46,6 +46,7 @@
 else:
     _import_structure["ipex"] = [
         "IPEXModelForCausalLM",
+        "IPEXModelForSeq2SeqLM",
         "IPEXModelForSequenceClassification",
         "IPEXModelForMaskedLM",
         "IPEXModelForTokenClassification",
@@ -237,6 +238,7 @@
             IPEXModelForImageClassification,
             IPEXModelForMaskedLM,
             IPEXModelForQuestionAnswering,
+            IPEXModelForSeq2SeqLM,
             IPEXModelForSequenceClassification,
             IPEXModelForTokenClassification,
         )
diff --git a/optimum/intel/ipex/__init__.py b/optimum/intel/ipex/__init__.py
index c1f711acfc..79ed4734d3 100644
--- a/optimum/intel/ipex/__init__.py
+++ b/optimum/intel/ipex/__init__.py
@@ -19,6 +19,7 @@
     IPEXModelForImageClassification,
     IPEXModelForMaskedLM,
     IPEXModelForQuestionAnswering,
+    IPEXModelForSeq2SeqLM,
     IPEXModelForSequenceClassification,
     IPEXModelForTokenClassification,
 )
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index 8611bddd21..161684d9c2 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -30,6 +30,7 @@
     AutoModelForImageClassification,
     AutoModelForMaskedLM,
     AutoModelForQuestionAnswering,
+    AutoModelForSeq2SeqLM,
     AutoModelForSequenceClassification,
     AutoModelForTokenClassification,
     GenerationConfig,
@@ -57,6 +58,7 @@
 logger = logging.getLogger(__name__)
 
 
+_IPEX_SUPPORTED_GENERATION_TASKS = ("text-generation", "text2text-generation")
 _IPEX_SUPPORT_MODEL_TYPES = ("llama", "bert", "vit", "falcon", "gpt2")
 _IPEX_EXPORTED_GENERATION_METHODS = ("sample", "greedy_search", "beam_sample", "beam_search", "assisted_generation")
 _IPEX_MINIMUM_VERSION_FOR_COMPILE = "2.5.0"
@@ -106,9 +108,9 @@ def __init__(
 
         # Non-generation tasks can use torch.compile to get acceleration.
         if (
-            model.device.type == "cpu"
-            and self.export_feature not in _IPEX_EXPORTED_GENERATION_TASKS
-            and config.model_type not in _COMPILE_NOT_READY_MODEL_TYPES
+            self.model.device.type == "cpu"
+            and self.export_feature not in _IPEX_SUPPORTED_GENERATION_TASKS
+            and self.config.model_type not in _COMPILE_NOT_READY_MODEL_TYPES
             and is_ipex_version(">=", _IPEX_MINIMUM_VERSION_FOR_COMPILE)
         ):
             from torch._inductor import config
@@ -336,6 +338,83 @@ def generate(self, *args, **kwargs):
         return result
 
 
+class IPEXModelForSeq2SeqLM(IPEXModel, GenerationMixin):
+    auto_model_class = AutoModelForSeq2SeqLM
+    export_feature = "text2text-generation"
+
+    def __init__(
+        self,
+        model,
+        config: PretrainedConfig = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        use_cache: bool = True,
+        **kwargs,
+    ):
+        super().__init__(model, config, model_save_dir=model_save_dir, use_cache=use_cache)
+
+        self._supports_cache_class = getattr(model, "_supports_cache_class", None)
+        self._supports_sdpa = getattr(model, "_supports_sdpa", None)
+        self._supports_quantized_cache = getattr(model, "_supports_quantized_cache", None)
+        self._supports_static_cache = getattr(model, "_supports_static_cache", None)
+
+        GenerationMixin.__init__(self)
+
+        model_type = self.config.model_type.replace("_", "-")
+        self.normalized_config = NormalizedConfigManager.get_normalized_config_class(model_type)(self.config)
+
+        self.config.is_decoder = False
+        self.config.is_encoder_decoder = True
+
+        self.generation_config = GenerationConfig.from_model_config(self.config)
+        try:
+            self.model_cls = get_class_from_dynamic_module(
+                self.config.auto_map["AutoModelForSeq2SeqLM"], model_save_dir
+            )
+        except AttributeError:
+            self.model_cls = get_model_class(self.config, AutoModelForSeq2SeqLM._model_mapping)
+
+        if hasattr(self.model_cls, "_convert_to_standard_cache"):
+            self._convert_to_standard_cache = self.model_cls._convert_to_standard_cache
+        if (
+            self._supports_static_cache
+            and self.model.device.type == "cpu"
+            and self.config.model_type not in _COMPILE_NOT_READY_MODEL_TYPES
+            and is_ipex_version(">=", _IPEX_MINIMUM_VERSION_FOR_COMPILE)
+        ):
+            from torch._inductor import config
+
+            # Use static cache for torch.compile
+            self.model.config.cache_implementation = "static"
+            self.config.cache_implementation = "static"
+            # System level optimization
+            torch._inductor.config.cpp_wrapper = True
+            os.environ["TORCHINDUCTOR_FREEZING"] = "1"
+            logger.info("Enable torch.compile optimization, start warm up")
+            self.model.forward = torch.compile(self.model.forward)
+            inputs = prepare_jit_inputs(model, self.export_feature, False)
+            self.model.generate(**inputs, max_length=4)
+            self.model.generate(**inputs, max_length=4)
+            logger.info("Warm up end")
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        return self.model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
+
+    def _reorder_cache(self, *args, **kwargs):
+        return self.model._reorder_cache(*args, **kwargs)
+
+    def prepare_inputs_for_generation(self, *args, **kwargs):
+        return self.model.prepare_inputs_for_generation(*args, **kwargs)
+
+    def get_encoder(self, *args, **kwargs):
+        return self.model.get_encoder(*args, **kwargs)
+
+
 def _ipex_crop_past_key_values(model, past_key_values, max_length):
     if isinstance(model, IPEXModel) and _is_patched_with_ipex(model, "text-generation"):
         if isinstance(past_key_values, IPEXPagedCache):
diff --git a/optimum/intel/ipex/utils.py b/optimum/intel/ipex/utils.py
index 3d3feb3db2..23126bcd4c 100644
--- a/optimum/intel/ipex/utils.py
+++ b/optimum/intel/ipex/utils.py
@@ -16,6 +16,7 @@
 _HEAD_TO_AUTOMODELS = {
     "feature-extraction": "IPEXModel",
     "text-generation": "IPEXModelForCausalLM",
+    "text2text-generation": "IPEXModelForSeq2SeqLM",
     "text-classification": "IPEXModelForSequenceClassification",
     "token-classification": "IPEXModelForTokenClassification",
     "question-answering": "IPEXModelForQuestionAnswering",
diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index d26d8c42b6..e301f1e995 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -58,6 +58,7 @@
         IPEXModelForImageClassification,
         IPEXModelForMaskedLM,
         IPEXModelForQuestionAnswering,
+        IPEXModelForSeq2SeqLM,
         IPEXModelForSequenceClassification,
         IPEXModelForTokenClassification,
     )
@@ -69,6 +70,12 @@
             "default": "gpt2",
             "type": "text",
         },
+        "text2text-generation": {
+            "impl": TextGenerationPipeline,
+            "class": (IPEXModelForSeq2SeqLM,),
+            "default": "google-t5/t5-small",
+            "type": "text",
+        },
         "fill-mask": {
             "impl": FillMaskPipeline,
             "class": (IPEXModelForMaskedLM,),
diff --git a/optimum/intel/utils/dummy_ipex_objects.py b/optimum/intel/utils/dummy_ipex_objects.py
index 4bd7eee630..2a0db565d3 100644
--- a/optimum/intel/utils/dummy_ipex_objects.py
+++ b/optimum/intel/utils/dummy_ipex_objects.py
@@ -70,6 +70,17 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["ipex"])
 
 
+class IPEXModelForSeq2SeqLM(metaclass=DummyObject):
+    _backends = ["ipex"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["ipex"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["ipex"])
+
+
 class IPEXModelForQuestionAnswering(metaclass=DummyObject):
     _backends = ["ipex"]
 

From f9fa8074ec251de2060fc4a81e1eb5805fdcef4b Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 9 Dec 2024 13:23:53 +0000
Subject: [PATCH 02/58] set static cache

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/intel/ipex/modeling_base.py      | 16 +++++++++++-----
 optimum/intel/pipelines/pipeline_base.py | 16 ++++++++++++++--
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index 161684d9c2..7038515ad3 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -383,17 +383,14 @@ def __init__(
         ):
             from torch._inductor import config
 
-            # Use static cache for torch.compile
-            self.model.config.cache_implementation = "static"
-            self.config.cache_implementation = "static"
             # System level optimization
             torch._inductor.config.cpp_wrapper = True
             os.environ["TORCHINDUCTOR_FREEZING"] = "1"
             logger.info("Enable torch.compile optimization, start warm up")
             self.model.forward = torch.compile(self.model.forward)
             inputs = prepare_jit_inputs(model, self.export_feature, False)
-            self.model.generate(**inputs, max_length=4)
-            self.model.generate(**inputs, max_length=4)
+            self.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=4)
+            self.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=4)
             logger.info("Warm up end")
 
     @torch.no_grad()
@@ -405,6 +402,15 @@ def forward(
     ) -> CausalLMOutputWithPast:
         return self.model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
 
+    def _prepare_generation_config(
+        self, generation_config: Optional[GenerationConfig], **kwargs: Dict
+    ) -> Tuple[GenerationConfig, Dict]:
+        generation_config, model_kwargs = super()._prepare_generation_config(generation_config, **kwargs)
+        # Use static cache for torch.compile
+        setattr(generation_config, "cache_implementation", "static")
+
+        return generation_config, model_kwargs
+
     def _reorder_cache(self, *args, **kwargs):
         return self.model._reorder_cache(*args, **kwargs)
 
diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index e301f1e995..7bbcbc00cc 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -70,10 +70,22 @@
             "default": "gpt2",
             "type": "text",
         },
+        "summarization": {
+            "impl": SummarizationPipeline,
+            "class": (IPEXModelForSeq2SeqLM,),
+            "default": "t5-base",
+            "type": "text",
+        },
+        "translation": {
+            "impl": TranslationPipeline,
+            "class": (IPEXModelForSeq2SeqLM,),
+            "default": "t5-small",
+            "type": "text",
+        },
         "text2text-generation": {
-            "impl": TextGenerationPipeline,
+            "impl": Text2TextGenerationPipeline,
             "class": (IPEXModelForSeq2SeqLM,),
-            "default": "google-t5/t5-small",
+            "default": "t5-small",
             "type": "text",
         },
         "fill-mask": {

From 202df432124c3a0f3c132c5d4654bea20931a9d6 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 9 Dec 2024 14:42:21 +0000
Subject: [PATCH 03/58] add tests for IPEXModelForSeq2SeqLM

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 tests/ipex/test_modeling.py  | 122 +++++++++++++++++++++++++++++++++++
 tests/ipex/test_pipelines.py |  44 +++++++++++++
 2 files changed, 166 insertions(+)

diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py
index 7f1104d7f7..9e921d30af 100644
--- a/tests/ipex/test_modeling.py
+++ b/tests/ipex/test_modeling.py
@@ -26,6 +26,7 @@
 from transformers import (
     AutoFeatureExtractor,
     AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
     AutoModelForQuestionAnswering,
     AutoTokenizer,
     GenerationConfig,
@@ -38,6 +39,7 @@
     IPEXModel,
     IPEXModelForAudioClassification,
     IPEXModelForCausalLM,
+    IPEXModelForSeq2SeqLM,
     IPEXModelForImageClassification,
     IPEXModelForMaskedLM,
     IPEXModelForQuestionAnswering,
@@ -510,3 +512,123 @@ def test_patched_model(self):
             transformers_outputs = transformers_model(**inputs)
         outputs = ipex_model(**inputs)
         self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4))
+
+
+class IPEXModelForSeq2SeqLMTest(unittest.TestCase):
+    IPEX_MODEL_CLASS = IPEXModelForSeq2SeqLM
+    SUPPORTED_ARCHITECTURES = ("t5",)
+    GENERATION_LENGTH = 2
+    SPEEDUP_CACHE = 1.0
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_compare_to_transformers(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        set_seed(SEED)
+        dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32
+        # Test model forward do not need cache.
+        ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(model_id, torch_dtype=dtype)
+        transformers_model = AutoModelForSeq2SeqLM.from_pretrained(model_id, torch_dtype=dtype)
+        self.assertIsInstance(ipex_model.config, PretrainedConfig)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokens = tokenizer(
+            "This is a sample",
+            return_tensors="pt",
+            return_token_type_ids=False if model_arch in ("llama", "llama2") else None,
+        )
+        decoder_start_token_id = transformers_model.config.decoder_start_token_id if model_arch != "mbart" else 2
+        decoder_inputs = {"decoder_input_ids": torch.ones((1, 1), dtype=torch.long) * decoder_start_token_id}
+        outputs = ipex_model(**tokens, **decoder_inputs)
+
+        self.assertIsInstance(outputs.logits, torch.Tensor)
+
+        with torch.no_grad():
+            transformers_outputs = transformers_model(**tokens, **decoder_inputs)
+
+        # Test re-load model
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            ipex_model.save_pretrained(tmpdirname)
+            loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, torch_dtype=dtype)
+            loaded_model_outputs = loaded_model(**tokens, **decoder_inputs)
+
+        # Test init method
+        init_model = self.IPEX_MODEL_CLASS(transformers_model)
+        init_model_outputs = init_model(**tokens, **decoder_inputs)
+
+        # Compare tensor outputs
+        self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4))
+        # To avoid float pointing error
+        self.assertTrue(torch.allclose(outputs.logits, loaded_model_outputs.logits, atol=1e-7))
+        self.assertTrue(torch.allclose(outputs.logits, init_model_outputs.logits, atol=1e-7))
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_pipeline(self, model_arch):
+        dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32
+        model_id = MODEL_NAMES[model_arch]
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        model = self.IPEX_MODEL_CLASS.from_pretrained(model_id, torch_dtype=dtype)
+        model.config.encoder_no_repeat_ngram_size = 0
+        # model.to("cpu")
+        pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
+        outputs = pipe("This is a sample", max_new_tokens=10, do_sample=False)
+        self.assertEqual(pipe.device, model.device)
+
+    def test_compare_with_and_without_past_key_values(self):
+        model_id = "hf-internal-testing/tiny-random-t5"
+        dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32
+        model_with_pkv = self.IPEX_MODEL_CLASS.from_pretrained(model_id, use_cache=True, torch_dtype=dtype)
+        device = model_with_pkv.device
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokens = tokenizer("This is a sample input", return_tensors="pt").to(device)
+        # Warmup
+        model_with_pkv.generate(**tokens)
+        with Timer() as with_pkv_timer:
+            outputs_model_with_pkv = model_with_pkv.generate(
+                **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1
+            )
+        model_without_pkv = self.IPEX_MODEL_CLASS.from_pretrained(model_id, use_cache=False, torch_dtype=dtype)
+        # Warmup
+        model_without_pkv.generate(**tokens)
+        with Timer() as without_pkv_timer:
+            outputs_model_without_pkv = model_without_pkv.generate(
+                **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1
+            )
+        self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
+        self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH + 1)
+        self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH + 1)
+
+    @parameterized.expand(
+        grid_parameters(
+            {
+                "model_arch": SUPPORTED_ARCHITECTURES,
+                "use_cache": [True, False],
+            }
+        )
+    )
+    def test_ipex_beam_search(self, test_name, model_arch, use_cache):
+        model_id = MODEL_NAMES[model_arch]
+        set_seed(SEED)
+        dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32
+        model = self.IPEX_MODEL_CLASS.from_pretrained(model_id, use_cache=use_cache, torch_dtype=dtype)
+        device = model.device
+        transformers_model = AutoModelForSeq2SeqLM.from_pretrained(model_id, torch_dtype=dtype).to(device)
+        self.assertEqual(model.use_cache, use_cache)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.pad_token = tokenizer.eos_token
+        # Test with batch_size is 1 and 2.
+        texts = ["This is a sample", ["This is the first input", "This is the second input"]]
+        generation_configs = (
+            GenerationConfig(max_new_tokens=4, num_beams=2, do_sample=False),
+            GenerationConfig(max_new_tokens=4, num_beams=4, do_sample=False),
+            GenerationConfig(max_new_tokens=4, num_beams=8, do_sample=False),
+            GenerationConfig(max_new_tokens=4, num_beams=32, do_sample=False),
+            GenerationConfig(
+                max_new_tokens=4, do_sample=False, top_p=0.9, top_k=0, pad_token_id=tokenizer.eos_token_id
+            ),
+        )
+        for text in texts:
+            tokens = tokenizer(text, padding=True, return_tensors="pt").to(device)
+            for generation_config in generation_configs:
+                outputs = model.generate(**tokens, generation_config=generation_config)
+                transformers_outputs = transformers_model.generate(**tokens, generation_config=generation_config)
+                self.assertIsInstance(outputs, torch.Tensor)
+                self.assertTrue(torch.equal(outputs, transformers_outputs))
diff --git a/tests/ipex/test_pipelines.py b/tests/ipex/test_pipelines.py
index 77790e19f4..e67a4a6e18 100644
--- a/tests/ipex/test_pipelines.py
+++ b/tests/ipex/test_pipelines.py
@@ -28,6 +28,7 @@
     IPEXModelForImageClassification,
     IPEXModelForMaskedLM,
     IPEXModelForQuestionAnswering,
+    IPEXModelForSeq2SeqLM,
     IPEXModelForSequenceClassification,
     IPEXModelForTokenClassification,
 )
@@ -82,6 +83,7 @@ class PipelinesIntegrationTest(unittest.TestCase):
         "resnet",
         "vit",
     )
+    TEXT2TEXT_GENERATION_SUPPORTED_ARCHITECTURES = ("t5",)
 
     @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES)
     def test_token_classification_pipeline_inference(self, model_arch):
@@ -215,3 +217,45 @@ def test_pipeline_load_from_jit_model(self, model_arch):
             ipex_output = ipex_generator(inputs)
         self.assertTrue(isinstance(ipex_generator.model, IPEXModelForSequenceClassification))
         self.assertGreaterEqual(ipex_output[0]["score"], 0.0)
+
+    @parameterized.expand(TEXT2TEXT_GENERATION_SUPPORTED_ARCHITECTURES)
+    def test_text2text_generation_pipeline_inference(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32
+        transformers_generator = transformers_pipeline("text2text-generation", model_id, torch_dtype=dtype)
+        ipex_generator = ipex_pipeline("text2text-generation", model_id, accelerator="ipex", torch_dtype=dtype)
+        inputs = "Describe a real-world application of AI."
+        with torch.inference_mode():
+            transformers_output = transformers_generator(inputs, do_sample=False, max_new_tokens=10)
+        with torch.inference_mode():
+            ipex_output = ipex_generator(inputs, do_sample=False, max_new_tokens=10)
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForSeq2SeqLM))
+        self.assertEqual(transformers_output[0]["generated_text"], ipex_output[0]["generated_text"])
+
+    @parameterized.expand(TEXT2TEXT_GENERATION_SUPPORTED_ARCHITECTURES)
+    def test_summarization_generation_pipeline_inference(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32
+        transformers_generator = transformers_pipeline("summarization", model_id, torch_dtype=dtype)
+        ipex_generator = ipex_pipeline("summarization", model_id, accelerator="ipex", torch_dtype=dtype)
+        inputs = "Describe a real-world application of AI."
+        with torch.inference_mode():
+            transformers_output = transformers_generator(inputs, do_sample=False, max_new_tokens=10)
+        with torch.inference_mode():
+            ipex_output = ipex_generator(inputs, do_sample=False, max_new_tokens=10)
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForSeq2SeqLM))
+        self.assertEqual(transformers_output[0]["summary_text"], ipex_output[0]["summary_text"])
+
+    @parameterized.expand(TEXT2TEXT_GENERATION_SUPPORTED_ARCHITECTURES)
+    def test_translation_generation_pipeline_inference(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32
+        transformers_generator = transformers_pipeline("translation", model_id, torch_dtype=dtype)
+        ipex_generator = ipex_pipeline("translation", model_id, accelerator="ipex", torch_dtype=dtype)
+        inputs = "Describe a real-world application of AI."
+        with torch.inference_mode():
+            transformers_output = transformers_generator(inputs, do_sample=False, max_new_tokens=10)
+        with torch.inference_mode():
+            ipex_output = ipex_generator(inputs, do_sample=False, max_new_tokens=10)
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForSeq2SeqLM))
+        self.assertEqual(transformers_output[0]["translation_text"], ipex_output[0]["translation_text"])

From 44880736a165f7548a2e9c8e4485a3f11f03e3f6 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 9 Dec 2024 14:54:48 +0000
Subject: [PATCH 04/58] add docs

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 docs/source/ipex/inference.mdx | 1 +
 docs/source/ipex/models.mdx    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/docs/source/ipex/inference.mdx b/docs/source/ipex/inference.mdx
index 54b586924d..bcbf234566 100644
--- a/docs/source/ipex/inference.mdx
+++ b/docs/source/ipex/inference.mdx
@@ -43,3 +43,4 @@ As shown in the table below, each task is associated with a class enabling to au
 | `IPEXModelForMaskedLM`               | `fill-mask`                          |
 | `IPEXModelForAudioClassification`    | `audio-classification`               |
 | `IPEXModelForCausalLM`               | `text-generation`                    |
+| `IPEXModelForSeq2SeqLM`              | `text2text-generation`               |
diff --git a/docs/source/ipex/models.mdx b/docs/source/ipex/models.mdx
index 346ca26599..b8cd6c482f 100644
--- a/docs/source/ipex/models.mdx
+++ b/docs/source/ipex/models.mdx
@@ -40,6 +40,7 @@ Here is the list of the supported architectures :
 - Roberta
 - Roformer
 - SqueezeBert
+- T5
 - UniSpeech
 - Vit
 - Wav2Vec2

From 16fecf8b1bbbcd496eebcef99f7c8d18dbef44d1 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 9 Dec 2024 15:01:31 +0000
Subject: [PATCH 05/58] fix readme

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 docs/source/ipex/inference.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/ipex/inference.mdx b/docs/source/ipex/inference.mdx
index bcbf234566..72826da595 100644
--- a/docs/source/ipex/inference.mdx
+++ b/docs/source/ipex/inference.mdx
@@ -14,7 +14,7 @@ Optimum Intel can be used to load models from the [Hub](https://huggingface.co/m
 
 ## Loading
 
-You can load your model and apply IPEX optimizations (apply torch.compile for non-generation tasks). For supported architectures like LLaMA, BERT and ViT, further optimizations will be applied by patching the model to use custom operators.
+You can load your model and apply IPEX optimizations (apply torch.compile except text-generation tasks). For supported architectures like LLaMA, BERT and ViT, further optimizations will be applied by patching the model to use custom operators.
 For now, support is enabled for Intel CPU/GPU. Previous models converted to TorchScript will be deprecated in v1.22.
 
 ```diff

From 4225bf03b03bc403754e6dd10d2dd285c129bde2 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 11 Dec 2024 12:06:44 +0000
Subject: [PATCH 06/58] refactor compile

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/intel/ipex/modeling_base.py | 119 +++++++++++++++-------------
 1 file changed, 64 insertions(+), 55 deletions(-)

diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index 7038515ad3..55b6e93522 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -58,12 +58,11 @@
 logger = logging.getLogger(__name__)
 
 
-_IPEX_SUPPORTED_GENERATION_TASKS = ("text-generation", "text2text-generation")
 _IPEX_SUPPORT_MODEL_TYPES = ("llama", "bert", "vit", "falcon", "gpt2")
 _IPEX_EXPORTED_GENERATION_METHODS = ("sample", "greedy_search", "beam_sample", "beam_search", "assisted_generation")
 _IPEX_MINIMUM_VERSION_FOR_COMPILE = "2.5.0"
-# TODO: Already fixed in torch 2.6, will enable when torch upgrading to 2.6
-_COMPILE_NOT_READY_MODEL_TYPES = ("electra", "roformer", "beit")
+# TODO: Some models are already fixed in torch 2.6, will enable them when torch upgrading to 2.6
+_COMPILE_NOT_READY_MODEL_TYPES = ("electra", "roformer", "beit", "llama", "falcon", "gpt2")
 
 
 def _is_patched_with_ipex(model, task, use_cache: bool = True):
@@ -86,15 +85,21 @@ def __init__(
         model,
         config: PretrainedConfig = None,
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        warmup: Optional[bool] = True,
         **kwargs,
     ):
         config = config or model.config
         OptimizedModel.__init__(self, model=model, config=config)
 
+        self._supports_cache_class = getattr(model, "_supports_cache_class", None)
+        self._supports_sdpa = getattr(model, "_supports_sdpa", None)
+        self._supports_quantized_cache = getattr(model, "_supports_quantized_cache", None)
+        self._supports_static_cache = getattr(model, "_supports_static_cache", None)
         self._dtype = self.model.dtype if self.model.dtype is not None else torch.float32
         self.use_cache = kwargs.get("use_cache", False)
         self.model_save_dir = model_save_dir
         self._add_patch = _is_patched_with_ipex(model, self.export_feature, self.use_cache)
+        self.compiled = False
 
         self.input_names = set(inspect.signature(model.forward).parameters)
 
@@ -106,25 +111,10 @@ def __init__(
         if hasattr(self.auto_model_class, "register"):
             self.auto_model_class.register(AutoConfig, self.__class__)
 
-        # Non-generation tasks can use torch.compile to get acceleration.
-        if (
-            self.model.device.type == "cpu"
-            and self.export_feature not in _IPEX_SUPPORTED_GENERATION_TASKS
-            and self.config.model_type not in _COMPILE_NOT_READY_MODEL_TYPES
-            and is_ipex_version(">=", _IPEX_MINIMUM_VERSION_FOR_COMPILE)
-        ):
-            from torch._inductor import config
-
-            # System level optimization
-            torch._inductor.config.cpp_wrapper = True
-            os.environ["TORCHINDUCTOR_FREEZING"] = "1"
-            logger.info("Enable torch.compile optimization, start warm up")
-            self.model.forward = torch.compile(self.model.forward)
-            inputs = prepare_jit_inputs(model, self.export_feature, False)
-            with torch.no_grad():
-                self.model(**inputs)
-                self.model(**inputs)
-            logger.info("Warm up end")
+        self.maybe_apply_torch_compile()
+
+        if warmup:
+            self._init_warmup()
 
     @classmethod
     def _from_transformers(cls, *args, **kwargs):
@@ -194,6 +184,31 @@ def to(self, device: Union[torch.device, str]):
     def can_generate(self):
         return isinstance(self, GenerationMixin)
 
+    def maybe_apply_torch_compile(self):
+        if (
+            not self.model.device.type != "cpu"
+            or self.config.model_type in _COMPILE_NOT_READY_MODEL_TYPES
+            or is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_COMPILE)
+        ):
+            return
+        if self.use_cache and not self._supports_static_cache:
+            return
+        from torch._inductor import config
+
+        # System level optimization
+        torch._inductor.config.cpp_wrapper = True
+        os.environ["TORCHINDUCTOR_FREEZING"] = "1"
+        logger.info("Enable torch.compile optimization")
+        self.model.forward = torch.compile(self.model.forward)
+        self.compiled = True
+
+    def _init_warmup(self):
+        inputs = prepare_jit_inputs(self.model, self.export_feature, False)
+        with torch.no_grad():
+            self.model(**inputs)
+            self.model(**inputs)
+        logger.info("Warm up end")
+
 
 class IPEXModelForSequenceClassification(IPEXModel):
     auto_model_class = AutoModelForSequenceClassification
@@ -238,16 +253,10 @@ def __init__(
         config: PretrainedConfig = None,
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         use_cache: bool = True,
+        warmup: Optional[bool] = True,
         **kwargs,
     ):
-        super().__init__(model, config, model_save_dir=model_save_dir, use_cache=use_cache)
-
-        self._supports_cache_class = getattr(model, "_supports_cache_class", None)
-        self._supports_sdpa = getattr(model, "_supports_sdpa", None)
-        self._supports_cache_class = getattr(model, "_supports_cache_class", None)
-        self._supports_quantized_cache = getattr(model, "_supports_quantized_cache", None)
-        self._supports_static_cache = getattr(model, "_supports_static_cache", None)
-
+        super().__init__(model, config, model_save_dir=model_save_dir, warmup=False, use_cache=use_cache)
         if self._add_patch:
             self._supports_cache_class = True
         GenerationMixin.__init__(self)
@@ -271,6 +280,9 @@ def __init__(
         if hasattr(self.model_cls, "_convert_to_bloom_cache"):
             self._convert_to_bloom_cache = self.model_cls._convert_to_bloom_cache
 
+        if warmup:
+            self._init_warmup()
+
     @torch.no_grad()
     def forward(
         self,
@@ -285,6 +297,9 @@ def _prepare_generation_config(
     ) -> Tuple[GenerationConfig, Dict]:
         generation_config, model_kwargs = super()._prepare_generation_config(generation_config, **kwargs)
         generation_method = generation_config.get_generation_mode().value
+        if self.compiled:
+            # Use static cache for torch compile
+            generation_config.cache_implementation = "static"
         if generation_method not in _IPEX_EXPORTED_GENERATION_METHODS:
             raise ValueError(
                 f"The generation method {generation_method} is not supported for IPEXModelForCausalLM for now, support methods are {_IPEX_EXPORTED_GENERATION_METHODS}"
@@ -337,6 +352,12 @@ def generate(self, *args, **kwargs):
 
         return result
 
+    def _init_warmup(self):
+        inputs = prepare_jit_inputs(self.model, self.export_feature, False)
+        self.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=4)
+        self.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=4)
+        logger.info("Warm up end")
+
 
 class IPEXModelForSeq2SeqLM(IPEXModel, GenerationMixin):
     auto_model_class = AutoModelForSeq2SeqLM
@@ -348,15 +369,10 @@ def __init__(
         config: PretrainedConfig = None,
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         use_cache: bool = True,
+        warmup: Optional[bool] = True,
         **kwargs,
     ):
-        super().__init__(model, config, model_save_dir=model_save_dir, use_cache=use_cache)
-
-        self._supports_cache_class = getattr(model, "_supports_cache_class", None)
-        self._supports_sdpa = getattr(model, "_supports_sdpa", None)
-        self._supports_quantized_cache = getattr(model, "_supports_quantized_cache", None)
-        self._supports_static_cache = getattr(model, "_supports_static_cache", None)
-
+        super().__init__(model, config, model_save_dir=model_save_dir, warmup=False, use_cache=use_cache)
         GenerationMixin.__init__(self)
 
         model_type = self.config.model_type.replace("_", "-")
@@ -375,23 +391,9 @@ def __init__(
 
         if hasattr(self.model_cls, "_convert_to_standard_cache"):
             self._convert_to_standard_cache = self.model_cls._convert_to_standard_cache
-        if (
-            self._supports_static_cache
-            and self.model.device.type == "cpu"
-            and self.config.model_type not in _COMPILE_NOT_READY_MODEL_TYPES
-            and is_ipex_version(">=", _IPEX_MINIMUM_VERSION_FOR_COMPILE)
-        ):
-            from torch._inductor import config
-
-            # System level optimization
-            torch._inductor.config.cpp_wrapper = True
-            os.environ["TORCHINDUCTOR_FREEZING"] = "1"
-            logger.info("Enable torch.compile optimization, start warm up")
-            self.model.forward = torch.compile(self.model.forward)
-            inputs = prepare_jit_inputs(model, self.export_feature, False)
-            self.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=4)
-            self.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=4)
-            logger.info("Warm up end")
+
+        if warmup:
+            self._init_warmup()
 
     @torch.no_grad()
     def forward(
@@ -407,7 +409,8 @@ def _prepare_generation_config(
     ) -> Tuple[GenerationConfig, Dict]:
         generation_config, model_kwargs = super()._prepare_generation_config(generation_config, **kwargs)
         # Use static cache for torch.compile
-        setattr(generation_config, "cache_implementation", "static")
+        if self.compiled:
+            generation_config.cache_implementation = "static"
 
         return generation_config, model_kwargs
 
@@ -420,6 +423,12 @@ def prepare_inputs_for_generation(self, *args, **kwargs):
     def get_encoder(self, *args, **kwargs):
         return self.model.get_encoder(*args, **kwargs)
 
+    def _init_warmup(self):
+        inputs = prepare_jit_inputs(self.model, self.export_feature, False)
+        self.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=4)
+        self.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=4)
+        logger.info("Warm up end")
+
 
 def _ipex_crop_past_key_values(model, past_key_values, max_length):
     if isinstance(model, IPEXModel) and _is_patched_with_ipex(model, "text-generation"):

From 2ac7ecf1c4af12a32a7f78fad91e0ba86d8c689d Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 11 Dec 2024 12:10:23 +0000
Subject: [PATCH 07/58] fix check

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/intel/ipex/modeling_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index 55b6e93522..a040fdcf57 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -297,7 +297,7 @@ def _prepare_generation_config(
     ) -> Tuple[GenerationConfig, Dict]:
         generation_config, model_kwargs = super()._prepare_generation_config(generation_config, **kwargs)
         generation_method = generation_config.get_generation_mode().value
-        if self.compiled:
+        if self.compiled and generation_config.cache_implementation != "ipex_paged":
             # Use static cache for torch compile
             generation_config.cache_implementation = "static"
         if generation_method not in _IPEX_EXPORTED_GENERATION_METHODS:

From 24b988ccc60092f56e28d22903d3b5ae02b70c96 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 11 Dec 2024 12:12:59 +0000
Subject: [PATCH 08/58] fix ruff check

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/intel/ipex/modeling_base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index a040fdcf57..de004db403 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -193,10 +193,10 @@ def maybe_apply_torch_compile(self):
             return
         if self.use_cache and not self._supports_static_cache:
             return
-        from torch._inductor import config
+        from torch._inductor import config as inductor_config
 
         # System level optimization
-        torch._inductor.config.cpp_wrapper = True
+        inductor_config.cpp_wrapper = True
         os.environ["TORCHINDUCTOR_FREEZING"] = "1"
         logger.info("Enable torch.compile optimization")
         self.model.forward = torch.compile(self.model.forward)

From 46b93a4c695b695021578b5d8f13eb9a3edd562b Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 16 Dec 2024 12:10:51 +0000
Subject: [PATCH 09/58] enable quantized model

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/exporters/ipex/modeling_utils.py | 134 +++++++++++++++--------
 optimum/intel/ipex/modeling_base.py      |   1 +
 2 files changed, 90 insertions(+), 45 deletions(-)

diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
index 8d5f8afa1a..3434b22422 100755
--- a/optimum/exporters/ipex/modeling_utils.py
+++ b/optimum/exporters/ipex/modeling_utils.py
@@ -30,6 +30,7 @@
 logger = logging.getLogger(__name__)
 
 _IPEX_MINIMUM_VERSION_FOR_PATCHING = "2.4.0"
+_accelerate_added_attributes = ["to", "cuda", "npu", "xpu", "mlu", "musa"]
 
 
 if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING):
@@ -133,6 +134,32 @@ def forward(self, x, y, z):
         return x
 
 
+# Adapted from https://github.com/huggingface/accelerate/blob/v1.2.1/src/accelerate/hooks.py#L183
+def _remove_hooks_for_ipex(module, recurse):
+    if hasattr(module, "_hf_hook"):
+        module._hf_hook.detach_hook(module)
+        delattr(module, "_hf_hook")
+
+    if hasattr(module, "_old_forward"):
+        # Overriding a GraphModuleImpl forward freezes the forward call and later modifications on the graph will fail.
+        # Reference: https://pytorch.slack.com/archives/C3PDTEV8E/p1705929610405409
+        if "GraphModuleImpl" in str(type(module)):
+            module.__class__.forward = module.__class__.forward.__get__(module)
+        else:
+            module.forward = module.__class__.forward.__get__(module)
+        delattr(module, "_old_forward")
+
+    # Remove accelerate added warning hooks from dispatch_model
+    for attr in _accelerate_added_attributes:
+        module.__dict__.pop(attr, None)
+
+    if recurse:
+        for child in module.children():
+            _remove_hooks_for_ipex(child, recurse)
+
+    return module
+
+
 # Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L83
 def _ipex_rms_layer_norm_forward(self, hidden_states):
     return rms_norm(hidden_states, self.weight, self.variance_epsilon)
@@ -656,30 +683,36 @@ def forward(
 class _IPEXLlamaAttention(_IPEXAttention):
     def __init__(self, module, config) -> None:
         super().__init__(module, config)
-        concat_weight = torch.concat([self.q_proj.weight, self.k_proj.weight, self.v_proj.weight]).contiguous()
-        bias_list = [bias for bias in [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias] if bias]
-        use_bias = bias_list != []
-        self.concat_qkv = nn.Linear(concat_weight.shape[1], concat_weight.shape[0], bias=use_bias)
-        self.concat_qkv.weight = nn.Parameter(concat_weight)
-        if use_bias:
-            concat_bias = torch.concat(bias_list, 0).contiguous()
-            self.concat_linear.bias = nn.Parameter(concat_bias)
-        self.q_slice = self.q_proj.weight.shape[0]
-        self.k_slice = self.q_slice + self.k_proj.weight.shape[0]
-        self.v_slice = self.k_slice + self.v_proj.weight.shape[0]
-        if self.module_device.type == "cpu":
-            if module.o_proj.__class__.__name__ not in ["LinearAllreduce"]:
-                self.mha_linear_add = LinearAdd(module.o_proj)
-
-        elif self.module_device.type == "xpu":
-            if module.o_proj.__class__.__name__ not in ["LinearAllreduce"]:
-                self.mha_linear_add = XPULinearAdd(module.o_proj)
+        if getattr(config, "quantization_config", None) is None:
+            concat_weight = torch.concat([self.q_proj.weight, self.k_proj.weight, self.v_proj.weight]).contiguous()
+            bias_list = [bias for bias in [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias] if bias]
+            use_bias = bias_list != []
+            self.concat_qkv = nn.Linear(concat_weight.shape[1], concat_weight.shape[0], bias=use_bias)
+            self.concat_qkv.weight = nn.Parameter(concat_weight)
+            if use_bias:
+                concat_bias = torch.concat(bias_list, 0).contiguous()
+                self.concat_linear.bias = nn.Parameter(concat_bias)
+            self.q_slice = self.q_proj.weight.shape[0]
+            self.k_slice = self.q_slice + self.k_proj.weight.shape[0]
+            self.v_slice = self.k_slice + self.v_proj.weight.shape[0]
+            if self.module_device.type == "cpu":
+                if module.o_proj.__class__.__name__ not in ["LinearAllreduce"]:
+                    self.mha_linear_add = LinearAdd(module.o_proj)
+
+            elif self.module_device.type == "xpu":
+                if module.o_proj.__class__.__name__ not in ["LinearAllreduce"]:
+                    self.mha_linear_add = XPULinearAdd(module.o_proj)
 
     def qkv_gemm(self, hidden_states):
-        qkv_out = self.concat_qkv(hidden_states)
-        query = qkv_out[:, : self.q_slice].view(-1, self.num_heads, self.head_dim)
-        key = qkv_out[:, self.q_slice : self.k_slice].view(-1, self.num_key_value_heads, self.head_dim)
-        value = qkv_out[:, self.k_slice :].view(-1, self.num_key_value_heads, self.head_dim)
+        if hasattr(self, "concat_qkv"):
+            qkv_out = self.concat_qkv(hidden_states)
+            query = qkv_out[:, : self.q_slice].view(-1, self.num_heads, self.head_dim)
+            key = qkv_out[:, self.q_slice : self.k_slice].view(-1, self.num_key_value_heads, self.head_dim)
+            value = qkv_out[:, self.k_slice :].view(-1, self.num_key_value_heads, self.head_dim)
+        else:
+            query = self.q_proj(hidden_states).view(-1, self.num_heads, self.head_dim)
+            key = self.k_proj(hidden_states).view(-1, self.num_key_value_heads, self.head_dim)
+            value = self.v_proj(hidden_states).view(-1, self.num_key_value_heads, self.head_dim)
 
         return query, key, value
 
@@ -745,16 +778,17 @@ def __init__(self, module, config) -> None:
         _setattr_from_module(self, module)
         self.config = config
         self.module_device = next(module.parameters()).device
-        if self.module_device.type == "cpu":
-            # LinearAllreduce and LinearLayer cannot use fused op LinearAdd
-            if module.down_proj.__class__.__name__ not in ["LinearAllreduce"]:
-                self.mlp_linear_add = LinearAdd(module.down_proj)
-            self.linear_silu_mul = Linear2SiluMul(module.gate_proj, module.up_proj)
-        elif self.module_device.type == "xpu":
-            # LinearAllreduce and LinearLayer cannot use fused op LinearAdd
-            if module.down_proj.__class__.__name__ not in ["LinearAllreduce"]:
-                self.mlp_linear_add = XPULinearAdd(module.down_proj)
-            self.linear_silu_mul = XPULinear2SiluMul(module.gate_proj, module.up_proj)
+        if getattr(config, "quantization_config", None) is None:
+            if self.module_device.type == "cpu":
+                # LinearAllreduce and LinearLayer cannot use fused op LinearAdd
+                if module.down_proj.__class__.__name__ not in ["LinearAllreduce"]:
+                    self.mlp_linear_add = LinearAdd(module.down_proj)
+                self.linear_silu_mul = Linear2SiluMul(module.gate_proj, module.up_proj)
+            elif self.module_device.type == "xpu":
+                # LinearAllreduce and LinearLayer cannot use fused op LinearAdd
+                if module.down_proj.__class__.__name__ not in ["LinearAllreduce"]:
+                    self.mlp_linear_add = XPULinearAdd(module.down_proj)
+                self.linear_silu_mul = XPULinear2SiluMul(module.gate_proj, module.up_proj)
 
     def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor = None, **kwargs):
         if hasattr(self, "linear_silu_mul"):
@@ -776,17 +810,18 @@ def __init__(self, module, config) -> None:
         super().__init__()
         _setattr_from_module(self, module)
         self.config = config
-        # LinearAllreduce and LinearLayer cannot use fused op LinearAdd
         self.module_device = next(module.parameters()).device
-        if self.module_device.type == "cpu":
-            self.linear_gelu = LinearGelu(module.dense_h_to_4h)
-        elif self.module_device.type == "xpu":
-            self.linear_gelu = XPULinearGelu(module.dense_h_to_4h)
-        if module.dense_4h_to_h.__class__.__name__ not in ["LinearAllreduce"]:
+        if getattr(config, "quantization_config", None) is None:
+            # LinearAllreduce and LinearLayer cannot use fused op LinearAdd
             if self.module_device.type == "cpu":
-                self.linear_add_add = LinearAddAdd(module.dense_4h_to_h)
+                self.linear_gelu = LinearGelu(module.dense_h_to_4h)
             elif self.module_device.type == "xpu":
-                self.linear_add_add = XPUlinearAddAdd(module.dense_4h_to_h)
+                self.linear_gelu = XPULinearGelu(module.dense_h_to_4h)
+            if module.dense_4h_to_h.__class__.__name__ not in ["LinearAllreduce"]:
+                if self.module_device.type == "cpu":
+                    self.linear_add_add = LinearAddAdd(module.dense_4h_to_h)
+                elif self.module_device.type == "xpu":
+                    self.linear_add_add = XPUlinearAddAdd(module.dense_4h_to_h)
 
     def forward(
         self,
@@ -812,6 +847,8 @@ def __init__(self, module, config):
         _setattr_from_module(self, module)
         self.self_attn = _IPEXLlamaAttention(module.self_attn, config)
         self.mlp = _IPEXLlamaMLP(module.mlp, config)
+        if getattr(config, "quantization_config", None):
+            _remove_hooks_for_ipex(self, True)
 
     def forward(self, hidden_states: torch.Tensor, **kwargs):
         # Please see the original model's forward to check the parameter
@@ -845,6 +882,8 @@ def __init__(self, module, config):
         _setattr_from_module(self, module)
         self.self_attention = _IPEXFalconAttention(module.self_attention, config)
         self.mlp = _IPEXFalconMLP(module.mlp, config)
+        if getattr(config, "quantization_config", None):
+            _remove_hooks_for_ipex(self, True)
 
     def forward(self, hidden_states: torch.Tensor, **kwargs):
         # Please see the original model's forward to check the parameter
@@ -871,11 +910,16 @@ def __init__(self, module, config):
         super().__init__()
         _setattr_from_module(self, module)
         self.module_device = next(module.parameters()).device
-        if self.module_device.type == "cpu":
-            self.linear_gelu = LinearGelu(module.dense)
-        elif self.module_device.type == "xpu":
-            self.linear_gelu = XPULinearGelu(module.dense)
+        if getattr(config, "quantization_config", None) is None:
+            if self.module_device.type == "cpu":
+                self.linear_gelu = LinearGelu(module.dense)
+            elif self.module_device.type == "xpu":
+                self.linear_gelu = XPULinearGelu(module.dense)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.linear_gelu(hidden_states)
+        if hasattr(self, "linear_gelu"):
+            hidden_states = self.linear_gelu(hidden_states)
+        else:
+            hidden_states = self.dense(hidden_states)
+            hidden_states = self.intermediate_act_fn(hidden_states)
         return hidden_states
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index de004db403..64bb42217f 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -189,6 +189,7 @@ def maybe_apply_torch_compile(self):
             not self.model.device.type != "cpu"
             or self.config.model_type in _COMPILE_NOT_READY_MODEL_TYPES
             or is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_COMPILE)
+            or getattr(self.config, "quantization_config", None)
         ):
             return
         if self.use_cache and not self._supports_static_cache:

From 82d39ce65377c88687f913d59a9f59498c552e5d Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 16 Dec 2024 12:53:41 +0000
Subject: [PATCH 10/58] add bnb test

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 tests/ipex/test_modeling.py | 167 ++++++++++++++++++++++++++++++++++++
 1 file changed, 167 insertions(+)

diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py
index 68ca27534c..b3cdeebfe3 100644
--- a/tests/ipex/test_modeling.py
+++ b/tests/ipex/test_modeling.py
@@ -29,11 +29,13 @@
     AutoModelForSeq2SeqLM,
     AutoModelForQuestionAnswering,
     AutoTokenizer,
+    BitsAndBytesConfig,
     GenerationConfig,
     PretrainedConfig,
     pipeline,
     set_seed,
 )
+from transformers.testing_utils import is_bitsandbytes_available, require_bitsandbytes
 from optimum.intel import (
     IPEXModel,
     IPEXModelForAudioClassification,
@@ -128,6 +130,40 @@ def test_pipeline(self, model_arch):
         _ = pipe(text)
         self.assertEqual(pipe.device, model.device)
 
+    @require_bitsandbytes
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_bnb(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+        ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(
+            model_id, device_map=DEVICE, quantization_config=quantization_config
+        )
+        transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(
+            model_id, device_map=DEVICE, quantization_config=quantization_config
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        inputs = "This is a sample input"
+        tokens = tokenizer(inputs, return_tensors="pt").to(DEVICE)
+        with torch.no_grad():
+            transformers_outputs = transformers_model(**tokens)
+        outputs = ipex_model(**tokens)
+
+        # Test re-load model
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            ipex_model.save_pretrained(tmpdirname)
+            loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, device_map=DEVICE)
+            loaded_model_outputs = loaded_model(**tokens)
+        # Test init method
+        init_model = self.IPEX_MODEL_CLASS(transformers_model)
+        init_model_outputs = init_model(**tokens)
+
+        # Compare tensor outputs
+        for output_name in {"logits", "last_hidden_state"}:
+            if output_name in transformers_outputs:
+                self.assertTrue(torch.allclose(outputs[output_name], transformers_outputs[output_name], atol=1e-3))
+                self.assertTrue(torch.allclose(outputs[output_name], loaded_model_outputs[output_name]))
+                self.assertTrue(torch.allclose(outputs[output_name], init_model_outputs[output_name]))
+
 
 class IPEXModelForSequenceClassificationTest(IPEXModelTest):
     IPEX_MODEL_CLASS = IPEXModelForSequenceClassification
@@ -212,6 +248,46 @@ def test_patched_model(self):
         self.assertTrue(torch.allclose(outputs.start_logits, transformers_outputs.start_logits, atol=1e-4))
         self.assertTrue(torch.allclose(outputs.end_logits, transformers_outputs.end_logits, atol=1e-4))
 
+    @require_bitsandbytes
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_bnb(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        set_seed(SEED)
+        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+        ipex_model = IPEXModelForQuestionAnswering.from_pretrained(
+            model_id, device_map=DEVICE, quantization_config=quantization_config
+        )
+        self.assertIsInstance(ipex_model.config, PretrainedConfig)
+        transformers_model = AutoModelForQuestionAnswering.from_pretrained(
+            model_id, device_map=DEVICE, quantization_config=quantization_config
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        inputs = "This is a sample input"
+        tokens = tokenizer(inputs, return_tensors="pt").to(DEVICE)
+        with torch.no_grad():
+            transformers_outputs = transformers_model(**tokens)
+        outputs = ipex_model(**tokens)
+
+        # Test re-load model
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            ipex_model.save_pretrained(tmpdirname)
+            loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, device_map=DEVICE)
+            loaded_model_outputs = loaded_model(**tokens)
+
+        # Test init method
+        init_model = self.IPEX_MODEL_CLASS(transformers_model)
+        init_model_outputs = init_model(**tokens)
+
+        self.assertIn("start_logits", outputs)
+        self.assertIn("end_logits", outputs)
+        # Compare tensor outputs
+        self.assertTrue(torch.allclose(outputs.start_logits, transformers_outputs.start_logits, atol=1e-4))
+        self.assertTrue(torch.allclose(outputs.end_logits, transformers_outputs.end_logits, atol=1e-4))
+        self.assertTrue(torch.equal(outputs.start_logits, loaded_model_outputs.start_logits))
+        self.assertTrue(torch.equal(outputs.end_logits, loaded_model_outputs.end_logits))
+        self.assertTrue(torch.equal(outputs.start_logits, init_model_outputs.start_logits))
+        self.assertTrue(torch.equal(outputs.end_logits, init_model_outputs.end_logits))
+
 
 class IPEXModelForCausalLMTest(unittest.TestCase):
     IPEX_MODEL_CLASS = IPEXModelForCausalLM
@@ -399,6 +475,51 @@ def test_patched_model(self, model_arch):
         )
         self.assertTrue(torch.allclose(ipex_outputs.logits[0], exported_outputs.logits[0], atol=1e-7))
 
+    @require_bitsandbytes
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_bnb(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        set_seed(SEED)
+        dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32
+        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+        # Test model forward do not need cache.
+        ipex_model = IPEXModelForCausalLM.from_pretrained(
+            model_id, torch_dtype=dtype, device_map=DEVICE, quantization_config=quantization_config
+        )
+        self.assertIsInstance(ipex_model.config, PretrainedConfig)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokens = tokenizer(
+            "This is a sample",
+            return_tensors="pt",
+            return_token_type_ids=False if model_arch in ("llama", "llama2") else None,
+        ).to(DEVICE)
+        inputs = ipex_model.prepare_inputs_for_generation(**tokens)
+        outputs = ipex_model(**inputs)
+
+        self.assertIsInstance(outputs.logits, torch.Tensor)
+
+        transformers_model = AutoModelForCausalLM.from_pretrained(
+            model_id, torch_dtype=dtype, device_map=DEVICE, quantization_config=quantization_config
+        )
+        with torch.no_grad():
+            transformers_outputs = transformers_model(**tokens)
+
+        # Test re-load model
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            ipex_model.save_pretrained(tmpdirname)
+            loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, torch_dtype=dtype, device_map=DEVICE)
+            loaded_model_outputs = loaded_model(**inputs)
+
+        # Test init method
+        init_model = self.IPEX_MODEL_CLASS(transformers_model)
+        init_model_outputs = init_model(**inputs)
+
+        # Compare tensor outputs
+        self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4))
+        # To avoid float pointing error
+        self.assertTrue(torch.allclose(outputs.logits, loaded_model_outputs.logits, atol=1e-7))
+        self.assertTrue(torch.allclose(outputs.logits, init_model_outputs.logits, atol=1e-7))
+
 
 class IPEXModelForAudioClassificationTest(unittest.TestCase):
     IPEX_MODEL_CLASS = IPEXModelForAudioClassification
@@ -644,6 +765,52 @@ def test_ipex_beam_search(self, test_name, model_arch, use_cache):
                 self.assertIsInstance(outputs, torch.Tensor)
                 self.assertTrue(torch.equal(outputs, transformers_outputs))
 
+    @require_bitsandbytes
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_bnb(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        set_seed(SEED)
+        dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32
+        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+        # Test model forward do not need cache.
+        ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(
+            model_id, torch_dtype=dtype, quantization_config=quantization_config
+        )
+        transformers_model = AutoModelForSeq2SeqLM.from_pretrained(
+            model_id, torch_dtype=dtype, quantization_config=quantization_config
+        )
+        self.assertIsInstance(ipex_model.config, PretrainedConfig)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokens = tokenizer(
+            "This is a sample",
+            return_tensors="pt",
+            return_token_type_ids=False if model_arch in ("llama", "llama2") else None,
+        )
+        decoder_start_token_id = transformers_model.config.decoder_start_token_id if model_arch != "mbart" else 2
+        decoder_inputs = {"decoder_input_ids": torch.ones((1, 1), dtype=torch.long) * decoder_start_token_id}
+        outputs = ipex_model(**tokens, **decoder_inputs)
+
+        self.assertIsInstance(outputs.logits, torch.Tensor)
+
+        with torch.no_grad():
+            transformers_outputs = transformers_model(**tokens, **decoder_inputs)
+
+        # Test re-load model
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            ipex_model.save_pretrained(tmpdirname)
+            loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, torch_dtype=dtype)
+            loaded_model_outputs = loaded_model(**tokens, **decoder_inputs)
+
+        # Test init method
+        init_model = self.IPEX_MODEL_CLASS(transformers_model)
+        init_model_outputs = init_model(**tokens, **decoder_inputs)
+
+        # Compare tensor outputs
+        self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4))
+        # To avoid float pointing error
+        self.assertTrue(torch.allclose(outputs.logits, loaded_model_outputs.logits, atol=1e-7))
+        self.assertTrue(torch.allclose(outputs.logits, init_model_outputs.logits, atol=1e-7))
+
 
 class IPEXSTModel(unittest.TestCase):
     SUPPORTED_ARCHITECTURES = (

From 7dc08da2b58fcaa9cde4de524089ba35f7b489de Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 16 Dec 2024 12:56:39 +0000
Subject: [PATCH 11/58] add bnb tests in yaml

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index de933e3795..e948a4b2f2 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -47,3 +47,8 @@ jobs:
       - name: Test with Pytest
         run: |
           pytest tests/ipex
+
+      - if: ${{ matrix.torch-version != '2.4.0' }}
+        name: Install bitsandbytes
+        run: |
+          pip install https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.45.0.dev0-py3-none-manylinux_2_24_x86_64.whl

From 30027ff55a40fc3a29e52e355a6f0c9140e60a44 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 16 Dec 2024 13:54:49 +0000
Subject: [PATCH 12/58] fix tests

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml |  8 ++++----
 tests/ipex/test_modeling.py     | 10 +++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index e948a4b2f2..b97f6def91 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -44,11 +44,11 @@ jobs:
           python -c "import intel_extension_for_pytorch; print(intel_extension_for_pytorch.__version__); assert intel_extension_for_pytorch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))"
           python -c "import transformers; print(transformers.__version__); assert transformers.__version__.startswith('${{ matrix.transformers-version }}'.replace('.*', ''))"
 
-      - name: Test with Pytest
-        run: |
-          pytest tests/ipex
-
       - if: ${{ matrix.torch-version != '2.4.0' }}
         name: Install bitsandbytes
         run: |
           pip install https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.45.0.dev0-py3-none-manylinux_2_24_x86_64.whl
+  
+      - name: Test with Pytest
+        run: |
+          pytest tests/ipex
diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py
index b3cdeebfe3..3c4b652c18 100644
--- a/tests/ipex/test_modeling.py
+++ b/tests/ipex/test_modeling.py
@@ -34,8 +34,8 @@
     PretrainedConfig,
     pipeline,
     set_seed,
+    is_bitsandbytes_available,
 )
-from transformers.testing_utils import is_bitsandbytes_available, require_bitsandbytes
 from optimum.intel import (
     IPEXModel,
     IPEXModelForAudioClassification,
@@ -130,8 +130,8 @@ def test_pipeline(self, model_arch):
         _ = pipe(text)
         self.assertEqual(pipe.device, model.device)
 
-    @require_bitsandbytes
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @unittest.skipIf(not is_bitsandbytes_available(), reason="Test requires bitsandbytes")
     def test_bnb(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         quantization_config = BitsAndBytesConfig(load_in_8bit=True)
@@ -248,8 +248,8 @@ def test_patched_model(self):
         self.assertTrue(torch.allclose(outputs.start_logits, transformers_outputs.start_logits, atol=1e-4))
         self.assertTrue(torch.allclose(outputs.end_logits, transformers_outputs.end_logits, atol=1e-4))
 
-    @require_bitsandbytes
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @unittest.skipIf(not is_bitsandbytes_available(), reason="Test requires bitsandbytes")
     def test_bnb(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         set_seed(SEED)
@@ -475,8 +475,8 @@ def test_patched_model(self, model_arch):
         )
         self.assertTrue(torch.allclose(ipex_outputs.logits[0], exported_outputs.logits[0], atol=1e-7))
 
-    @require_bitsandbytes
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @unittest.skipIf(not is_bitsandbytes_available(), reason="Test requires bitsandbytes")
     def test_bnb(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         set_seed(SEED)
@@ -765,8 +765,8 @@ def test_ipex_beam_search(self, test_name, model_arch, use_cache):
                 self.assertIsInstance(outputs, torch.Tensor)
                 self.assertTrue(torch.equal(outputs, transformers_outputs))
 
-    @require_bitsandbytes
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @unittest.skipIf(not is_bitsandbytes_available(), reason="Test requires bitsandbytes")
     def test_bnb(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         set_seed(SEED)

From 314db046d382c54ec8e9143a49a3448c0033ac48 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 16 Dec 2024 14:19:57 +0000
Subject: [PATCH 13/58] disable bnb tests

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml     | 5 -----
 optimum/intel/ipex/modeling_base.py | 2 +-
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index b97f6def91..de933e3795 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -44,11 +44,6 @@ jobs:
           python -c "import intel_extension_for_pytorch; print(intel_extension_for_pytorch.__version__); assert intel_extension_for_pytorch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))"
           python -c "import transformers; print(transformers.__version__); assert transformers.__version__.startswith('${{ matrix.transformers-version }}'.replace('.*', ''))"
 
-      - if: ${{ matrix.torch-version != '2.4.0' }}
-        name: Install bitsandbytes
-        run: |
-          pip install https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.45.0.dev0-py3-none-manylinux_2_24_x86_64.whl
-  
       - name: Test with Pytest
         run: |
           pytest tests/ipex
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index 64bb42217f..bf2be43e52 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -186,7 +186,7 @@ def can_generate(self):
 
     def maybe_apply_torch_compile(self):
         if (
-            not self.model.device.type != "cpu"
+            self.model.device.type != "cpu"
             or self.config.model_type in _COMPILE_NOT_READY_MODEL_TYPES
             or is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_COMPILE)
             or getattr(self.config, "quantization_config", None)

From 87656cae05be110810373d99de8b034b0e5ebf34 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 16 Dec 2024 14:34:22 +0000
Subject: [PATCH 14/58] fix gpt2

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/exporters/ipex/modeling_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
index 3434b22422..169f46076d 100755
--- a/optimum/exporters/ipex/modeling_utils.py
+++ b/optimum/exporters/ipex/modeling_utils.py
@@ -753,6 +753,8 @@ class _IPEXGPT2Attention(_IPEXAttention):
     def __init__(self, module, config) -> None:
         self.num_key_value_heads = config.num_key_value_heads
         super().__init__(module, config)
+        if getattr(config, "quantization_config", None):
+            _remove_hooks_for_ipex(self, True)
 
     def qkv_gemm(self, hidden_states):
         query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=-1)

From b0cec9c696d8fc2d35765fb965d7458abe4bf91b Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 18 Dec 2024 13:13:45 +0000
Subject: [PATCH 15/58] set actual device

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/exporters/ipex/model_patcher.py  | 1 +
 optimum/exporters/ipex/modeling_utils.py | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py
index 03937754a6..5b3dbe42f8 100644
--- a/optimum/exporters/ipex/model_patcher.py
+++ b/optimum/exporters/ipex/model_patcher.py
@@ -133,6 +133,7 @@ def _patch_vit_model(model):
 
 
 def _patch_model(model):
+    setattr(model.config, "device", model.device)
     if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING):
         raise ImportError(f"Only ipex version >= {_IPEX_MINIMUM_VERSION_FOR_PATCHING} supports llama model patching")
     if is_transformers_version("<", _TRANSFORMERS_MIN_VERSION) or is_transformers_version(
diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
index ec9a18e04c..1b8c2da41d 100755
--- a/optimum/exporters/ipex/modeling_utils.py
+++ b/optimum/exporters/ipex/modeling_utils.py
@@ -599,7 +599,7 @@ def __init__(self, module, config) -> None:
         super().__init__()
         _setattr_from_module(self, module)
         self.config = config
-        self.module_device = next(module.parameters()).device
+        self.module_device = config.device
         self.num_groups = self.num_heads // self.num_key_value_heads
         self.kv_head_mapping = torch.arange(
             0, self.num_key_value_heads, dtype=torch.int32, device=self.module_device
@@ -779,7 +779,7 @@ def __init__(self, module, config) -> None:
         super().__init__()
         _setattr_from_module(self, module)
         self.config = config
-        self.module_device = next(module.parameters()).device
+        self.module_device = config.device
         if getattr(config, "quantization_config", None) is None:
             if self.module_device.type == "cpu":
                 # LinearAllreduce and LinearLayer cannot use fused op LinearAdd
@@ -812,7 +812,7 @@ def __init__(self, module, config) -> None:
         super().__init__()
         _setattr_from_module(self, module)
         self.config = config
-        self.module_device = next(module.parameters()).device
+        self.module_device = config.device
         if getattr(config, "quantization_config", None) is None:
             # LinearAllreduce and LinearLayer cannot use fused op LinearAdd
             if self.module_device.type == "cpu":
@@ -911,7 +911,7 @@ class _IPEXIntermediate(nn.Module):
     def __init__(self, module, config):
         super().__init__()
         _setattr_from_module(self, module)
-        self.module_device = next(module.parameters()).device
+        self.module_device = config.device
         if getattr(config, "quantization_config", None) is None:
             if self.module_device.type == "cpu":
                 self.linear_gelu = LinearGelu(module.dense)

From 94cf35d1e418d01df30af5c57628040876db2469 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 18 Dec 2024 13:29:15 +0000
Subject: [PATCH 16/58] assign device when convert class

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/exporters/ipex/model_patcher.py  | 17 +++++++++--------
 optimum/exporters/ipex/modeling_utils.py | 16 ++++++++--------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py
index 5b3dbe42f8..4bfc07ece3 100644
--- a/optimum/exporters/ipex/model_patcher.py
+++ b/optimum/exporters/ipex/model_patcher.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import torch
 from transformers.models.bert.modeling_bert import BertIntermediate
 from transformers.models.falcon.modeling_falcon import FalconDecoderLayer, FalconModel
 from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block, GPT2Model
@@ -58,12 +59,12 @@ def convert_functions(m, target_m, new_function_name, new_function):
         convert_functions(sub_m, target_m, new_function_name, new_function)
 
 
-def convert_class(m, target_m, new_class, config=None):
+def convert_class(m, target_m, new_class, device, config):
     for name, sub_m in m.named_children():
         if isinstance(sub_m, target_m):
-            new_m = new_class(sub_m, config)
+            new_m = new_class(sub_m, device, config)
             setattr(m, name, new_m)
-        convert_class(sub_m, target_m, new_class, config)
+        convert_class(sub_m, target_m, new_class, device, config)
 
 
 def patch_op(m, target_m, new_op_name, new_op):
@@ -81,7 +82,7 @@ def _patch_llama_model(model):
     """
     convert_functions(model, LlamaModel, "forward", _llama_model_forward)
     convert_functions(model, LlamaRMSNorm, "forward", _ipex_rms_layer_norm_forward)
-    convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayer, model.config)
+    convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayer, model.device, model.config)
     return model
 
 
@@ -97,7 +98,7 @@ def _patch_falcon_model(model):
     setattr(model.config, "num_key_value_heads", num_key_value_heads)
     convert_functions(model, FalconModel, "forward", _falcon_model_forward)
     replace_customized_linear_with_linear(model)
-    convert_class(model, FalconDecoderLayer, _IPEXFalconDecoderLayer, model.config)
+    convert_class(model, FalconDecoderLayer, _IPEXFalconDecoderLayer, model.device, model.config)
     return model
 
 
@@ -110,7 +111,7 @@ def _patch_gpt2_model(model):
     setattr(model.config, "num_key_value_heads", num_key_value_heads)
     convert_functions(model, GPT2Model, "forward", _gpt2_model_forward)
     convert_functions(model, GPT2Block, "forward", _gpt2_block_forward)
-    convert_class(model, GPT2Attention, _IPEXGPT2Attention, model.config)
+    convert_class(model, GPT2Attention, _IPEXGPT2Attention, model.device, model.config)
     return model
 
 
@@ -119,7 +120,7 @@ def _patch_bert_model(model):
     Patch bert model:
         1. Linear fusion with Linear + Gelu
     """
-    convert_class(model, BertIntermediate, _IPEXIntermediate)
+    convert_class(model, BertIntermediate, _IPEXIntermediate, model.device, model.config)
     return model
 
 
@@ -128,7 +129,7 @@ def _patch_vit_model(model):
     Patch vit model:
         1. Linear fusion with Linear + Gelu
     """
-    convert_class(model, ViTIntermediate, _IPEXIntermediate)
+    convert_class(model, ViTIntermediate, _IPEXIntermediate, model.device, model.config)
     return model
 
 
diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
index 1b8c2da41d..6b4e10fe60 100755
--- a/optimum/exporters/ipex/modeling_utils.py
+++ b/optimum/exporters/ipex/modeling_utils.py
@@ -595,11 +595,11 @@ def _gpt2_block_forward(
 
 
 class _IPEXAttention(nn.Module):
-    def __init__(self, module, config) -> None:
+    def __init__(self, module, device, config) -> None:
         super().__init__()
         _setattr_from_module(self, module)
         self.config = config
-        self.module_device = config.device
+        self.module_device = device
         self.num_groups = self.num_heads // self.num_key_value_heads
         self.kv_head_mapping = torch.arange(
             0, self.num_key_value_heads, dtype=torch.int32, device=self.module_device
@@ -775,11 +775,11 @@ def postprocess_attention_output(self, attn_output):
 
 # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L186
 class _IPEXLlamaMLP(nn.Module):
-    def __init__(self, module, config) -> None:
+    def __init__(self, module, device, config) -> None:
         super().__init__()
         _setattr_from_module(self, module)
         self.config = config
-        self.module_device = config.device
+        self.module_device = device
         if getattr(config, "quantization_config", None) is None:
             if self.module_device.type == "cpu":
                 # LinearAllreduce and LinearLayer cannot use fused op LinearAdd
@@ -808,11 +808,11 @@ def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor = None, **
 
 
 class _IPEXFalconMLP(nn.Module):
-    def __init__(self, module, config) -> None:
+    def __init__(self, module, device, config) -> None:
         super().__init__()
         _setattr_from_module(self, module)
         self.config = config
-        self.module_device = config.device
+        self.module_device = device
         if getattr(config, "quantization_config", None) is None:
             # LinearAllreduce and LinearLayer cannot use fused op LinearAdd
             if self.module_device.type == "cpu":
@@ -908,10 +908,10 @@ def forward(self, hidden_states: torch.Tensor, **kwargs):
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/bert/modeling_bert.py#L524
 class _IPEXIntermediate(nn.Module):
-    def __init__(self, module, config):
+    def __init__(self, module, device, config):
         super().__init__()
         _setattr_from_module(self, module)
-        self.module_device = config.device
+        self.module_device = device
         if getattr(config, "quantization_config", None) is None:
             if self.module_device.type == "cpu":
                 self.linear_gelu = LinearGelu(module.dense)

From 9af46d1cc336f2f3fa84984b7192acfc0d38d62a Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 18 Dec 2024 13:30:56 +0000
Subject: [PATCH 17/58] fix class init

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/exporters/ipex/modeling_utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
index 6b4e10fe60..cb24399bb5 100755
--- a/optimum/exporters/ipex/modeling_utils.py
+++ b/optimum/exporters/ipex/modeling_utils.py
@@ -681,7 +681,7 @@ def forward(
 
 
 class _IPEXLlamaAttention(_IPEXAttention):
-    def __init__(self, module, config) -> None:
+    def __init__(self, module, device, config) -> None:
         super().__init__(module, config)
         if getattr(config, "quantization_config", None) is None:
             concat_weight = torch.concat([self.q_proj.weight, self.k_proj.weight, self.v_proj.weight]).contiguous()
@@ -723,7 +723,7 @@ def rope(self, query, key, **kwargs):
 
 
 class _IPEXFalconAttention(_IPEXAttention):
-    def __init__(self, module, config):
+    def __init__(self, module, device, config):
         self.num_key_value_heads = config.num_key_value_heads
         super().__init__(module, config)
         self.q_slice = self.head_dim * config.num_kv_heads
@@ -750,7 +750,7 @@ def rope(self, query, key, **kwargs):
 
 
 class _IPEXGPT2Attention(_IPEXAttention):
-    def __init__(self, module, config) -> None:
+    def __init__(self, module, device, config) -> None:
         self.num_key_value_heads = config.num_key_value_heads
         super().__init__(module, config)
         if getattr(config, "quantization_config", None):
@@ -844,7 +844,7 @@ def forward(
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L694
 class _IPEXLlamaDecoderLayer(nn.Module):
-    def __init__(self, module, config):
+    def __init__(self, module, device, config):
         super().__init__()
         _setattr_from_module(self, module)
         self.self_attn = _IPEXLlamaAttention(module.self_attn, config)
@@ -879,7 +879,7 @@ def forward(self, hidden_states: torch.Tensor, **kwargs):
 
 
 class _IPEXFalconDecoderLayer(nn.Module):
-    def __init__(self, module, config):
+    def __init__(self, module, device, config):
         super().__init__()
         _setattr_from_module(self, module)
         self.self_attention = _IPEXFalconAttention(module.self_attention, config)

From 18b2a6a226fd4cca0c703d37126894a5e72974db Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 18 Dec 2024 13:32:58 +0000
Subject: [PATCH 18/58] fix ipex attn init

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/exporters/ipex/modeling_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
index cb24399bb5..0626c9f8b9 100755
--- a/optimum/exporters/ipex/modeling_utils.py
+++ b/optimum/exporters/ipex/modeling_utils.py
@@ -682,7 +682,7 @@ def forward(
 
 class _IPEXLlamaAttention(_IPEXAttention):
     def __init__(self, module, device, config) -> None:
-        super().__init__(module, config)
+        super().__init__(module, device, config)
         if getattr(config, "quantization_config", None) is None:
             concat_weight = torch.concat([self.q_proj.weight, self.k_proj.weight, self.v_proj.weight]).contiguous()
             bias_list = [bias for bias in [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias] if bias]
@@ -725,7 +725,7 @@ def rope(self, query, key, **kwargs):
 class _IPEXFalconAttention(_IPEXAttention):
     def __init__(self, module, device, config):
         self.num_key_value_heads = config.num_key_value_heads
-        super().__init__(module, config)
+        super().__init__(module, device, config)
         self.q_slice = self.head_dim * config.num_kv_heads
         self.k_slice = self.q_slice + self.head_dim
         self.v_slice = self.k_slice + self.head_dim
@@ -752,7 +752,7 @@ def rope(self, query, key, **kwargs):
 class _IPEXGPT2Attention(_IPEXAttention):
     def __init__(self, module, device, config) -> None:
         self.num_key_value_heads = config.num_key_value_heads
-        super().__init__(module, config)
+        super().__init__(module, device, config)
         if getattr(config, "quantization_config", None):
             _remove_hooks_for_ipex(self, True)
 

From 9f6db336dd6ec6ba40eab948f2af92d12506e1db Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 18 Dec 2024 13:34:03 +0000
Subject: [PATCH 19/58] rm set device on config

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/exporters/ipex/model_patcher.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py
index 4bfc07ece3..af3c789f99 100644
--- a/optimum/exporters/ipex/model_patcher.py
+++ b/optimum/exporters/ipex/model_patcher.py
@@ -134,7 +134,6 @@ def _patch_vit_model(model):
 
 
 def _patch_model(model):
-    setattr(model.config, "device", model.device)
     if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING):
         raise ImportError(f"Only ipex version >= {_IPEX_MINIMUM_VERSION_FOR_PATCHING} supports llama model patching")
     if is_transformers_version("<", _TRANSFORMERS_MIN_VERSION) or is_transformers_version(

From 6d8a969ac4e675dc44a46e4525c6b6f89a299ee8 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 18 Dec 2024 13:40:11 +0000
Subject: [PATCH 20/58] fix format

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/exporters/ipex/model_patcher.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py
index af3c789f99..dd2e35e967 100644
--- a/optimum/exporters/ipex/model_patcher.py
+++ b/optimum/exporters/ipex/model_patcher.py
@@ -12,7 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-import torch
 from transformers.models.bert.modeling_bert import BertIntermediate
 from transformers.models.falcon.modeling_falcon import FalconDecoderLayer, FalconModel
 from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block, GPT2Model

From dd811f9b3a651f47b7285ffd8de7db7bbfcb9ffc Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 18 Dec 2024 14:03:06 +0000
Subject: [PATCH 21/58] fix mlp class init

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/exporters/ipex/modeling_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
index 0626c9f8b9..2721cd0689 100755
--- a/optimum/exporters/ipex/modeling_utils.py
+++ b/optimum/exporters/ipex/modeling_utils.py
@@ -847,8 +847,8 @@ class _IPEXLlamaDecoderLayer(nn.Module):
     def __init__(self, module, device, config):
         super().__init__()
         _setattr_from_module(self, module)
-        self.self_attn = _IPEXLlamaAttention(module.self_attn, config)
-        self.mlp = _IPEXLlamaMLP(module.mlp, config)
+        self.self_attn = _IPEXLlamaAttention(module.self_attn, device, config)
+        self.mlp = _IPEXLlamaMLP(module.mlp, device, config)
         if getattr(config, "quantization_config", None):
             _remove_hooks_for_ipex(self, True)
 
@@ -882,8 +882,8 @@ class _IPEXFalconDecoderLayer(nn.Module):
     def __init__(self, module, device, config):
         super().__init__()
         _setattr_from_module(self, module)
-        self.self_attention = _IPEXFalconAttention(module.self_attention, config)
-        self.mlp = _IPEXFalconMLP(module.mlp, config)
+        self.self_attention = _IPEXFalconAttention(module.self_attention, device, config)
+        self.mlp = _IPEXFalconMLP(module.mlp, device, config)
         if getattr(config, "quantization_config", None):
             _remove_hooks_for_ipex(self, True)
 

From dab4a78fe2855627184307e5839b9b158794a935 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 21 Jan 2025 15:38:15 +0000
Subject: [PATCH 22/58] add use_cache param when init generation config

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/intel/ipex/modeling_base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index 6309e6b9a5..81172090be 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -298,6 +298,7 @@ def forward(
     def _prepare_generation_config(
         self, generation_config: Optional[GenerationConfig], **kwargs: Dict
     ) -> Tuple[GenerationConfig, Dict]:
+        kwargs["use_cache"] = self.use_cache
         generation_config, model_kwargs = super()._prepare_generation_config(generation_config, **kwargs)
         generation_method = generation_config.get_generation_mode().value
         if self.compiled and generation_config.cache_implementation != "ipex_paged" and self._supports_static_cache:

From 6bf3b8b5e547bdd4b13851fe5927527d915d3c3a Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 21 Jan 2025 16:00:10 +0000
Subject: [PATCH 23/58] fix gpt2 quant model

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/exporters/ipex/model_patcher.py  |  11 +-
 optimum/exporters/ipex/modeling_utils.py | 225 ++++++++++++-----------
 2 files changed, 122 insertions(+), 114 deletions(-)

diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py
index 9b38abad6b..ee6082d1ae 100644
--- a/optimum/exporters/ipex/model_patcher.py
+++ b/optimum/exporters/ipex/model_patcher.py
@@ -14,7 +14,7 @@
 
 from transformers.models.bert.modeling_bert import BertIntermediate
 from transformers.models.falcon.modeling_falcon import FalconDecoderLayer, FalconModel
-from transformers.models.gpt2.modeling_gpt2 import GPT2MLP, GPT2Attention, GPT2Block, GPT2Model
+from transformers.models.gpt2.modeling_gpt2 import GPT2Block, GPT2Model
 from transformers.models.llama.modeling_llama import (
     LlamaDecoderLayer,
     LlamaModel,
@@ -27,13 +27,11 @@
 
 from .modeling_utils import (
     _IPEX_MINIMUM_VERSION_FOR_PATCHING,
-    _IPEXGPT2MLP,
     _falcon_model_forward,
-    _gpt2_block_forward,
     _gpt2_model_forward,
     _ipex_rms_layer_norm_forward,
     _IPEXFalconDecoderLayer,
-    _IPEXGPT2Attention,
+    _IPEXGPT2Block,
     _IPEXIntermediate,
     _IPEXLlamaDecoderLayer,
     _llama_model_forward,
@@ -106,13 +104,12 @@ def _patch_gpt2_model(model):
     """
     Patch gpt2 model:
         1. Use IPEX paged attention
+        2. Linear fusion with (Linear + Add)
     """
     num_key_value_heads = model.config.num_attention_heads
     setattr(model.config, "num_key_value_heads", num_key_value_heads)
     convert_functions(model, GPT2Model, "forward", _gpt2_model_forward)
-    convert_functions(model, GPT2Block, "forward", _gpt2_block_forward)
-    convert_class(model, GPT2Attention, _IPEXGPT2Attention, model.device, model.config)
-    convert_class(model, GPT2MLP, _IPEXGPT2MLP, model.device, model.config)
+    convert_class(model, GPT2Block, _IPEXGPT2Block, model.device, model.config)
     return model
 
 
diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
index cd85c25315..1d9ef4b086 100755
--- a/optimum/exporters/ipex/modeling_utils.py
+++ b/optimum/exporters/ipex/modeling_utils.py
@@ -558,78 +558,6 @@ def _gpt2_model_forward(
     )
 
 
-# To pass input_lens, adapted from https://github.com/huggingface/transformers/blob/v4.46.3/src/transformers/models/gpt2/modeling_gpt2.py#L602
-def _gpt2_block_forward(
-    self,
-    hidden_states: Optional[Tuple[torch.FloatTensor]],
-    layer_past: Optional[Tuple[torch.Tensor]] = None,
-    attention_mask: Optional[torch.FloatTensor] = None,
-    head_mask: Optional[torch.FloatTensor] = None,
-    encoder_hidden_states: Optional[torch.Tensor] = None,
-    encoder_attention_mask: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = False,
-    output_attentions: Optional[bool] = False,
-    **kwargs,
-) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
-    residual = hidden_states
-    hidden_states = self.ln_1(hidden_states)
-    attn_outputs = self.attn(
-        hidden_states,
-        layer_past=layer_past,
-        attention_mask=attention_mask,
-        head_mask=head_mask,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        **kwargs,
-    )
-    attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
-    outputs = attn_outputs[1:]
-    # residual connection
-    if hasattr(self.attn, "linear_add"):
-        hidden_states = self.attn.linear_add(attn_output, residual)
-    else:
-        hidden_states = attn_output + residual
-
-    if encoder_hidden_states is not None:
-        # add one self-attention block for cross-attention
-        if not hasattr(self, "crossattention"):
-            raise ValueError(
-                f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
-                "cross-attention layers by setting `config.add_cross_attention=True`"
-            )
-        residual = hidden_states
-        hidden_states = self.ln_cross_attn(hidden_states)
-        cross_attn_outputs = self.crossattention(
-            hidden_states,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_attentions=output_attentions,
-            **kwargs,
-        )
-        attn_output = cross_attn_outputs[0]
-        # residual connection
-        hidden_states = residual + attn_output
-        outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights
-
-    residual = hidden_states
-    hidden_states = self.ln_2(hidden_states)
-    feed_forward_hidden_states = self.mlp(hidden_states)
-    # residual connection
-    if hasattr(self.mlp, "linear_add"):
-        hidden_states = self.mlp.linear_add(feed_forward_hidden_states, residual)
-    else:
-        hidden_states = residual + feed_forward_hidden_states
-
-    if use_cache:
-        outputs = (hidden_states,) + outputs
-    else:
-        outputs = (hidden_states,) + outputs[1:]
-
-    return outputs  # hidden_states, present, (attentions, cross_attentions)
-
-
 class _IPEXAttention(nn.Module):
     def __init__(self, module, device, config) -> None:
         super().__init__()
@@ -844,26 +772,27 @@ class _IPEXGPT2Attention(_IPEXAttention):
     def __init__(self, module, device, config) -> None:
         self.num_key_value_heads = config.num_key_value_heads
         super().__init__(module, device, config)
-        if getattr(config, "quantization_config", None):
-            _remove_hooks_for_ipex(self, True)
-
         _setattr_from_module(self, module)
-        self.c_attn_linear = nn.Linear(self.c_attn.weight.shape[0], self.c_attn.weight.shape[1])
-        self.c_attn_linear.weight = nn.Parameter(self.c_attn.weight.t())
-        self.c_attn_linear.bias = self.c_attn.bias
-        self.c_proj_linear = nn.Linear(self.c_proj.weight.shape[0], self.c_proj.weight.shape[1])
-        self.c_proj_linear.weight = nn.Parameter(self.c_proj.weight.t())
-        self.c_proj_linear.bias = self.c_proj.bias
-        if self.module_device.type == "cpu":
-            if self.c_proj_linear not in ["LinearAllreduce"]:
-                self.linear_add = LinearAdd(self.c_proj_linear)
-
-        elif self.module_device.type == "xpu":
-            if self.c_proj_linear not in ["LinearAllreduce"]:
-                self.linear_add = XPULinearAdd(self.c_proj_linear)
+        if getattr(config, "quantization_config", None) is None:
+            self.c_attn_linear = nn.Linear(self.c_attn.weight.shape[0], self.c_attn.weight.shape[1])
+            self.c_attn_linear.weight = nn.Parameter(self.c_attn.weight.t())
+            self.c_attn_linear.bias = self.c_attn.bias
+            self.c_proj_linear = nn.Linear(self.c_proj.weight.shape[0], self.c_proj.weight.shape[1])
+            self.c_proj_linear.weight = nn.Parameter(self.c_proj.weight.t())
+            self.c_proj_linear.bias = self.c_proj.bias
+            if self.module_device.type == "cpu":
+                if self.c_proj_linear not in ["LinearAllreduce"]:
+                    self.linear_add = LinearAdd(self.c_proj_linear)
+
+            elif self.module_device.type == "xpu":
+                if self.c_proj_linear not in ["LinearAllreduce"]:
+                    self.linear_add = XPULinearAdd(self.c_proj_linear)
 
     def qkv_gemm(self, hidden_states):
-        query, key, value = self.c_attn_linear(hidden_states).split(self.split_size, dim=-1)
+        if hasattr(self, "c_attn_linear"):
+            query, key, value = self.c_attn_linear(hidden_states).split(self.split_size, dim=-1)
+        else:
+            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=-1)
         query = query.view(-1, self.num_heads, self.head_dim)
         key = key.view(-1, self.num_heads, self.head_dim)
         value = value.view(-1, self.num_heads, self.head_dim)
@@ -951,27 +880,29 @@ def forward(
 
 
 class _IPEXGPT2MLP(nn.Module):
-    def __init__(self, module, config) -> None:
+    def __init__(self, module, device, config) -> None:
         super().__init__()
         _setattr_from_module(self, module)
         self.config = config
-        self.module_device = next(module.parameters()).device
-        self.c_fc_linear = nn.Linear(self.c_fc.weight.shape[0], self.c_fc.weight.shape[1])
-        self.c_fc_linear.weight = nn.Parameter(self.c_fc.weight.t())
-        self.c_fc_linear.bias = self.c_fc.bias
-        self.c_proj_linear = nn.Linear(self.c_proj.weight.shape[0], self.c_proj.weight.shape[1])
-        self.c_proj_linear.weight = nn.Parameter(self.c_proj.weight.t())
-        self.c_proj_linear.bias = self.c_proj.bias
-        if self.module_device.type == "cpu":
-            self.linear_new_gelu = LinearNewGelu(self.c_fc_linear)
-
-        if self.module_device.type == "cpu":
-            if self.c_proj_linear not in ["LinearAllreduce"]:
-                self.linear_add = LinearAdd(self.c_proj_linear)
-
-        elif self.module_device.type == "xpu":
-            if self.c_proj_linear not in ["LinearAllreduce"]:
-                self.linear_add = XPULinearAdd(self.c_proj_linear)
+        self.module_device = device
+
+        if getattr(config, "quantization_config", None) is None:
+            self.c_fc_linear = nn.Linear(self.c_fc.weight.shape[0], self.c_fc.weight.shape[1])
+            self.c_fc_linear.weight = nn.Parameter(self.c_fc.weight.t())
+            self.c_fc_linear.bias = self.c_fc.bias
+            self.c_proj_linear = nn.Linear(self.c_proj.weight.shape[0], self.c_proj.weight.shape[1])
+            self.c_proj_linear.weight = nn.Parameter(self.c_proj.weight.t())
+            self.c_proj_linear.bias = self.c_proj.bias
+            if self.module_device.type == "cpu":
+                self.linear_new_gelu = LinearNewGelu(self.c_fc_linear)
+
+            if self.module_device.type == "cpu":
+                if self.c_proj_linear not in ["LinearAllreduce"]:
+                    self.linear_add = LinearAdd(self.c_proj_linear)
+
+            elif self.module_device.type == "xpu":
+                if self.c_proj_linear not in ["LinearAllreduce"]:
+                    self.linear_add = XPULinearAdd(self.c_proj_linear)
 
     def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
         if hasattr(self, "linear_new_gelu"):
@@ -1048,6 +979,86 @@ def forward(self, hidden_states: torch.Tensor, **kwargs):
         return outputs
 
 
+class _IPEXGPT2Block(nn.Module):
+    def __init__(self, module, device, config):
+        super().__init__()
+        _setattr_from_module(self, module)
+        self.attn = _IPEXGPT2Attention(module.attn, device, config)
+        self.mlp = _IPEXGPT2MLP(module.mlp, device, config)
+        if getattr(config, "quantization_config", None):
+            _remove_hooks_for_ipex(self, True)
+
+    def forward(
+        self,
+        hidden_states: Optional[Tuple[torch.FloatTensor]],
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        **kwargs,
+    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_outputs = self.attn(
+            hidden_states,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            **kwargs,
+        )
+        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
+        outputs = attn_outputs[1:]
+        # residual connection
+        if hasattr(self.attn, "linear_add"):
+            hidden_states = self.attn.linear_add(attn_output, residual)
+        else:
+            hidden_states = attn_output + residual
+
+        if encoder_hidden_states is not None:
+            # add one self-attention block for cross-attention
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+                    "cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+            residual = hidden_states
+            hidden_states = self.ln_cross_attn(hidden_states)
+            cross_attn_outputs = self.crossattention(
+                hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+                **kwargs,
+            )
+            attn_output = cross_attn_outputs[0]
+            # residual connection
+            hidden_states = residual + attn_output
+            outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        if hasattr(self.mlp, "linear_add"):
+            hidden_states = self.mlp.linear_add(feed_forward_hidden_states, residual)
+        else:
+            hidden_states = residual + feed_forward_hidden_states
+
+        if use_cache:
+            outputs = (hidden_states,) + outputs
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        return outputs  # hidden_states, present, (attentions, cross_attentions)
+
+
 # Adapted from https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/bert/modeling_bert.py#L524
 class _IPEXIntermediate(nn.Module):
     def __init__(self, module, device, config):

From 356d51d57ec34e42a652e99eed55598580957b72 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 22 Jan 2025 09:25:42 +0000
Subject: [PATCH 24/58] fix falcon linear fusion

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/exporters/ipex/modeling_utils.py |   6 +-
 tests/ipex/test_modeling.py              | 120 -----------------------
 2 files changed, 5 insertions(+), 121 deletions(-)

diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
index 1d9ef4b086..b8285ef145 100755
--- a/optimum/exporters/ipex/modeling_utils.py
+++ b/optimum/exporters/ipex/modeling_utils.py
@@ -869,7 +869,11 @@ def forward(
         residual: torch.Tensor = None,
         **kwargs,
     ):
-        mlp_hidden_states = self.linear_gelu(hidden_states)
+        if hasattr(self, "linear_gelu"):
+            mlp_hidden_states = self.linear_gelu(hidden_states)
+        else:
+            mlp_hidden_states = self.act(self.dense_h_to_4h(hidden_states))
+
         if hasattr(self, "linear_add_add"):
             output = self.linear_add_add(mlp_hidden_states, attention_output, residual)
         else:
diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py
index e3584bb112..ac311f40f6 100644
--- a/tests/ipex/test_modeling.py
+++ b/tests/ipex/test_modeling.py
@@ -130,40 +130,6 @@ def test_pipeline(self, model_arch):
         _ = pipe(text)
         self.assertEqual(pipe.device, model.device)
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @unittest.skipIf(not is_bitsandbytes_available(), reason="Test requires bitsandbytes")
-    def test_bnb(self, model_arch):
-        model_id = MODEL_NAMES[model_arch]
-        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-        ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(
-            model_id, device_map=DEVICE, quantization_config=quantization_config
-        )
-        transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(
-            model_id, device_map=DEVICE, quantization_config=quantization_config
-        )
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inputs = "This is a sample input"
-        tokens = tokenizer(inputs, return_tensors="pt").to(DEVICE)
-        with torch.no_grad():
-            transformers_outputs = transformers_model(**tokens)
-        outputs = ipex_model(**tokens)
-
-        # Test re-load model
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            ipex_model.save_pretrained(tmpdirname)
-            loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, device_map=DEVICE)
-            loaded_model_outputs = loaded_model(**tokens)
-        # Test init method
-        init_model = self.IPEX_MODEL_CLASS(transformers_model)
-        init_model_outputs = init_model(**tokens)
-
-        # Compare tensor outputs
-        for output_name in {"logits", "last_hidden_state"}:
-            if output_name in transformers_outputs:
-                self.assertTrue(torch.allclose(outputs[output_name], transformers_outputs[output_name], atol=1e-3))
-                self.assertTrue(torch.allclose(outputs[output_name], loaded_model_outputs[output_name]))
-                self.assertTrue(torch.allclose(outputs[output_name], init_model_outputs[output_name]))
-
 
 class IPEXModelForSequenceClassificationTest(IPEXModelTest):
     IPEX_MODEL_CLASS = IPEXModelForSequenceClassification
@@ -248,46 +214,6 @@ def test_patched_model(self):
         self.assertTrue(torch.allclose(outputs.start_logits, transformers_outputs.start_logits, atol=1e-4))
         self.assertTrue(torch.allclose(outputs.end_logits, transformers_outputs.end_logits, atol=1e-4))
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @unittest.skipIf(not is_bitsandbytes_available(), reason="Test requires bitsandbytes")
-    def test_bnb(self, model_arch):
-        model_id = MODEL_NAMES[model_arch]
-        set_seed(SEED)
-        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-        ipex_model = IPEXModelForQuestionAnswering.from_pretrained(
-            model_id, device_map=DEVICE, quantization_config=quantization_config
-        )
-        self.assertIsInstance(ipex_model.config, PretrainedConfig)
-        transformers_model = AutoModelForQuestionAnswering.from_pretrained(
-            model_id, device_map=DEVICE, quantization_config=quantization_config
-        )
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inputs = "This is a sample input"
-        tokens = tokenizer(inputs, return_tensors="pt").to(DEVICE)
-        with torch.no_grad():
-            transformers_outputs = transformers_model(**tokens)
-        outputs = ipex_model(**tokens)
-
-        # Test re-load model
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            ipex_model.save_pretrained(tmpdirname)
-            loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, device_map=DEVICE)
-            loaded_model_outputs = loaded_model(**tokens)
-
-        # Test init method
-        init_model = self.IPEX_MODEL_CLASS(transformers_model)
-        init_model_outputs = init_model(**tokens)
-
-        self.assertIn("start_logits", outputs)
-        self.assertIn("end_logits", outputs)
-        # Compare tensor outputs
-        self.assertTrue(torch.allclose(outputs.start_logits, transformers_outputs.start_logits, atol=1e-4))
-        self.assertTrue(torch.allclose(outputs.end_logits, transformers_outputs.end_logits, atol=1e-4))
-        self.assertTrue(torch.equal(outputs.start_logits, loaded_model_outputs.start_logits))
-        self.assertTrue(torch.equal(outputs.end_logits, loaded_model_outputs.end_logits))
-        self.assertTrue(torch.equal(outputs.start_logits, init_model_outputs.start_logits))
-        self.assertTrue(torch.equal(outputs.end_logits, init_model_outputs.end_logits))
-
 
 class IPEXModelForCausalLMTest(unittest.TestCase):
     IPEX_MODEL_CLASS = IPEXModelForCausalLM
@@ -799,52 +725,6 @@ def test_ipex_beam_search(self, test_name, model_arch, use_cache):
                 self.assertIsInstance(outputs, torch.Tensor)
                 self.assertTrue(torch.equal(outputs, transformers_outputs))
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @unittest.skipIf(not is_bitsandbytes_available(), reason="Test requires bitsandbytes")
-    def test_bnb(self, model_arch):
-        model_id = MODEL_NAMES[model_arch]
-        set_seed(SEED)
-        dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32
-        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-        # Test model forward do not need cache.
-        ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(
-            model_id, torch_dtype=dtype, quantization_config=quantization_config
-        )
-        transformers_model = AutoModelForSeq2SeqLM.from_pretrained(
-            model_id, torch_dtype=dtype, quantization_config=quantization_config
-        )
-        self.assertIsInstance(ipex_model.config, PretrainedConfig)
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        tokens = tokenizer(
-            "This is a sample",
-            return_tensors="pt",
-            return_token_type_ids=False if model_arch in ("llama", "llama2") else None,
-        )
-        decoder_start_token_id = transformers_model.config.decoder_start_token_id if model_arch != "mbart" else 2
-        decoder_inputs = {"decoder_input_ids": torch.ones((1, 1), dtype=torch.long) * decoder_start_token_id}
-        outputs = ipex_model(**tokens, **decoder_inputs)
-
-        self.assertIsInstance(outputs.logits, torch.Tensor)
-
-        with torch.no_grad():
-            transformers_outputs = transformers_model(**tokens, **decoder_inputs)
-
-        # Test re-load model
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            ipex_model.save_pretrained(tmpdirname)
-            loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, torch_dtype=dtype)
-            loaded_model_outputs = loaded_model(**tokens, **decoder_inputs)
-
-        # Test init method
-        init_model = self.IPEX_MODEL_CLASS(transformers_model)
-        init_model_outputs = init_model(**tokens, **decoder_inputs)
-
-        # Compare tensor outputs
-        self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4))
-        # To avoid float pointing error
-        self.assertTrue(torch.allclose(outputs.logits, loaded_model_outputs.logits, atol=1e-7))
-        self.assertTrue(torch.allclose(outputs.logits, init_model_outputs.logits, atol=1e-7))
-
 
 class IPEXSTModel(unittest.TestCase):
     SUPPORTED_ARCHITECTURES = (

From d1eee874add5f0b7663c6208910f59cbe81a5aaa Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 22 Jan 2025 09:41:24 +0000
Subject: [PATCH 25/58] fix falcon

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/exporters/ipex/modeling_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
index b8285ef145..cb22bc1c22 100755
--- a/optimum/exporters/ipex/modeling_utils.py
+++ b/optimum/exporters/ipex/modeling_utils.py
@@ -877,7 +877,7 @@ def forward(
         if hasattr(self, "linear_add_add"):
             output = self.linear_add_add(mlp_hidden_states, attention_output, residual)
         else:
-            mlp_output = self.mlp.dense_4h_to_h(mlp_hidden_states)
+            mlp_output = self.dense_4h_to_h(mlp_hidden_states)
             output = mlp_output + attention_output + residual
 
         return output

From 57e3c274e52b5e6b4147e7df2b19963c9467764d Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 12:10:30 +0000
Subject: [PATCH 26/58] enable awq model test

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml |  1 +
 tests/ipex/test_modeling.py     | 17 ++++++++---------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 0aa881badc..357a693ff5 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -36,6 +36,7 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
+          pip install autoawq
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 
       - name: Assert versions
diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py
index 953d6050c3..e13eb06267 100644
--- a/tests/ipex/test_modeling.py
+++ b/tests/ipex/test_modeling.py
@@ -30,13 +30,13 @@
     AutoModelForSeq2SeqLM,
     AutoModelForQuestionAnswering,
     AutoTokenizer,
-    BitsAndBytesConfig,
+    AwqConfig,
     GenerationConfig,
     PretrainedConfig,
     pipeline,
     set_seed,
-    is_bitsandbytes_available,
 )
+from transformers.utils import is_auto_awq_available
 from packaging import version
 from optimum.intel import (
     IPEXModel,
@@ -442,13 +442,12 @@ def test_patched_model(self, model_arch):
         )
         self.assertTrue(torch.allclose(ipex_outputs.logits[0], exported_outputs.logits[0], atol=1e-7))
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @unittest.skipIf(not is_bitsandbytes_available(), reason="Test requires bitsandbytes")
-    def test_bnb(self, model_arch):
-        model_id = MODEL_NAMES[model_arch]
+    @unittest.skipIf(not is_auto_awq_available(), reason="Test requires auto-awq")
+    def test_awq(self):
+        model_id = "PrunaAI/JackFram-llama-68m-AWQ-4bit-smashed"
         set_seed(SEED)
         dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32
-        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+        quantization_config = AwqConfig(version="ipex")
         # Test model forward do not need cache.
         ipex_model = IPEXModelForCausalLM.from_pretrained(
             model_id, torch_dtype=dtype, device_map=DEVICE, quantization_config=quantization_config
@@ -458,7 +457,7 @@ def test_bnb(self, model_arch):
         tokens = tokenizer(
             "This is a sample",
             return_tensors="pt",
-            return_token_type_ids=False if model_arch in ("llama", "llama2") else None,
+            return_token_type_ids=False,
         ).to(DEVICE)
         inputs = ipex_model.prepare_inputs_for_generation(**tokens)
         outputs = ipex_model(**inputs)
@@ -482,7 +481,7 @@ def test_bnb(self, model_arch):
         init_model_outputs = init_model(**inputs)
 
         # Compare tensor outputs
-        self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4))
+        self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=5e-2))
         # To avoid float pointing error
         self.assertTrue(torch.allclose(outputs.logits, loaded_model_outputs.logits, atol=1e-7))
         self.assertTrue(torch.allclose(outputs.logits, init_model_outputs.logits, atol=1e-7))

From 8f6ba5c522535e7064c47e151de8a8803c23a4eb Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 12:13:03 +0000
Subject: [PATCH 27/58] fix install

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 357a693ff5..4ff636ff5d 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -36,8 +36,8 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
-          pip install autoawq
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
+          pip install autoawq
 
       - name: Assert versions
         run: |

From 8870714c09df6a96df3b08f4c03fed492aff0e91 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 12:18:27 +0000
Subject: [PATCH 28/58] fix install

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 4ff636ff5d..72c8ff4563 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -37,6 +37,9 @@ jobs:
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
+
+      - name: Install dependencies
+        run: |
           pip install autoawq
 
       - name: Assert versions

From 5828fc02364468b1ce988a267b35d109784fda22 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 12:21:56 +0000
Subject: [PATCH 29/58] fix install

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 72c8ff4563..b04b2890db 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -38,16 +38,16 @@ jobs:
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 
-      - name: Install dependencies
-        run: |
-          pip install autoawq
-
       - name: Assert versions
         run: |
           python -c "import torch; print(torch.__version__); assert torch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))"
           python -c "import intel_extension_for_pytorch; print(intel_extension_for_pytorch.__version__); assert intel_extension_for_pytorch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))"
           python -c "import transformers; print(transformers.__version__); assert transformers.__version__.startswith('${{ matrix.transformers-version }}'.replace('.*', ''))"
 
+      - name: Install autoawq
+        run: |
+          pip install autoawq
+
       - name: Test with Pytest
         run: |
           pytest tests/ipex

From c616d57751b39a7800c59d482982690ec5a6c2c8 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 12:25:04 +0000
Subject: [PATCH 30/58] fix install

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index b04b2890db..7e889fd4f9 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -46,7 +46,7 @@ jobs:
 
       - name: Install autoawq
         run: |
-          pip install autoawq
+          pip install autoawq[cpu]
 
       - name: Test with Pytest
         run: |

From e1715b847d211c6325b6d02e42defaddd1579874 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 13:13:09 +0000
Subject: [PATCH 31/58] fix install

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 7e889fd4f9..59b13e6949 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -36,6 +36,8 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
+          pip setuptools wheel
+          pip install autoawq[cpu]
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 
       - name: Assert versions
@@ -44,10 +46,6 @@ jobs:
           python -c "import intel_extension_for_pytorch; print(intel_extension_for_pytorch.__version__); assert intel_extension_for_pytorch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))"
           python -c "import transformers; print(transformers.__version__); assert transformers.__version__.startswith('${{ matrix.transformers-version }}'.replace('.*', ''))"
 
-      - name: Install autoawq
-        run: |
-          pip install autoawq[cpu]
-
       - name: Test with Pytest
         run: |
           pytest tests/ipex

From e88faf2f7f24de3349fda818df98715a207b649a Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 13:14:33 +0000
Subject: [PATCH 32/58] fix install

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 59b13e6949..e3b9c80503 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -36,7 +36,7 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
-          pip setuptools wheel
+          pip install setuptools wheel
           pip install autoawq[cpu]
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 

From 882f2b2ab5fcab9cb56d322c849d2920252de039 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 13:15:03 +0000
Subject: [PATCH 33/58] fix install

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index e3b9c80503..9bcbceba5f 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -37,7 +37,7 @@ jobs:
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
           pip install setuptools wheel
-          pip install autoawq[cpu]
+          pip install autoawq
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 
       - name: Assert versions

From d8208c74286ac38a247b60b100a102e86c8c337a Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 13:19:33 +0000
Subject: [PATCH 34/58] fix install

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 9bcbceba5f..b11acfa160 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -36,10 +36,14 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
-          pip install setuptools wheel
-          pip install autoawq
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 
+      - name: Install autoawq
+        run: |
+          pip install --upgrade pip setuptools wheel
+          pip install autoawq
+          pip install transformers[testing]==${{ matrix.transformers-version }}
+
       - name: Assert versions
         run: |
           python -c "import torch; print(torch.__version__); assert torch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))"

From 80b9ccb2117172209b07841fa1dede0e0615f175 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 13:23:20 +0000
Subject: [PATCH 35/58] fix install

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index b11acfa160..8bed570d02 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -38,18 +38,18 @@ jobs:
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 
-      - name: Install autoawq
-        run: |
-          pip install --upgrade pip setuptools wheel
-          pip install autoawq
-          pip install transformers[testing]==${{ matrix.transformers-version }}
-
       - name: Assert versions
         run: |
           python -c "import torch; print(torch.__version__); assert torch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))"
           python -c "import intel_extension_for_pytorch; print(intel_extension_for_pytorch.__version__); assert intel_extension_for_pytorch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))"
           python -c "import transformers; print(transformers.__version__); assert transformers.__version__.startswith('${{ matrix.transformers-version }}'.replace('.*', ''))"
 
+      - name: Install autoawq
+        run: |
+          pip install --upgrade pip setuptools wheel
+          pip install autoawq
+          pip install transformers[testing]==${{ matrix.transformers-version }}
+
       - name: Test with Pytest
         run: |
           pytest tests/ipex

From f05fb2f0b383e8e948d1c47518e93a8372d04033 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 13:35:58 +0000
Subject: [PATCH 36/58] fix install

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 8bed570d02..fab2d1465e 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -36,6 +36,7 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
+          pip install build setuptools wheel autoawq
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 
       - name: Assert versions
@@ -44,12 +45,6 @@ jobs:
           python -c "import intel_extension_for_pytorch; print(intel_extension_for_pytorch.__version__); assert intel_extension_for_pytorch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))"
           python -c "import transformers; print(transformers.__version__); assert transformers.__version__.startswith('${{ matrix.transformers-version }}'.replace('.*', ''))"
 
-      - name: Install autoawq
-        run: |
-          pip install --upgrade pip setuptools wheel
-          pip install autoawq
-          pip install transformers[testing]==${{ matrix.transformers-version }}
-
       - name: Test with Pytest
         run: |
           pytest tests/ipex

From bd8e8707d46bfb82c5fd27c9936a46633e1ab13e Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 13:37:55 +0000
Subject: [PATCH 37/58] fix install

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index fab2d1465e..72d6be4afe 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -36,7 +36,8 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
-          pip install build setuptools wheel autoawq
+          pip install build setuptools wheel
+          pip install autoawq
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 
       - name: Assert versions

From e471be37905962914b1c660264b5383df824c364 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 13:43:26 +0000
Subject: [PATCH 38/58] fix install

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 72d6be4afe..93ee49c418 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -37,7 +37,6 @@ jobs:
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
           pip install build setuptools wheel
-          pip install autoawq
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 
       - name: Assert versions

From 96f46226a8782848ce2c9f5bb71ba2cf1aeb8d03 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 13:47:34 +0000
Subject: [PATCH 39/58] fix install

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 93ee49c418..42820a4eba 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -37,6 +37,7 @@ jobs:
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
           pip install build setuptools wheel
+          pip install autoawq[cpu]
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 
       - name: Assert versions

From 32bf0a15ec769af090f044c1b77da6d71e054578 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 13:50:43 +0000
Subject: [PATCH 40/58] fix install

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 42820a4eba..95b88057b1 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -36,9 +36,9 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
+          pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
           pip install build setuptools wheel
           pip install autoawq[cpu]
-          pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 
       - name: Assert versions
         run: |

From ad3467b9727fe3044c2b9292eda2864a2a9c3525 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 13:56:08 +0000
Subject: [PATCH 41/58] fix install

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 95b88057b1..c5df4a36b5 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -38,7 +38,7 @@ jobs:
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
           pip install build setuptools wheel
-          pip install autoawq[cpu]
+          pip install git+https://github.com/casper-hansen/AutoAWQ.git
 
       - name: Assert versions
         run: |

From 4a21d2629064f98f0d1fcb8d0d2911834ecdc7cf Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 13:59:02 +0000
Subject: [PATCH 42/58] fix install

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index c5df4a36b5..b8944753dd 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -37,7 +37,6 @@ jobs:
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
-          pip install build setuptools wheel
           pip install git+https://github.com/casper-hansen/AutoAWQ.git
 
       - name: Assert versions

From fb8002c0d5af8caa51d7208b05ede5a41e2e216a Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 14:06:19 +0000
Subject: [PATCH 43/58] fix install

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index b8944753dd..0aa881badc 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -37,7 +37,6 @@ jobs:
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
-          pip install git+https://github.com/casper-hansen/AutoAWQ.git
 
       - name: Assert versions
         run: |

From 2ad2371aee3a818caa86a7cf66b9c1255ab42346 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 14:22:24 +0000
Subject: [PATCH 44/58] fix install

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 0aa881badc..37e8fe162c 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -38,6 +38,12 @@ jobs:
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 
+      - name: Install bitsandbytes
+        run: |
+          git clone --branch multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git
+          cd bitsandbytes
+          pip install .
+
       - name: Assert versions
         run: |
           python -c "import torch; print(torch.__version__); assert torch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))"

From 3c2ddef32467d03fb5cb0d00aa26918a2829dd9e Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 14:52:33 +0000
Subject: [PATCH 45/58] enable bnb test

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 tests/ipex/test_modeling.py | 44 +++++++++++++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py
index e13eb06267..6f84604783 100644
--- a/tests/ipex/test_modeling.py
+++ b/tests/ipex/test_modeling.py
@@ -31,12 +31,13 @@
     AutoModelForQuestionAnswering,
     AutoTokenizer,
     AwqConfig,
+    BitsAndBytesConfig,
     GenerationConfig,
     PretrainedConfig,
     pipeline,
     set_seed,
 )
-from transformers.utils import is_auto_awq_available
+from transformers.utils import is_auto_awq_available, is_bitsandbytes_available
 from packaging import version
 from optimum.intel import (
     IPEXModel,
@@ -442,7 +443,7 @@ def test_patched_model(self, model_arch):
         )
         self.assertTrue(torch.allclose(ipex_outputs.logits[0], exported_outputs.logits[0], atol=1e-7))
 
-    @unittest.skipIf(not is_auto_awq_available(), reason="Test requires auto-awq")
+    @unittest.skipIf(not is_auto_awq_available(), reason="Test requires autoawq")
     def test_awq(self):
         model_id = "PrunaAI/JackFram-llama-68m-AWQ-4bit-smashed"
         set_seed(SEED)
@@ -486,6 +487,45 @@ def test_awq(self):
         self.assertTrue(torch.allclose(outputs.logits, loaded_model_outputs.logits, atol=1e-7))
         self.assertTrue(torch.allclose(outputs.logits, init_model_outputs.logits, atol=1e-7))
 
+    @unittest.skipIf(not is_bitsandbytes_available(), reason="Test requires bitsandbytes")
+    def test_bnb(self):
+        model_id = "PrunaAI/JackFram-llama-68m-bnb-4bit-smashed"
+        set_seed(SEED)
+        dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32
+        # Test model forward do not need cache.
+        ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, device_map=DEVICE)
+        self.assertIsInstance(ipex_model.config, PretrainedConfig)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokens = tokenizer(
+            "This is a sample",
+            return_tensors="pt",
+            return_token_type_ids=False,
+        ).to(DEVICE)
+        inputs = ipex_model.prepare_inputs_for_generation(**tokens)
+        outputs = ipex_model(**inputs)
+
+        self.assertIsInstance(outputs.logits, torch.Tensor)
+
+        transformers_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, device_map=DEVICE)
+        with torch.no_grad():
+            transformers_outputs = transformers_model(**tokens)
+
+        # Test re-load model
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            ipex_model.save_pretrained(tmpdirname)
+            loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, torch_dtype=dtype, device_map=DEVICE)
+            loaded_model_outputs = loaded_model(**inputs)
+
+        # Test init method
+        init_model = self.IPEX_MODEL_CLASS(transformers_model)
+        init_model_outputs = init_model(**inputs)
+
+        # Compare tensor outputs
+        self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=5e-2))
+        # To avoid float pointing error
+        self.assertTrue(torch.allclose(outputs.logits, loaded_model_outputs.logits, atol=1e-7))
+        self.assertTrue(torch.allclose(outputs.logits, init_model_outputs.logits, atol=1e-7))
+
 
 class IPEXModelForAudioClassificationTest(unittest.TestCase):
     IPEX_MODEL_CLASS = IPEXModelForAudioClassification

From 757ea8c0372c5bbaae4f3421c7b42677adfa1ebf Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 7 Feb 2025 15:01:07 +0000
Subject: [PATCH 46/58] remove useless device

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/exporters/ipex/modeling_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
index cb22bc1c22..2c72c1de1c 100755
--- a/optimum/exporters/ipex/modeling_utils.py
+++ b/optimum/exporters/ipex/modeling_utils.py
@@ -33,7 +33,7 @@
 logger = logging.getLogger(__name__)
 
 _IPEX_MINIMUM_VERSION_FOR_PATCHING = "2.4.0"
-_accelerate_added_attributes = ["to", "cuda", "npu", "xpu", "mlu", "musa"]
+_accelerate_added_attributes = ["to", "xpu"]
 
 
 if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING):

From 0a6ab0f1d4dfe86398cabb35367e9b22bd59482f Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 11 Feb 2025 16:04:10 +0000
Subject: [PATCH 47/58] update python to 3.10 on test_ipex

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 37e8fe162c..0cacd5765d 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -30,7 +30,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
-          python-version: 3.9
+          python-version: 3.10
 
       - name: Install dependencies
         run: |

From 8c4884b2bb29728bc25c05edbed44ff9cfc11f75 Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Tue, 11 Feb 2025 09:43:01 +0100
Subject: [PATCH 48/58] Apply suggestions from code review

---
 .github/workflows/test_ipex.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 0cacd5765d..7b7fa4666e 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -30,7 +30,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
-          python-version: 3.10
+          python-version: "3.10"
 
       - name: Install dependencies
         run: |

From 7fa23a5a2b01b146f140cb9ca0d4a3e663f7fc9b Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 11 Feb 2025 16:37:23 +0000
Subject: [PATCH 49/58] install autoawq

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 7b7fa4666e..5e3e55b905 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -36,6 +36,7 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
+          pip install autoawq
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 
       - name: Install bitsandbytes

From 5386bbe1b8c2e9c4977424db4cde6ff9b07a8fb1 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 11 Feb 2025 16:39:17 +0000
Subject: [PATCH 50/58] install wheel

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 5e3e55b905..2ab89d78f2 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -36,7 +36,7 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
-          pip install autoawq
+          pip install setuptools wheel autoawq
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 
       - name: Install bitsandbytes

From c5f5d16f4b8cb282ccb3f217f8afa9c50bc6f468 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 11 Feb 2025 16:44:35 +0000
Subject: [PATCH 51/58] fix install autoawq

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 2ab89d78f2..3c757a0492 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -36,7 +36,8 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
-          pip install setuptools wheel autoawq
+          pip install setuptools wheel
+          pip install autoawq
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 
       - name: Install bitsandbytes

From 41513f0d641e00f2a2a7cf0f426a1697caca7cd4 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 11 Feb 2025 16:50:09 +0000
Subject: [PATCH 52/58] rm autoawq

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .github/workflows/test_ipex.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 3c757a0492..7b7fa4666e 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -36,8 +36,6 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
-          pip install setuptools wheel
-          pip install autoawq
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 
       - name: Install bitsandbytes

From f73c08dba54a331ff90c82dc8004f6a733e88465 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 12 Feb 2025 09:33:58 +0000
Subject: [PATCH 53/58] fix concat qkv

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/exporters/ipex/modeling_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
index 4af1e38d0f..c963459f58 100755
--- a/optimum/exporters/ipex/modeling_utils.py
+++ b/optimum/exporters/ipex/modeling_utils.py
@@ -831,7 +831,7 @@ def __init__(self, module, device, config) -> None:
             self.concat_qkv.weight = nn.Parameter(concat_weight)
             if use_bias:
                 concat_bias = torch.concat(bias_list, 0).contiguous()
-                self.concat_linear.bias = nn.Parameter(concat_bias)
+                self.concat_qkv.bias = nn.Parameter(concat_bias)
             self.q_slice = self.q_proj.weight.shape[0]
             self.k_slice = self.q_slice + self.k_proj.weight.shape[0]
             self.v_slice = self.k_slice + self.v_proj.weight.shape[0]

From f51777b02b9aec4b637623362d660c86bd4130a2 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 12 Feb 2025 09:37:26 +0000
Subject: [PATCH 54/58] fix format

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/exporters/ipex/modeling_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
index c963459f58..6af89c9c07 100755
--- a/optimum/exporters/ipex/modeling_utils.py
+++ b/optimum/exporters/ipex/modeling_utils.py
@@ -1093,6 +1093,7 @@ def forward(self, hidden_states: torch.Tensor, **kwargs):
 
         return outputs
 
+
 class _IPEXGPT2Block(nn.Module):
     def __init__(self, module, device, config):
         super().__init__()

From f64b2512d290a2f453f11d8741602196bf20c90d Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 12 Feb 2025 09:50:30 +0000
Subject: [PATCH 55/58] fix qwen patch

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/exporters/ipex/model_patcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py
index f7a0fc34ce..506436bba0 100644
--- a/optimum/exporters/ipex/model_patcher.py
+++ b/optimum/exporters/ipex/model_patcher.py
@@ -128,7 +128,7 @@ def _patch_qwen2_model(model):
     """
     convert_functions(model, Qwen2Model, "forward", _qwen2_model_forward)
     convert_functions(model, Qwen2RMSNorm, "forward", _ipex_rms_layer_norm_forward)
-    convert_class(model, Qwen2DecoderLayer, _IPEXQwen2DecoderLayer, model.config)
+    convert_class(model, Qwen2DecoderLayer, _IPEXQwen2DecoderLayer, model.device, model.config)
     return model
 
 

From 778bf15f123a66b31f87af2596446eeb6b34ac9b Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 12 Feb 2025 10:08:57 +0000
Subject: [PATCH 56/58] fix bias

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/exporters/ipex/modeling_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
index 6af89c9c07..8a3045ae1e 100755
--- a/optimum/exporters/ipex/modeling_utils.py
+++ b/optimum/exporters/ipex/modeling_utils.py
@@ -825,7 +825,7 @@ def __init__(self, module, device, config) -> None:
         super().__init__(module, device, config)
         if getattr(config, "quantization_config", None) is None:
             concat_weight = torch.concat([self.q_proj.weight, self.k_proj.weight, self.v_proj.weight]).contiguous()
-            bias_list = [bias for bias in [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias] if bias]
+            bias_list = [bias for bias in [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias] if bias is not None]
             use_bias = bias_list != []
             self.concat_qkv = nn.Linear(concat_weight.shape[1], concat_weight.shape[0], bias=use_bias)
             self.concat_qkv.weight = nn.Parameter(concat_weight)

From 6ba1895e1a131a42aea58193016cd10b95494c7d Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 12 Feb 2025 11:02:55 +0000
Subject: [PATCH 57/58] rm autoawq test

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 tests/ipex/test_modeling.py | 43 -------------------------------------
 1 file changed, 43 deletions(-)

diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py
index b7d66cbe06..f0c0f52d72 100644
--- a/tests/ipex/test_modeling.py
+++ b/tests/ipex/test_modeling.py
@@ -444,49 +444,6 @@ def test_patched_model(self, model_arch):
         )
         self.assertTrue(torch.allclose(ipex_outputs.logits[0], exported_outputs.logits[0], atol=1e-7))
 
-    @unittest.skipIf(not is_auto_awq_available(), reason="Test requires autoawq")
-    def test_awq(self):
-        model_id = "PrunaAI/JackFram-llama-68m-AWQ-4bit-smashed"
-        set_seed(SEED)
-        dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32
-        quantization_config = AwqConfig(version="ipex")
-        # Test model forward do not need cache.
-        ipex_model = IPEXModelForCausalLM.from_pretrained(
-            model_id, torch_dtype=dtype, device_map=DEVICE, quantization_config=quantization_config
-        )
-        self.assertIsInstance(ipex_model.config, PretrainedConfig)
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        tokens = tokenizer(
-            "This is a sample",
-            return_tensors="pt",
-            return_token_type_ids=False,
-        ).to(DEVICE)
-        inputs = ipex_model.prepare_inputs_for_generation(**tokens)
-        outputs = ipex_model(**inputs)
-
-        self.assertIsInstance(outputs.logits, torch.Tensor)
-
-        transformers_model = AutoModelForCausalLM.from_pretrained(
-            model_id, torch_dtype=dtype, device_map=DEVICE, quantization_config=quantization_config
-        )
-        with torch.no_grad():
-            transformers_outputs = transformers_model(**tokens)
-
-        # Test re-load model
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            ipex_model.save_pretrained(tmpdirname)
-            loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, torch_dtype=dtype, device_map=DEVICE)
-            loaded_model_outputs = loaded_model(**inputs)
-
-        # Test init method
-        init_model = self.IPEX_MODEL_CLASS(transformers_model)
-        init_model_outputs = init_model(**inputs)
-
-        # Compare tensor outputs
-        self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=5e-2))
-        # To avoid float pointing error
-        self.assertTrue(torch.allclose(outputs.logits, loaded_model_outputs.logits, atol=1e-7))
-        self.assertTrue(torch.allclose(outputs.logits, init_model_outputs.logits, atol=1e-7))
 
     @unittest.skipIf(not is_bitsandbytes_available(), reason="Test requires bitsandbytes")
     def test_bnb(self):

From 88dba292e66ad12f59dbd08944258c591aa5ff17 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 12 Feb 2025 11:04:28 +0000
Subject: [PATCH 58/58] fix style

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 tests/ipex/test_modeling.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py
index f0c0f52d72..79ab8329d7 100644
--- a/tests/ipex/test_modeling.py
+++ b/tests/ipex/test_modeling.py
@@ -444,7 +444,6 @@ def test_patched_model(self, model_arch):
         )
         self.assertTrue(torch.allclose(ipex_outputs.logits[0], exported_outputs.logits[0], atol=1e-7))
 
-
     @unittest.skipIf(not is_bitsandbytes_available(), reason="Test requires bitsandbytes")
     def test_bnb(self):
         model_id = "PrunaAI/JackFram-llama-68m-bnb-4bit-smashed"