huggingface · IlyasMoutawwakil · Feb 19, 2025 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
@@ -30,14 +30,20 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
-          python-version: 3.9
+          python-version: "3.10"
 
       - name: Install dependencies
         run: |
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 
+      - name: Install bitsandbytes
+        run: |
+          git clone --branch multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git
+          cd bitsandbytes
+          pip install .
+
       - name: Assert versions
         run: |
           python -c "import torch; print(torch.__version__); assert torch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))"

diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py
@@ -14,7 +14,7 @@
 
 from transformers.models.bert.modeling_bert import BertIntermediate
 from transformers.models.falcon.modeling_falcon import FalconDecoderLayer, FalconModel
-from transformers.models.gpt2.modeling_gpt2 import GPT2MLP, GPT2Attention, GPT2Block, GPT2Model
+from transformers.models.gpt2.modeling_gpt2 import GPT2Block, GPT2Model
 from transformers.models.llama.modeling_llama import (
     LlamaDecoderLayer,
     LlamaModel,
@@ -32,13 +32,11 @@
 
 from .modeling_utils import (
     _IPEX_MINIMUM_VERSION_FOR_PATCHING,
-    _IPEXGPT2MLP,
     _falcon_model_forward,
-    _gpt2_block_forward,
     _gpt2_model_forward,
     _ipex_rms_layer_norm_forward,
     _IPEXFalconDecoderLayer,
-    _IPEXGPT2Attention,
+    _IPEXGPT2Block,
     _IPEXIntermediate,
     _IPEXLlamaDecoderLayer,
     _IPEXQwen2DecoderLayer,
@@ -66,12 +64,12 @@ def convert_functions(m, target_m, new_function_name, new_function):
         convert_functions(sub_m, target_m, new_function_name, new_function)
 
 
-def convert_class(m, target_m, new_class, config=None):
+def convert_class(m, target_m, new_class, device, config):
     for name, sub_m in m.named_children():
         if isinstance(sub_m, target_m):
-            new_m = new_class(sub_m, config)
+            new_m = new_class(sub_m, device, config)
             setattr(m, name, new_m)
-        convert_class(sub_m, target_m, new_class, config)
+        convert_class(sub_m, target_m, new_class, device, config)
 
 
 def patch_op(m, target_m, new_op_name, new_op):
@@ -89,7 +87,7 @@ def _patch_llama_model(model):
     """
     convert_functions(model, LlamaModel, "forward", _llama_model_forward)
     convert_functions(model, LlamaRMSNorm, "forward", _ipex_rms_layer_norm_forward)
-    convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayer, model.config)
+    convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayer, model.device, model.config)
     return model
 
 
@@ -105,21 +103,20 @@ def _patch_falcon_model(model):
     setattr(model.config, "num_key_value_heads", num_key_value_heads)
     convert_functions(model, FalconModel, "forward", _falcon_model_forward)
     replace_customized_linear_with_linear(model)
-    convert_class(model, FalconDecoderLayer, _IPEXFalconDecoderLayer, model.config)
+    convert_class(model, FalconDecoderLayer, _IPEXFalconDecoderLayer, model.device, model.config)
     return model
 
 
 def _patch_gpt2_model(model):
     """
     Patch gpt2 model:
         1. Use IPEX paged attention
+        2. Linear fusion with (Linear + Add)
     """
     num_key_value_heads = model.config.num_attention_heads
     setattr(model.config, "num_key_value_heads", num_key_value_heads)
     convert_functions(model, GPT2Model, "forward", _gpt2_model_forward)
-    convert_functions(model, GPT2Block, "forward", _gpt2_block_forward)
-    convert_class(model, GPT2Attention, _IPEXGPT2Attention, model.config)
-    convert_class(model, GPT2MLP, _IPEXGPT2MLP, model.config)
+    convert_class(model, GPT2Block, _IPEXGPT2Block, model.device, model.config)
     return model
 
 
@@ -131,7 +128,7 @@ def _patch_qwen2_model(model):
     """
     convert_functions(model, Qwen2Model, "forward", _qwen2_model_forward)
     convert_functions(model, Qwen2RMSNorm, "forward", _ipex_rms_layer_norm_forward)
-    convert_class(model, Qwen2DecoderLayer, _IPEXQwen2DecoderLayer, model.config)
+    convert_class(model, Qwen2DecoderLayer, _IPEXQwen2DecoderLayer, model.device, model.config)
     return model
 
 
@@ -140,7 +137,7 @@ def _patch_bert_model(model):
     Patch bert model:
         1. Linear fusion with Linear + Gelu
     """
-    convert_class(model, BertIntermediate, _IPEXIntermediate)
+    convert_class(model, BertIntermediate, _IPEXIntermediate, model.device, model.config)
     return model
 
 
@@ -149,7 +146,7 @@ def _patch_vit_model(model):
     Patch vit model:
         1. Linear fusion with Linear + Gelu
     """
-    convert_class(model, ViTIntermediate, _IPEXIntermediate)
+    convert_class(model, ViTIntermediate, _IPEXIntermediate, model.device, model.config)
     return model
 
 

diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
@@ -189,6 +189,7 @@ def maybe_apply_torch_compile(self):
             self.model.device.type != "cpu"
             or self.config.model_type in _COMPILE_NOT_READY_MODEL_TYPES
             or is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_COMPILE)
+            or getattr(self.config, "quantization_config", None)
         ):
             return
         if self.use_cache and not self._supports_static_cache:
@@ -297,6 +298,7 @@ def forward(
     def _prepare_generation_config(
         self, generation_config: Optional[GenerationConfig], **kwargs: Dict
     ) -> Tuple[GenerationConfig, Dict]:
+        kwargs["use_cache"] = self.use_cache
         generation_config, model_kwargs = super()._prepare_generation_config(generation_config, **kwargs)
         generation_method = generation_config.get_generation_mode().value
         if self.compiled and generation_config.cache_implementation != "ipex_paged" and self._supports_static_cache:

diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py
@@ -30,11 +30,14 @@
     AutoModelForSeq2SeqLM,
     AutoModelForQuestionAnswering,
     AutoTokenizer,
+    AwqConfig,
+    BitsAndBytesConfig,
     GenerationConfig,
     PretrainedConfig,
     pipeline,
     set_seed,
 )
+from transformers.utils import is_auto_awq_available, is_bitsandbytes_available
 from packaging import version
 from optimum.intel import (
     IPEXModel,
@@ -441,6 +444,45 @@ def test_patched_model(self, model_arch):
         )
         self.assertTrue(torch.allclose(ipex_outputs.logits[0], exported_outputs.logits[0], atol=1e-7))
 
+    @unittest.skipIf(not is_bitsandbytes_available(), reason="Test requires bitsandbytes")
+    def test_bnb(self):
+        model_id = "PrunaAI/JackFram-llama-68m-bnb-4bit-smashed"
+        set_seed(SEED)
+        dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32
+        # Test model forward do not need cache.
+        ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, device_map=DEVICE)
+        self.assertIsInstance(ipex_model.config, PretrainedConfig)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokens = tokenizer(
+            "This is a sample",
+            return_tensors="pt",
+            return_token_type_ids=False,
+        ).to(DEVICE)
+        inputs = ipex_model.prepare_inputs_for_generation(**tokens)
+        outputs = ipex_model(**inputs)
+
+        self.assertIsInstance(outputs.logits, torch.Tensor)
+
+        transformers_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, device_map=DEVICE)
+        with torch.no_grad():
+            transformers_outputs = transformers_model(**tokens)
+
+        # Test re-load model
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            ipex_model.save_pretrained(tmpdirname)
+            loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, torch_dtype=dtype, device_map=DEVICE)
+            loaded_model_outputs = loaded_model(**inputs)
+
+        # Test init method
+        init_model = self.IPEX_MODEL_CLASS(transformers_model)
+        init_model_outputs = init_model(**inputs)
+
+        # Compare tensor outputs
+        self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=5e-2))
+        # To avoid float pointing error
+        self.assertTrue(torch.allclose(outputs.logits, loaded_model_outputs.logits, atol=1e-7))
+        self.assertTrue(torch.allclose(outputs.logits, init_model_outputs.logits, atol=1e-7))
+
 
 class IPEXModelForAudioClassificationTest(unittest.TestCase):
     IPEX_MODEL_CLASS = IPEXModelForAudioClassification