faaany
diff --git a/‎.github/workflows/test_inc.yml
+3-3 b/‎.github/workflows/test_inc.yml
+3-3
diff --git a/‎.github/workflows/test_ipex.yml
+1-1 b/‎.github/workflows/test_ipex.yml
+1-1
diff --git a/‎optimum/commands/export/openvino.py
+28-3 b/‎optimum/commands/export/openvino.py
+28-3
diff --git a/‎optimum/exporters/ipex/model_patcher.py
+5-28 b/‎optimum/exporters/ipex/model_patcher.py
+5-28
@@ -32,7 +32,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install cmake
         pip install py-cpuinfo
-        pip install torch==2.2 torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu
         pip install .[neural-compressor,diffusers,tests]
         pip install intel-extension-for-transformers
         pip install peft
@@ -43,7 +43,7 @@ jobs:
     - name: Test IPEX
       run: |
         pip uninstall -y intel-extension-for-transformers
-        pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu
-        pip install intel-extension-for-pytorch==2.1.100
+        pip install torch==2.3.0 torchaudio==2.3.0 torchvision==0.18 --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install intel-extension-for-pytorch==2.3.0
         pytest tests/neural_compressor/test_ipex.py
 
@@ -30,7 +30,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install torch==2.2 torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install torch torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
         pip install .[ipex,tests]
     - name: Test with Pytest
       run: |
 
@@ -119,6 +119,15 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models."
         ),
     )
+    optional_group.add_argument(
+        "--all-layers",
+        action="store_true",
+        default=None,
+        help=(
+            "Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an weight "
+            "compression is applied, they are compressed to INT8."
+        ),
+    )
     optional_group.add_argument(
         "--disable-stateful",
         action="store_true",
@@ -198,6 +207,7 @@ def run(self):
                 and self.args.ratio is None
                 and self.args.group_size is None
                 and self.args.sym is None
+                and self.args.all_layers is None
                 and self.args.model in _DEFAULT_4BIT_CONFIGS
             ):
                 quantization_config = _DEFAULT_4BIT_CONFIGS[self.args.model]
@@ -207,6 +217,7 @@ def run(self):
                     "ratio": 1 if is_int8 else (self.args.ratio or 0.8),
                     "sym": self.args.sym or False,
                     "group_size": -1 if is_int8 else self.args.group_size,
+                    "all_layers": None if is_int8 else self.args.all_layers,
                 }
 
             if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}:
@@ -226,6 +237,9 @@ def run(self):
             )
             library_name = "transformers"
 
+        if self.args.convert_tokenizer:
+            logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.")
+
         if (
             library_name == "diffusers"
             and ov_config
@@ -261,10 +275,21 @@ def run(self):
             )
             model.save_pretrained(self.args.output)
 
-        else:
-            if self.args.convert_tokenizer:
-                logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.")
+            if self.args.disable_convert_tokenizer:
+                return
+
+            # avoid import when using other exporters (IPEX, INC)
+            from ...exporters.openvino.convert import export_tokenizer
 
+            output = Path(self.args.output)
+            tokenizer = getattr(model, "tokenizer", None)
+            if tokenizer is not None:
+                export_tokenizer(tokenizer, output / "tokenizer")
+
+            tokenizer_2 = getattr(model, "tokenizer_2", None)
+            if tokenizer_2 is not None:
+                export_tokenizer(tokenizer_2, output / "tokenizer_2")
+        else:
             # TODO : add input shapes
             main_export(
                 model_name_or_path=self.args.model,
 
@@ -13,7 +13,6 @@
 #  limitations under the License.
 
 from transformers.models.llama.modeling_llama import (
-    LlamaAttention,
     LlamaDecoderLayer,
     LlamaForCausalLM,
     LlamaModel,
@@ -24,7 +23,6 @@
 
 from .modeling_utils import (
     _IPEXLlamaDecoderLayerRef,
-    _llama_attn_forward,
     _llama_layer_norm_forward,
     _llama_model_forward,
 )
@@ -63,34 +61,13 @@ def patch_op(m, target_m, new_op_name, new_op):
 
 
 def _patch_llama_model(model):
-
     ipex_version = "2.1.0" if "xpu" in str(model.device) else "2.5.0"
     if is_ipex_version("<", ipex_version):
-        raise ImportError(f"Only ipex version >= {ipex_version} supports RotaryEmbedding and IndirectAccessKVCache")
-
-    if "cpu" in str(model.device):
-        from intel_extension_for_pytorch.llm.modules import RotaryEmbedding
-        from intel_extension_for_pytorch.llm.modules import IndirectAccessKVCache
-
-        ipex_rope = RotaryEmbedding(
-            model.config.max_position_embeddings,
-            model.config.hidden_size // model.config.num_attention_heads,
-            model.config.rope_theta,
-            model.config.architectures[0],
-        )
-        ipex_scale_dot_product = IndirectAccessKVCache(text_max_length=model.config.max_position_embeddings)
-
-        patch_op(model, LlamaAttention, "ipex_rope", ipex_rope)
-        patch_op(model, LlamaAttention, "ipex_scale_dot_product", ipex_scale_dot_product)
-
-        convert_functions(model, LlamaModel, "forward", _llama_model_forward)
-        convert_functions(model, LlamaAttention, "forward", _llama_attn_forward)
-        convert_functions(model, LlamaRMSNorm, "forward", _llama_layer_norm_forward)
-
-        convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayerRef, model.config)
-    else:
-        convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayer, model.config)
-        convert_functions(model, LlamaModel, "forward", _llama_model_forward)
+        raise ImportError(f"Only ipex version >= {ipex_version} supports llama model patching")
+
+    convert_functions(model, LlamaModel, "forward", _llama_model_forward)
+    convert_functions(model, LlamaRMSNorm, "forward", _llama_layer_norm_forward)
+    convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayerRef, model.config)
     return model