NoushNabi
diff --git a/‎.github/workflows/test_openvino.yml
+1-1 b/‎.github/workflows/test_openvino.yml
+1-1
diff --git a/‎docs/source/optimization_ov.mdx
+11-1 b/‎docs/source/optimization_ov.mdx
+11-1
diff --git a/‎examples/openvino/image-classification/run_image_classification.py
+8-10 b/‎examples/openvino/image-classification/run_image_classification.py
+8-10
diff --git a/‎optimum/exporters/ipex/__init__.py b/‎optimum/exporters/ipex/__init__.py
diff --git a/‎optimum/exporters/ipex/model_patcher.py
+91 b/‎optimum/exporters/ipex/model_patcher.py
+91
@@ -32,7 +32,7 @@ jobs:
         python -m pip install --upgrade pip
         # install PyTorch CPU version to avoid installing CUDA packages on GitHub runner without GPU
         pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-        pip install .[openvino,openvino-tokenizers,nncf,tests,diffusers]
+        pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime
     - name: Test with Pytest
       run: |
         pytest tests/openvino/ --ignore test_modeling_basic
@@ -82,7 +82,17 @@ from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig
 
 model = OVModelForCausalLM.from_pretrained(
     model_id,
-    export=True,
+    quantization_config=OVWeightQuantizationConfig(bits=4),
+)
+```
+
+You can tune quantization parameters to achieve a better performance accuracy trade-off as follows:
+
+```python
+from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig
+
+model = OVModelForCausalLM.from_pretrained(
+    model_id,
     quantization_config=OVWeightQuantizationConfig(bits=4, sym=False, ratio=0.8, dataset="ptb"),
 ) 
 ```
 
@@ -151,12 +151,12 @@ class ModelArguments:
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
     feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
@@ -239,8 +239,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            task="image-classification",
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -252,7 +251,6 @@ def main():
             "imagefolder",
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            task="image-classification",
         )
 
     # If we don't have a validation split, split off a percentage of train as validation.
@@ -287,15 +285,15 @@ def compute_metrics(p):
         finetuning_task="image-classification",
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     model = AutoModelForImageClassification.from_pretrained(
         model_args.model_name_or_path,
         from_tf=bool(".ckpt" in model_args.model_name_or_path),
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
@@ -311,7 +309,7 @@ def compute_metrics(p):
         model_args.feature_extractor_name or model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     # Define torchvision transforms to be applied to each image.
 
@@ -0,0 +1,91 @@
+#  Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from transformers.models.llama.modeling_llama import (
+    LlamaAttention,
+    LlamaDecoderLayer,
+    LlamaForCausalLM,
+    LlamaModel,
+    LlamaRMSNorm,
+)
+
+from optimum.intel.utils.import_utils import is_ipex_version
+
+from .modeling_utils import (
+    _IPEXLlamaDecoderLayerRef,
+    _llama_attn_forward,
+    _llama_layer_norm_forward,
+    _llama_model_forward,
+)
+
+
+_IPEX_EXPORTED_ARCH = ("LlamaForCausalLM",)
+_IPEX_EXPORTED_TASK = ("text-generation",)
+
+
+def convert_func(m, func_name, new_function):
+    bound_method = new_function.__get__(m, m.__class__)
+    setattr(m, func_name, bound_method)
+
+
+def convert_functions(m, target_m, new_function_name, new_function):
+    for _, sub_m in m.named_children():
+        if isinstance(sub_m, target_m):
+            convert_func(sub_m, new_function_name, new_function)
+        convert_functions(sub_m, target_m, new_function_name, new_function)
+
+
+def convert_class(m, target_m, new_class, config, distributed=False):
+    for name, sub_m in m.named_children():
+        if isinstance(sub_m, target_m):
+            new_m = new_class(sub_m, config, distributed)
+            setattr(m, name, new_m)
+        convert_class(sub_m, target_m, new_class, config, distributed)
+
+
+def patch_op(m, target_m, new_op_name, new_op):
+    for name, sub_m in m.named_children():
+        if isinstance(sub_m, target_m):
+            setattr(sub_m, new_op_name, new_op)
+        patch_op(sub_m, target_m, new_op_name, new_op)
+
+
+def _patch_llama_model(model):
+    if is_ipex_version("<", "2.5.0"):
+        raise ImportError("Only ipex version > 2.3.0 supports RotaryEmbedding and IndirectAccessKVCache")
+
+    from intel_extension_for_pytorch.llm.modules import IndirectAccessKVCache, RotaryEmbedding
+
+    ipex_rope = RotaryEmbedding(
+        model.config.max_position_embeddings,
+        model.config.hidden_size // model.config.num_attention_heads,
+        model.config.rope_theta,
+        model.config.architectures[0],
+    )
+    ipex_scale_dot_product = IndirectAccessKVCache(text_max_length=model.config.max_position_embeddings)
+    patch_op(model, LlamaAttention, "ipex_rope", ipex_rope)
+    patch_op(model, LlamaAttention, "ipex_scale_dot_product", ipex_scale_dot_product)
+
+    convert_functions(model, LlamaModel, "forward", _llama_model_forward)
+    convert_functions(model, LlamaAttention, "forward", _llama_attn_forward)
+    convert_functions(model, LlamaRMSNorm, "forward", _llama_layer_norm_forward)
+
+    convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayerRef, model.config)
+    return model
+
+
+def _patch_model(model):
+    if isinstance(model, LlamaForCausalLM):
+        model = _patch_llama_model(model)
+    return model