huggingface
diff --git a/‎README.md
+2-2 b/‎README.md
+2-2
diff --git a/‎optimum/exporters/openvino/convert.py
+2-20 b/‎optimum/exporters/openvino/convert.py
+2-20
diff --git a/‎optimum/exporters/openvino/model_patcher.py
+20-5 b/‎optimum/exporters/openvino/model_patcher.py
+20-5
diff --git a/‎optimum/exporters/openvino/stateful.py
+1-4 b/‎optimum/exporters/openvino/stateful.py
+1-4
diff --git a/‎optimum/intel/__init__.py
+18-4 b/‎optimum/intel/__init__.py
+18-4
diff --git a/‎optimum/intel/generation/modeling.py
+1-3 b/‎optimum/intel/generation/modeling.py
+1-3
diff --git a/‎optimum/intel/ipex/__init__.py
+3 b/‎optimum/intel/ipex/__init__.py
+3
diff --git a/‎optimum/intel/ipex/inference.py
+1-19 b/‎optimum/intel/ipex/inference.py
+1-19
@@ -78,10 +78,10 @@ It is possible to export your model to the [OpenVINO](https://docs.openvino.ai/2
 optimum-cli export openvino --model gpt2 ov_model
 ```
 
-If you add `--int8`, the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision.
+You can also apply 8-bit weight-only quantization when exporting your model : the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision.
 
 ```plain
-optimum-cli export openvino --model gpt2 --int8 ov_model
+optimum-cli export openvino --model gpt2 --weight-format int8 ov_model
 ```
 
 To apply quantization on both weights and activations, you can find more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov).
 
@@ -33,14 +33,7 @@
 from optimum.exporters.onnx.model_patcher import DecoderModelPatcher
 from optimum.utils import is_diffusers_available
 
-from ...intel.utils.import_utils import (
-    _torch_version,
-    _transformers_version,
-    is_nncf_available,
-    is_optimum_version,
-    is_torch_version,
-    is_transformers_version,
-)
+from ...intel.utils.import_utils import is_nncf_available, is_optimum_version
 from .model_patcher import patch_model_with_bettertransformer
 from .stateful import ensure_stateful_is_available, patch_stateful
 from .utils import (
@@ -97,6 +90,7 @@ def _save_model(model, path: str, compression_option: Optional[str] = None, comp
                 "ratio": compression_ratio,
             },
         }
+
         model = nncf.compress_weights(model, **COMPRESSION_OPTIONS[compression_option])
 
     compress_to_fp16 = compression_option == "fp16"
@@ -332,18 +326,6 @@ def export_pytorch(
     output = Path(output)
 
     if stateful:
-        if is_transformers_version("<", "4.36") or is_torch_version("<", "2.1.1"):
-            COLOR_RED = "\033[1;31m"
-            COLOR_RESET = "\033[0m"
-            logger.warning(
-                COLOR_RED
-                + "[WARNING] For good performance with stateful models, transformers>=4.36.2 and PyTorch>=2.1.1 are required. "
-                f"This Python environment has Transformers {_transformers_version} and PyTorch {_torch_version}. "
-                "Consider upgrading PyTorch and Transformers, for example by running "
-                "`pip install --upgrade --upgrade-strategy eager optimum[openvino,nncf]`, and export the model again"
-                + COLOR_RESET
-            )
-
         # Trigger bettertransformer together with stateful model because OpenVINO HW-dependent transformations expect
         # both of them are applied to demonstrate the best performance.
         # TODO: Consider applying bettertransformer regardless of stateful flag -- requires additional validation.
 
@@ -14,16 +14,31 @@
 
 import logging as log
 
-from optimum.intel.utils.import_utils import is_torch_version
+from optimum.intel.utils.import_utils import (
+    _torch_version,
+    _transformers_version,
+    is_torch_version,
+    is_transformers_version,
+)
 
 
 def patch_model_with_bettertransformer(model):
-    if is_torch_version("<", "2.0"):
+    # check that the model has not yet been pathced
+    if hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True:
+        return model
+
+    if is_transformers_version("<", "4.36") or is_torch_version("<", "2.1.1"):
+        COLOR_RED = "\033[1;31m"
+        COLOR_RESET = "\033[0m"
         log.warn(
-            "integration Scaled Dot Product Attention optimization supported only with torch > 2.0."
-            "Usage model with stateful=True may be non-effective if model does not contain torch.functional.scaled_dot_product_attention"
-            "It is recommended to upgrade PyTorch version for using stateful model or use stateful=False"
+            COLOR_RED
+            + "[WARNING] For good performance with stateful models, transformers>=4.36.2 and PyTorch>=2.1.1 are required. "
+            f"This Python environment has Transformers {_transformers_version} and PyTorch {_torch_version}. "
+            "Consider upgrading PyTorch and Transformers, for example by running "
+            "`pip install --upgrade --upgrade-strategy eager optimum[openvino,nncf]`, and export the model again"
+            + COLOR_RESET
         )
+
     # model already has required SDPA implementation
     if getattr(model, "_supports_sdpa", False) and getattr(model.config, "_attn_implementation", "eager") == "sdpa":
         return model
 
@@ -22,7 +22,6 @@
 from openvino.runtime import opset13
 from optimum.exporters import TasksManager
 from optimum.intel.utils.import_utils import _openvino_version, is_openvino_version
-from optimum.utils.normalized_config import NormalizedConfigManager
 
 
 def model_has_state(ov_model: ov.Model):
@@ -217,9 +216,7 @@ def patch_stateful(config: PretrainedConfig, ov_model: ov.Model):
     batch_dim = 1 if config.model_type == "chatglm" else 0
 
     fuse_cache_reorder(ov_model, not_kv_inputs, key_value_input_names, batch_dim)
-
-    normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config)
-    num_attention_heads = normalized_config.num_attention_heads if config.model_type == "bloom" else 1
+    num_attention_heads = config.num_attention_heads if config.model_type == "bloom" else 1
     make_stateful(
         ov_model, not_kv_inputs, key_value_input_names, key_value_output_names, batch_dim, num_attention_heads, None
     )
@@ -48,9 +48,11 @@
         "IPEXModelForMaskedLM",
         "IPEXModelForTokenClassification",
         "IPEXModelForQuestionAnswering",
+        "IPEXModelForImageClassification",
+        "IPEXModelForAudioClassification",
+        "IPEXModel",
     ]
 
-
 try:
     if not (is_openvino_available() and is_nncf_available()):
         raise OptionalDependencyNotAvailable()
@@ -60,9 +62,12 @@
         "OVQuantizer",
         "OVTrainer",
         "OVTrainingArguments",
+        "OVWeightQuantizationConfig",
     ]
 else:
-    _import_structure["openvino"].extend(["OVConfig", "OVQuantizer", "OVTrainer", "OVTrainingArguments"])
+    _import_structure["openvino"].extend(
+        ["OVConfig", "OVQuantizer", "OVTrainer", "OVTrainingArguments", "OVWeightQuantizationConfig"]
+    )
 
 try:
     if not (is_openvino_available() and is_diffusers_available()):
@@ -159,7 +164,10 @@
         from .utils.dummy_ipex_objects import *
     else:
         from .ipex import (
+            IPEXModel,
+            IPEXModelForAudioClassification,
             IPEXModelForCausalLM,
+            IPEXModelForImageClassification,
             IPEXModelForMaskedLM,
             IPEXModelForQuestionAnswering,
             IPEXModelForSequenceClassification,
@@ -171,9 +179,15 @@
         if not (is_openvino_available() and is_nncf_available()):
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
-        from .utils.dummy_openvino_and_nncf_objects import OVConfig, OVQuantizer, OVTrainer, OVTrainingArguments
+        from .utils.dummy_openvino_and_nncf_objects import (
+            OVConfig,
+            OVQuantizer,
+            OVTrainer,
+            OVTrainingArguments,
+            OVWeightQuantizationConfig,
+        )
     else:
-        from .openvino import OVConfig, OVQuantizer, OVTrainer, OVTrainingArguments
+        from .openvino import OVConfig, OVQuantizer, OVTrainer, OVTrainingArguments, OVWeightQuantizationConfig
 
     try:
         if not (is_openvino_available() and is_diffusers_available()):
 
@@ -66,13 +66,11 @@ def prepare_jit_inputs(model: PreTrainedModel, task: str, use_cache: bool = Fals
 
 def jit_trace(model: PreTrainedModel, task: str, use_cache: bool = False):
     model_inputs = prepare_jit_inputs(model, task, use_cache)
-    model.config.return_dict = False
+    model.config.return_dict = task not in {"text-generation", "audio-classification"}
     # check if the model_inputs is correct.
     model(**model_inputs)
 
     torch._C._jit_set_texpr_fuser_enabled(False)
-    if "past_key_values" in model_inputs.keys():
-        model.config.return_dict = False
     if is_torch_version(">=", "2.1.0"):
         traced_model = torch.jit.trace(model, example_kwarg_inputs=model_inputs, strict=False)
     else:
 
@@ -1,5 +1,8 @@
 from optimum.intel.ipex.modeling_base import (
+    IPEXModel,
+    IPEXModelForAudioClassification,
     IPEXModelForCausalLM,
+    IPEXModelForImageClassification,
     IPEXModelForMaskedLM,
     IPEXModelForQuestionAnswering,
     IPEXModelForSequenceClassification,
 
@@ -31,25 +31,13 @@
     IPEXModelForMaskedLM,
     IPEXModelForSequenceClassification,
     IPEXModelForTokenClassification,
-    IPEXBloomForCausalLM,
-    IPEXMPTForCausalLM,
-    IPEXOPTForCausalLM,
-    IPEXGPTBigCodeForCausalLM,
     IPEXModelForQuestionAnswering,
 )
 
 
 from .utils import _HEAD_TO_AUTOMODELS
 
 
-_MODEL_TYPE_TO_AUTOMODELS = {
-    "bloom": IPEXBloomForCausalLM,
-    "mpt": IPEXMPTForCausalLM,
-    "opt": IPEXOPTForCausalLM,
-    "big_code": IPEXGPTBigCodeForCausalLM,
-}
-
-
 logger = logging.getLogger(__name__)
 
 IPEX_NOT_AVAILABLE_ERROR_MSG = (
@@ -146,13 +134,7 @@ def __enter__(self):
                         )
                         if task in _HEAD_TO_AUTOMODELS:
                             model = jit_trace(model, task, use_cache)
-                            model_type = getattr(self._original.config, "model_type", "").replace("_", "-")
-
-                            if task == "text-generation" and model_type in _MODEL_TYPE_TO_AUTOMODELS.keys():
-                                auto_model_class = _MODEL_TYPE_TO_AUTOMODELS[task]
-                            else:
-                                auto_model_class = eval(_HEAD_TO_AUTOMODELS[task])
-
+                            auto_model_class = eval(_HEAD_TO_AUTOMODELS[task])
                             model = auto_model_class(model, self._original.config, use_cache=use_cache)
 
                     # Enable automatic mixed precision (AMP) if we are going to target `bfloat16`