huggingface
diff --git a/‎optimum/exporters/openvino/convert.py
+2 b/‎optimum/exporters/openvino/convert.py
+2
diff --git a/‎optimum/intel/openvino/__init__.py
+1-2 b/‎optimum/intel/openvino/__init__.py
+1-2
diff --git a/‎optimum/intel/openvino/configuration.py
+113-3 b/‎optimum/intel/openvino/configuration.py
+113-3
diff --git a/‎optimum/intel/openvino/modeling_base.py
-6 b/‎optimum/intel/openvino/modeling_base.py
-6
diff --git a/‎optimum/intel/openvino/modeling_decoder.py
+22-8 b/‎optimum/intel/openvino/modeling_decoder.py
+22-8
@@ -500,6 +500,8 @@ def export_models(
     Returns:
         list of input_names and output_names from ONNX configuration
     """
+
+    # TODO : modify compression_option to quantization_config
     outputs = []
 
     if output_names is not None and len(output_names) != len(models_and_onnx_configs):
 
@@ -36,11 +36,10 @@
 
     patch_torch_operators()
 
-    from .configuration import OVConfig
+    from .configuration import OVConfig, OVWeightQuantizationConfig
     from .quantization import OVQuantizer
     from .trainer import OVTrainer
     from .training_args import OVTrainingArguments
-    from .weight_quantization import OVWeightQuantizationConfig
 
 from .modeling import (
     OVModelForAudioClassification,
 
@@ -12,15 +12,15 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-from typing import Dict, List, Optional, Union
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
 
 import torch
+from transformers import PretrainedConfig
 from transformers.utils.quantization_config import QuantizationConfigMixin
 
 from optimum.configuration_utils import BaseConfig
 
-from .weight_quantization import OVWeightQuantizationConfig
-
 
 DEFAULT_QUANTIZATION_CONFIG = {
     "algorithm": "quantization",
@@ -77,6 +77,28 @@
 }
 
 
+DEFAULT_4BIT_CONFIGS = {
+    "databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.5},
+    "EleutherAI/gpt-j-6b": {"bits": 4, "sym": False, "group_size": 64},
+    "facebook/opt-6.7b": {"bits": 4, "sym": False, "group_size": 64, "ratio": 0.8},
+    "bigscience/bloomz-7b1": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.6},
+    "togethercomputer/RedPajama-INCITE-7B-Instruct": {"bits": 4, "sym": False, "group_size": 128},
+    "HuggingFaceH4/zephyr-7b-beta": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.6},
+    "meta-llama/Llama-2-7b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.6},
+    "meta-llama/Llama-2-7b-chat": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8},
+    "meta-llama/Llama-2-13b-chat": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8},
+    "stabilityai/stablelm-3b-4e1t": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8},
+    "stablelm-epoch-3b-preview": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8},
+    "stable-zephyr-3b-dpo": {"bits": 4, "sym": False, "group_size": 64, "ratio": 0.8},
+    "pansophic/rocket-3B": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8},
+    "THUDM/chatglm2-6b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.72},
+    "Qwen/Qwen-7B-Chat": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.6},
+    "openlm-research/open_llama_3b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True},
+    "tiiuae/falcon-7b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True},
+    "psmathur/orca_mini_3b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True},
+}
+
+
 class OVConfig(BaseConfig):
     CONFIG_NAME = "openvino_config.json"
     FULL_CONFIGURATION_FILE = "openvino_config.json"
@@ -127,3 +149,91 @@ def _enable_standard_onnx_export_option(self):
             for i, algo_config in enumerate(self.compression):
                 if algo_config["algorithm"] == "quantization":
                     self.compression[i]["export_to_onnx_standard_ops"] = self.save_onnx_model
+
+
+@dataclass
+class OVWeightQuantizationConfig(QuantizationConfigMixin):
+    """
+    This is a wrapper class about all possible attributes and features that you can play with a model that has been
+    loaded using `optimum-intel` api for quantization with NNCF.
+
+    Args:
+
+        bits (`int`, defaults to 8):
+            The number of bits to quantize to.
+        sym (`bool`, *optional*, defaults to `False`):
+            Whether to use symetric quantization.
+        tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*):
+            The tokenizer used to process the dataset. You can pass either:
+                - A custom tokenizer object.
+                - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
+                    Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                    user or organization name, like `dbmdz/bert-base-german-cased`.
+                - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
+                    using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+        dataset (`Union[List[str]]`, *optional*):
+            The dataset used for data-aware compression. You can provide your own dataset in a list of string or just use the
+            the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new']
+        group_size (`int`, *optional*, defaults to 128):
+            The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
+        ratio (`float`, *optional*, defaults to 1.0):
+            The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM
+            and the rest to INT8_ASYM).
+        all_layers (`bool`, *optional*):
+            Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit presicion.
+        sensitivity_metric (`nncf.SensitivityMetric`, *optional*):
+            The sensitivity metric for assigning quantization precision to layers. In order to
+            preserve the accuracy of the model, the more sensitive layers receives a higher precision.
+        awq (`bool`, *optional*):
+            Enables AWQ method to unify weight ranges and improve overall model accuracy.
+        ignored_scope (`nncf.IgnoredScope`, *optional*):
+            An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization.
+
+    """
+
+    def __init__(
+        self,
+        bits: int = 8,
+        sym: bool = False,
+        tokenizer: Any = None,
+        dataset: Optional[str] = None,
+        ratio: Optional[float] = None,
+        group_size: Optional[int] = None,
+        all_layers: Optional[bool] = None,
+        sensitivity_metric: Optional[str] = None,
+        ignored_scope: Optional[dict] = None,
+        **kwargs,
+    ):
+        self.bits = bits
+        self.sym = sym
+        self.tokenizer = tokenizer
+        self.dataset = dataset
+        self.group_size = group_size
+        self.ratio = ratio
+        self.all_layers = all_layers
+        self.sensitivity_metric = sensitivity_metric
+        self.ignored_scope = ignored_scope
+        self.quant_method = "default"  # TODO : enable AWQ after nncf v2.9.0 release
+        self.post_init()
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct
+        """
+        if self.ratio is not None and not (0 <= self.ratio <= 1):
+            raise ValueError("damp_percent must between 0 and 1.")
+        if self.group_size is not None and self.group_size != -1 and self.group_size <= 0:
+            raise ValueError("group_size must be greater than 0 or equal to -1")
+        if self.dataset is not None and isinstance(self.dataset, str):
+            if self.dataset not in ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"]:
+                raise ValueError(
+                    f"""You have entered a string value for dataset. You can only choose between
+                    ['wikitext2','c4','c4-new','ptb','ptb-new'], but we found {self.dataset}"""
+                )
+
+        if self.bits not in [4, 8]:
+            raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}")
+
+
+def _check_default_4bit_configs(config: PretrainedConfig):
+    return DEFAULT_4BIT_CONFIGS.get(config.name_or_path, None)
@@ -155,7 +155,6 @@ def _from_pretrained(
         from_onnx: bool = False,
         local_files_only: bool = False,
         load_in_8bit: bool = False,
-        load_in_4bit: bool = False,
         **kwargs,
     ):
         """
@@ -185,11 +184,7 @@ def _from_pretrained(
                 Whether or not to only look at local files (i.e., do not try to download the model).
             load_in_8bit (`bool`, *optional*, defaults to `False`):
                 Whether or not to apply 8-bit weight quantization.
-            load_in_4bit (`bool`, *optional*, defaults to `False`):
-                Whether or not to apply 4-bit weight quantization.
         """
-        if load_in_4bit:
-            raise ValueError("load_in_4bit is available for OVModelForCausalLM only.")
         model_path = Path(model_id)
         default_file_name = ONNX_WEIGHTS_NAME if from_onnx else OV_XML_FILE_NAME
         file_name = file_name or default_file_name
@@ -257,7 +252,6 @@ def _from_transformers(
         task: Optional[str] = None,
         trust_remote_code: bool = False,
         load_in_8bit: Optional[bool] = None,
-        load_in_4bit: Optional[bool] = None,
         **kwargs,
     ):
         """
 
@@ -32,10 +32,11 @@
 
 from ...exporters.openvino import ensure_stateful_is_available, main_export, patch_stateful
 from ...exporters.openvino.stateful import model_has_state
+from ..utils.import_utils import is_nncf_available
 from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS
+from .configuration import OVWeightQuantizationConfig, _check_default_4bit_configs
 from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel
 from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE
-from .weight_quantization import OVWeightQuantizationConfig, compress_decoder_weights
 
 
 logger = logging.getLogger(__name__)
@@ -238,7 +239,6 @@ def _from_transformers(
         use_cache: bool = True,
         trust_remote_code: bool = False,
         load_in_8bit: Optional[bool] = None,
-        load_in_4bit: Optional[bool] = None,
         quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None,
         **kwargs,
     ):
@@ -258,8 +258,9 @@ def _from_transformers(
 
         # If load_in_8bit is not specified then compression_option should be set to None and will be set by default in main_export depending on the model size
         compression_option = None
-        if load_in_8bit is not None or load_in_4bit is not None:
+        if load_in_8bit is not None or quantization_config is not None:
             compression_option = "fp32"
+
         stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache)
         main_export(
             model_name_or_path=model_id,
@@ -285,7 +286,6 @@ def _from_transformers(
             use_cache=use_cache,
             load_in_8bit=load_in_8bit,
             stateful=None,
-            load_in_4bit=load_in_4bit,
             quantization_config=quantization_config,
             **kwargs,
         )
@@ -556,7 +556,6 @@ def _from_pretrained(
         from_onnx: bool = False,
         local_files_only: bool = False,
         load_in_8bit: bool = False,
-        load_in_4bit: bool = False,
         quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
         **kwargs,
     ):
@@ -575,8 +574,10 @@ def _from_pretrained(
             local_files_only=local_files_only,
         )
 
-        if load_in_8bit and load_in_4bit:
-            raise ValueError("Either load_in_8bit or load_in_4bit should be set to True.")
+        if isinstance(quantization_config, dict):
+            quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)
+
+        load_in_4bit = quantization_config.bits == 4 if quantization_config else False
         model = cls.load_model(model_cache_path, load_in_8bit=False if load_in_4bit else load_in_8bit)
 
         model_type = config.model_type.replace("_", "-")
@@ -594,7 +595,20 @@ def _from_pretrained(
         causal_model = init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs)
 
         if load_in_4bit:
-            compress_decoder_weights(causal_model, quantization_config)
+            if not is_nncf_available():
+                raise ImportError(
+                    "Quantization of the weights requires nncf, please install it with `pip install nncf`"
+                )
+            from .quantization import _weight_only_quantization
+
+            default_config = _check_default_4bit_configs(config)
+
+            if default_config:
+                logger.info(
+                    f"For the given model, we recommend the following `quantization_config` : {default_config}"
+                )
+
+            _weight_only_quantization(causal_model, quantization_config)
         return causal_model