huggingface
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎docs/source/inference.mdx
+12-14 b/‎docs/source/inference.mdx
+12-14
diff --git a/‎docs/source/optimization_ov.mdx
+3-3 b/‎docs/source/optimization_ov.mdx
+3-3
diff --git a/‎optimum/commands/export/openvino.py
+49-4 b/‎optimum/commands/export/openvino.py
+49-4
diff --git a/‎optimum/exporters/openvino/__init__.py
+14 b/‎optimum/exporters/openvino/__init__.py
+14
diff --git a/‎optimum/exporters/openvino/__main__.py
+38-3 b/‎optimum/exporters/openvino/__main__.py
+38-3
@@ -126,7 +126,7 @@ from optimum.intel import OVQuantizer, OVModelForSequenceClassification
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
 model_id = "distilbert-base-uncased-finetuned-sst-2-english"
-model = AutoModelForSequenceClassification.from_pretrained(model_id)
+model = OVModelForSequenceClassification.from_pretrained(model_id, export=True)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 def preprocess_fn(examples, tokenizer):
     return tokenizer(
 
@@ -47,6 +47,8 @@ Here we set the `task` to `text-generation-with-past`, with the `-with-past` suf
 optimum-cli export openvino --model local_path --task text-generation-with-past ov_model
 ```
 
+To export your model in fp16, you can add `--weight-format fp16` when exporting your model.
+
 Once the model is exported, you can load the OpenVINO model using :
 
 ```python
@@ -96,15 +98,23 @@ tokenizer.save_pretrained(save_directory)
 
 ### Weight-only quantization
 
-You can also apply 8-bit or 4-bit weight quantization when exporting your model with the CLI:
+You can also apply 8-bit or 4-bit weight quantization when exporting your model with the CLI by setting the `weight-format` argument to respectively `int8` or `int4`:
 
 ```bash
 optimum-cli export openvino --model gpt2 --weight-format int8 ov_model
 ```
 
 This will result in the exported model linear and embedding layers to be quantized to INT8 or INT4, the activations will be kept in floating point precision. This type of optimization allows reducing the footprint and latency of LLMs.
 
-This can also be done when loading your model by setting the `load_in_8bit` argument when calling the `from_pretrained()` method.
+By default the quantization scheme will be [assymmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#asymmetric-quantization), to make it [symmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#symmetric-quantization) you can add `--sym`.
+
+For INT4 quantization you can also specify the following arguments :
+* The `--group-size` parameter will define the group size to use for quantization, `-1` it will results in per-column quantization.
+* The `--ratio` CLI parameter controls the ratio between 4-bit and 8-bit quantization. If set to 0.9, it means that 90% of the layers will be quantized to `int4` while 10% will be quantized to `int8`.
+
+Smaller `group_size` and `ratio` of usually improve accuracy at the sacrifice of the model size and inference latency.
+
+You can also apply 8-bit quantization on your model's weight when loading your model by setting the `load_in_8bit=True` argument when calling the `from_pretrained()` method.
 
 ```python
 from optimum.intel import OVModelForCausalLM
@@ -114,18 +124,6 @@ model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)
 
 > **NOTE:** `load_in_8bit` is enabled by default for the models larger than 1 billion parameters.
 
-There are also alternative compression options for a different performance-accuracy trade-off:
-
-| Option                                                              | Description       |
-|---------------------------------------------------------------------|-------------------|
-| `fp16`                                                              | Float16 weights   |
-| `int8`                                                              | INT8 weights      |
-| `int4_sym_g128`, `int4_asym_g128`, `int4_sym_g64`, `int4_asym_g64`* | INT4 weights      |
-
-*`sym` and `asym` stand for symmetric and asymmetric quantization, `g128` and `g64` means the group size `128` and `64` respectively. 
-
-`--ratio` CLI parameter controls the ratio between 4-bit and 8-bit quantized layers and can also change performance-accuracy trade-off for the optimized model. It is valid only for INT4 quantization options.
-
 
 To apply quantization on both weights and activations, you can use the `OVQuantizer`, more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov#optimization).
 
 
@@ -26,11 +26,11 @@ Here is how to apply static quantization on a fine-tuned DistilBERT:
 
 ```python
 from functools import partial
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-from optimum.intel import OVConfig, OVQuantizer
+from transformers import  AutoTokenizer
+from optimum.intel import OVConfig, OVQuantizer, OVModelForSequenceClassification,
 
 model_id = "distilbert-base-uncased-finetuned-sst-2-english"
-model = AutoModelForSequenceClassification.from_pretrained(model_id)
+model = OVModelForSequenceClassification.from_pretrained(model_id, export=True)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 # The directory where the quantized model will be saved
 save_dir = "ptq_model"
 
@@ -77,7 +77,7 @@ def parse_args_openvino(parser: "ArgumentParser"):
     optional_group.add_argument(
         "--weight-format",
         type=str,
-        choices=["fp32", "fp16", "int8", "int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"],
+        choices=["fp32", "fp16", "int8", "int4", "int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"],
         default=None,
         help=(
             "The weight format of the exporting model, e.g. f32 stands for float32 weights, f16 - for float16 weights, i8 - INT8 weights, int4_* - for INT4 compressed weights."
@@ -86,12 +86,24 @@ def parse_args_openvino(parser: "ArgumentParser"):
     optional_group.add_argument(
         "--ratio",
         type=float,
-        default=0.8,
+        default=None,
         help=(
             "Compression ratio between primary and backup precision. In the case of INT4, NNCF evaluates layer sensitivity and keeps the most impactful layers in INT8"
             "precision (by default 20%% in INT8). This helps to achieve better accuracy after weight compression."
         ),
     )
+    optional_group.add_argument(
+        "--sym",
+        action="store_true",
+        default=None,
+        help=("Whether to apply symmetric quantization"),
+    )
+    optional_group.add_argument(
+        "--group-size",
+        type=int,
+        default=None,
+        help=("The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization."),
+    )
     optional_group.add_argument(
         "--disable-stateful",
         action="store_true",
@@ -132,6 +144,7 @@ def parse_args(parser: "ArgumentParser"):
 
     def run(self):
         from ...exporters.openvino.__main__ import main_export
+        from ...intel.openvino.configuration import _DEFAULT_4BIT_CONFIGS, OVConfig
 
         if self.args.fp16:
             logger.warning(
@@ -144,6 +157,39 @@ def run(self):
             )
             self.args.weight_format = "int8"
 
+        weight_format = self.args.weight_format or "fp32"
+
+        ov_config = None
+        if weight_format in {"fp16", "fp32"}:
+            ov_config = OVConfig(dtype=weight_format)
+        else:
+            is_int8 = weight_format == "int8"
+
+            # For int4 quantization if not parameter is provided, then use the default config if exist
+            if (
+                not is_int8
+                and self.args.ratio is None
+                and self.args.group_size is None
+                and self.args.sym is None
+                and self.args.model in _DEFAULT_4BIT_CONFIGS
+            ):
+                quantization_config = _DEFAULT_4BIT_CONFIGS[self.args.model]
+            else:
+                quantization_config = {
+                    "bits": 8 if is_int8 else 4,
+                    "ratio": 1 if is_int8 else (self.args.ratio or 0.8),
+                    "sym": self.args.sym or False,
+                    "group_size": -1 if is_int8 else self.args.group_size,
+                }
+
+            if weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}:
+                logger.warning(
+                    f"--weight-format {weight_format} is deprecated, possible choices are fp32, fp16, int8, int4"
+                )
+                quantization_config["sym"] = "asym" not in weight_format
+                quantization_config["group_size"] = 128 if "128" in weight_format else 64
+            ov_config = OVConfig(quantization_config=quantization_config)
+
         # TODO : add input shapes
         main_export(
             model_name_or_path=self.args.model,
@@ -153,8 +199,7 @@ def run(self):
             cache_dir=self.args.cache_dir,
             trust_remote_code=self.args.trust_remote_code,
             pad_token_id=self.args.pad_token_id,
-            compression_option=self.args.weight_format,
-            compression_ratio=self.args.ratio,
+            ov_config=ov_config,
             stateful=not self.args.disable_stateful,
             convert_tokenizer=self.args.convert_tokenizer,
             # **input_shapes,
 
@@ -1,3 +1,17 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .__main__ import main_export
 from .convert import export, export_from_model, export_models, export_pytorch_via_onnx
 from .stateful import ensure_stateful_is_available, patch_stateful
 
@@ -14,7 +14,7 @@
 
 import logging
 from pathlib import Path
-from typing import Any, Callable, Dict, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
 
 from requests.exceptions import ConnectionError as RequestsConnectionError
 from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
@@ -41,6 +41,18 @@
     ]
 
 
+if TYPE_CHECKING:
+    from optimum.intel.openvino.configuration import OVConfig
+
+_COMPRESSION_OPTIONS = {
+    "int8": {"bits": 8},
+    "int4_sym_g128": {"bits": 4, "sym": True, "group_size": 128},
+    "int4_asym_g128": {"bits": 4, "sym": False, "group_size": 128},
+    "int4_sym_g64": {"bits": 4, "sym": True, "group_size": 64},
+    "int4_asym_g64": {"bits": 4, "sym": False, "group_size": 64},
+}
+
+
 logger = logging.getLogger(__name__)
 
 
@@ -63,6 +75,7 @@ def main_export(
     fn_get_submodels: Optional[Callable] = None,
     compression_option: Optional[str] = None,
     compression_ratio: Optional[float] = None,
+    ov_config: "OVConfig" = None,
     stateful: bool = True,
     convert_tokenizer: bool = False,
     library_name: Optional[str] = None,
@@ -137,6 +150,29 @@ def main_export(
     >>> main_export("gpt2", output="gpt2_onnx/")
     ```
     """
+
+    if compression_option is not None:
+        logger.warning(
+            "The `compression_option` argument is deprecated and will be removed in optimum-intel v1.17.0. "
+            "Please, pass an `ov_config` argument instead `OVConfig(..., quantization_config=quantization_config)`."
+        )
+
+    if compression_ratio is not None:
+        logger.warning(
+            "The `compression_ratio` argument is deprecated and will be removed in optimum-intel v1.17.0. "
+            "Please, pass an `ov_config` argument instead `OVConfig(quantization_config={ratio=compression_ratio})`."
+        )
+
+    if ov_config is None and compression_option is not None:
+        from ...intel.openvino.configuration import OVConfig
+
+        if compression_option == "fp16":
+            ov_config = OVConfig(dtype="fp16")
+        elif compression_option != "fp32":
+            q_config = _COMPRESSION_OPTIONS[compression_option] if compression_option in _COMPRESSION_OPTIONS else {}
+            q_config["ratio"] = compression_ratio or 1.0
+            ov_config = OVConfig(quantization_config=q_config)
+
     original_task = task
     task = TasksManager.map_from_synonym(task)
     framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework)
@@ -293,8 +329,7 @@ class StoreAttr(object):
         model=model,
         output=output,
         task=task,
-        compression_option=compression_option,
-        compression_ratio=compression_ratio,
+        ov_config=ov_config,
         stateful=stateful,
         model_kwargs=model_kwargs,
         custom_onnx_configs=custom_onnx_configs,