huggingface · echarlaix · Apr 22, 2024 · Apr 15, 2024 · Apr 15, 2024 · Apr 15, 2024
diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
@@ -35,7 +35,7 @@ jobs:
         pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime
     - name: Test with Pytest
       run: |
-        pytest tests/openvino/ --ignore test_modeling_basic --durations=0
+        pytest tests/openvino/ --ignore tests/openvino/test_modeling_basic.py --durations=0
     - name: Test openvino-nightly
       run: |
         pip uninstall -y openvino

diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
@@ -62,7 +62,6 @@ def __init__(
         self,
         ignored_scope: Optional[dict] = None,
         num_samples: Optional[int] = None,
-        weight_only: Optional[bool] = None,
         **kwargs,
     ):
         """
@@ -72,14 +71,11 @@ def __init__(
                 entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class.
             num_samples (`int`, *optional*):
                 The maximum number of samples composing the calibration dataset.
-            weight_only (`bool`, *optional*):
-                Used to explicitly specify type of quantization (weight-only of full) to apply.
         """
         if isinstance(ignored_scope, nncf.IgnoredScope):
             ignored_scope = ignored_scope.__dict__
         self.ignored_scope = ignored_scope
         self.num_samples = num_samples
-        self.weight_only = weight_only
 
     def post_init(self):
         try:
@@ -191,6 +187,8 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
     Args:
         bits (`int`, defaults to 8):
             The number of bits to quantize to.
+        group_size (`int`, *optional*):
+            The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
         sym (`bool`, defaults to `False`):
             Whether to use symmetric quantization.
         tokenizer (`str`, *optional*):
@@ -209,8 +207,6 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
         ratio (`float`, defaults to 1.0):
             The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM
             and the rest to INT8_ASYM).
-        group_size (`int`, *optional*):
-            The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
         all_layers (`bool`, *optional*):
             Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit precision.
         sensitivity_metric (`str`, *optional*):
@@ -223,33 +219,24 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
             The maximum number of samples composing the calibration dataset.
         quant_method (`str`, defaults of OVQuantizationMethod.DEFAULT):
             Weight compression method to apply.
-        weight_only (`bool`, *optional*):
-            Used to explicitly specify type of quantization (weight-only of full) to apply. Useful when building
-            the config from dictionary.
     """
 
     def __init__(
         self,
         bits: int = 8,
+        group_size: Optional[int] = None,
         sym: bool = False,
         tokenizer: Optional[str] = None,
         dataset: Optional[Union[str, List[str]]] = None,
         ratio: float = 1.0,
-        group_size: Optional[int] = None,
         all_layers: Optional[bool] = None,
         sensitivity_metric: Optional[str] = None,
         ignored_scope: Optional[dict] = None,
         num_samples: Optional[int] = None,
         quant_method: Optional[Union[QuantizationMethod, OVQuantizationMethod]] = OVQuantizationMethod.DEFAULT,
-        weight_only: Optional[bool] = True,
         **kwargs,
     ):
-        if weight_only is False:
-            logger.warning(
-                "Trying to create an instance of `OVWeightQuantizationConfig` with `weight_only` being "
-                "False. Please check your configuration."
-            )
-        super().__init__(ignored_scope, num_samples, True)
+        super().__init__(ignored_scope, num_samples)
         self.bits = bits
         self.sym = sym
         self.tokenizer = tokenizer
@@ -305,83 +292,90 @@ def post_init(self):
             raise ValueError(f"Tokenizer is expected to be a string, but found {self.tokenizer}")
 
 
+@dataclass
+class OVDynamicQuantizationConfig(OVWeightQuantizationConfig):
+    def __init__(
+        self,
+        bits: int = 8,
+        weights_group_size: Optional[int] = None,
+        activations_group_size: int = 32,
+        **kwargs,
+    ):
+        super().__init__(bits=bits, group_size=weights_group_size, **kwargs)
+        # TODO add kv_cache_dtype
+        self.activations_group_size = activations_group_size
+
+
 @dataclass
 class OVQuantizationConfig(OVQuantizationConfigBase):
     def __init__(
         self,
+        bits: int = 8,
+        sym: bool = False,
         ignored_scope: Optional[dict] = None,
         num_samples: Optional[int] = 300,
-        preset: nncf.QuantizationPreset = None,
         model_type: nncf.ModelType = nncf.ModelType.TRANSFORMER,
         fast_bias_correction: bool = True,
         overflow_fix: OverflowFix = OverflowFix.DISABLE,
-        weight_only: Optional[bool] = False,
         **kwargs,
     ):
         """
         Configuration class containing parameters related to model quantization with NNCF. Compared to weight
         compression, during quantization both weights and activations are converted to lower precision.
         For weight-only model quantization please see OVWeightQuantizationConfig.
         Args:
+            bits (`int`, defaults to 8):
+                The number of bits to quantize to.
             ignored_scope (`dict`, *optional*):
                 An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary
                 entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class.
             num_samples (`int`, *optional*):
                 The maximum number of samples composing the calibration dataset.
-            preset (`nncf.QuantizationPreset`, *optional*):
-                A preset controls the quantization mode (symmetric and asymmetric).
-                It can take the following values:
-                - `performance`: Symmetric quantization of weights and activations.
-                - `mixed`: Symmetric quantization of weights and asymmetric quantization of activations.
-                Default value is None. In this case, `mixed` preset is used for `transformer`
-                model type otherwise `performance`.
+            sym (`bool`, defaults to `False`):
+                Whether to use symmetric quantization on the activations. Symmetric quantization will be applied on the weights in any case.
             model_type (`nncf.ModelType`, defaults to nncf.ModelType.TRANSFORMER):
                 Model type is needed to specify additional patterns in the model. Supported only `transformer` now.
             fast_bias_correction (`bool`, defaults to True):
                 Whether to apply fast or full bias correction algorithm.
             overflow_fix (`nncf.OverflowFix`, default to OverflowFix.DISABLE):
                 Parameter for controlling overflow fix setting.
-            weight_only (`bool`, *optional*):
-                Used to explicitly specify type of quantization (weight-only of full) to apply. Useful when building
-                the config from dictionary.
         """
-        if weight_only is True:
-            logger.warning(
-                "Trying to create an instance of `OVQuantizationConfig` with `weight_only` being True. "
-                "Please check your configuration."
-            )
-        super().__init__(ignored_scope, num_samples, False)
+        super().__init__(ignored_scope, num_samples)
         # TODO: remove checks below once NNCF is updated to 2.10
         if isinstance(overflow_fix, str):
             overflow_fix = OverflowFix(overflow_fix)
-        if isinstance(preset, str):
-            preset = nncf.QuantizationPreset(preset)
 
-        self.preset = preset
+        self.bits = bits
+        self.sym = sym
         self.model_type = model_type
         self.fast_bias_correction = fast_bias_correction
         self.overflow_fix = overflow_fix
         self.post_init()
 
     def to_dict(self) -> Dict[str, Any]:
         # TODO: remove code below once NNCF is updated to 2.10
-        if isinstance(self.overflow_fix, Enum) or isinstance(self.preset, Enum):
+        if isinstance(self.overflow_fix, Enum):
             overflow_fix_value = (
                 None
                 if self.overflow_fix is None
                 else self.overflow_fix
                 if isinstance(self.overflow_fix, str)
                 else self.overflow_fix.value
             )
-            preset_value = (
-                None if self.preset is None else self.preset if isinstance(self.preset, str) else self.preset.value
-            )
             self_copy = copy.deepcopy(self)
             self_copy.overflow_fix = overflow_fix_value
-            self_copy.preset = preset_value
             return self_copy.to_dict()
         return super().to_dict()
 
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct
+        """
+        super().post_init()
+
+        if self.bits != 8:
+            raise ValueError(f"Only support 8-bit for static quantization but found {self.bits}")
+
 
 def _check_default_4bit_configs(config: PretrainedConfig):
     return _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, None)
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
@@ -31,7 +31,7 @@
 
 from ...exporters.openvino import export, main_export
 from ..utils.import_utils import is_nncf_available
-from .configuration import OVConfig, OVWeightQuantizationConfig
+from .configuration import OVConfig, OVDynamicQuantizationConfig, OVWeightQuantizationConfig
 from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, _print_compiled_model_properties
 
 
@@ -64,10 +64,7 @@ def __init__(
         self.model_save_dir = model_save_dir
         self._device = device.upper()
         self.is_dynamic = dynamic_shapes
-        self.ov_config = ov_config if ov_config is not None else {}
-        if self.ov_config.get("PERFORMANCE_HINT") is None:
-            self.ov_config["PERFORMANCE_HINT"] = "LATENCY"
-
+        self.ov_config = {} if ov_config is None else {**ov_config}
         self.preprocessors = kwargs.get("preprocessors", [])
         enable_compilation = kwargs.get("compile", True)
 
@@ -98,12 +95,12 @@ def __init__(
         self._openvino_config = None
         if quantization_config:
             self._openvino_config = OVConfig(quantization_config=quantization_config)
+        self._set_ov_config_parameters()
 
     @staticmethod
     def load_model(
         file_name: Union[str, Path],
         quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
-        calibration_dataset: Optional = None,
     ):
         """
         Loads the model.
@@ -113,8 +110,6 @@ def load_model(
                 The path of the model ONNX or XML file.
             quantization_config (`OVWeightQuantizationConfig` or `Dict`, *optional*):
                 Quantization config to apply after model is loaded.
-            calibration_dataset (`nncf.Dataset`, *optional*):
-                Optional nncf.Dataset to feed to model weight compression when quantization config is provided.
         """
 
         def fix_op_names_duplicates(model: openvino.runtime.Model):
@@ -143,7 +138,7 @@ def fix_op_names_duplicates(model: openvino.runtime.Model):
 
             from optimum.intel.openvino.quantization import _weight_only_quantization
 
-            model = _weight_only_quantization(model, quantization_config, calibration_dataset=calibration_dataset)
+            model = _weight_only_quantization(model, quantization_config)
 
         return model
 
@@ -251,6 +246,14 @@ def _prepare_weight_quantization_config(
 
         return quantization_config
 
+    def _set_ov_config_parameters(self):
+        if self.ov_config.get("PERFORMANCE_HINT") is None:
+            self.ov_config["PERFORMANCE_HINT"] = "LATENCY"
+
+        q_config = self._openvino_config.quantization_config if self._openvino_config else None
+        if isinstance(q_config, OVDynamicQuantizationConfig):
+            self.ov_config["DYNAMIC_QUANTIZATION_GROUP_SIZE"] = str(q_config.activations_group_size)
+
     @staticmethod
     def _cached_file(
         model_path: Union[Path, str],

diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py
@@ -66,11 +66,7 @@ def __init__(
         self.model_save_dir = model_save_dir
         self._device = device.upper()
         self.is_dynamic = dynamic_shapes
-        self.ov_config = ov_config if ov_config is not None else {}
-
-        if self.ov_config.get("PERFORMANCE_HINT") is None:
-            self.ov_config["PERFORMANCE_HINT"] = "LATENCY"
-
+        self.ov_config = {} if ov_config is None else {**ov_config}
         self.preprocessors = kwargs.get("preprocessors", [])
 
         if self.is_dynamic:
@@ -84,6 +80,7 @@ def __init__(
         self._openvino_config = None
         if quantization_config:
             self._openvino_config = OVConfig(quantization_config=quantization_config)
+        self._set_ov_config_parameters()
 
     def _save_pretrained(self, save_directory: Union[str, Path]):
         """

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
@@ -12,7 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-import copy
 import logging
 import os
 from pathlib import Path
@@ -596,11 +595,10 @@ def _from_pretrained(
         quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
 
         load_in_4bit = quantization_config.bits == 4 if quantization_config else False
-        calibration_dataset = kwargs.get("calibration_dataset", None)
+
         model = cls.load_model(
             model_cache_path,
             quantization_config=None if load_in_4bit else quantization_config,
-            calibration_dataset=calibration_dataset,
         )
 
         model_type = config.model_type.replace("_", "-")
@@ -637,18 +635,15 @@ def _from_pretrained(
                     f"For the given model, we recommend the following `quantization_config` : {default_config}"
                 )
 
-            if calibration_dataset is None and isinstance(quantization_config.dataset, str):
+            calibration_dataset = None
+            if isinstance(quantization_config.dataset, str):
                 tokenizer = quantization_config.tokenizer or AutoTokenizer.from_pretrained(model_id)
 
                 from optimum.gptq.data import get_dataset, prepare_dataset
 
-                # from optimum.gptq.utils import get_seqlen
-
-                # seqlen = get_seqlen(causal_model)
-                nsamples = quantization_config.num_samples if quantization_config.num_samples else 128
+                nsamples = quantization_config.num_samples or 128
                 dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples)
                 dataset = prepare_dataset(dataset)
-                quantization_config = copy.deepcopy(quantization_config)
                 calibration_dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x))
 
             _weight_only_quantization(model, quantization_config, calibration_dataset)

diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
@@ -100,9 +100,7 @@ def __init__(
         self._internal_dict = config
         self._device = device.upper()
         self.is_dynamic = dynamic_shapes
-        self.ov_config = ov_config if ov_config is not None else {}
-        if self.ov_config.get("PERFORMANCE_HINT") is None:
-            self.ov_config["PERFORMANCE_HINT"] = "LATENCY"
+        self.ov_config = {} if ov_config is None else {**ov_config}
 
         # This attribute is needed to keep one reference on the temporary directory, since garbage collecting
         # would end-up removing the directory containing the underlying OpenVINO model
@@ -162,6 +160,7 @@ def __init__(
         self._openvino_config = None
         if quantization_config:
             self._openvino_config = OVConfig(quantization_config=quantization_config)
+        self._set_ov_config_parameters()
 
     def _save_pretrained(self, save_directory: Union[str, Path]):
         """

diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
@@ -180,22 +180,15 @@ def __init__(self, model: transformers.PreTrainedModel, task: Optional[str] = No
         """
         super().__init__()
         self.model = model
-        feature = kwargs.pop("feature", None)
-        if feature is not None:
-            logger.warning("`feature` is deprecated and will be removed in a future version. Use `task` instead.")
-        if task is not None and task != feature:
-            logger.warning(
-                f"Both `feature` and `task` were specified. {task} will be used to define the model topology for the model ONNX export."
-            )
-        self.task = task or feature
+        self.task = task
         self.seed = seed
-        # TODO : deprecate input_names
-        self.input_names = None
         signature = inspect.signature(self.model.forward)
         self._signature_columns = list(signature.parameters.keys())
-        self._export_input_names = [
-            column for column in self._signature_columns if column not in {"label", "labels", "label_ids"}
-        ]
+
+    @property
+    def input_names(self):
+        logger.warning("The`input_names` attribute is deprecated and will be removed in v1.18.0")
+        return None
 
     @classmethod
     def from_pretrained(cls, model: PreTrainedModel, **kwargs):
@@ -265,9 +258,8 @@ def quantize(
         # TODO: deprecate weights_only argument
         if weights_only is not None:
             logger.warning(
-                "`weights_only` argument is deprecated. In the future please provide `ov_config.quantization_config` "
-                "as an instance of OVWeightQuantizationConfig for weight-only compression or as an instance of "
-                "OVQuantizationConfig for full model quantization."
+                "`weights_only` argument is deprecated and will be removed in v1.18.0. In the future please provide `ov_config.quantization_config` "
+                "as an instance of `OVWeightQuantizationConfig` for weight-only compression or as an instance of `OVQuantizationConfig` for full model quantization."
             )
 
         if save_directory is None: