add quantization_config argument for OVModel

echarlaix · echarlaix · commit bf910338886a · 2024-02-23T17:24:40.000+01:00
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
@@ -120,11 +120,8 @@ def export(
         device (`str`, *optional*, defaults to `cpu`):
             The device on which the model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
             export on CUDA devices.
-        compression_option (`Optional[str]`, defaults to `None`):
-            The weight compression option, e.g. `f16` stands for float16 weights, `i8` - INT8 weights, `int4_sym_g128` - INT4 symmetric weights w/ group size 128, `int4_asym_g128` - as previous but asymmetric w/ zero-point,
-            `int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point.
-        compression_ratio (`Optional[float]`, defaults to `None`):
-            Compression ratio between primary and backup precision (only relevant to INT4).
+        ov_config (`OVConfig`, *optional*):
+            The configuration containing the parameters related to quantization.
         input_shapes (`Optional[Dict]`, defaults to `None`):
             If specified, allows to use specific shapes for the example input provided to the exporter.
         stateful (`bool`, defaults to `True`):
@@ -233,11 +230,8 @@ def export_pytorch_via_onnx(
             If specified, allows to use specific shapes for the example input provided to the exporter.
         model_kwargs (optional[Dict[str, Any]], defaults to `None`):
             Additional kwargs for model export.
-        compression_option (`Optional[str]`, defaults to `None`):
-            The weight compression option, e.g. `f16` stands for float16 weights, `i8` - INT8 weights, `int4_sym_g128` - INT4 symmetric weights w/ group size 128, `int4_asym_g128` - as previous but asymmetric w/ zero-point,
-            `int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point.
-        compression_ratio (`Optional[float]`, defaults to `None`):
-            Compression ratio between primary and backup precision (only relevant to INT4).
+        ov_config (`OVConfig`, *optional*):
+            The configuration containing the parameters related to quantization.
 
     Returns:
         `Tuple[List[str], List[str], bool]`: A tuple with an ordered list of the model's inputs, and the named inputs from
@@ -290,11 +284,8 @@ def export_pytorch(
             If specified, allows to use specific shapes for the example input provided to the exporter.
         model_kwargs (optional[Dict[str, Any]], defaults to `None`):
             Additional kwargs for model export
-        compression_option (`Optional[str]`, defaults to `None`):
-            The weight compression option, e.g. `f16` stands for float16 weights, `i8` - INT8 weights, `int4_sym_g128` - INT4 symmetric weights w/ group size 128, `int4_asym_g128` - as previous but asymmetric w/ zero-point,
-            `int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point.
-        compression_ratio (`Optional[float]`, defaults to `None`):
-            Compression ratio between primary and backup precision (only relevant to INT4).
+        ov_config (`OVConfig`, *optional*):
+            The configuration containing the parameters related to quantization.
         stateful (`bool`, defaults to `False`):
             Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. Applicable only for decoder models.
 
@@ -452,11 +443,8 @@ def export_models(
             export on CUDA devices.
         input_shapes (Optional[Dict], optional, Defaults to None):
             If specified, allows to use specific shapes for the example input provided to the exporter.
-        compression_option (`Optional[str]`, defaults to `None`):
-            The weight compression option, e.g. `f16` stands for float16 weights, `i8` - INT8 weights, `int4_sym_g128` - INT4 symmetric weights w/ group size 128, `int4_asym_g128` - as previous but asymmetric w/ zero-point,
-            `int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point.
-        compression_ratio (`Optional[int]`, defaults to `None`):
-            Compression ratio between primary and backup precision (only relevant to INT4).
+        ov_config (`OVConfig`, *optional*):
+            The configuration containing the parameters related to quantization.
         model_kwargs (Optional[Dict[str, Any]], optional):
             Additional kwargs for model export.
         stateful (`bool`, defaults to `True`)
diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py
@@ -16,7 +16,7 @@
 import os
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Optional, Union
+from typing import Dict, Optional, Union
 
 import numpy as np
 import openvino
@@ -53,6 +53,7 @@
 
 from ...exporters.openvino import main_export
 from ..utils.import_utils import is_timm_available, is_timm_version
+from .configuration import OVConfig, OVWeightQuantizationConfig
 from .modeling_base import OVBaseModel
 from .utils import _is_timm_ov_dir
 
@@ -427,14 +428,17 @@ def _from_transformers(
         task: Optional[str] = None,
         trust_remote_code: bool = False,
         load_in_8bit: Optional[bool] = None,
-        load_in_4bit: Optional[bool] = None,
+        quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
         **kwargs,
     ):
         save_dir = TemporaryDirectory()
         save_dir_path = Path(save_dir.name)
 
-        # If load_in_8bit is not specified then compression_option should be set to None and will be set by default in main_export depending on the model size
-        compression_option = "fp32" if load_in_8bit is not None else None
+        # If load_in_8bit or quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
+        if load_in_8bit is None or not quantization_config:
+            ov_config = None
+        else:
+            ov_config = OVConfig(dtype="fp32")
 
         # OVModelForFeatureExtraction works with Transformers type of models, thus even sentence-transformers models are loaded as such.
         main_export(
@@ -448,12 +452,18 @@ def _from_transformers(
             local_files_only=local_files_only,
             force_download=force_download,
             trust_remote_code=trust_remote_code,
-            compression_option=compression_option,
+            ov_config=ov_config,
             library_name="transformers",
         )
 
         config.save_pretrained(save_dir_path)
-        return cls._from_pretrained(model_id=save_dir_path, config=config, load_in_8bit=load_in_8bit, **kwargs)
+        return cls._from_pretrained(
+            model_id=save_dir_path,
+            config=config,
+            load_in_8bit=load_in_8bit,
+            quantization_config=quantization_config,
+            **kwargs,
+        )
 
 
 MASKED_LM_EXAMPLE = r"""
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
@@ -31,6 +31,7 @@
 
 from ...exporters.openvino import export, main_export
 from ..utils.import_utils import is_nncf_available
+from .configuration import OVConfig, OVWeightQuantizationConfig
 from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, _print_compiled_model_properties
 
 
@@ -91,7 +92,7 @@ def __init__(
         self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None
 
     @staticmethod
-    def load_model(file_name: Union[str, Path], load_in_8bit: bool = False):
+    def load_model(file_name: Union[str, Path], quantization_config: Union[OVWeightQuantizationConfig, Dict] = None):
         """
         Loads the model.
 
@@ -118,14 +119,15 @@ def fix_op_names_duplicates(model: openvino.runtime.Model):
         if file_name.suffix == ".onnx":
             model = fix_op_names_duplicates(model)  # should be called during model conversion to IR
 
-        if load_in_8bit:
+        if quantization_config:
             if not is_nncf_available():
                 raise ImportError(
                     "Quantization of the weights to int8 requires nncf, please install it with `pip install nncf`"
                 )
-            import nncf
 
-            model = nncf.compress_weights(model)
+            from optimum.intel.openvino.quantization import _weight_only_quantization
+
+            model = _weight_only_quantization(model, quantization_config)
 
         return model
 
@@ -155,6 +157,7 @@ def _from_pretrained(
         from_onnx: bool = False,
         local_files_only: bool = False,
         load_in_8bit: bool = False,
+        quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
         **kwargs,
     ):
         """
@@ -199,7 +202,12 @@ def _from_pretrained(
             subfolder=subfolder,
             local_files_only=local_files_only,
         )
-        model = cls.load_model(model_cache_path, load_in_8bit=load_in_8bit)
+
+        # Give default quantization config if not provided and load_in_8bit=True
+        if load_in_8bit:
+            quantization_config = quantization_config or {"bits": 8}
+
+        model = cls.load_model(model_cache_path, quantization_config=quantization_config)
         return cls(model, config=config, model_save_dir=model_cache_path.parent, **kwargs)
 
     @staticmethod
@@ -252,6 +260,7 @@ def _from_transformers(
         task: Optional[str] = None,
         trust_remote_code: bool = False,
         load_in_8bit: Optional[bool] = None,
+        quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
         **kwargs,
     ):
         """
@@ -275,10 +284,11 @@ def _from_transformers(
         save_dir = TemporaryDirectory()
         save_dir_path = Path(save_dir.name)
 
-        # If load_in_8bit is not specified then compression_option should be set to None and will be set by default in main_export depending on the model size
-        compression_option = None
-        if load_in_8bit is not None:
-            compression_option = "fp32"
+        # If load_in_8bit or quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
+        if load_in_8bit is None or not quantization_config:
+            ov_config = None
+        else:
+            ov_config = OVConfig(dtype="fp32")
 
         main_export(
             model_name_or_path=model_id,
@@ -291,11 +301,17 @@ def _from_transformers(
             local_files_only=local_files_only,
             force_download=force_download,
             trust_remote_code=trust_remote_code,
-            compression_option=compression_option,
+            ov_config=ov_config,
         )
 
         config.save_pretrained(save_dir_path)
-        return cls._from_pretrained(model_id=save_dir_path, config=config, load_in_8bit=load_in_8bit, **kwargs)
+        return cls._from_pretrained(
+            model_id=save_dir_path,
+            config=config,
+            load_in_8bit=load_in_8bit,
+            quantization_config=quantization_config,
+            **kwargs,
+        )
 
     @classmethod
     def _to_load(
diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py
@@ -25,6 +25,7 @@
 from transformers.file_utils import add_start_docstrings
 
 from ...exporters.openvino import main_export
+from .configuration import OVConfig, OVWeightQuantizationConfig
 from .modeling_base import OVBaseModel
 from .utils import (
     ONNX_DECODER_NAME,
@@ -111,6 +112,7 @@ def _from_pretrained(
         use_cache: bool = True,
         from_onnx: bool = False,
         load_in_8bit: bool = False,
+        quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
         **kwargs,
     ):
         """
@@ -152,12 +154,19 @@ def _from_pretrained(
         decoder_file_name = decoder_file_name or default_decoder_file_name
         decoder_with_past_file_name = decoder_with_past_file_name or default_decoder_with_past_file_name
         decoder_with_past = None
+
+        # Give default quantization config if not provided and load_in_8bit=True
+        if load_in_8bit:
+            quantization_config = quantization_config or {"bits": 8}
+
         # Load model from a local directory
         if os.path.isdir(model_id):
-            encoder = cls.load_model(os.path.join(model_id, encoder_file_name), load_in_8bit)
-            decoder = cls.load_model(os.path.join(model_id, decoder_file_name), load_in_8bit)
+            encoder = cls.load_model(os.path.join(model_id, encoder_file_name), quantization_config)
+            decoder = cls.load_model(os.path.join(model_id, decoder_file_name), quantization_config)
             if use_cache:
-                decoder_with_past = cls.load_model(os.path.join(model_id, decoder_with_past_file_name), load_in_8bit)
+                decoder_with_past = cls.load_model(
+                    os.path.join(model_id, decoder_with_past_file_name), quantization_config
+                )
 
             model_save_dir = Path(model_id)
 
@@ -185,10 +194,10 @@ def _from_pretrained(
                 file_names[name] = model_cache_path
 
             model_save_dir = Path(model_cache_path).parent
-            encoder = cls.load_model(file_names["encoder"], load_in_8bit)
-            decoder = cls.load_model(file_names["decoder"], load_in_8bit)
+            encoder = cls.load_model(file_names["encoder"], quantization_config)
+            decoder = cls.load_model(file_names["decoder"], quantization_config)
             if use_cache:
-                decoder_with_past = cls.load_model(file_names["decoder_with_past"], load_in_8bit)
+                decoder_with_past = cls.load_model(file_names["decoder_with_past"], quantization_config)
 
         return cls(
             encoder=encoder,
@@ -214,6 +223,7 @@ def _from_transformers(
         use_cache: bool = True,
         trust_remote_code: bool = False,
         load_in_8bit: Optional[bool] = None,
+        quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
         **kwargs,
     ):
         """
@@ -240,13 +250,15 @@ def _from_transformers(
 
         if task is None:
             task = cls.export_feature
-
             if use_cache:
                 task = task + "-with-past"
 
-        compression_option = None
-        if load_in_8bit is not None:
-            compression_option = "fp32"
+        # If load_in_8bit or quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
+        if load_in_8bit is None or not quantization_config:
+            ov_config = None
+        else:
+            ov_config = OVConfig(dtype="fp32")
+
         main_export(
             model_name_or_path=model_id,
             output=save_dir_path,
@@ -258,12 +270,17 @@ def _from_transformers(
             local_files_only=local_files_only,
             force_download=force_download,
             trust_remote_code=trust_remote_code,
-            compression_option=compression_option,
+            ov_config=ov_config,
         )
 
         config.save_pretrained(save_dir_path)
         return cls._from_pretrained(
-            model_id=save_dir_path, config=config, use_cache=use_cache, load_in_8bit=load_in_8bit, **kwargs
+            model_id=save_dir_path,
+            config=config,
+            use_cache=use_cache,
+            load_in_8bit=load_in_8bit,
+            quantization_config=quantization_config,
+            **kwargs,
         )
 
     def _reshape(self, model: openvino.runtime.Model, batch_size: int, sequence_length: int, is_decoder=True):
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
@@ -34,7 +34,7 @@
 from ...exporters.openvino.stateful import model_has_state
 from ..utils.import_utils import is_nncf_available
 from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS
-from .configuration import OVWeightQuantizationConfig, _check_default_4bit_configs
+from .configuration import OVConfig, OVWeightQuantizationConfig, _check_default_4bit_configs
 from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel
 from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE
 
@@ -252,14 +252,14 @@ def _from_transformers(
 
         if task is None:
             task = cls.export_feature
-
             if use_cache:
                 task = task + "-with-past"
 
-        # If load_in_8bit is not specified then compression_option should be set to None and will be set by default in main_export depending on the model size
-        compression_option = None
-        if load_in_8bit is not None or quantization_config is not None:
-            compression_option = "fp32"
+        # If load_in_8bit or quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
+        if load_in_8bit is None or not quantization_config:
+            ov_config = None
+        else:
+            ov_config = OVConfig(dtype="fp32")
 
         stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache)
 
@@ -274,7 +274,7 @@ def _from_transformers(
             local_files_only=local_files_only,
             force_download=force_download,
             trust_remote_code=trust_remote_code,
-            compression_option=compression_option,
+            ov_config=ov_config,
             stateful=stateful,
         )
 
@@ -285,8 +285,8 @@ def _from_transformers(
             model_id=save_dir_path,
             config=config,
             use_cache=use_cache,
-            load_in_8bit=load_in_8bit,
             stateful=None,
+            load_in_8bit=load_in_8bit,
             quantization_config=quantization_config,
             **kwargs,
         )
@@ -576,11 +576,15 @@ def _from_pretrained(
             local_files_only=local_files_only,
         )
 
+        # Give default quantization config if not provided and load_in_8bit=True
+        if load_in_8bit:
+            quantization_config = quantization_config or {"bits": 8}
+
         if isinstance(quantization_config, dict):
             quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)
 
         load_in_4bit = quantization_config.bits == 4 if quantization_config else False
-        model = cls.load_model(model_cache_path, load_in_8bit=False if load_in_4bit else load_in_8bit)
+        model = cls.load_model(model_cache_path, quantization_config=None if load_in_4bit else quantization_config)
 
         model_type = config.model_type.replace("_", "-")
         if model_type == "bloom":
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py