diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 29abd00034..c097562651 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -124,6 +124,7 @@ "OVModelForVision2Seq", "OVModelForSequenceClassification", "OVModelForTokenClassification", + "OVQuantizationConfig", "OVWeightQuantizationConfig", "OVConfig", ] @@ -243,6 +244,7 @@ OVModelForSpeechSeq2Seq, OVModelForTokenClassification, OVModelForVision2Seq, + OVQuantizationConfig, OVWeightQuantizationConfig, ) diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 27a966865f..0cd7d8a029 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -43,7 +43,7 @@ from .trainer import OVTrainer -from .configuration import OVConfig, OVWeightQuantizationConfig +from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig from .modeling import ( OVModelForAudioClassification, OVModelForAudioFrameClassification, diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 40a60bb58e..e75301729d 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -11,71 +11,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import copy +import inspect +import logging from dataclasses import dataclass +from enum import Enum from typing import Any, Dict, List, Optional, Union +import nncf import torch +from nncf.quantization.advanced_parameters import OverflowFix from transformers import PretrainedConfig -from transformers.utils.quantization_config import QuantizationConfigMixin +from transformers.utils.quantization_config import QuantizationConfigMixin, QuantizationMethod from optimum.configuration_utils import BaseConfig -DEFAULT_QUANTIZATION_CONFIG = { - "algorithm": "quantization", - "preset": "mixed", - "overflow_fix": "disable", - "initializer": { - "range": {"num_init_samples": 300, "type": "mean_min_max"}, - "batchnorm_adaptation": {"num_bn_adaptation_samples": 0}, - }, - "scope_overrides": {"activations": {"{re}.*matmul_0": {"mode": "symmetric"}}}, - "ignored_scopes": [ - "{re}.*Embedding.*", - "{re}.*add___.*", - "{re}.*layer_norm_.*", - "{re}.*matmul_1", - "{re}.*__truediv__.*", - ], -} - -INT8_WEIGHT_COMPRESSION_CONFIG = { - "algorithm": "quantization", - "weights": { - "mode": "symmetric", - "bits": 8, - "target_scopes": [ - "{re}.*Embedding.*", - "{re}.*matmul_.*", - "{re}.*addmm_.*", - "{re}.*baddmm_.*", - "{re}.*linear_.*", - ], - "ignored_scopes": [ - "{re}.*conv_*", - ], - }, - "activations": { - "ignored_scopes": [ - "{re}.*add___.*", - "{re}.*__radd___.*", - "{re}.*layer_norm_.*", - "{re}.*__truediv__.*", - "{re}.*__mul___.*", - "{re}.*__rmul___.*", - "{re}.*tanh_.*", - "{re}.*pow_.*", - "{re}.*matmul_.*", - "{re}.*addmm_.*", - "{re}.*baddmm_.*", - "{re}.*linear_.*", - "{re}.*conv_.*", - ], - }, - "overflow_fix": "disable", -} - +logger = logging.getLogger(__name__) _DEFAULT_4BIT_CONFIGS = { "databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.5}, @@ -100,31 +52,75 @@ } +@dataclass +class OVQuantizationConfigBase(QuantizationConfigMixin): + """ + Base configuration class for quantization parameters + """ + + def __init__( + self, + ignored_scope: Optional[dict] = None, + num_samples: Optional[int] = None, + weight_only: Optional[bool] = None, + **kwargs, + ): + """ + Args: + ignored_scope (`dict`, *optional*): + An ignored scope that defines a list of model nodes to be ignored during quantization. Dictionary + entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. + num_samples (`int`, *optional*): + The maximum number of samples composing the calibration dataset. + weight_only (`bool`, *optional*): + Used to explicitly specify type of quantization (weight-only of full) to apply. + """ + if isinstance(ignored_scope, nncf.IgnoredScope): + ignored_scope = ignored_scope.__dict__ + self.ignored_scope = ignored_scope + self.num_samples = num_samples + self.weight_only = weight_only + + def post_init(self): + try: + self.get_ignored_scope_instance() + except Exception as e: + raise ValueError( + f"Can't create an `IgnoredScope` object from the provided ignored scope dict: {self.ignored_scope}.\n{e}" + ) + if not (self.num_samples is None or isinstance(self.num_samples, int) and self.num_samples > 0): + raise ValueError(f"`num_samples` is expected to be a positive integer, but found: {self.num_samples}") + + def get_ignored_scope_instance(self) -> nncf.IgnoredScope: + if self.ignored_scope is None: + return nncf.IgnoredScope() + return nncf.IgnoredScope(**copy.deepcopy(self.ignored_scope)) + + class OVConfig(BaseConfig): CONFIG_NAME = "openvino_config.json" FULL_CONFIGURATION_FILE = "openvino_config.json" def __init__( self, - compression: Union[List[Dict], Dict, None] = None, input_info: Optional[List] = None, save_onnx_model: bool = False, - quantization_config: Optional[Union[QuantizationConfigMixin, Dict]] = None, + quantization_config: Optional[Union[dict, OVQuantizationConfigBase]] = None, dtype: Optional[str] = None, **kwargs, ): super().__init__() - self.compression = compression self.input_info = input_info self.save_onnx_model = save_onnx_model - self._enable_standard_onnx_export_option() self.optimum_version = kwargs.pop("optimum_version", None) - self.quantization_config = quantization_config or {} + if isinstance(quantization_config, dict): + quantization_config = self._quantization_config_from_dict(quantization_config) + self.quantization_config = quantization_config + self.compression = None # A field for backward-compatability of training-time compression parameters - if isinstance(quantization_config, QuantizationConfigMixin): - bits = self.quantization_config.bits - else: - bits = self.quantization_config.get("bits", None) + bits = ( + self.quantization_config.bits if isinstance(self.quantization_config, OVWeightQuantizationConfig) else None + ) self.dtype = "int" + str(bits) if isinstance(bits, int) else dtype def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): @@ -137,41 +133,68 @@ def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): for name, value in model_inputs.items() ] - def save_pretrained(self, *args, **kwargs): - super().save_pretrained(*args, **kwargs) - - def _enable_standard_onnx_export_option(self): - # This method depends on self.save_onnx_model. - # save_onnx_model is defaulted to false so that the final model output is - # in OpenVINO IR to realize performance benefit in OpenVINO runtime. - # True value of save_onnx_model will save a model in onnx format. - if ( - isinstance(self.compression, dict) - and "algorithm" in self.compression - and self.compression["algorithm"] == "quantization" - ): - self.compression["export_to_onnx_standard_ops"] = self.save_onnx_model - elif isinstance(self.compression, list): - for i, algo_config in enumerate(self.compression): - if algo_config["algorithm"] == "quantization": - self.compression[i]["export_to_onnx_standard_ops"] = self.save_onnx_model + @staticmethod + def _quantization_config_from_dict(quantization_config: dict) -> OVQuantizationConfigBase: + wq_args = inspect.getfullargspec(OVWeightQuantizationConfig.__init__).args + q_args = inspect.getfullargspec(OVQuantizationConfig.__init__).args + config_keys = quantization_config.keys() + matches_wq_config_signature = all(arg_name in wq_args for arg_name in config_keys) + matches_q_config_signature = all(arg_name in q_args for arg_name in config_keys) + if matches_wq_config_signature == matches_q_config_signature: + weight_only = quantization_config.get("weight_only", None) + if weight_only is None: + logger.warning( + "Can't determine type of OV quantization config. Please specify explicitly whether you intend to " + "run weight-only quantization or not with `weight_only` parameter. Creating an instance of " + "OVWeightQuantizationConfig." + ) + return OVWeightQuantizationConfig.from_dict(quantization_config) + matches_wq_config_signature = weight_only + + config_type = OVWeightQuantizationConfig if matches_wq_config_signature else OVQuantizationConfig + return config_type.from_dict(quantization_config) + + def _to_dict_safe(self, to_diff_dict: bool = False) -> Dict[str, Any]: + class ConfigStub: + def to_dict(self): + return None + + def to_diff_dict(self): + return None + + if self.quantization_config is None: + # Parent to_dict() implementation does not support quantization_config being None + self_copy = copy.deepcopy(self) + self_copy.quantization_config = ConfigStub() + result = self_copy.to_diff_dict() if to_diff_dict else self_copy.to_dict() + else: + result = super().to_diff_dict() if to_diff_dict else super().to_dict() + return result + + def to_dict(self) -> Dict[str, Any]: + return self._to_dict_safe(to_diff_dict=False) + + def to_diff_dict(self) -> Dict[str, Any]: + return self._to_dict_safe(to_diff_dict=True) + + +class OVQuantizationMethod(str, Enum): + DEFAULT = "default" @dataclass -class OVWeightQuantizationConfig(QuantizationConfigMixin): +class OVWeightQuantizationConfig(OVQuantizationConfigBase): """ This is a wrapper class about all possible attributes and features that you can play with a model that has been - loaded using `optimum-intel` api for quantization with NNCF. - + loaded using `optimum-intel` api for weight-only quantization with NNCF. For full model quantization please see + OVQuantizationConfig. Args: - bits (`int`, defaults to 8): The number of bits to quantize to. sym (`bool`, defaults to `False`): - Whether to use symetric quantization. - tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*): + Whether to use symmetric quantization. + tokenizer (`str`, *optional*): The tokenizer used to process the dataset. You can pass either: - - A custom tokenizer object. - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. @@ -179,30 +202,37 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin): using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. dataset (`str or List[str]`, *optional*): The dataset used for data-aware compression or quantization with NNCF. You can provide your own dataset - in a list of strings or just use the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs + in a list of strings or just use the one from the list ['wikitext','c4','c4-new','ptb','ptb-new'] for LLLMs or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models. + Alternatively, you can provide data objects via `calibration_dataset` argument + of `OVQuantizer.quantize()` method. ratio (`float`, defaults to 1.0): The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM and the rest to INT8_ASYM). group_size (`int`, *optional*): The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. all_layers (`bool`, *optional*): - Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit presicion. + Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit precision. sensitivity_metric (`str`, *optional*): The sensitivity metric for assigning quantization precision to layers. In order to preserve the accuracy of the model, the more sensitive layers receives a higher precision. ignored_scope (`dict`, *optional*): - An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization. + An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary + entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. num_samples (`int`, *optional*): The maximum number of samples composing the calibration dataset. - + quant_method (`str`, defaults of OVQuantizationMethod.DEFAULT): + Weight compression method to apply. + weight_only (`bool`, *optional*): + Used to explicitly specify type of quantization (weight-only of full) to apply. Useful when building + the config from dictionary. """ def __init__( self, bits: int = 8, sym: bool = False, - tokenizer: Optional[Any] = None, + tokenizer: Optional[str] = None, dataset: Optional[Union[str, List[str]]] = None, ratio: float = 1.0, group_size: Optional[int] = None, @@ -210,8 +240,16 @@ def __init__( sensitivity_metric: Optional[str] = None, ignored_scope: Optional[dict] = None, num_samples: Optional[int] = None, + quant_method: Optional[Union[QuantizationMethod, OVQuantizationMethod]] = OVQuantizationMethod.DEFAULT, + weight_only: Optional[bool] = True, **kwargs, ): + if weight_only is False: + logger.warning( + "Trying to create an instance of `OVWeightQuantizationConfig` with `weight_only` being " + "False. Please check your configuration." + ) + super().__init__(ignored_scope, num_samples, True) self.bits = bits self.sym = sym self.tokenizer = tokenizer @@ -220,21 +258,25 @@ def __init__( self.ratio = ratio self.all_layers = all_layers self.sensitivity_metric = sensitivity_metric - self.ignored_scope = ignored_scope - self.num_samples = num_samples - self.quant_method = "default" # TODO : enable AWQ after nncf v2.9.0 release + self.quant_method = quant_method self.post_init() def post_init(self): r""" Safety checker that arguments are correct """ + super().post_init() if self.ratio is not None and not (0 <= self.ratio <= 1): raise ValueError("`ratio` must between 0 and 1.") if self.group_size is not None and self.group_size != -1 and self.group_size <= 0: raise ValueError("`group_size` must be greater than 0 or equal to -1") + if not (self.dataset is None or isinstance(self.dataset, (str, list))): + raise ValueError( + f"Dataset must be a instance of either string or list of strings, but found {type(self.dataset)}. " + f"If you wish to provide a custom dataset please pass it via `calibration_dataset` argument." + ) if self.dataset is not None and isinstance(self.dataset, str): - llm_datasets = ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"] + llm_datasets = ["wikitext", "c4", "c4-new", "ptb", "ptb-new"] stable_diffusion_datasets = [ "conceptual_captions", "laion/220k-GPT4Vision-captions-from-LIVIS", @@ -259,6 +301,87 @@ def post_init(self): f"For 8-bit quantization, `group_size` is expected to be set to -1, but was set to {self.group_size}" ) + if self.tokenizer is not None and not isinstance(self.tokenizer, str): + raise ValueError(f"Tokenizer is expected to be a string, but found {self.tokenizer}") + + +@dataclass +class OVQuantizationConfig(OVQuantizationConfigBase): + def __init__( + self, + ignored_scope: Optional[dict] = None, + num_samples: Optional[int] = 300, + preset: nncf.QuantizationPreset = None, + model_type: nncf.ModelType = nncf.ModelType.TRANSFORMER, + fast_bias_correction: bool = True, + overflow_fix: OverflowFix = OverflowFix.DISABLE, + weight_only: Optional[bool] = False, + **kwargs, + ): + """ + Configuration class containing parameters related to model quantization with NNCF. Compared to weight + compression, during quantization both weights and activations are converted to lower precision. + For weight-only model quantization please see OVWeightQuantizationConfig. + Args: + ignored_scope (`dict`, *optional*): + An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary + entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. + num_samples (`int`, *optional*): + The maximum number of samples composing the calibration dataset. + preset (`nncf.QuantizationPreset`, *optional*): + A preset controls the quantization mode (symmetric and asymmetric). + It can take the following values: + - `performance`: Symmetric quantization of weights and activations. + - `mixed`: Symmetric quantization of weights and asymmetric quantization of activations. + Default value is None. In this case, `mixed` preset is used for `transformer` + model type otherwise `performance`. + model_type (`nncf.ModelType`, defaults to nncf.ModelType.TRANSFORMER): + Model type is needed to specify additional patterns in the model. Supported only `transformer` now. + fast_bias_correction (`bool`, defaults to True): + Whether to apply fast or full bias correction algorithm. + overflow_fix (`nncf.OverflowFix`, default to OverflowFix.DISABLE): + Parameter for controlling overflow fix setting. + weight_only (`bool`, *optional*): + Used to explicitly specify type of quantization (weight-only of full) to apply. Useful when building + the config from dictionary. + """ + if weight_only is True: + logger.warning( + "Trying to create an instance of `OVQuantizationConfig` with `weight_only` being True. " + "Please check your configuration." + ) + super().__init__(ignored_scope, num_samples, False) + # TODO: remove checks below once NNCF is updated to 2.10 + if isinstance(overflow_fix, str): + overflow_fix = OverflowFix(overflow_fix) + if isinstance(preset, str): + preset = nncf.QuantizationPreset(preset) + + self.preset = preset + self.model_type = model_type + self.fast_bias_correction = fast_bias_correction + self.overflow_fix = overflow_fix + self.post_init() + + def to_dict(self) -> Dict[str, Any]: + # TODO: remove code below once NNCF is updated to 2.10 + if isinstance(self.overflow_fix, Enum) or isinstance(self.preset, Enum): + overflow_fix_value = ( + None + if self.overflow_fix is None + else self.overflow_fix + if isinstance(self.overflow_fix, str) + else self.overflow_fix.value + ) + preset_value = ( + None if self.preset is None else self.preset if isinstance(self.preset, str) else self.preset.value + ) + self_copy = copy.deepcopy(self) + self_copy.overflow_fix = overflow_fix_value + self_copy.preset = preset_value + return self_copy.to_dict() + return super().to_dict() + def _check_default_4bit_configs(config: PretrainedConfig): return _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, None) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index a6b8aacf43..d5b19bb28c 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -100,13 +100,21 @@ def __init__( self._openvino_config = OVConfig(quantization_config=quantization_config) @staticmethod - def load_model(file_name: Union[str, Path], quantization_config: Union[OVWeightQuantizationConfig, Dict] = None): + def load_model( + file_name: Union[str, Path], + quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, + calibration_dataset: Optional = None, + ): """ Loads the model. Arguments: file_name (`str` or `Path`): The path of the model ONNX or XML file. + quantization_config (`OVWeightQuantizationConfig` or `Dict`, *optional*): + Quantization config to apply after model is loaded. + calibration_dataset (`nncf.Dataset`, *optional*): + Optional nncf.Dataset to feed to model weight compression when quantization config is provided. """ def fix_op_names_duplicates(model: openvino.runtime.Model): @@ -135,7 +143,7 @@ def fix_op_names_duplicates(model: openvino.runtime.Model): from optimum.intel.openvino.quantization import _weight_only_quantization - model = _weight_only_quantization(model, quantization_config) + model = _weight_only_quantization(model, quantization_config, calibration_dataset=calibration_dataset) return model diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 4b156eda9e..44137186e2 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -572,7 +572,7 @@ def _from_pretrained( from_onnx: bool = False, local_files_only: bool = False, load_in_8bit: bool = False, - quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, + quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None, **kwargs, ): model_path = Path(model_id) @@ -596,7 +596,12 @@ def _from_pretrained( quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) load_in_4bit = quantization_config.bits == 4 if quantization_config else False - model = cls.load_model(model_cache_path, quantization_config=None if load_in_4bit else quantization_config) + calibration_dataset = kwargs.get("calibration_dataset", None) + model = cls.load_model( + model_cache_path, + quantization_config=None if load_in_4bit else quantization_config, + calibration_dataset=calibration_dataset, + ) model_type = config.model_type.replace("_", "-") if model_type == "bloom": @@ -632,7 +637,7 @@ def _from_pretrained( f"For the given model, we recommend the following `quantization_config` : {default_config}" ) - if isinstance(quantization_config.dataset, str): + if calibration_dataset is None and isinstance(quantization_config.dataset, str): tokenizer = quantization_config.tokenizer or AutoTokenizer.from_pretrained(model_id) from optimum.gptq.data import get_dataset, prepare_dataset @@ -644,9 +649,9 @@ def _from_pretrained( dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples) dataset = prepare_dataset(dataset) quantization_config = copy.deepcopy(quantization_config) - quantization_config.dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x)) + calibration_dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x)) - _weight_only_quantization(model, quantization_config) + _weight_only_quantization(model, quantization_config, calibration_dataset) return causal_model diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index a2579611a4..33985dbe6e 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -18,13 +18,14 @@ import os from collections import deque from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union +import datasets import nncf import openvino import torch import transformers -from nncf import CompressWeightsMode, IgnoredScope, SensitivityMetric +from nncf import CompressWeightsMode, SensitivityMetric from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters from nncf.torch import register_module from nncf.torch.initialization import PTInitializingDataLoader @@ -46,7 +47,7 @@ from ..utils.constant import _TASK_ALIASES from ..utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available from ..utils.modeling_utils import get_model_device -from .configuration import OVConfig, OVWeightQuantizationConfig +from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig from .modeling_base import OVBaseModel from .utils import ( MAX_ONNX_OPSET, @@ -203,39 +204,52 @@ def from_pretrained(cls, model: PreTrainedModel, **kwargs): def quantize( self, - calibration_dataset: "Dataset" = None, + calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, save_directory: Union[str, Path] = None, ov_config: OVConfig = None, file_name: Optional[str] = None, batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, - weights_only: bool = False, + weights_only: bool = None, **kwargs, ): """ Quantize a model given the optimization specifications defined in `quantization_config`. Args: - calibration_dataset (`datasets.Dataset`): - The dataset to use for the calibration step. + calibration_dataset (`datasets.Dataset` or `nncf.Dataset` or `Iterable`, *optional*): + A collection of data samples to use for quantization calibration. Is optional for weight-only + quantization and is required for full quantization. save_directory (`Union[str, Path]`): The directory where the quantized model should be saved. - quantization_config (`OVConfig`, *optional*): - The configuration containing the parameters related to quantization. + ov_config (`OVConfig`, *optional*): + The configuration containing the parameters related to quantization. If not provided, 8-bit symmetric + weight-only quantization will be applied. file_name (`str`, *optional*): The model file name to use when saving the model. Overwrites the default file name `"model.onnx"`. - batch_size (`int`, defaults to 8): + batch_size (`int`, defaults to 1): The number of calibration samples to load per batch. data_collator (`DataCollator`, *optional*): The function to use to form a batch from a list of elements of the calibration dataset. remove_unused_columns (`bool`, defaults to `True`): - Whether or not to remove the columns unused by the model forward method. - weights_only (`bool`, defaults to `False`): + Whether to remove the columns unused by the model forward method. + weights_only (`bool`, *optional*): + Being deprecated. Compress weights to integer precision (8-bit by default) while keeping activations floating-point. Fits best for LLM footprint reduction and performance acceleration. Examples: + ```python + >>> from optimum.intel.openvino import OVQuantizer, OVModelForCausalLM + >>> from transformers import AutoModelForCausalLM + >>> model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b") + >>> quantizer = OVQuantizer.from_pretrained(model, task="text-generation") + >>> ov_config = OVConfig(quantization_config=OVWeightQuantizationConfig()) + >>> quantizer.quantize(ov_config=ov_config, save_directory="./quantized_model") + >>> optimized_model = OVModelForCausalLM.from_pretrained("./quantized_model") + ``` + ```python >>> from optimum.intel.openvino import OVQuantizer, OVModelForSequenceClassification >>> from transformers import AutoModelForSequenceClassification @@ -243,53 +257,46 @@ def quantize( >>> # or >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english") >>> quantizer = OVQuantizer.from_pretrained(model, task="text-classification") - >>> quantizer.quantize(calibration_dataset=calibration_dataset, save_directory="./quantized_model") + >>> ov_config = OVConfig(quantization_config=OVQuantizationConfig()) + >>> quantizer.quantize(calibration_dataset=dataset, ov_config=ov_config, save_directory="./quantized_model") >>> optimized_model = OVModelForSequenceClassification.from_pretrained("./quantized_model") ``` - - ```python - >>> from optimum.intel.openvino import OVQuantizer, OVModelForCausalLM - >>> from transformers import AutoModelForCausalLM - >>> model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b") - >>> quantizer = OVQuantizer.from_pretrained(model, task="text-generation") - >>> quantizer.quantize(save_directory="./quantized_model", weights_only=True) - >>> optimized_model = OVModelForCausalLM.from_pretrained("./quantized_model") - ``` """ + # TODO: deprecate weights_only argument + if weights_only is not None: + logger.warning( + "`weights_only` argument is deprecated. In the future please provide `ov_config.quantization_config` " + "as an instance of OVWeightQuantizationConfig for weight-only compression or as an instance of " + "OVQuantizationConfig for full model quantization." + ) + if save_directory is None: # TODO : can be set to self.model.config.name_or_path for OVModels when not provided raise ValueError("`save_directory` needs to be specified") - if weights_only: - if calibration_dataset is not None: - logger.warning( - "`calibration_dataset` was provided but will not be used as `weights_only` is set to `True`." - ) - else: - if calibration_dataset is None: - raise ValueError( - "`calibration_dataset` is needed to compute the activations range during the calibration step and was not provided. " - "In case you only want to apply quantization on the weights, please set `weights_only=True`." - ) - quantization_config = kwargs.pop("quantization_config", None) - if quantization_config is not None: - logger.warning( - "The argument `quantization_config` is deprecated, and will be removed in optimum-intel v1.6.0, please use `ov_config` instead" - ) - ov_config = ov_config or quantization_config - if ov_config is not None: - if not isinstance(ov_config, OVConfig): - raise TypeError(f"`ov_config` should be an `OVConfig`, but got: {type(ov_config)} instead.") + if ov_config is None: + ov_config = OVConfig() + if not isinstance(ov_config, OVConfig): + raise TypeError(f"`ov_config` should be an `OVConfig`, but got: {type(ov_config)} instead.") + quantization_config = ov_config.quantization_config + if quantization_config is None: + if weights_only is None or weights_only is True: + if weights_only is None: + logger.info( + "`quantization_config` was not provided, 8-bit asymmetric weight quantization will be applied." + ) + ov_config.quantization_config = OVWeightQuantizationConfig(bits=8) + else: + ov_config.quantization_config = OVQuantizationConfig() if isinstance(self.model, OVBaseModel): self._quantize_ovbasemodel( - calibration_dataset, + ov_config, save_directory, + calibration_dataset, batch_size, data_collator, remove_unused_columns, - weights_only, - ov_config, **kwargs, ) @@ -299,84 +306,97 @@ def quantize( "To convert a PyTorch model to OpenVINO, you can set `export=True` when loading your model as `OVModelForXxx.from_pretrained(..., export=True)`" ) self._quantize_torchmodel( - calibration_dataset, + ov_config, save_directory, + calibration_dataset, file_name, batch_size, data_collator, remove_unused_columns, - weights_only, + **kwargs, ) else: raise TypeError(f"Unsupported model type: {type(self.model)}") def _quantize_ovbasemodel( self, - calibration_dataset: "Dataset", + ov_config: OVConfig, save_directory: Union[str, Path], + calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, - weights_only: bool = False, - ov_config: OVConfig = None, **kwargs, ): save_directory = Path(save_directory) save_directory.mkdir(parents=True, exist_ok=True) - if weights_only: - q_config = getattr(ov_config, "quantization_config", None) - # Use default 8-bit compression if not provided - q_config = q_config or OVWeightQuantizationConfig(bits=8, sym=True) - _weight_only_quantization(self.model.model, q_config) - + quantization_config = ov_config.quantization_config + if isinstance(quantization_config, OVWeightQuantizationConfig): + _weight_only_quantization(self.model.model, quantization_config, calibration_dataset) self.model.save_pretrained(save_directory) + ov_config.save_pretrained(save_directory) return + if not isinstance(quantization_config, OVQuantizationConfig): + raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}") - calibration_dataloader = self._get_calibration_dataloader( - calibration_dataset=calibration_dataset, - batch_size=batch_size, - remove_unused_columns=remove_unused_columns, - data_collator=data_collator, - ) + if isinstance(calibration_dataset, nncf.Dataset): + quantization_dataset = calibration_dataset + elif isinstance(calibration_dataset, datasets.Dataset): + calibration_dataloader = self._get_calibration_dataloader( + calibration_dataset=calibration_dataset, + batch_size=batch_size, + remove_unused_columns=remove_unused_columns, + data_collator=data_collator, + ) - if self.model.export_feature == "text-generation" and self.model.use_cache: - # Prefeth past_key_values - self.model.update_pkv_precision(True) - self.model.compile() - subset_size = kwargs.get("subset_size", 300) - collected_inputs = [] - - self.model.request = InferRequestWrapper(self.model.request, collected_inputs) - for _, data in enumerate(calibration_dataloader): - self.model.generate(**data, max_new_tokens=1) - if len(collected_inputs) >= subset_size: - break - self.model.request = self.model.request.request - calibration_dataloader = collected_inputs + if self.model.export_feature == "text-generation" and self.model.use_cache: + # Prefetch past_key_values + self.model.update_pkv_precision(True) + self.model.compile() + collected_inputs = [] + + self.model.request = InferRequestWrapper(self.model.request, collected_inputs) + try: + for data in calibration_dataloader: + self.model.generate(**data, max_new_tokens=1) + if len(collected_inputs) >= quantization_config.num_samples: + break + finally: + self.model.request = self.model.request.request + quantization_dataset = nncf.Dataset(collected_inputs) + else: + quantization_dataset = nncf.Dataset(calibration_dataloader) + else: + if calibration_dataset is None: + raise ValueError("Calibration dataset is required to run quantization.") + quantization_dataset = nncf.Dataset(calibration_dataset) # Actual model quantization - quantization_dataset = nncf.Dataset(calibration_dataloader) quantized_model = nncf.quantize( self.model.model, quantization_dataset, - model_type=nncf.ModelType.TRANSFORMER if not kwargs.get("model_type") else kwargs.get("model_type"), - fast_bias_correction=kwargs.get("fast_bias_correction", True), + subset_size=quantization_config.num_samples, + ignored_scope=quantization_config.get_ignored_scope_instance(), + model_type=quantization_config.model_type, + preset=quantization_config.preset, + fast_bias_correction=quantization_config.fast_bias_correction, + advanced_parameters=nncf.AdvancedQuantizationParameters(overflow_fix=quantization_config.overflow_fix), **kwargs, ) self.model.model = quantized_model self.model.save_pretrained(save_directory) + ov_config.save_pretrained(save_directory) def _quantize_torchmodel( self, - calibration_dataset: "Dataset", + ov_config: OVConfig, save_directory: Union[str, Path], + calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, file_name: Optional[str] = None, batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, - weights_only: bool = False, - save_onnx_model: bool = False, **kwargs, ): self._set_task() @@ -394,6 +414,7 @@ def _quantize_torchmodel( model_type=model_type, ) + save_onnx_model = ov_config.save_onnx_model onnx_file_name = ( ONNX_WEIGHTS_NAME if file_name is None and save_onnx_model else Path(ov_file_name).with_suffix(".onnx") ) @@ -412,7 +433,8 @@ def _quantize_torchmodel( stateful = ensure_stateful_is_available() and ensure_export_task_support_stateful(task) - if weights_only: + quantization_config = ov_config.quantization_config + if isinstance(quantization_config, OVWeightQuantizationConfig): if stateful: # patch model before weight compression model = patch_model_with_bettertransformer(model) @@ -426,6 +448,8 @@ def _quantize_torchmodel( nncf.compress_weights(model, dataset=nncf.Dataset([dummy_inputs])) else: + if not isinstance(quantization_config, OVQuantizationConfig): + raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}") if stateful: logger.warn( "Quantization algorithm does not support optimized stateful models. " @@ -433,19 +457,29 @@ def _quantize_torchmodel( ) stateful = False - calibration_dataloader = self._get_calibration_dataloader( - calibration_dataset=calibration_dataset, - batch_size=batch_size, - remove_unused_columns=remove_unused_columns, - data_collator=data_collator, - ) - - quantization_dataset = nncf.Dataset(calibration_dataloader) + if isinstance(calibration_dataset, nncf.Dataset): + quantization_dataset = calibration_dataset + elif isinstance(calibration_dataset, datasets.Dataset): + calibration_dataloader = self._get_calibration_dataloader( + calibration_dataset=calibration_dataset, + batch_size=batch_size, + remove_unused_columns=remove_unused_columns, + data_collator=data_collator, + ) + quantization_dataset = nncf.Dataset(calibration_dataloader) + else: + if calibration_dataset is None: + raise ValueError("Calibration dataset is required to run quantization.") + quantization_dataset = nncf.Dataset(calibration_dataset) model = nncf.quantize( model, quantization_dataset, - model_type=nncf.ModelType.TRANSFORMER if not kwargs.get("model_type") else kwargs.get("model_type"), - fast_bias_correction=kwargs.get("fast_bias_correction", True), + subset_size=quantization_config.num_samples, + ignored_scope=quantization_config.get_ignored_scope_instance(), + model_type=quantization_config.model_type, + preset=quantization_config.preset, + fast_bias_correction=quantization_config.fast_bias_correction, + advanced_parameters=nncf.AdvancedQuantizationParameters(overflow_fix=quantization_config.overflow_fix), **kwargs, ) @@ -472,6 +506,8 @@ def _quantize_torchmodel( except FileNotFoundError: pass + ov_config.save_pretrained(save_directory) + @staticmethod def _save_pretrained(model: openvino.runtime.Model, output_path: str): compress_quantize_weights_transformation(model) @@ -503,7 +539,7 @@ def get_calibration_dataset( preprocess_batch: bool = True, use_auth_token: bool = False, cache_dir: Optional[str] = None, - ) -> "Dataset": + ) -> datasets.Dataset: """ Create the calibration `datasets.Dataset` to use for the post-training static quantization calibration step. @@ -580,18 +616,33 @@ def _remove_unused_columns(self, dataset: "Dataset"): def _weight_only_quantization( - model: openvino.runtime.Model, quantization_config: Union[OVWeightQuantizationConfig, Dict] + model: openvino.runtime.Model, + quantization_config: Union[OVWeightQuantizationConfig, Dict], + calibration_dataset: Optional[Union[nncf.Dataset, Iterable]] = None, ) -> openvino.runtime.Model: config = quantization_config if isinstance(config, dict): config = OVWeightQuantizationConfig.from_dict(quantization_config) - dataset = config.dataset - - if config.dataset is not None and isinstance(config.dataset, str): - tokenizer = config.tokenizer - if isinstance(tokenizer, str): - tokenizer = AutoTokenizer.from_pretrained(tokenizer) + if config.dataset is not None and calibration_dataset is not None: + logger.info( + "Both `quantization_config.dataset` and `calibration_dataset` were provided for weight only " + "quantization. Will rely on `calibration_dataset`." + ) + dataset = None + if calibration_dataset is not None: + if isinstance(calibration_dataset, datasets.Dataset): + raise ValueError( + "Providing calibration dataset as an instance of `datasets.Dataset` for OV weight-only " + "quantization is not supported. Please provide it as `nncf.Dataset` or as iterable of " + "model inputs." + ) + elif isinstance(calibration_dataset, nncf.Dataset): + dataset = calibration_dataset + else: + dataset = nncf.Dataset(calibration_dataset) + elif config.dataset is not None and isinstance(config.dataset, str): + tokenizer = AutoTokenizer.from_pretrained(config.tokenizer) from optimum.gptq.data import get_dataset, prepare_dataset @@ -603,10 +654,6 @@ def _weight_only_quantization( if isinstance(config.sensitivity_metric, str): sensitivity_metric = getattr(SensitivityMetric, config.sensitivity_metric.upper()) - ignored_scope = None - if isinstance(config.ignored_scope, dict): - ignored_scope = IgnoredScope(**config.ignored_scope) - if config.bits == 8: mode = CompressWeightsMode.INT8_SYM if config.sym else CompressWeightsMode.INT8_ASYM else: @@ -619,10 +666,10 @@ def _weight_only_quantization( group_size=config.group_size, all_layers=config.all_layers, sensitivity_metric=sensitivity_metric, - # awq=config.quant_method == "awq", # TODO : remove and add it back once nncf v2.9.0 - ignored_scope=ignored_scope, + # awq=config.quant_method == QuantizationMethod.AWQ, # TODO : enable from nncf v2.9.0 + ignored_scope=config.get_ignored_scope_instance(), dataset=dataset, - # subset_size=config.num_samples if config.num_samples else 128, # TODO : enable from nncf v2.9.0 + # subset_size=config.num_samples if config.num_samples else 128, # TODO : enable from nncf v2.9.0 ) @@ -691,23 +738,23 @@ def _hybrid_quantization( """ ops_to_compress = _collect_ops_with_weights(model) - ignored_scope = quantization_config.ignored_scope if isinstance(quantization_config.ignored_scope, dict) else {} - ptq_ignored_scope = nncf.IgnoredScope(**ignored_scope) - ptq_ignored_scope.names += ops_to_compress - - wc_quantization_config = copy.deepcopy(quantization_config) - wc_quantization_config.ignored_scope = ignored_scope - wc_quantization_config.ignored_scope["types"] = ignored_scope.get("types", []) + ["Convolution"] - compressed_model = _weight_only_quantization(model, wc_quantization_config) + wc_config = copy.deepcopy(quantization_config) + wc_config.ignored_scope = wc_config.ignored_scope or {} + wc_config.ignored_scope["types"] = wc_config.ignored_scope.get("types", []) + ["Convolution"] + compressed_model = _weight_only_quantization(model, wc_config) + ptq_ignored_scope = quantization_config.get_ignored_scope_instance() + ptq_ignored_scope.names += ops_to_compress subset_size = quantization_config.num_samples if quantization_config.num_samples else 200 quantized_model = nncf.quantize( model=compressed_model, calibration_dataset=nncf.Dataset(dataset), model_type=nncf.ModelType.TRANSFORMER, ignored_scope=ptq_ignored_scope, - # The SQ algo should be disabled for MatMul nodes because their weights are already compressed - advanced_parameters=nncf.AdvancedQuantizationParameters(AdvancedSmoothQuantParameters(matmul=-1)), + # SQ algo should be disabled for MatMul nodes because their weights are already compressed + advanced_parameters=nncf.AdvancedQuantizationParameters( + smooth_quant_alphas=AdvancedSmoothQuantParameters(matmul=-1) + ), subset_size=subset_size, ) return quantized_model diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py index b7d110c96a..4f5ac5f178 100644 --- a/optimum/intel/openvino/trainer.py +++ b/optimum/intel/openvino/trainer.py @@ -89,7 +89,7 @@ from ..utils.constant import _TASK_ALIASES from ..utils.import_utils import is_transformers_version -from .configuration import DEFAULT_QUANTIZATION_CONFIG, OVConfig +from .configuration import OVConfig from .quantization import OVDataLoader from .training_args import OVTrainingArguments from .utils import ( @@ -136,6 +136,25 @@ NNCF_LOG_FILE_NAME = "nncf_output.log" +DEFAULT_QUANTIZATION_CONFIG = { + "algorithm": "quantization", + "preset": "mixed", + "overflow_fix": "disable", + "initializer": { + "range": {"num_init_samples": 300, "type": "mean_min_max"}, + "batchnorm_adaptation": {"num_bn_adaptation_samples": 0}, + }, + "scope_overrides": {"activations": {"{re}.*matmul_0": {"mode": "symmetric"}}}, + "ignored_scopes": [ + "{re}.*Embedding.*", + "{re}.*add___.*", + "{re}.*layer_norm_.*", + "{re}.*matmul_1", + "{re}.*__truediv__.*", + ], +} + + def _onnx_export_nncf_model(model: NNCFNetwork, config: OnnxConfig, output: Union[str, io.BytesIO], opset: int = None): # TODO: remove it when fix controller.strip(copy=True) behavior signature = inspect.signature(model.forward) @@ -228,6 +247,16 @@ def __init__( if self.ov_config is not None: if self.ov_config.compression is None: self.ov_config.compression = DEFAULT_QUANTIZATION_CONFIG + if ( + isinstance(self.ov_config.compression, dict) + and "algorithm" in self.ov_config.compression + and self.ov_config.compression["algorithm"] == "quantization" + ): + self.ov_config.compression["export_to_onnx_standard_ops"] = self.ov_config.save_onnx_model + elif isinstance(self.ov_config.compression, list): + for i, algo_config in enumerate(self.ov_config.compression): + if algo_config["algorithm"] == "quantization": + self.ov_config.compression[i]["export_to_onnx_standard_ops"] = self.ov_config.save_onnx_model if self.args.do_train: self._set_task() diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 0e307fb036..b22d5e3955 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -15,15 +15,19 @@ # ruff: noqa import itertools +import logging import tempfile import unittest from collections import defaultdict +from enum import Enum from functools import partial +from typing import List, Union import evaluate import numpy as np import torch from datasets import load_dataset +from nncf.quantization.advanced_parameters import OverflowFix from parameterized import parameterized import openvino.runtime as ov import nncf @@ -37,6 +41,7 @@ TrainingArguments, default_data_collator, ) +from transformers.utils.quantization_config import QuantizationMethod from optimum.intel import ( OVConfig, @@ -55,8 +60,10 @@ OVStableDiffusionXLPipeline, OVQuantizer, OVTrainer, + OVQuantizationConfig, OVWeightQuantizationConfig, ) +from optimum.intel.openvino.configuration import OVQuantizationMethod, OVQuantizationConfigBase from optimum.intel.openvino.quantization import InferRequestWrapper from optimum.intel.utils.import_utils import is_openvino_version @@ -98,7 +105,13 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="train", ) - quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset, file_name=file_name) + ov_config = OVConfig(quantization_config=OVQuantizationConfig()) + quantizer.quantize( + save_directory=tmp_dir, + calibration_dataset=calibration_dataset, + file_name=file_name, + ov_config=ov_config, + ) model = model_cls.from_pretrained(tmp_dir, file_name=file_name) # TODO: uncomment once move to a newer version of NNCF which has some fixes (addmm, baddmm) @@ -110,6 +123,10 @@ def preprocess_function(examples, tokenizer): outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) def test_ovmodel_static_quantization(self, model_cls, model_name, expected_fake_quantize, expected_int8): task = model_cls.export_feature @@ -134,7 +151,8 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="train", ) - quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset) + ov_config = OVConfig(quantization_config=OVQuantizationConfig()) + quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset, ov_config=ov_config) model = model_cls.from_pretrained(tmp_dir) @@ -146,6 +164,10 @@ def preprocess_function(examples, tokenizer): outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) + class OVWeightCompressionTest(unittest.TestCase): # TODO : add models @@ -210,7 +232,7 @@ class OVWeightCompressionTest(unittest.TestCase): ratio=0.8, sensitivity_metric="mean_activation_magnitude", dataset="ptb", - awq=True, + quant_method=QuantizationMethod.AWQ, ), 14, ), @@ -251,7 +273,7 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i tokenizer.pad_token = tokenizer.eos_token quantizer = OVQuantizer.from_pretrained(transformers_model, task=task) - quantizer.quantize(save_directory=tmp_dir, weights_only=True) + quantizer.quantize(save_directory=tmp_dir) model = model_cls.from_pretrained(tmp_dir) _, num_int8, _ = get_num_quantized_nodes(model) @@ -261,6 +283,15 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + original_config_as_dict = OVWeightQuantizationConfig().to_dict() + for k in original_config_as_dict.keys(): + v = original_config_as_dict[k] + if isinstance(v, Enum): + original_config_as_dict[k] = v.value + self.assertEqual(original_config_as_dict, loaded_config.quantization_config.to_dict()) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_pt_int8, expected_ov_int8): task = model_cls.export_feature @@ -272,7 +303,7 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p tokenizer.pad_token = tokenizer.eos_token quantizer = OVQuantizer.from_pretrained(transformers_model, task=task) - quantizer.quantize(save_directory=tmp_dir, weights_only=True) + quantizer.quantize(save_directory=tmp_dir) model = model_cls.from_pretrained(tmp_dir) _, num_int8, _ = get_num_quantized_nodes(model) @@ -282,6 +313,10 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(OVWeightQuantizationConfig().to_dict(), loaded_config.quantization_config.to_dict()) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS) def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_int8, expected_int4): task = model_cls.export_feature @@ -297,7 +332,6 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i ov_config = OVConfig(quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, ratio=0.8)) quantizer.quantize( save_directory=tmp_dir, - weights_only=True, ov_config=ov_config, ) model = model_cls.from_pretrained(tmp_dir) @@ -310,6 +344,10 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) + @parameterized.expand(SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) @unittest.skipIf(not IS_SUPPORT_STATEFUL, "Stateful models supported only in 2023.3 and above") def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_id, expected_pt_int8, expected_ov_int8): @@ -322,7 +360,7 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_id, exp tokenizer.pad_token = tokenizer.eos_token quantizer = OVQuantizer.from_pretrained(transformers_model, task=task) - quantizer.quantize(save_directory=tmp_dir, weights_only=True) + quantizer.quantize(save_directory=tmp_dir) model = model_cls.from_pretrained(tmp_dir) _, num_int8, _ = get_num_quantized_nodes(model) @@ -332,6 +370,10 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_id, exp outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(OVWeightQuantizationConfig().to_dict(), loaded_config.quantization_config.to_dict()) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION) def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type): model = model_cls.from_pretrained(MODEL_NAMES[model_type], export=True, load_in_8bit=True, stateful=False) @@ -401,17 +443,18 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_ model.save_pretrained(tmp_dir) openvino_config = OVConfig.from_pretrained(tmp_dir) - self.assertEqual(openvino_config.quantization_config["bits"], 4) + self.assertEqual(openvino_config.quantization_config.bits, 4) self.assertEqual(openvino_config.dtype, "int4") if model_id == "facebook/opt-125m": for key, value in self.DEFAULT_INT4_CONFIG.items(): - self.assertEqual(value, openvino_config.quantization_config[key]) + self.assertEqual(value, getattr(openvino_config.quantization_config, key)) @parameterized.expand(LOAD_IN_4_BITS_SCOPE) def test_ovmodel_4bit_auto_compression_with_config( self, model_cls, model_id, quantization_config, expected_ov_int4 ): with tempfile.TemporaryDirectory() as tmp_dir: + quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config) model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: @@ -422,7 +465,7 @@ def test_ovmodel_4bit_auto_compression_with_config( model.save_pretrained(tmp_dir) openvino_config = OVConfig.from_pretrained(tmp_dir) - self.assertEqual(openvino_config.quantization_config["bits"], 4) + self.assertEqual(openvino_config.quantization_config.bits, 4) self.assertEqual(openvino_config.dtype, "int4") @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTO_COMPRESSED_MATMULS) @@ -453,9 +496,8 @@ def transform_fn(data, tokenizer): model = model_cls.from_pretrained( model_id, export=True, - quantization_config=OVWeightQuantizationConfig( - bits=4, sym=True, group_size=-1, ratio=0.8, dataset=quantization_dataset - ), + quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, group_size=-1, ratio=0.8), + calibration_dataset=quantization_dataset, ) _, num_int8, num_int4 = get_num_quantized_nodes(model) @@ -545,7 +587,7 @@ def test_ovmodel_load_large_model_with_additional_quantization_config(self): "all_layers": None, "sensitivity_metric": None, "dataset": None, - "ignored_scope": None, + "ignored_scope": nncf.IgnoredScope(), } compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params) @@ -571,7 +613,8 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="test", ) - quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset) + ov_config = OVConfig(quantization_config=OVQuantizationConfig()) + quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset, ov_config=ov_config) # Test that inference on quantized model works model = OVModelForQuestionAnswering.from_pretrained(tmp_dir) @@ -586,6 +629,10 @@ def preprocess_function(examples, tokenizer): except RuntimeError: self.fail("Loading BERT QA model a second time failed") + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_ovmodel_static_quantization(self, model_name): def preprocess_function(examples, tokenizer): @@ -604,7 +651,8 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="test", ) - quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset) + ov_config = OVConfig(quantization_config=OVQuantizationConfig()) + quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset, ov_config=ov_config) # Test that inference on quantized model works model = OVModelForQuestionAnswering.from_pretrained(tmp_dir) @@ -619,6 +667,10 @@ def preprocess_function(examples, tokenizer): except RuntimeError: self.fail("Loading BERT QA model a second time failed") + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) + class OVTrainerTest(unittest.TestCase): SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("distilbert-base-uncased", 50, 38),) @@ -666,6 +718,150 @@ def compute_metrics(p): self.assertTrue("logits" in outputs) +class OVQuantizationConfigTest(unittest.TestCase): + QUANTIZATION_CONFIGS = ( + (None,), + (OVWeightQuantizationConfig(),), + ( + OVWeightQuantizationConfig( + bits=8, + sym=True, + ), + ), + ( + OVWeightQuantizationConfig( + dataset="wikitext", + bits=4, + ignored_scope={"names": ["op_name"]}, + sym=False, + tokenizer="dbmdz/bert-base-german-cased", + ratio=1.0, + group_size=128, + all_layers=True, + sensitivity_metric="mean_activation_magnitude", + num_samples=100, + quant_method=OVQuantizationMethod.DEFAULT, + ), + ), + (OVWeightQuantizationConfig(dataset=["hello world", "i'm alive"]),), + ( + OVQuantizationConfig( + ignored_scope={"names": ["op_name"]}, + num_samples=100, + preset=nncf.QuantizationPreset.MIXED, + model_type=nncf.ModelType.TRANSFORMER, + fast_bias_correction=True, + overflow_fix=OverflowFix.DISABLE, + ), + ), + (OVQuantizationConfig(ignored_scope=nncf.IgnoredScope(names=["op_name"])),), + ) + + QUANTIZATION_CONFIG_DICTS = ( + (dict(bits=8, sym=True), OVWeightQuantizationConfig, None), + ( + dict( + dataset="wikitext", + bits=4, + ignored_scope={"names": ["op_name"]}, + sym=False, + tokenizer="dbmdz/bert-base-german-cased", + ratio=1.0, + group_size=128, + all_layers=True, + sensitivity_metric="mean_activation_magnitude", + num_samples=100, + quant_method=OVQuantizationMethod.DEFAULT, + ), + OVWeightQuantizationConfig, + None, + ), + (dict(), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), + ( + dict(ignored_scope={"names": ["op_name"]}), + OVWeightQuantizationConfig, + "Can't determine type of OV quantization config", + ), + (dict(num_samples=100), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), + (dict(abc="def"), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), + ( + dict(bits=8, fast_bias_correction=True), + OVWeightQuantizationConfig, + "Can't determine type of OV quantization config", + ), + (dict(model_type=nncf.ModelType.TRANSFORMER), OVQuantizationConfig, None), + ( + dict( + ignored_scope={"names": ["op_name"]}, + num_samples=100, + preset=nncf.QuantizationPreset.MIXED, + model_type=nncf.ModelType.TRANSFORMER, + fast_bias_correction=True, + overflow_fix=OverflowFix.DISABLE, + ), + OVQuantizationConfig, + None, + ), + (dict(weight_only=True), OVWeightQuantizationConfig, None), + (dict(weight_only=False), OVQuantizationConfig, None), + (dict(abc="def", weight_only=False), OVQuantizationConfig, None), + (dict(abc="def", weight_only=True), OVWeightQuantizationConfig, None), + (dict(bits=8, fast_bias_correction=True, weight_only=True), OVWeightQuantizationConfig, None), + (dict(bits=8, fast_bias_correction=True, weight_only=False), OVQuantizationConfig, None), + (dict(bits=8, sym=True, weight_only=False), OVWeightQuantizationConfig, "Please check your configuration"), + ( + dict(model_type=nncf.ModelType.TRANSFORMER, weight_only=True), + OVQuantizationConfig, + "Please check your configuration", + ), + ) + + @parameterized.expand(QUANTIZATION_CONFIGS) + def test_config_serialization(self, quantization_config: OVQuantizationConfigBase): + def str_to_enum(enum_cls, value): + for k, v in enum_cls.__members__.items(): + if getattr(enum_cls, k).value == value: + return v + raise ValueError(f"Could not convert string {value} to enum value of type {enum_cls}") + + ov_config = OVConfig(quantization_config=quantization_config) + with tempfile.TemporaryDirectory() as tmp_dir: + ov_config.save_pretrained(tmp_dir) + loaded_ov_config = OVConfig.from_pretrained(tmp_dir) + + if quantization_config is None: + self.assertEqual(loaded_ov_config.quantization_config, None) + return + for key, value in loaded_ov_config.quantization_config.to_dict().items(): + initial_value = getattr(ov_config.quantization_config, key) + if key == "preset" or key == "overflow_fix": + # TODO: remove once NNCF is updated to 2.10 + if getattr(quantization_config, key) is not None: + self.assertTrue(isinstance(value, str)) + if key == "preset": + value = str_to_enum(nncf.QuantizationPreset, value) + else: + value = str_to_enum(OverflowFix, value) + self.assertEqual(value, initial_value) + + @parameterized.expand(QUANTIZATION_CONFIG_DICTS) + def test_config_from_dict(self, quantization_config: dict, config_type: type, warning_log: Union[str, None]): + from optimum.intel.openvino.configuration import logger as configuration_logger + + if warning_log is not None: + with self.assertLogs(configuration_logger, logging.WARN) as cm: + ov_config = OVConfig(quantization_config=quantization_config) + self.assertTrue(any(warning_log in log for log in cm.output)) + else: + ov_config = OVConfig(quantization_config=quantization_config) + self.assertIsInstance(ov_config.quantization_config, config_type) + for k, v in quantization_config.items(): + if k == "weight_only" and warning_log == "Please check your configuration": + continue + if hasattr(ov_config.quantization_config, k): + self.assertEqual(getattr(ov_config.quantization_config, k), v) + + class InferRequestWrapperTest(unittest.TestCase): MODEL_ID = ("openai/whisper-tiny.en",) APPLY_CACHING = (False, True) diff --git a/tests/openvino/test_training.py b/tests/openvino/test_training.py index 80298faf2b..db443c6de2 100644 --- a/tests/openvino/test_training.py +++ b/tests/openvino/test_training.py @@ -45,14 +45,14 @@ from transformers.utils import WEIGHTS_NAME from optimum.intel.openvino import OVTrainingArguments -from optimum.intel.openvino.configuration import DEFAULT_QUANTIZATION_CONFIG, OVConfig +from optimum.intel.openvino.configuration import OVConfig from optimum.intel.openvino.modeling import ( OVModel, OVModelForAudioClassification, OVModelForImageClassification, OVModelForSequenceClassification, ) -from optimum.intel.openvino.trainer import OVTrainer +from optimum.intel.openvino.trainer import DEFAULT_QUANTIZATION_CONFIG, OVTrainer from optimum.intel.openvino.utils import OV_XML_FILE_NAME