From 22d2d6aaad4a6057ce81e02b0ad83d183431e337 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 27 Mar 2024 16:27:51 +0100 Subject: [PATCH 01/15] Introduce OVQuantizationConfig for nncf.quantize() parameters --- optimum/intel/__init__.py | 2 + optimum/intel/openvino/__init__.py | 2 +- optimum/intel/openvino/configuration.py | 197 +++++++++---------- optimum/intel/openvino/modeling_decoder.py | 2 +- optimum/intel/openvino/modeling_diffusion.py | 2 +- optimum/intel/openvino/quantization.py | 173 +++++++--------- optimum/intel/openvino/trainer.py | 31 ++- tests/openvino/test_quantization.py | 32 +-- tests/openvino/test_training.py | 4 +- 9 files changed, 224 insertions(+), 221 deletions(-) diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 59059d688d..62bb2b9171 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -123,6 +123,7 @@ "OVModelForSpeechSeq2Seq", "OVModelForSequenceClassification", "OVModelForTokenClassification", + "OVQuantizationConfig", "OVWeightQuantizationConfig", "OVConfig", ] @@ -241,6 +242,7 @@ OVModelForSequenceClassification, OVModelForSpeechSeq2Seq, OVModelForTokenClassification, + OVQuantizationConfig, OVWeightQuantizationConfig, ) diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 1df932771a..33bceebed7 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -43,7 +43,7 @@ from .trainer import OVTrainer -from .configuration import OVConfig, OVWeightQuantizationConfig +from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig from .modeling import ( OVModelForAudioClassification, OVModelForAudioFrameClassification, diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 40a60bb58e..0c0891064b 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -13,70 +13,19 @@ # limitations under the License. from dataclasses import dataclass +from enum import Enum from typing import Any, Dict, List, Optional, Union +import datasets +import nncf import torch +from nncf.quantization.advanced_parameters import OverflowFix from transformers import PretrainedConfig -from transformers.utils.quantization_config import QuantizationConfigMixin +from transformers.utils.quantization_config import QuantizationConfigMixin, QuantizationMethod from optimum.configuration_utils import BaseConfig -DEFAULT_QUANTIZATION_CONFIG = { - "algorithm": "quantization", - "preset": "mixed", - "overflow_fix": "disable", - "initializer": { - "range": {"num_init_samples": 300, "type": "mean_min_max"}, - "batchnorm_adaptation": {"num_bn_adaptation_samples": 0}, - }, - "scope_overrides": {"activations": {"{re}.*matmul_0": {"mode": "symmetric"}}}, - "ignored_scopes": [ - "{re}.*Embedding.*", - "{re}.*add___.*", - "{re}.*layer_norm_.*", - "{re}.*matmul_1", - "{re}.*__truediv__.*", - ], -} - -INT8_WEIGHT_COMPRESSION_CONFIG = { - "algorithm": "quantization", - "weights": { - "mode": "symmetric", - "bits": 8, - "target_scopes": [ - "{re}.*Embedding.*", - "{re}.*matmul_.*", - "{re}.*addmm_.*", - "{re}.*baddmm_.*", - "{re}.*linear_.*", - ], - "ignored_scopes": [ - "{re}.*conv_*", - ], - }, - "activations": { - "ignored_scopes": [ - "{re}.*add___.*", - "{re}.*__radd___.*", - "{re}.*layer_norm_.*", - "{re}.*__truediv__.*", - "{re}.*__mul___.*", - "{re}.*__rmul___.*", - "{re}.*tanh_.*", - "{re}.*pow_.*", - "{re}.*matmul_.*", - "{re}.*addmm_.*", - "{re}.*baddmm_.*", - "{re}.*linear_.*", - "{re}.*conv_.*", - ], - }, - "overflow_fix": "disable", -} - - _DEFAULT_4BIT_CONFIGS = { "databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.5}, "EleutherAI/gpt-j-6b": {"bits": 4, "sym": False, "group_size": 64}, @@ -100,31 +49,55 @@ } +@dataclass +class OVQuantizationConfigBase(QuantizationConfigMixin): + def __init__( + self, + dataset: Optional[Union[str, List[str], nncf.Dataset, datasets.Dataset]] = None, + ignored_scope: Optional[Union[dict, nncf.IgnoredScope]] = None, + subset_size: Optional[int] = None, + ): + self.dataset = dataset + self.ignored_scope = ignored_scope + self.subset_size = subset_size + + def post_init(self): + if self.dataset is not None and isinstance(self.dataset, str): + llm_datasets = ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"] + stable_diffusion_datasets = [ + "conceptual_captions", + "laion/220k-GPT4Vision-captions-from-LIVIS", + "laion/filtered-wit", + ] + if self.dataset not in llm_datasets + stable_diffusion_datasets: + raise ValueError( + f"""You have entered a string value for dataset. You can only choose between + {llm_datasets} for LLLMs or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}""" + ) + + class OVConfig(BaseConfig): CONFIG_NAME = "openvino_config.json" FULL_CONFIGURATION_FILE = "openvino_config.json" def __init__( self, - compression: Union[List[Dict], Dict, None] = None, input_info: Optional[List] = None, save_onnx_model: bool = False, - quantization_config: Optional[Union[QuantizationConfigMixin, Dict]] = None, + quantization_config: Optional[Union[Dict, OVQuantizationConfigBase]] = None, dtype: Optional[str] = None, **kwargs, ): super().__init__() - self.compression = compression self.input_info = input_info self.save_onnx_model = save_onnx_model - self._enable_standard_onnx_export_option() self.optimum_version = kwargs.pop("optimum_version", None) - self.quantization_config = quantization_config or {} + self.quantization_config = quantization_config + self.compression = None # A backward-compatability field for training-time compression parameters - if isinstance(quantization_config, QuantizationConfigMixin): - bits = self.quantization_config.bits - else: - bits = self.quantization_config.get("bits", None) + bits = ( + self.quantization_config.bits if isinstance(self.quantization_config, OVWeightQuantizationConfig) else None + ) self.dtype = "int" + str(bits) if isinstance(bits, int) else dtype def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): @@ -137,28 +110,21 @@ def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): for name, value in model_inputs.items() ] - def save_pretrained(self, *args, **kwargs): - super().save_pretrained(*args, **kwargs) + def to_dict(self) -> Dict[str, Any]: + # Parent to_dict() implementation does not support quantization_config being None + if self.quantization_config is None: + self.quantization_config = OVQuantizationConfigBase() + result = super().to_dict() + del result["quantization_config"] + return result + - def _enable_standard_onnx_export_option(self): - # This method depends on self.save_onnx_model. - # save_onnx_model is defaulted to false so that the final model output is - # in OpenVINO IR to realize performance benefit in OpenVINO runtime. - # True value of save_onnx_model will save a model in onnx format. - if ( - isinstance(self.compression, dict) - and "algorithm" in self.compression - and self.compression["algorithm"] == "quantization" - ): - self.compression["export_to_onnx_standard_ops"] = self.save_onnx_model - elif isinstance(self.compression, list): - for i, algo_config in enumerate(self.compression): - if algo_config["algorithm"] == "quantization": - self.compression[i]["export_to_onnx_standard_ops"] = self.save_onnx_model +class OVQuantizationMethod(str, Enum): + DEFAULT = "default" @dataclass -class OVWeightQuantizationConfig(QuantizationConfigMixin): +class OVWeightQuantizationConfig(OVQuantizationConfigBase): """ This is a wrapper class about all possible attributes and features that you can play with a model that has been loaded using `optimum-intel` api for quantization with NNCF. @@ -168,7 +134,7 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin): bits (`int`, defaults to 8): The number of bits to quantize to. sym (`bool`, defaults to `False`): - Whether to use symetric quantization. + Whether to use symmetric quantization. tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*): The tokenizer used to process the dataset. You can pass either: - A custom tokenizer object. @@ -187,64 +153,52 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin): group_size (`int`, *optional*): The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. all_layers (`bool`, *optional*): - Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit presicion. + Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit precision. sensitivity_metric (`str`, *optional*): The sensitivity metric for assigning quantization precision to layers. In order to preserve the accuracy of the model, the more sensitive layers receives a higher precision. ignored_scope (`dict`, *optional*): An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization. - num_samples (`int`, *optional*): + subset_size (`int`, *optional*): The maximum number of samples composing the calibration dataset. """ def __init__( self, + dataset: Optional[Union[str, List[str], nncf.Dataset, datasets.Dataset]] = None, bits: int = 8, + ignored_scope: Optional[Union[dict, nncf.IgnoredScope]] = None, sym: bool = False, tokenizer: Optional[Any] = None, - dataset: Optional[Union[str, List[str]]] = None, ratio: float = 1.0, group_size: Optional[int] = None, all_layers: Optional[bool] = None, sensitivity_metric: Optional[str] = None, - ignored_scope: Optional[dict] = None, - num_samples: Optional[int] = None, - **kwargs, + subset_size: Optional[int] = None, + quant_method: Optional[Union[QuantizationMethod, OVQuantizationMethod]] = OVQuantizationMethod.DEFAULT, ): + super().__init__(dataset, ignored_scope, subset_size) self.bits = bits self.sym = sym self.tokenizer = tokenizer - self.dataset = dataset self.group_size = group_size or (-1 if bits == 8 else 128) self.ratio = ratio self.all_layers = all_layers self.sensitivity_metric = sensitivity_metric - self.ignored_scope = ignored_scope - self.num_samples = num_samples - self.quant_method = "default" # TODO : enable AWQ after nncf v2.9.0 release + self.subset_size = subset_size + self.quant_method = quant_method self.post_init() def post_init(self): r""" Safety checker that arguments are correct """ + super().post_init() if self.ratio is not None and not (0 <= self.ratio <= 1): raise ValueError("`ratio` must between 0 and 1.") if self.group_size is not None and self.group_size != -1 and self.group_size <= 0: raise ValueError("`group_size` must be greater than 0 or equal to -1") - if self.dataset is not None and isinstance(self.dataset, str): - llm_datasets = ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"] - stable_diffusion_datasets = [ - "conceptual_captions", - "laion/220k-GPT4Vision-captions-from-LIVIS", - "laion/filtered-wit", - ] - if self.dataset not in llm_datasets + stable_diffusion_datasets: - raise ValueError( - f"""You have entered a string value for dataset. You can only choose between - {llm_datasets} for LLLMs or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}""" - ) if self.bits not in [4, 8]: raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}") @@ -260,5 +214,36 @@ def post_init(self): ) +@dataclass +class OVQuantizationConfig(OVQuantizationConfigBase): + def __init__( + self, + dataset: Union[str, List[str], nncf.Dataset, datasets.Dataset], + ignored_scope: Optional[nncf.IgnoredScope] = None, + subset_size: Optional[int] = 300, + preset: nncf.QuantizationPreset = nncf.QuantizationPreset.MIXED, + model_type: nncf.ModelType = nncf.ModelType.TRANSFORMER, + fast_bias_correction: bool = True, + overflow_fix: OverflowFix = OverflowFix.DISABLE, + ): + super().__init__(dataset, ignored_scope, subset_size) + self.preset = preset + self.model_type = model_type + self.fast_bias_correction = fast_bias_correction + self.overflow_fix = overflow_fix + self.post_init() + + def post_init(self): + """ + Safety checker that arguments are correct + """ + super().post_init() + # if self.dataset is None: + # raise ValueError( + # "`dataset` is needed to compute the activations range during the calibration step and was not provided." + # " In case you only want to apply quantization on the weights, please set `weights_only=True`." + # ) + + def _check_default_4bit_configs(config: PretrainedConfig): return _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, None) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 10f0359a24..02c3bd3d73 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -616,7 +616,7 @@ def _from_pretrained( # from optimum.gptq.utils import get_seqlen # seqlen = get_seqlen(causal_model) - nsamples = quantization_config.num_samples if quantization_config.num_samples else 128 + nsamples = quantization_config.subset_size if quantization_config.subset_size else 128 dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples) dataset = prepare_dataset(dataset) quantization_config = copy.deepcopy(quantization_config) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index eb407b4cd1..f6f13482ce 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -321,7 +321,7 @@ def _from_pretrained( if not isinstance(sd_model, supported_pipelines): raise NotImplementedError(f"Quantization in hybrid mode is not supported for {cls.__name__}") - nsamples = quantization_config.num_samples if quantization_config.num_samples else 200 + nsamples = quantization_config.subset_size if quantization_config.subset_size else 200 unet_inputs = sd_model._prepare_unet_inputs(quantization_config.dataset, nsamples) from .quantization import _hybrid_quantization diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 3a40ff4f3e..f63cbc7fc3 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -46,7 +46,7 @@ from ..utils.constant import _TASK_ALIASES from ..utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available from ..utils.modeling_utils import get_model_device -from .configuration import OVConfig, OVWeightQuantizationConfig +from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig from .modeling_base import OVBaseModel from .utils import ( MAX_ONNX_OPSET, @@ -162,22 +162,17 @@ def from_pretrained(cls, model: PreTrainedModel, **kwargs): def quantize( self, - calibration_dataset: "Dataset" = None, save_directory: Union[str, Path] = None, ov_config: OVConfig = None, file_name: Optional[str] = None, batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, - weights_only: bool = False, - **kwargs, ): """ Quantize a model given the optimization specifications defined in `quantization_config`. Args: - calibration_dataset (`datasets.Dataset`): - The dataset to use for the calibration step. save_directory (`Union[str, Path]`): The directory where the quantized model should be saved. quantization_config (`OVConfig`, *optional*): @@ -189,10 +184,7 @@ def quantize( data_collator (`DataCollator`, *optional*): The function to use to form a batch from a list of elements of the calibration dataset. remove_unused_columns (`bool`, defaults to `True`): - Whether or not to remove the columns unused by the model forward method. - weights_only (`bool`, defaults to `False`): - Compress weights to integer precision (8-bit by default) while keeping activations - floating-point. Fits best for LLM footprint reduction and performance acceleration. + Whether to remove the columns unused by the model forward method. Examples: ```python @@ -218,39 +210,19 @@ def quantize( if save_directory is None: # TODO : can be set to self.model.config.name_or_path for OVModels when not provided raise ValueError("`save_directory` needs to be specified") - if weights_only: - if calibration_dataset is not None: - logger.warning( - "`calibration_dataset` was provided but will not be used as `weights_only` is set to `True`." - ) - else: - if calibration_dataset is None: - raise ValueError( - "`calibration_dataset` is needed to compute the activations range during the calibration step and was not provided. " - "In case you only want to apply quantization on the weights, please set `weights_only=True`." - ) - quantization_config = kwargs.pop("quantization_config", None) - if quantization_config is not None: - logger.warning( - "The argument `quantization_config` is deprecated, and will be removed in optimum-intel v1.6.0, please use `ov_config` instead" - ) - ov_config = ov_config or quantization_config + if ov_config is None: + ov_config = OVConfig() if ov_config is not None: if not isinstance(ov_config, OVConfig): raise TypeError(f"`ov_config` should be an `OVConfig`, but got: {type(ov_config)} instead.") + quantization_config = ov_config.quantization_config + if quantization_config is None: + ov_config.quantization_config = OVWeightQuantizationConfig(bits=8, sym=True) + logger.warning("`quantization_config` was not provided, a default weight quantization will be applied") if isinstance(self.model, OVBaseModel): - self._quantize_ovbasemodel( - calibration_dataset, - save_directory, - batch_size, - data_collator, - remove_unused_columns, - weights_only, - ov_config, - **kwargs, - ) + self._quantize_ovbasemodel(ov_config, save_directory, batch_size, data_collator, remove_unused_columns) elif isinstance(self.model, torch.nn.Module): logger.warning( @@ -258,85 +230,81 @@ def quantize( "To convert a PyTorch model to OpenVINO, you can set `export=True` when loading your model as `OVModelForXxx.from_pretrained(..., export=True)`" ) self._quantize_torchmodel( - calibration_dataset, - save_directory, - file_name, - batch_size, - data_collator, - remove_unused_columns, - weights_only, + ov_config, save_directory, file_name, batch_size, data_collator, remove_unused_columns ) else: raise TypeError(f"Unsupported model type: {type(self.model)}") def _quantize_ovbasemodel( self, - calibration_dataset: "Dataset", + ov_config: OVConfig, save_directory: Union[str, Path], batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, - weights_only: bool = False, - ov_config: OVConfig = None, - **kwargs, ): save_directory = Path(save_directory) save_directory.mkdir(parents=True, exist_ok=True) - if weights_only: - q_config = getattr(ov_config, "quantization_config", None) - # Use default 8-bit compression if not provided - q_config = q_config or OVWeightQuantizationConfig(bits=8, sym=True) - _weight_only_quantization(self.model.model, q_config) - + quantization_config = ov_config.quantization_config + if isinstance(quantization_config, OVWeightQuantizationConfig): + _weight_only_quantization(self.model.model, quantization_config) self.model.save_pretrained(save_directory) return + if not isinstance(quantization_config, OVQuantizationConfig): + raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}") - calibration_dataloader = self._get_calibration_dataloader( - calibration_dataset=calibration_dataset, - batch_size=batch_size, - remove_unused_columns=remove_unused_columns, - data_collator=data_collator, - ) + calibration_dataset = quantization_config.dataset + if isinstance(calibration_dataset, nncf.Dataset): + quantization_dataset = calibration_dataset + else: + calibration_dataloader = self._get_calibration_dataloader( + calibration_dataset=quantization_config.dataset, + batch_size=batch_size, + remove_unused_columns=remove_unused_columns, + data_collator=data_collator, + ) - if self.model.export_feature == "text-generation" and self.model.use_cache: - # Prefeth past_key_values - self.model.update_pkv_precision(True) - self.model.compile() - subset_size = kwargs.get("subset_size", 300) - data_cache = [] + if self.model.export_feature == "text-generation" and self.model.use_cache: + # Prefetch past_key_values + self.model.update_pkv_precision(True) + self.model.compile() + data_cache = [] - self.model.request = InferRequestWrapper(self.model.request, data_cache) - for _, data in enumerate(calibration_dataloader): - self.model.generate(**data, max_new_tokens=1) - if len(data_cache) >= subset_size: - break - self.model.request = self.model.request.request - calibration_dataloader = data_cache + self.model.request = InferRequestWrapper(self.model.request, data_cache) + try: + for data in calibration_dataloader: + self.model.generate(**data, max_new_tokens=1) + if len(data_cache) >= quantization_config.subset_size: + break + finally: + self.model.request = self.model.request.request + quantization_dataset = nncf.Dataset(data_cache) + else: + quantization_dataset = nncf.Dataset(calibration_dataloader) # Actual model quantization - quantization_dataset = nncf.Dataset(calibration_dataloader) quantized_model = nncf.quantize( self.model.model, quantization_dataset, - model_type=nncf.ModelType.TRANSFORMER if not kwargs.get("model_type") else kwargs.get("model_type"), - fast_bias_correction=kwargs.get("fast_bias_correction", True), - **kwargs, + subset_size=quantization_config.subset_size, + ignored_scope=quantization_config.ignored_scope, + model_type=quantization_config.model_type, + preset=quantization_config.preset, + fast_bias_correction=quantization_config.fast_bias_correction, + advanced_parameters=nncf.AdvancedQuantizationParameters(overflow_fix=quantization_config.overflow_fix), ) self.model.model = quantized_model self.model.save_pretrained(save_directory) def _quantize_torchmodel( self, - calibration_dataset: "Dataset", + ov_config: OVConfig, save_directory: Union[str, Path], file_name: Optional[str] = None, batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, - weights_only: bool = False, - save_onnx_model: bool = False, - **kwargs, ): self._set_task() save_directory = Path(save_directory) @@ -353,6 +321,7 @@ def _quantize_torchmodel( model_type=model_type, ) + save_onnx_model = ov_config.save_onnx_model onnx_file_name = ( ONNX_WEIGHTS_NAME if file_name is None and save_onnx_model else Path(ov_file_name).with_suffix(".onnx") ) @@ -371,7 +340,8 @@ def _quantize_torchmodel( stateful = ensure_stateful_is_available() and ensure_export_task_support_stateful(task) - if weights_only: + quantization_config = ov_config.quantization_config + if isinstance(quantization_config, OVWeightQuantizationConfig): if stateful: # patch model before weight compression model = patch_model_with_bettertransformer(model) @@ -385,6 +355,8 @@ def _quantize_torchmodel( nncf.compress_weights(model, dataset=nncf.Dataset([dummy_inputs])) else: + if not isinstance(quantization_config, OVQuantizationConfig): + raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}") if stateful: logger.warn( "Quantization algorithm does not support optimized stateful models. " @@ -392,20 +364,25 @@ def _quantize_torchmodel( ) stateful = False - calibration_dataloader = self._get_calibration_dataloader( - calibration_dataset=calibration_dataset, - batch_size=batch_size, - remove_unused_columns=remove_unused_columns, - data_collator=data_collator, - ) - - quantization_dataset = nncf.Dataset(calibration_dataloader) + if isinstance(quantization_config.dataset, nncf.Dataset): + quantization_dataset = quantization_config.dataset + else: + calibration_dataloader = self._get_calibration_dataloader( + calibration_dataset=quantization_config.dataset, + batch_size=batch_size, + remove_unused_columns=remove_unused_columns, + data_collator=data_collator, + ) + quantization_dataset = nncf.Dataset(calibration_dataloader) model = nncf.quantize( model, quantization_dataset, - model_type=nncf.ModelType.TRANSFORMER if not kwargs.get("model_type") else kwargs.get("model_type"), - fast_bias_correction=kwargs.get("fast_bias_correction", True), - **kwargs, + subset_size=quantization_config.subset_size, + ignored_scope=quantization_config.ignored_scope, + model_type=quantization_config.model_type, + preset=quantization_config.preset, + fast_bias_correction=quantization_config.fast_bias_correction, + advanced_parameters=nncf.AdvancedQuantizationParameters(overflow_fix=quantization_config.overflow_fix), ) model_path = save_directory / (onnx_file_name if save_onnx_model else ov_file_name) @@ -510,7 +487,7 @@ def get_calibration_dataset( def _get_calibration_dataloader( self, - calibration_dataset: "Dataset", + calibration_dataset: Union[Dataset, nncf.Dataset], batch_size: int, remove_unused_columns: bool, data_collator: Optional[DataCollator] = None, @@ -554,7 +531,7 @@ def _weight_only_quantization( from optimum.gptq.data import get_dataset, prepare_dataset - nsamples = config.num_samples if config.num_samples else 128 + nsamples = config.subset_size if config.subset_size else 128 dataset = get_dataset(config.dataset, tokenizer, seqlen=32, nsamples=nsamples) dataset = prepare_dataset(dataset) @@ -578,10 +555,10 @@ def _weight_only_quantization( group_size=config.group_size, all_layers=config.all_layers, sensitivity_metric=sensitivity_metric, - # awq=config.quant_method == "awq", # TODO : remove and add it back once nncf v2.9.0 + # awq=config.quant_method == QuantizationMethod.AWQ, ignored_scope=ignored_scope, dataset=dataset, - # subset_size=config.num_samples if config.num_samples else 128, # TODO : enable from nncf v2.9.0 + # subset_size=config.subset_size if config.subset_size else 128, ) @@ -659,7 +636,7 @@ def _hybrid_quantization( wc_quantization_config.ignored_scope["types"] = ignored_scope.get("types", []) + ["Convolution"] compressed_model = _weight_only_quantization(model, wc_quantization_config) - subset_size = quantization_config.num_samples if quantization_config.num_samples else 200 + subset_size = quantization_config.subset_size if quantization_config.subset_size else 200 quantized_model = nncf.quantize( model=compressed_model, calibration_dataset=nncf.Dataset(dataset), diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py index b7d110c96a..4f5ac5f178 100644 --- a/optimum/intel/openvino/trainer.py +++ b/optimum/intel/openvino/trainer.py @@ -89,7 +89,7 @@ from ..utils.constant import _TASK_ALIASES from ..utils.import_utils import is_transformers_version -from .configuration import DEFAULT_QUANTIZATION_CONFIG, OVConfig +from .configuration import OVConfig from .quantization import OVDataLoader from .training_args import OVTrainingArguments from .utils import ( @@ -136,6 +136,25 @@ NNCF_LOG_FILE_NAME = "nncf_output.log" +DEFAULT_QUANTIZATION_CONFIG = { + "algorithm": "quantization", + "preset": "mixed", + "overflow_fix": "disable", + "initializer": { + "range": {"num_init_samples": 300, "type": "mean_min_max"}, + "batchnorm_adaptation": {"num_bn_adaptation_samples": 0}, + }, + "scope_overrides": {"activations": {"{re}.*matmul_0": {"mode": "symmetric"}}}, + "ignored_scopes": [ + "{re}.*Embedding.*", + "{re}.*add___.*", + "{re}.*layer_norm_.*", + "{re}.*matmul_1", + "{re}.*__truediv__.*", + ], +} + + def _onnx_export_nncf_model(model: NNCFNetwork, config: OnnxConfig, output: Union[str, io.BytesIO], opset: int = None): # TODO: remove it when fix controller.strip(copy=True) behavior signature = inspect.signature(model.forward) @@ -228,6 +247,16 @@ def __init__( if self.ov_config is not None: if self.ov_config.compression is None: self.ov_config.compression = DEFAULT_QUANTIZATION_CONFIG + if ( + isinstance(self.ov_config.compression, dict) + and "algorithm" in self.ov_config.compression + and self.ov_config.compression["algorithm"] == "quantization" + ): + self.ov_config.compression["export_to_onnx_standard_ops"] = self.ov_config.save_onnx_model + elif isinstance(self.ov_config.compression, list): + for i, algo_config in enumerate(self.ov_config.compression): + if algo_config["algorithm"] == "quantization": + self.ov_config.compression[i]["export_to_onnx_standard_ops"] = self.ov_config.save_onnx_model if self.args.do_train: self._set_task() diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 8c166a5e8c..74bc098e34 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -36,6 +36,7 @@ TrainingArguments, default_data_collator, ) +from transformers.utils.quantization_config import QuantizationMethod from optimum.intel import ( OVConfig, @@ -54,6 +55,7 @@ OVStableDiffusionXLPipeline, OVQuantizer, OVTrainer, + OVQuantizationConfig, OVWeightQuantizationConfig, ) @@ -97,7 +99,9 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="train", ) - quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset, file_name=file_name) + quantization_config = OVQuantizationConfig(dataset=calibration_dataset) + ov_config = OVConfig(quantization_config=quantization_config) + quantizer.quantize(save_directory=tmp_dir, ov_config=ov_config, file_name=file_name) model = model_cls.from_pretrained(tmp_dir, file_name=file_name) # TODO: uncomment once move to a newer version of NNCF which has some fixes (addmm, baddmm) @@ -133,7 +137,9 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="train", ) - quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset) + quantization_config = OVQuantizationConfig(dataset=calibration_dataset) + ov_config = OVConfig(quantization_config=quantization_config) + quantizer.quantize(save_directory=tmp_dir, ov_config=ov_config) model = model_cls.from_pretrained(tmp_dir) @@ -209,7 +215,7 @@ class OVWeightCompressionTest(unittest.TestCase): ratio=0.8, sensitivity_metric="mean_activation_magnitude", dataset="ptb", - awq=True, + quant_method=QuantizationMethod.AWQ, ), 14, ), @@ -250,7 +256,7 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i tokenizer.pad_token = tokenizer.eos_token quantizer = OVQuantizer.from_pretrained(transformers_model, task=task) - quantizer.quantize(save_directory=tmp_dir, weights_only=True) + quantizer.quantize(save_directory=tmp_dir) model = model_cls.from_pretrained(tmp_dir) _, num_int8, _ = get_num_quantized_nodes(model) @@ -271,7 +277,7 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p tokenizer.pad_token = tokenizer.eos_token quantizer = OVQuantizer.from_pretrained(transformers_model, task=task) - quantizer.quantize(save_directory=tmp_dir, weights_only=True) + quantizer.quantize(save_directory=tmp_dir) model = model_cls.from_pretrained(tmp_dir) _, num_int8, _ = get_num_quantized_nodes(model) @@ -296,7 +302,6 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i ov_config = OVConfig(quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, ratio=0.8)) quantizer.quantize( save_directory=tmp_dir, - weights_only=True, ov_config=ov_config, ) model = model_cls.from_pretrained(tmp_dir) @@ -321,7 +326,7 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_id, exp tokenizer.pad_token = tokenizer.eos_token quantizer = OVQuantizer.from_pretrained(transformers_model, task=task) - quantizer.quantize(save_directory=tmp_dir, weights_only=True) + quantizer.quantize(save_directory=tmp_dir) model = model_cls.from_pretrained(tmp_dir) _, num_int8, _ = get_num_quantized_nodes(model) @@ -353,7 +358,7 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type): @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION) def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8): model_id = MODEL_NAMES[model_type] - quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", num_samples=2) + quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", subset_size=2) with tempfile.TemporaryDirectory() as tmp_dir: model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) @@ -375,7 +380,7 @@ def test_ovmodel_hybrid_quantization_with_custom_dataset( model = model_cls.from_pretrained( model_id, export=True, - quantization_config=OVWeightQuantizationConfig(bits=8, dataset=dataset, num_samples=3), + quantization_config=OVWeightQuantizationConfig(bits=8, dataset=dataset, subset_size=3), ) num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet) self.assertEqual(expected_num_fake_quantize, num_fake_quantize) @@ -411,6 +416,7 @@ def test_ovmodel_4bit_auto_compression_with_config( self, model_cls, model_id, quantization_config, expected_ov_int4 ): with tempfile.TemporaryDirectory() as tmp_dir: + quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config) model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: @@ -570,7 +576,9 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="test", ) - quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset) + quantization_config = OVQuantizationConfig(dataset=calibration_dataset) + ov_config = OVConfig(quantization_config=quantization_config) + quantizer.quantize(save_directory=tmp_dir, ov_config=ov_config) # Test that inference on quantized model works model = OVModelForQuestionAnswering.from_pretrained(tmp_dir) @@ -603,7 +611,9 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="test", ) - quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset) + quantization_config = OVQuantizationConfig(dataset=calibration_dataset) + ov_config = OVConfig(quantization_config=quantization_config) + quantizer.quantize(save_directory=tmp_dir, ov_config=ov_config) # Test that inference on quantized model works model = OVModelForQuestionAnswering.from_pretrained(tmp_dir) diff --git a/tests/openvino/test_training.py b/tests/openvino/test_training.py index 80298faf2b..db443c6de2 100644 --- a/tests/openvino/test_training.py +++ b/tests/openvino/test_training.py @@ -45,14 +45,14 @@ from transformers.utils import WEIGHTS_NAME from optimum.intel.openvino import OVTrainingArguments -from optimum.intel.openvino.configuration import DEFAULT_QUANTIZATION_CONFIG, OVConfig +from optimum.intel.openvino.configuration import OVConfig from optimum.intel.openvino.modeling import ( OVModel, OVModelForAudioClassification, OVModelForImageClassification, OVModelForSequenceClassification, ) -from optimum.intel.openvino.trainer import OVTrainer +from optimum.intel.openvino.trainer import DEFAULT_QUANTIZATION_CONFIG, OVTrainer from optimum.intel.openvino.utils import OV_XML_FILE_NAME From 350bfa9af7117899b5779e4d11be16480e2328d1 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 28 Mar 2024 11:39:25 +0100 Subject: [PATCH 02/15] Ignored scope tweaks --- optimum/intel/openvino/configuration.py | 8 +++++--- optimum/intel/openvino/quantization.py | 27 ++++++++++++++----------- tests/openvino/test_quantization.py | 2 +- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 0c0891064b..f52e70df48 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -54,10 +54,12 @@ class OVQuantizationConfigBase(QuantizationConfigMixin): def __init__( self, dataset: Optional[Union[str, List[str], nncf.Dataset, datasets.Dataset]] = None, - ignored_scope: Optional[Union[dict, nncf.IgnoredScope]] = None, + ignored_scope: Optional[dict] = None, subset_size: Optional[int] = None, ): self.dataset = dataset + if ignored_scope is None: + ignored_scope = {} self.ignored_scope = ignored_scope self.subset_size = subset_size @@ -168,7 +170,7 @@ def __init__( self, dataset: Optional[Union[str, List[str], nncf.Dataset, datasets.Dataset]] = None, bits: int = 8, - ignored_scope: Optional[Union[dict, nncf.IgnoredScope]] = None, + ignored_scope: Optional[dict] = None, sym: bool = False, tokenizer: Optional[Any] = None, ratio: float = 1.0, @@ -219,7 +221,7 @@ class OVQuantizationConfig(OVQuantizationConfigBase): def __init__( self, dataset: Union[str, List[str], nncf.Dataset, datasets.Dataset], - ignored_scope: Optional[nncf.IgnoredScope] = None, + ignored_scope: Optional[dict] = None, subset_size: Optional[int] = 300, preset: nncf.QuantizationPreset = nncf.QuantizationPreset.MIXED, model_type: nncf.ModelType = nncf.ModelType.TRANSFORMER, diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index f63cbc7fc3..032d43495c 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -283,12 +283,14 @@ def _quantize_ovbasemodel( else: quantization_dataset = nncf.Dataset(calibration_dataloader) + ignored_scope = IgnoredScope(**quantization_config.ignored_scope) + # Actual model quantization quantized_model = nncf.quantize( self.model.model, quantization_dataset, subset_size=quantization_config.subset_size, - ignored_scope=quantization_config.ignored_scope, + ignored_scope=ignored_scope, model_type=quantization_config.model_type, preset=quantization_config.preset, fast_bias_correction=quantization_config.fast_bias_correction, @@ -374,11 +376,13 @@ def _quantize_torchmodel( data_collator=data_collator, ) quantization_dataset = nncf.Dataset(calibration_dataloader) + + ignored_scope = IgnoredScope(**quantization_config.ignored_scope) model = nncf.quantize( model, quantization_dataset, subset_size=quantization_config.subset_size, - ignored_scope=quantization_config.ignored_scope, + ignored_scope=ignored_scope, model_type=quantization_config.model_type, preset=quantization_config.preset, fast_bias_correction=quantization_config.fast_bias_correction, @@ -487,7 +491,7 @@ def get_calibration_dataset( def _get_calibration_dataloader( self, - calibration_dataset: Union[Dataset, nncf.Dataset], + calibration_dataset: "Dataset", batch_size: int, remove_unused_columns: bool, data_collator: Optional[DataCollator] = None, @@ -539,9 +543,7 @@ def _weight_only_quantization( if isinstance(config.sensitivity_metric, str): sensitivity_metric = getattr(SensitivityMetric, config.sensitivity_metric.upper()) - ignored_scope = None - if isinstance(config.ignored_scope, dict): - ignored_scope = IgnoredScope(**config.ignored_scope) + ignored_scope = IgnoredScope(**config.ignored_scope) if config.bits == 8: mode = CompressWeightsMode.INT8_SYM if config.sym else CompressWeightsMode.INT8_ASYM @@ -555,10 +557,10 @@ def _weight_only_quantization( group_size=config.group_size, all_layers=config.all_layers, sensitivity_metric=sensitivity_metric, - # awq=config.quant_method == QuantizationMethod.AWQ, + # awq=config.quant_method == QuantizationMethod.AWQ, # TODO : remove and add it back once nncf 2.9.0 ignored_scope=ignored_scope, dataset=dataset, - # subset_size=config.subset_size if config.subset_size else 128, + # subset_size=config.subset_size if config.subset_size else 128, # TODO : remove and add it back once nncf 2.9.0 ) @@ -627,13 +629,14 @@ def _hybrid_quantization( """ ops_to_compress = _collect_ops_with_weights(model) - ignored_scope = quantization_config.ignored_scope if isinstance(quantization_config.ignored_scope, dict) else {} - ptq_ignored_scope = nncf.IgnoredScope(**ignored_scope) + ignored_scope = IgnoredScope(**quantization_config.ignored_scope) + ptq_ignored_scope = copy.deepcopy(ignored_scope) ptq_ignored_scope.names += ops_to_compress wc_quantization_config = copy.deepcopy(quantization_config) - wc_quantization_config.ignored_scope = ignored_scope - wc_quantization_config.ignored_scope["types"] = ignored_scope.get("types", []) + ["Convolution"] + wc_quantization_config.ignored_scope["types"] = wc_quantization_config.ignored_scope.get("types", []) + [ + "Convolution" + ] compressed_model = _weight_only_quantization(model, wc_quantization_config) subset_size = quantization_config.subset_size if quantization_config.subset_size else 200 diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 74bc098e34..16f1218eab 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -550,7 +550,7 @@ def test_ovmodel_load_large_model_with_additional_quantization_config(self): "all_layers": None, "sensitivity_metric": None, "dataset": None, - "ignored_scope": None, + "ignored_scope": nncf.IgnoredScope(), } compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params) From b90fd42a85bccd6c564ed37882499a4c9b83b5ae Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 2 Apr 2024 17:28:13 +0200 Subject: [PATCH 03/15] Added **kwargs to quantization call. Added config serialization test. --- optimum/intel/openvino/configuration.py | 78 +++++++++++-- optimum/intel/openvino/quantization.py | 8 +- tests/openvino/test_quantization.py | 142 ++++++++++++++++++++++++ 3 files changed, 217 insertions(+), 11 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index f52e70df48..2d1a306022 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -11,10 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import json +import logging from dataclasses import dataclass from enum import Enum -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union, Iterable, Tuple import datasets import nncf @@ -25,6 +26,8 @@ from optimum.configuration_utils import BaseConfig +logger = logging.getLogger(__name__) + _DEFAULT_4BIT_CONFIGS = { "databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.5}, @@ -49,6 +52,32 @@ } +class replace_properties_values: + def __init__(self, obj, property_names, property_values): + self.obj = obj + self.property_names = property_names + self.new_property_values = property_values + self.old_property_values = [None] * len(property_names) + for i, property_name in enumerate(self.property_names): + self.old_property_values[i] = getattr(obj, property_name) + + def __enter__(self): + for property_name, new_property_value in zip(self.property_names, self.new_property_values): + setattr(self.obj, property_name, new_property_value) + + def __exit__(self, exc_type, exc_val, exc_tb): + for property_name, old_property_value in zip(self.property_names, self.old_property_values): + setattr(self.obj, property_name, old_property_value) + + +def is_serializable(obj): + try: + json.dumps(obj) + return True + except: + return False + + @dataclass class OVQuantizationConfigBase(QuantizationConfigMixin): def __init__( @@ -65,7 +94,7 @@ def __init__( def post_init(self): if self.dataset is not None and isinstance(self.dataset, str): - llm_datasets = ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"] + llm_datasets = ["wikitext", "c4", "c4-new", "ptb", "ptb-new"] stable_diffusion_datasets = [ "conceptual_captions", "laion/220k-GPT4Vision-captions-from-LIVIS", @@ -77,6 +106,16 @@ def post_init(self): {llm_datasets} for LLLMs or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}""" ) + def to_dict_without_properties(self, property_names: Union[List[str], Tuple[str]]) -> Dict[str, Any]: + with replace_properties_values(self, property_names, [None] * len(property_names)): + result = super().to_dict() + return result + + def to_dict(self) -> Dict[str, Any]: + if not is_serializable(self.dataset): + return self.to_dict_without_properties(("dataset",)) + return super().to_dict() + class OVConfig(BaseConfig): CONFIG_NAME = "openvino_config.json" @@ -112,14 +151,22 @@ def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): for name, value in model_inputs.items() ] - def to_dict(self) -> Dict[str, Any]: - # Parent to_dict() implementation does not support quantization_config being None + def to_dict_safe(self, to_diff_dict: bool = False) -> Dict[str, Any]: if self.quantization_config is None: - self.quantization_config = OVQuantizationConfigBase() - result = super().to_dict() - del result["quantization_config"] + # Parent to_dict() implementation does not support quantization_config being None + with replace_properties_values(self, ("quantization_config",), (OVQuantizationConfigBase(),)): + result = super().to_diff_dict() if to_diff_dict else super().to_dict() + del result["quantization_config"] + else: + result = super().to_diff_dict() if to_diff_dict else super().to_dict() return result + def to_dict(self) -> Dict[str, Any]: + return self.to_dict_safe(to_diff_dict=False) + + def to_diff_dict(self) -> Dict[str, Any]: + return self.to_dict_safe(to_diff_dict=True) + class OVQuantizationMethod(str, Enum): DEFAULT = "default" @@ -147,7 +194,7 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. dataset (`str or List[str]`, *optional*): The dataset used for data-aware compression or quantization with NNCF. You can provide your own dataset - in a list of strings or just use the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs + in a list of strings or just use the one from the list ['wikitext','c4','c4-new','ptb','ptb-new'] for LLLMs or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models. ratio (`float`, defaults to 1.0): The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM @@ -165,7 +212,6 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): The maximum number of samples composing the calibration dataset. """ - def __init__( self, dataset: Optional[Union[str, List[str], nncf.Dataset, datasets.Dataset]] = None, @@ -215,6 +261,11 @@ def post_init(self): f"For 8-bit quantization, `group_size` is expected to be set to -1, but was set to {self.group_size}" ) + def to_dict(self) -> Dict[str, Any]: + if not is_serializable(self.tokenizer): + return self.to_dict_without_properties(("tokenizer",)) + return super().to_dict() + @dataclass class OVQuantizationConfig(OVQuantizationConfigBase): @@ -228,6 +279,7 @@ def __init__( fast_bias_correction: bool = True, overflow_fix: OverflowFix = OverflowFix.DISABLE, ): + super().__init__(dataset, ignored_scope, subset_size) self.preset = preset self.model_type = model_type @@ -240,12 +292,18 @@ def post_init(self): Safety checker that arguments are correct """ super().post_init() + # if self.dataset is None: # raise ValueError( # "`dataset` is needed to compute the activations range during the calibration step and was not provided." # " In case you only want to apply quantization on the weights, please set `weights_only=True`." # ) + def to_dict(self) -> Dict[str, Any]: + # TODO: remove once NNCF is updated to 2.10 + with replace_properties_values(self, ("overflow_fix", "preset"), (self.overflow_fix.value, self.preset.value)): + return super().to_dict() + def _check_default_4bit_configs(config: PretrainedConfig): return _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, None) diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 9188c93938..24a145de71 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -209,6 +209,7 @@ def quantize( batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, + **kwargs, ): """ Quantize a model given the optimization specifications defined in `quantization_config`. @@ -263,7 +264,8 @@ def quantize( logger.warning("`quantization_config` was not provided, a default weight quantization will be applied") if isinstance(self.model, OVBaseModel): - self._quantize_ovbasemodel(ov_config, save_directory, batch_size, data_collator, remove_unused_columns) + self._quantize_ovbasemodel(ov_config, save_directory, batch_size, data_collator, remove_unused_columns, + **kwargs) elif isinstance(self.model, torch.nn.Module): logger.warning( @@ -283,6 +285,7 @@ def _quantize_ovbasemodel( batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, + **kwargs, ): save_directory = Path(save_directory) save_directory.mkdir(parents=True, exist_ok=True) @@ -334,6 +337,7 @@ def _quantize_ovbasemodel( preset=quantization_config.preset, fast_bias_correction=quantization_config.fast_bias_correction, advanced_parameters=nncf.AdvancedQuantizationParameters(overflow_fix=quantization_config.overflow_fix), + **kwargs, ) self.model.model = quantized_model self.model.save_pretrained(save_directory) @@ -346,6 +350,7 @@ def _quantize_torchmodel( batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, + **kwargs, ): self._set_task() save_directory = Path(save_directory) @@ -424,6 +429,7 @@ def _quantize_torchmodel( preset=quantization_config.preset, fast_bias_correction=quantization_config.fast_bias_correction, advanced_parameters=nncf.AdvancedQuantizationParameters(overflow_fix=quantization_config.overflow_fix), + **kwargs, ) model_path = save_directory / (onnx_file_name if save_onnx_model else ov_file_name) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 7e65433298..f909bdb172 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -19,11 +19,13 @@ import unittest from collections import defaultdict from functools import partial +from typing import List import evaluate import numpy as np import torch from datasets import load_dataset +from nncf.quantization.advanced_parameters import OverflowFix from parameterized import parameterized import openvino.runtime as ov import nncf @@ -59,6 +61,7 @@ OVQuantizationConfig, OVWeightQuantizationConfig, ) +from optimum.intel.openvino.configuration import OVQuantizationMethod, OVQuantizationConfigBase from optimum.intel.openvino.quantization import InferRequestWrapper from optimum.intel.utils.import_utils import is_openvino_version @@ -676,6 +679,145 @@ def compute_metrics(p): self.assertTrue("logits" in outputs) +class OVQuantizationConfigTest(unittest.TestCase): + QUANTIZATION_CONFIGS = ( + ( + None, + [], + ), + ( + OVWeightQuantizationConfig(), + [] + ), + ( + OVWeightQuantizationConfig( + bits=8, + sym=True, + ), + [] + ), + ( + { + "bits": 8, + "sym": True, + }, + [] + ), + ( + OVWeightQuantizationConfig( + dataset="wikitext", + bits=4, + ignored_scope={"names": ["op_name"]}, + sym=False, + tokenizer="dbmdz/bert-base-german-cased", + ratio=1.0, + group_size=128, + all_layers=True, + sensitivity_metric="mean_activation_magnitude", + subset_size=100, + quant_method=OVQuantizationMethod.DEFAULT, + ), + [] + ), + ( + OVWeightQuantizationConfig( + dataset=["wikitext", "c4"] + ), + [] + ), + ( + OVWeightQuantizationConfig( + dataset=load_dataset("wikitext", "wikitext-2-raw-v1", split="test") + ), + ["dataset"] + ), + ( + OVWeightQuantizationConfig( + dataset=nncf.Dataset([np.zeros((1, 10))]) + ), + ["dataset"] + ), + ( + OVWeightQuantizationConfig( + tokenizer=AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased") + ), + ["tokenizer"] + ), + ( + OVQuantizationConfig( + dataset="wikitext" + ), + [] + ), + ( + { + "dataset": "wikitext" + }, + [] + ), + ( + OVQuantizationConfig( + dataset="wikitext", + ignored_scope={"names": ["op_name"]}, + subset_size=100, + preset=nncf.QuantizationPreset.MIXED, + model_type=nncf.ModelType.TRANSFORMER, + fast_bias_correction=True, + overflow_fix=OverflowFix.DISABLE + ), + [] + ), + ( + OVQuantizationConfig( + dataset=["wikitext", "c4"] + ), + [] + ), + ( + OVQuantizationConfig( + dataset=load_dataset("wikitext", "wikitext-2-raw-v1", split="test") + ), + ["dataset"] + ), + ( + OVQuantizationConfig( + dataset=nncf.Dataset([np.zeros((1, 10))]) + ), + ["dataset"] + ), + ) + + @parameterized.expand(QUANTIZATION_CONFIGS) + def test_config_serialization(self, quantization_config: OVQuantizationConfigBase, non_equal_property_names: List[str]): + def str_to_enum(enum_cls, value): + for k, v in enum_cls.__members__.items(): + if getattr(enum_cls, k).value == value: + return v + raise ValueError(f"Could not convert string {value} to enum value of type {enum_cls}") + + ov_config = OVConfig(quantization_config=quantization_config) + with tempfile.TemporaryDirectory() as tmp_dir: + ov_config.save_pretrained(tmp_dir) + loaded_ov_config = OVConfig.from_pretrained(tmp_dir) + + if quantization_config is None: + self.assertEqual(loaded_ov_config.quantization_config, None) + return + for key, value in loaded_ov_config.quantization_config.items(): + initial_value = quantization_config[key] if isinstance(quantization_config, dict) else getattr(ov_config.quantization_config, key) + if key == "preset" or key == "overflow_fix": + # TODO: remove once NNCF is updated to 2.10 + self.assertTrue(isinstance(value, str)) + if key == "preset": + value = str_to_enum(nncf.QuantizationPreset, value) + else: + value = str_to_enum(OverflowFix, value) + if key in non_equal_property_names: + self.assertNotEqual(value, initial_value) + else: + self.assertEqual(value, initial_value) + + class InferRequestWrapperTest(unittest.TestCase): MODEL_ID = ("openai/whisper-tiny.en",) APPLY_CACHING = (False, True) From bfe982c896f93dc07cdd8a671f3a0040b8a4eb68 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 3 Apr 2024 13:31:59 +0200 Subject: [PATCH 04/15] Ignored scope changes. Tests pass. --- optimum/intel/openvino/configuration.py | 20 ++++++++++++-------- optimum/intel/openvino/quantization.py | 13 +++++-------- tests/openvino/test_quantization.py | 19 ++++++++++++++++--- 3 files changed, 33 insertions(+), 19 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 2d1a306022..7f16d39943 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -83,12 +83,12 @@ class OVQuantizationConfigBase(QuantizationConfigMixin): def __init__( self, dataset: Optional[Union[str, List[str], nncf.Dataset, datasets.Dataset]] = None, - ignored_scope: Optional[dict] = None, + ignored_scope: Optional[Union[dict, nncf.IgnoredScope]] = None, subset_size: Optional[int] = None, ): self.dataset = dataset - if ignored_scope is None: - ignored_scope = {} + if isinstance(ignored_scope, dict): + ignored_scope = nncf.IgnoredScope(**ignored_scope) self.ignored_scope = ignored_scope self.subset_size = subset_size @@ -112,9 +112,13 @@ def to_dict_without_properties(self, property_names: Union[List[str], Tuple[str] return result def to_dict(self) -> Dict[str, Any]: - if not is_serializable(self.dataset): - return self.to_dict_without_properties(("dataset",)) - return super().to_dict() + properties_to_omit = [] if is_serializable(self.dataset) else ["dataset"] + if isinstance(self.ignored_scope, nncf.IgnoredScope): + ignored_scope_as_dict = dict(names=self.ignored_scope.names, types=self.ignored_scope.types, + patterns=self.ignored_scope.patterns, validate=self.ignored_scope.validate) + with replace_properties_values(self, ["ignored_scope"], [ignored_scope_as_dict]): + return self.to_dict_without_properties(properties_to_omit) + return self.to_dict_without_properties(properties_to_omit) class OVConfig(BaseConfig): @@ -216,7 +220,7 @@ def __init__( self, dataset: Optional[Union[str, List[str], nncf.Dataset, datasets.Dataset]] = None, bits: int = 8, - ignored_scope: Optional[dict] = None, + ignored_scope: Optional[Union[dict, nncf.IgnoredScope]] = None, sym: bool = False, tokenizer: Optional[Any] = None, ratio: float = 1.0, @@ -272,7 +276,7 @@ class OVQuantizationConfig(OVQuantizationConfigBase): def __init__( self, dataset: Union[str, List[str], nncf.Dataset, datasets.Dataset], - ignored_scope: Optional[dict] = None, + ignored_scope: Optional[Union[dict, nncf.IgnoredScope]] = None, subset_size: Optional[int] = 300, preset: nncf.QuantizationPreset = nncf.QuantizationPreset.MIXED, model_type: nncf.ModelType = nncf.ModelType.TRANSFORMER, diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 24a145de71..9af851f70e 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -586,10 +586,6 @@ def _weight_only_quantization( if isinstance(config.sensitivity_metric, str): sensitivity_metric = getattr(SensitivityMetric, config.sensitivity_metric.upper()) - ignored_scope = None - if isinstance(config.ignored_scope, dict): - ignored_scope = IgnoredScope(**config.ignored_scope) - if config.bits == 8: mode = CompressWeightsMode.INT8_SYM if config.sym else CompressWeightsMode.INT8_ASYM else: @@ -603,7 +599,7 @@ def _weight_only_quantization( all_layers=config.all_layers, sensitivity_metric=sensitivity_metric, # awq=config.quant_method == QuantizationMethod.AWQ, - ignored_scope=ignored_scope, + ignored_scope=config.ignored_scope, dataset=dataset, # subset_size=config.subset_size if config.subset_size else 128, ) @@ -674,13 +670,14 @@ def _hybrid_quantization( """ ops_to_compress = _collect_ops_with_weights(model) - ignored_scope = quantization_config.ignored_scope if isinstance(quantization_config.ignored_scope, dict) else {} - ptq_ignored_scope = nncf.IgnoredScope(**ignored_scope) + ignored_scope: Union[nncf.IgnoredScope, None] = quantization_config.ignored_scope + ignored_scope = ignored_scope or nncf.IgnoredScope() + ptq_ignored_scope = copy.deepcopy(ignored_scope) ptq_ignored_scope.names += ops_to_compress wc_quantization_config = copy.deepcopy(quantization_config) wc_quantization_config.ignored_scope = ignored_scope - wc_quantization_config.ignored_scope["types"] = ignored_scope.get("types", []) + ["Convolution"] + wc_quantization_config.ignored_scope.types.append("Convolution") compressed_model = _weight_only_quantization(model, wc_quantization_config) subset_size = quantization_config.subset_size if quantization_config.subset_size else 200 diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index f909bdb172..63ab2e83d7 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -554,7 +554,7 @@ def test_ovmodel_load_large_model_with_additional_quantization_config(self): "all_layers": None, "sensitivity_metric": None, "dataset": None, - "ignored_scope": nncf.IgnoredScope(), + "ignored_scope": None, } compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params) @@ -717,7 +717,7 @@ class OVQuantizationConfigTest(unittest.TestCase): subset_size=100, quant_method=OVQuantizationMethod.DEFAULT, ), - [] + ["ignored_scope"] ), ( OVWeightQuantizationConfig( @@ -743,6 +743,12 @@ class OVQuantizationConfigTest(unittest.TestCase): ), ["tokenizer"] ), + ( + OVWeightQuantizationConfig( + ignored_scope=nncf.IgnoredScope(names=["op_name"]) + ), + ["ignored_scope"] + ), ( OVQuantizationConfig( dataset="wikitext" @@ -765,7 +771,7 @@ class OVQuantizationConfigTest(unittest.TestCase): fast_bias_correction=True, overflow_fix=OverflowFix.DISABLE ), - [] + ["ignored_scope"] ), ( OVQuantizationConfig( @@ -785,6 +791,13 @@ class OVQuantizationConfigTest(unittest.TestCase): ), ["dataset"] ), + ( + OVQuantizationConfig( + dataset=["wikitext", "c4"], + ignored_scope=nncf.IgnoredScope(names=["op_name"]) + ), + ["ignored_scope"] + ), ) @parameterized.expand(QUANTIZATION_CONFIGS) From e7d0d1451ac67afb9a6339607810b9b9d77f43cf Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 3 Apr 2024 15:44:09 +0200 Subject: [PATCH 05/15] Added documentation --- optimum/intel/openvino/configuration.py | 107 +++++++++++++++++------- optimum/intel/openvino/quantization.py | 23 +++-- tests/openvino/test_quantization.py | 25 ++++-- 3 files changed, 112 insertions(+), 43 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 7f16d39943..faea459dcd 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. import json -import logging from dataclasses import dataclass from enum import Enum -from typing import Any, Dict, List, Optional, Union, Iterable, Tuple +from typing import Any, Dict, List, Optional, Union, Tuple import datasets import nncf @@ -26,8 +25,6 @@ from optimum.configuration_utils import BaseConfig -logger = logging.getLogger(__name__) - _DEFAULT_4BIT_CONFIGS = { "databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.5}, @@ -53,6 +50,9 @@ class replace_properties_values: + """ + A context manager for temporarily overriding an object's properties + """ def __init__(self, obj, property_names, property_values): self.obj = obj self.property_names = property_names @@ -80,12 +80,25 @@ def is_serializable(obj): @dataclass class OVQuantizationConfigBase(QuantizationConfigMixin): + """ + Base configuration class for quantization parameters + """ def __init__( self, dataset: Optional[Union[str, List[str], nncf.Dataset, datasets.Dataset]] = None, ignored_scope: Optional[Union[dict, nncf.IgnoredScope]] = None, subset_size: Optional[int] = None, ): + """ + + Args: + dataset (`str or List[str] or nncf.Dataset or datasets.Dataset`, *optional*): + The dataset used for data-aware weight compression or quantization with NNCF. + ignored_scope (`dict or nncf.IgnoredScope`, *optional*): + An ignored scope that defines the list of model nodes to be ignored during quantization. + subset_size (`int`, *optional*): + The maximum number of samples composing the calibration dataset. + """ self.dataset = dataset if isinstance(ignored_scope, dict): ignored_scope = nncf.IgnoredScope(**ignored_scope) @@ -93,20 +106,17 @@ def __init__( self.subset_size = subset_size def post_init(self): - if self.dataset is not None and isinstance(self.dataset, str): - llm_datasets = ["wikitext", "c4", "c4-new", "ptb", "ptb-new"] - stable_diffusion_datasets = [ - "conceptual_captions", - "laion/220k-GPT4Vision-captions-from-LIVIS", - "laion/filtered-wit", - ] - if self.dataset not in llm_datasets + stable_diffusion_datasets: - raise ValueError( - f"""You have entered a string value for dataset. You can only choose between - {llm_datasets} for LLLMs or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}""" - ) + if not (self.dataset is None or isinstance(self.dataset, (str, list, nncf.Dataset, datasets.Dataset))): + raise ValueError("Dataset must be a instance of either string, list of strings, nncf.Dataset or " + f"dataset.Dataset, but found {type(self.dataset)}") + if not(self.ignored_scope is None or isinstance(self.ignored_scope, nncf.IgnoredScope)): + raise ValueError("Ignored scope must be a instance of either dict, or nncf.IgnoredScope but found " + f"{type(self.dataset)}") def to_dict_without_properties(self, property_names: Union[List[str], Tuple[str]]) -> Dict[str, Any]: + """ + Call to_dict() with given properties overwritten with None. Useful for hiding non-serializable properties. + """ with replace_properties_values(self, property_names, [None] * len(property_names)): result = super().to_dict() return result @@ -180,10 +190,9 @@ class OVQuantizationMethod(str, Enum): class OVWeightQuantizationConfig(OVQuantizationConfigBase): """ This is a wrapper class about all possible attributes and features that you can play with a model that has been - loaded using `optimum-intel` api for quantization with NNCF. + loaded using `optimum-intel` api for weights compression with NNCF. Args: - bits (`int`, defaults to 8): The number of bits to quantize to. sym (`bool`, defaults to `False`): @@ -214,21 +223,24 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization. subset_size (`int`, *optional*): The maximum number of samples composing the calibration dataset. + quant_method (`str`, defaults of OVQuantizationMethod.DEFAULT): + Weight compression method to apply. """ def __init__( self, - dataset: Optional[Union[str, List[str], nncf.Dataset, datasets.Dataset]] = None, bits: int = 8, - ignored_scope: Optional[Union[dict, nncf.IgnoredScope]] = None, sym: bool = False, tokenizer: Optional[Any] = None, + dataset: Optional[Union[str, List[str], nncf.Dataset, datasets.Dataset]] = None, ratio: float = 1.0, group_size: Optional[int] = None, all_layers: Optional[bool] = None, sensitivity_metric: Optional[str] = None, + ignored_scope: Optional[Union[dict, nncf.IgnoredScope]] = None, subset_size: Optional[int] = None, quant_method: Optional[Union[QuantizationMethod, OVQuantizationMethod]] = OVQuantizationMethod.DEFAULT, + **kwargs, ): super().__init__(dataset, ignored_scope, subset_size) self.bits = bits @@ -251,6 +263,18 @@ def post_init(self): raise ValueError("`ratio` must between 0 and 1.") if self.group_size is not None and self.group_size != -1 and self.group_size <= 0: raise ValueError("`group_size` must be greater than 0 or equal to -1") + if self.dataset is not None and isinstance(self.dataset, str): + llm_datasets = ["wikitext", "c4", "c4-new", "ptb", "ptb-new"] + stable_diffusion_datasets = [ + "conceptual_captions", + "laion/220k-GPT4Vision-captions-from-LIVIS", + "laion/filtered-wit", + ] + if self.dataset not in llm_datasets + stable_diffusion_datasets: + raise ValueError( + f"""You have entered a string value for dataset. You can only choose between + {llm_datasets} for LLLMs or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}""" + ) if self.bits not in [4, 8]: raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}") @@ -278,12 +302,36 @@ def __init__( dataset: Union[str, List[str], nncf.Dataset, datasets.Dataset], ignored_scope: Optional[Union[dict, nncf.IgnoredScope]] = None, subset_size: Optional[int] = 300, - preset: nncf.QuantizationPreset = nncf.QuantizationPreset.MIXED, + preset: nncf.QuantizationPreset = None, model_type: nncf.ModelType = nncf.ModelType.TRANSFORMER, fast_bias_correction: bool = True, overflow_fix: OverflowFix = OverflowFix.DISABLE, + **kwargs, ): - + """ + Configuration class containing parameters related to model quantization with NNCF. Compared to weight + compression, during quantization both weights and activations are converted to lower precision. + Args: + dataset (`str or List[str] or nncf.Dataset or datasets.Dataset`): + A dataset used for quantization parameters calibration. Required parameter. + ignored_scope (`dict or nncf.IgnoredScope`, *optional*): + An ignored scope that defines the list of model nodes to be ignored during quantization. + subset_size (`int`, *optional*): + The maximum number of samples composing the calibration dataset. + preset (`nncf.QuantizationPreset`, *optional*): + A preset controls the quantization mode (symmetric and asymmetric). + It can take the following values: + - `performance`: Symmetric quantization of weights and activations. + - `mixed`: Symmetric quantization of weights and asymmetric quantization of activations. + Default value is None. In this case, `mixed` preset is used for `transformer` + model type otherwise `performance`. + model_type (`nncf.ModelType`, defaults to nncf.ModelType.TRANSFORMER): + Model type is needed to specify additional patterns in the model. Supported only `transformer` now. + fast_bias_correction (`bool`, defaults to True): + Whether to apply fast or full bias correction algorithm. + overflow_fix (`bool`, default to OverflowFix.DISABLE): + Parameter for controlling overflow fix setting. + """ super().__init__(dataset, ignored_scope, subset_size) self.preset = preset self.model_type = model_type @@ -296,16 +344,17 @@ def post_init(self): Safety checker that arguments are correct """ super().post_init() - - # if self.dataset is None: - # raise ValueError( - # "`dataset` is needed to compute the activations range during the calibration step and was not provided." - # " In case you only want to apply quantization on the weights, please set `weights_only=True`." - # ) + if self.dataset is None: + raise ValueError( + "`dataset` is needed to compute the activations range during the calibration step and was not provided." + " In case you only want to apply quantization on the weights, please run weight-only quantization." + ) def to_dict(self) -> Dict[str, Any]: # TODO: remove once NNCF is updated to 2.10 - with replace_properties_values(self, ("overflow_fix", "preset"), (self.overflow_fix.value, self.preset.value)): + overflow_fix_value = None if self.overflow_fix is None else self.overflow_fix.value + preset_value = None if self.preset is None else self.preset.value + with replace_properties_values(self, ("overflow_fix", "preset"), (overflow_fix_value, preset_value)): return super().to_dict() diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 9af851f70e..d6328165a9 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -24,7 +24,7 @@ import openvino import torch import transformers -from nncf import CompressWeightsMode, IgnoredScope, SensitivityMetric +from nncf import CompressWeightsMode, SensitivityMetric from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters from nncf.torch import register_module from nncf.torch.initialization import PTInitializingDataLoader @@ -217,7 +217,7 @@ def quantize( Args: save_directory (`Union[str, Path]`): The directory where the quantized model should be saved. - quantization_config (`OVConfig`, *optional*): + ov_config (`OVConfig`, *optional*): The configuration containing the parameters related to quantization. file_name (`str`, *optional*): The model file name to use when saving the model. Overwrites the default file name `"model.onnx"`. @@ -236,7 +236,8 @@ def quantize( >>> # or >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english") >>> quantizer = OVQuantizer.from_pretrained(model, task="text-classification") - >>> quantizer.quantize(calibration_dataset=calibration_dataset, save_directory="./quantized_model") + >>> ov_config = OVConfig(quantization_config=OVQuantizationConfig(dataset=calibration_dataset)) + >>> quantizer.quantize(ov_config=ov_config, save_directory="./quantized_model") >>> optimized_model = OVModelForSequenceClassification.from_pretrained("./quantized_model") ``` @@ -245,7 +246,8 @@ def quantize( >>> from transformers import AutoModelForCausalLM >>> model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b") >>> quantizer = OVQuantizer.from_pretrained(model, task="text-generation") - >>> quantizer.quantize(save_directory="./quantized_model", weights_only=True) + >>> ov_config = OVConfig(quantization_config=OVWeightQuantizationConfig(bits=8, sym=True)) + >>> quantizer.quantize(ov_config=ov_config, save_directory="./quantized_model") >>> optimized_model = OVModelForCausalLM.from_pretrained("./quantized_model") ``` """ @@ -455,6 +457,8 @@ def _quantize_torchmodel( except FileNotFoundError: pass + ov_config.save_pretrained(save_directory) + @staticmethod def _save_pretrained(model: openvino.runtime.Model, output_path: str): compress_quantize_weights_transformation(model) @@ -534,7 +538,7 @@ def get_calibration_dataset( def _get_calibration_dataloader( self, - calibration_dataset: Union[Dataset, nncf.Dataset], + calibration_dataset: "Dataset", batch_size: int, remove_unused_columns: bool, data_collator: Optional[DataCollator] = None, @@ -598,10 +602,10 @@ def _weight_only_quantization( group_size=config.group_size, all_layers=config.all_layers, sensitivity_metric=sensitivity_metric, - # awq=config.quant_method == QuantizationMethod.AWQ, + # awq=config.quant_method == QuantizationMethod.AWQ, # TODO : enable from nncf v2.9.0 ignored_scope=config.ignored_scope, dataset=dataset, - # subset_size=config.subset_size if config.subset_size else 128, + # subset_size=config.subset_size if config.subset_size else 128, # TODO : enable from nncf v2.9.0 ) @@ -686,8 +690,9 @@ def _hybrid_quantization( calibration_dataset=nncf.Dataset(dataset), model_type=nncf.ModelType.TRANSFORMER, ignored_scope=ptq_ignored_scope, - # The SQ algo should be disabled for MatMul nodes because their weights are already compressed - advanced_parameters=nncf.AdvancedQuantizationParameters(AdvancedSmoothQuantParameters(matmul=-1)), + # SQ algo should be disabled for MatMul nodes because their weights are already compressed + advanced_parameters=nncf.AdvancedQuantizationParameters( + smooth_quant_alphas=AdvancedSmoothQuantParameters(matmul=-1)), subset_size=subset_size, ) return quantized_model diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 63ab2e83d7..ba90eb85bc 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -18,6 +18,7 @@ import tempfile import unittest from collections import defaultdict +from enum import Enum from functools import partial from typing import List @@ -117,6 +118,10 @@ def preprocess_function(examples, tokenizer): outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) def test_ovmodel_static_quantization(self, model_cls, model_name, expected_fake_quantize, expected_int8): task = model_cls.export_feature @@ -270,6 +275,15 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + original_config_as_dict = OVWeightQuantizationConfig(bits=8, sym=True).to_dict() + for k in original_config_as_dict.keys(): + v = original_config_as_dict[k] + if isinstance(v, Enum): + original_config_as_dict[k] = v.value + self.assertEqual(original_config_as_dict, loaded_config.quantization_config) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_pt_int8, expected_ov_int8): task = model_cls.export_feature @@ -820,11 +834,12 @@ def str_to_enum(enum_cls, value): initial_value = quantization_config[key] if isinstance(quantization_config, dict) else getattr(ov_config.quantization_config, key) if key == "preset" or key == "overflow_fix": # TODO: remove once NNCF is updated to 2.10 - self.assertTrue(isinstance(value, str)) - if key == "preset": - value = str_to_enum(nncf.QuantizationPreset, value) - else: - value = str_to_enum(OverflowFix, value) + if getattr(quantization_config, key) is not None: + self.assertTrue(isinstance(value, str)) + if key == "preset": + value = str_to_enum(nncf.QuantizationPreset, value) + else: + value = str_to_enum(OverflowFix, value) if key in non_equal_property_names: self.assertNotEqual(value, initial_value) else: From 431775e3156193d6ce36f6b78e91a0a302b48f85 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 3 Apr 2024 15:45:28 +0200 Subject: [PATCH 06/15] Linters --- optimum/intel/openvino/configuration.py | 27 +++++-- optimum/intel/openvino/quantization.py | 8 +- tests/openvino/test_quantization.py | 101 +++++++----------------- 3 files changed, 51 insertions(+), 85 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index faea459dcd..ffd4a5503d 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -14,7 +14,7 @@ import json from dataclasses import dataclass from enum import Enum -from typing import Any, Dict, List, Optional, Union, Tuple +from typing import Any, Dict, List, Optional, Tuple, Union import datasets import nncf @@ -53,6 +53,7 @@ class replace_properties_values: """ A context manager for temporarily overriding an object's properties """ + def __init__(self, obj, property_names, property_values): self.obj = obj self.property_names = property_names @@ -83,6 +84,7 @@ class OVQuantizationConfigBase(QuantizationConfigMixin): """ Base configuration class for quantization parameters """ + def __init__( self, dataset: Optional[Union[str, List[str], nncf.Dataset, datasets.Dataset]] = None, @@ -107,11 +109,15 @@ def __init__( def post_init(self): if not (self.dataset is None or isinstance(self.dataset, (str, list, nncf.Dataset, datasets.Dataset))): - raise ValueError("Dataset must be a instance of either string, list of strings, nncf.Dataset or " - f"dataset.Dataset, but found {type(self.dataset)}") - if not(self.ignored_scope is None or isinstance(self.ignored_scope, nncf.IgnoredScope)): - raise ValueError("Ignored scope must be a instance of either dict, or nncf.IgnoredScope but found " - f"{type(self.dataset)}") + raise ValueError( + "Dataset must be a instance of either string, list of strings, nncf.Dataset or " + f"dataset.Dataset, but found {type(self.dataset)}" + ) + if not (self.ignored_scope is None or isinstance(self.ignored_scope, nncf.IgnoredScope)): + raise ValueError( + "Ignored scope must be a instance of either dict, or nncf.IgnoredScope but found " + f"{type(self.dataset)}" + ) def to_dict_without_properties(self, property_names: Union[List[str], Tuple[str]]) -> Dict[str, Any]: """ @@ -124,8 +130,12 @@ def to_dict_without_properties(self, property_names: Union[List[str], Tuple[str] def to_dict(self) -> Dict[str, Any]: properties_to_omit = [] if is_serializable(self.dataset) else ["dataset"] if isinstance(self.ignored_scope, nncf.IgnoredScope): - ignored_scope_as_dict = dict(names=self.ignored_scope.names, types=self.ignored_scope.types, - patterns=self.ignored_scope.patterns, validate=self.ignored_scope.validate) + ignored_scope_as_dict = dict( + names=self.ignored_scope.names, + types=self.ignored_scope.types, + patterns=self.ignored_scope.patterns, + validate=self.ignored_scope.validate, + ) with replace_properties_values(self, ["ignored_scope"], [ignored_scope_as_dict]): return self.to_dict_without_properties(properties_to_omit) return self.to_dict_without_properties(properties_to_omit) @@ -227,6 +237,7 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): Weight compression method to apply. """ + def __init__( self, bits: int = 8, diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index d6328165a9..47ca4511e1 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -266,8 +266,9 @@ def quantize( logger.warning("`quantization_config` was not provided, a default weight quantization will be applied") if isinstance(self.model, OVBaseModel): - self._quantize_ovbasemodel(ov_config, save_directory, batch_size, data_collator, remove_unused_columns, - **kwargs) + self._quantize_ovbasemodel( + ov_config, save_directory, batch_size, data_collator, remove_unused_columns, **kwargs + ) elif isinstance(self.model, torch.nn.Module): logger.warning( @@ -692,7 +693,8 @@ def _hybrid_quantization( ignored_scope=ptq_ignored_scope, # SQ algo should be disabled for MatMul nodes because their weights are already compressed advanced_parameters=nncf.AdvancedQuantizationParameters( - smooth_quant_alphas=AdvancedSmoothQuantParameters(matmul=-1)), + smooth_quant_alphas=AdvancedSmoothQuantParameters(matmul=-1) + ), subset_size=subset_size, ) return quantized_model diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index ba90eb85bc..927b89fc0a 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -699,23 +699,20 @@ class OVQuantizationConfigTest(unittest.TestCase): None, [], ), - ( - OVWeightQuantizationConfig(), - [] - ), + (OVWeightQuantizationConfig(), []), ( OVWeightQuantizationConfig( bits=8, sym=True, ), - [] + [], ), ( { "bits": 8, "sym": True, }, - [] + [], ), ( OVWeightQuantizationConfig( @@ -731,50 +728,18 @@ class OVQuantizationConfigTest(unittest.TestCase): subset_size=100, quant_method=OVQuantizationMethod.DEFAULT, ), - ["ignored_scope"] - ), - ( - OVWeightQuantizationConfig( - dataset=["wikitext", "c4"] - ), - [] + ["ignored_scope"], ), + (OVWeightQuantizationConfig(dataset=["wikitext", "c4"]), []), + (OVWeightQuantizationConfig(dataset=load_dataset("wikitext", "wikitext-2-raw-v1", split="test")), ["dataset"]), + (OVWeightQuantizationConfig(dataset=nncf.Dataset([np.zeros((1, 10))])), ["dataset"]), ( - OVWeightQuantizationConfig( - dataset=load_dataset("wikitext", "wikitext-2-raw-v1", split="test") - ), - ["dataset"] - ), - ( - OVWeightQuantizationConfig( - dataset=nncf.Dataset([np.zeros((1, 10))]) - ), - ["dataset"] - ), - ( - OVWeightQuantizationConfig( - tokenizer=AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased") - ), - ["tokenizer"] - ), - ( - OVWeightQuantizationConfig( - ignored_scope=nncf.IgnoredScope(names=["op_name"]) - ), - ["ignored_scope"] - ), - ( - OVQuantizationConfig( - dataset="wikitext" - ), - [] - ), - ( - { - "dataset": "wikitext" - }, - [] + OVWeightQuantizationConfig(tokenizer=AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")), + ["tokenizer"], ), + (OVWeightQuantizationConfig(ignored_scope=nncf.IgnoredScope(names=["op_name"])), ["ignored_scope"]), + (OVQuantizationConfig(dataset="wikitext"), []), + ({"dataset": "wikitext"}, []), ( OVQuantizationConfig( dataset="wikitext", @@ -783,39 +748,23 @@ class OVQuantizationConfigTest(unittest.TestCase): preset=nncf.QuantizationPreset.MIXED, model_type=nncf.ModelType.TRANSFORMER, fast_bias_correction=True, - overflow_fix=OverflowFix.DISABLE - ), - ["ignored_scope"] - ), - ( - OVQuantizationConfig( - dataset=["wikitext", "c4"] + overflow_fix=OverflowFix.DISABLE, ), - [] + ["ignored_scope"], ), + (OVQuantizationConfig(dataset=["wikitext", "c4"]), []), + (OVQuantizationConfig(dataset=load_dataset("wikitext", "wikitext-2-raw-v1", split="test")), ["dataset"]), + (OVQuantizationConfig(dataset=nncf.Dataset([np.zeros((1, 10))])), ["dataset"]), ( - OVQuantizationConfig( - dataset=load_dataset("wikitext", "wikitext-2-raw-v1", split="test") - ), - ["dataset"] - ), - ( - OVQuantizationConfig( - dataset=nncf.Dataset([np.zeros((1, 10))]) - ), - ["dataset"] - ), - ( - OVQuantizationConfig( - dataset=["wikitext", "c4"], - ignored_scope=nncf.IgnoredScope(names=["op_name"]) - ), - ["ignored_scope"] + OVQuantizationConfig(dataset=["wikitext", "c4"], ignored_scope=nncf.IgnoredScope(names=["op_name"])), + ["ignored_scope"], ), ) @parameterized.expand(QUANTIZATION_CONFIGS) - def test_config_serialization(self, quantization_config: OVQuantizationConfigBase, non_equal_property_names: List[str]): + def test_config_serialization( + self, quantization_config: OVQuantizationConfigBase, non_equal_property_names: List[str] + ): def str_to_enum(enum_cls, value): for k, v in enum_cls.__members__.items(): if getattr(enum_cls, k).value == value: @@ -831,7 +780,11 @@ def str_to_enum(enum_cls, value): self.assertEqual(loaded_ov_config.quantization_config, None) return for key, value in loaded_ov_config.quantization_config.items(): - initial_value = quantization_config[key] if isinstance(quantization_config, dict) else getattr(ov_config.quantization_config, key) + initial_value = ( + quantization_config[key] + if isinstance(quantization_config, dict) + else getattr(ov_config.quantization_config, key) + ) if key == "preset" or key == "overflow_fix": # TODO: remove once NNCF is updated to 2.10 if getattr(quantization_config, key) is not None: From c8edf994b2ac21ae0fd7d1f269aa1bb805335710 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 3 Apr 2024 16:08:12 +0200 Subject: [PATCH 07/15] Linters --- optimum/intel/openvino/configuration.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index ffd4a5503d..f6c78a5e1b 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -75,7 +75,7 @@ def is_serializable(obj): try: json.dumps(obj) return True - except: + except Exception: return False @@ -130,12 +130,12 @@ def to_dict_without_properties(self, property_names: Union[List[str], Tuple[str] def to_dict(self) -> Dict[str, Any]: properties_to_omit = [] if is_serializable(self.dataset) else ["dataset"] if isinstance(self.ignored_scope, nncf.IgnoredScope): - ignored_scope_as_dict = dict( - names=self.ignored_scope.names, - types=self.ignored_scope.types, - patterns=self.ignored_scope.patterns, - validate=self.ignored_scope.validate, - ) + ignored_scope_as_dict = { + "names": self.ignored_scope.names, + "types": self.ignored_scope.types, + "patterns": self.ignored_scope.patterns, + "validate": self.ignored_scope.validate, + } with replace_properties_values(self, ["ignored_scope"], [ignored_scope_as_dict]): return self.to_dict_without_properties(properties_to_omit) return self.to_dict_without_properties(properties_to_omit) From 77faf7fe43d5f9def6d36b68b54a1cf06efc9b49 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 3 Apr 2024 16:50:20 +0200 Subject: [PATCH 08/15] Tweak ignored scope serialization --- optimum/intel/openvino/configuration.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index f6c78a5e1b..947aa7ca82 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -130,13 +130,7 @@ def to_dict_without_properties(self, property_names: Union[List[str], Tuple[str] def to_dict(self) -> Dict[str, Any]: properties_to_omit = [] if is_serializable(self.dataset) else ["dataset"] if isinstance(self.ignored_scope, nncf.IgnoredScope): - ignored_scope_as_dict = { - "names": self.ignored_scope.names, - "types": self.ignored_scope.types, - "patterns": self.ignored_scope.patterns, - "validate": self.ignored_scope.validate, - } - with replace_properties_values(self, ["ignored_scope"], [ignored_scope_as_dict]): + with replace_properties_values(self, ["ignored_scope"], [self.ignored_scope.__dict__]): return self.to_dict_without_properties(properties_to_omit) return self.to_dict_without_properties(properties_to_omit) From 123e2271f69b3c9769e2dc231afc466e8dd38bf6 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 4 Apr 2024 11:12:35 +0200 Subject: [PATCH 09/15] Added deprecation errors, tweak docs --- optimum/intel/openvino/configuration.py | 25 ++++++++---- optimum/intel/openvino/quantization.py | 52 +++++++++++++++++-------- tests/openvino/test_quantization.py | 24 ++++++++++++ 3 files changed, 78 insertions(+), 23 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 947aa7ca82..75b5ae4521 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import json +import logging from dataclasses import dataclass from enum import Enum from typing import Any, Dict, List, Optional, Tuple, Union @@ -26,6 +27,8 @@ from optimum.configuration_utils import BaseConfig +logger = logging.getLogger(__name__) + _DEFAULT_4BIT_CONFIGS = { "databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.5}, "EleutherAI/gpt-j-6b": {"bits": 4, "sym": False, "group_size": 64}, @@ -92,7 +95,6 @@ def __init__( subset_size: Optional[int] = None, ): """ - Args: dataset (`str or List[str] or nncf.Dataset or datasets.Dataset`, *optional*): The dataset used for data-aware weight compression or quantization with NNCF. @@ -121,8 +123,10 @@ def post_init(self): def to_dict_without_properties(self, property_names: Union[List[str], Tuple[str]]) -> Dict[str, Any]: """ - Call to_dict() with given properties overwritten with None. Useful for hiding non-serializable properties. + Calls to_dict() with given properties overwritten with None. Useful for hiding non-serializable properties. """ + if len(property_names) == 0: + return super().to_dict() with replace_properties_values(self, property_names, [None] * len(property_names)): result = super().to_dict() return result @@ -143,7 +147,7 @@ def __init__( self, input_info: Optional[List] = None, save_onnx_model: bool = False, - quantization_config: Optional[Union[Dict, OVQuantizationConfigBase]] = None, + quantization_config: Optional[Union[dict, OVQuantizationConfigBase]] = None, dtype: Optional[str] = None, **kwargs, ): @@ -154,6 +158,13 @@ def __init__( self.quantization_config = quantization_config self.compression = None # A backward-compatability field for training-time compression parameters + if isinstance(self.quantization_config, dict): + # Config is loaded as dict during deserialization + logger.info( + "`quantization_config` was provided as a dict, in this form it can't be used for quantization. " + "Please provide config as an instance of OVWeightQuantizationConfig or OVQuantizationConfig" + ) + bits = ( self.quantization_config.bits if isinstance(self.quantization_config, OVWeightQuantizationConfig) else None ) @@ -194,8 +205,8 @@ class OVQuantizationMethod(str, Enum): class OVWeightQuantizationConfig(OVQuantizationConfigBase): """ This is a wrapper class about all possible attributes and features that you can play with a model that has been - loaded using `optimum-intel` api for weights compression with NNCF. - + loaded using `optimum-intel` api for weight-only quantization with NNCF. For full model quantization please see + OVQuantizationConfig. Args: bits (`int`, defaults to 8): The number of bits to quantize to. @@ -229,7 +240,6 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): The maximum number of samples composing the calibration dataset. quant_method (`str`, defaults of OVQuantizationMethod.DEFAULT): Weight compression method to apply. - """ def __init__( @@ -316,6 +326,7 @@ def __init__( """ Configuration class containing parameters related to model quantization with NNCF. Compared to weight compression, during quantization both weights and activations are converted to lower precision. + For weight-only model quantization please see OVWeightQuantizationConfig. Args: dataset (`str or List[str] or nncf.Dataset or datasets.Dataset`): A dataset used for quantization parameters calibration. Required parameter. @@ -356,7 +367,7 @@ def post_init(self): ) def to_dict(self) -> Dict[str, Any]: - # TODO: remove once NNCF is updated to 2.10 + # TODO: remove code below once NNCF is updated to 2.10 overflow_fix_value = None if self.overflow_fix is None else self.overflow_fix.value preset_value = None if self.preset is None else self.preset.value with replace_properties_values(self, ("overflow_fix", "preset"), (overflow_fix_value, preset_value)): diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 47ca4511e1..5e11dfbba2 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -218,10 +218,11 @@ def quantize( save_directory (`Union[str, Path]`): The directory where the quantized model should be saved. ov_config (`OVConfig`, *optional*): - The configuration containing the parameters related to quantization. + The configuration containing the parameters related to quantization. If not provided, 8-bit symmetric + weight-only quantization will be applied. file_name (`str`, *optional*): The model file name to use when saving the model. Overwrites the default file name `"model.onnx"`. - batch_size (`int`, defaults to 8): + batch_size (`int`, defaults to 1): The number of calibration samples to load per batch. data_collator (`DataCollator`, *optional*): The function to use to form a batch from a list of elements of the calibration dataset. @@ -251,6 +252,17 @@ def quantize( >>> optimized_model = OVModelForCausalLM.from_pretrained("./quantized_model") ``` """ + if "calibration_dataset" in kwargs: + raise ValueError( + "`calibration_dataset` argument is deprecated. Please provide calibration dataset " + "with `ov_config.quantization_config.dataset`." + ) + if "weights_only" in kwargs: + raise ValueError( + "`weights_only` argument is deprecated. Please provide `ov_config.quantization_config` " + "as an instance of OVWeightQuantizationConfig for weight-only compression." + ) + if save_directory is None: # TODO : can be set to self.model.config.name_or_path for OVModels when not provided raise ValueError("`save_directory` needs to be specified") @@ -263,7 +275,7 @@ def quantize( quantization_config = ov_config.quantization_config if quantization_config is None: ov_config.quantization_config = OVWeightQuantizationConfig(bits=8, sym=True) - logger.warning("`quantization_config` was not provided, a default weight quantization will be applied") + logger.info("`quantization_config` was not provided, 8-bit symmetric weight quantization will be applied.") if isinstance(self.model, OVBaseModel): self._quantize_ovbasemodel( @@ -276,7 +288,7 @@ def quantize( "To convert a PyTorch model to OpenVINO, you can set `export=True` when loading your model as `OVModelForXxx.from_pretrained(..., export=True)`" ) self._quantize_torchmodel( - ov_config, save_directory, file_name, batch_size, data_collator, remove_unused_columns + ov_config, save_directory, file_name, batch_size, data_collator, remove_unused_columns, **kwargs ) else: raise TypeError(f"Unsupported model type: {type(self.model)}") @@ -297,6 +309,7 @@ def _quantize_ovbasemodel( if isinstance(quantization_config, OVWeightQuantizationConfig): _weight_only_quantization(self.model.model, quantization_config) self.model.save_pretrained(save_directory) + ov_config.save_pretrained(save_directory) return if not isinstance(quantization_config, OVQuantizationConfig): raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}") @@ -344,6 +357,7 @@ def _quantize_ovbasemodel( ) self.model.model = quantized_model self.model.save_pretrained(save_directory) + ov_config.save_pretrained(save_directory) def _quantize_torchmodel( self, @@ -391,18 +405,24 @@ def _quantize_torchmodel( quantization_config = ov_config.quantization_config if isinstance(quantization_config, OVWeightQuantizationConfig): - if stateful: - # patch model before weight compression - model = patch_model_with_bettertransformer(model) - - dummy_inputs = onnx_config.generate_dummy_inputs(framework="pt") - device = get_model_device(model) - dummy_inputs = tree_map( - lambda value: value.to(device) if isinstance(value, torch.Tensor) else value, dummy_inputs - ) - check_dummy_inputs_are_allowed(model, dummy_inputs) - - nncf.compress_weights(model, dataset=nncf.Dataset([dummy_inputs])) + dataset = quantization_config.dataset + if not isinstance(dataset, nncf.Dataset): + if dataset is not None: + raise ValueError( + "Please provide `dataset` for weight compression as an instance of `nncf.Dataset`." + ) + if stateful: + # patch model before weight compression + model = patch_model_with_bettertransformer(model) + + dummy_inputs = onnx_config.generate_dummy_inputs(framework="pt") + device = get_model_device(model) + dummy_inputs = tree_map( + lambda value: value.to(device) if isinstance(value, torch.Tensor) else value, dummy_inputs + ) + check_dummy_inputs_are_allowed(model, dummy_inputs) + dataset = nncf.Dataset([dummy_inputs]) + nncf.compress_weights(model, dataset=dataset) else: if not isinstance(quantization_config, OVQuantizationConfig): raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}") diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 927b89fc0a..be99e6054d 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -160,6 +160,10 @@ def preprocess_function(examples, tokenizer): outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config) + class OVWeightCompressionTest(unittest.TestCase): # TODO : add models @@ -305,6 +309,10 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(OVWeightQuantizationConfig(bits=8, sym=True).to_dict(), loaded_config.quantization_config) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS) def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_int8, expected_int4): task = model_cls.export_feature @@ -332,6 +340,10 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config) + @parameterized.expand(SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) @unittest.skipIf(not IS_SUPPORT_STATEFUL, "Stateful models supported only in 2023.3 and above") def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_id, expected_pt_int8, expected_ov_int8): @@ -354,6 +366,10 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_id, exp outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(OVWeightQuantizationConfig(bits=8, sym=True).to_dict(), loaded_config.quantization_config) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION) def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type): model = model_cls.from_pretrained(MODEL_NAMES[model_type], export=True, load_in_8bit=True, stateful=False) @@ -611,6 +627,10 @@ def preprocess_function(examples, tokenizer): except RuntimeError: self.fail("Loading BERT QA model a second time failed") + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config) + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_ovmodel_static_quantization(self, model_name): def preprocess_function(examples, tokenizer): @@ -646,6 +666,10 @@ def preprocess_function(examples, tokenizer): except RuntimeError: self.fail("Loading BERT QA model a second time failed") + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config) + class OVTrainerTest(unittest.TestCase): SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("distilbert-base-uncased", 50, 38),) From 20fd76191787f44111428c367709d4296ed39265 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 10 Apr 2024 13:21:01 +0200 Subject: [PATCH 10/15] Addressed minor comments --- optimum/intel/openvino/configuration.py | 51 ++++++++++---------- optimum/intel/openvino/modeling_decoder.py | 2 +- optimum/intel/openvino/modeling_diffusion.py | 2 +- optimum/intel/openvino/quantization.py | 33 ++++++++----- tests/openvino/test_quantization.py | 8 +-- 5 files changed, 53 insertions(+), 43 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 75b5ae4521..030490061e 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -52,7 +52,7 @@ } -class replace_properties_values: +class _replace_properties_values: """ A context manager for temporarily overriding an object's properties """ @@ -74,7 +74,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): setattr(self.obj, property_name, old_property_value) -def is_serializable(obj): +def _is_serializable(obj): try: json.dumps(obj) return True @@ -92,7 +92,7 @@ def __init__( self, dataset: Optional[Union[str, List[str], nncf.Dataset, datasets.Dataset]] = None, ignored_scope: Optional[Union[dict, nncf.IgnoredScope]] = None, - subset_size: Optional[int] = None, + num_samples: Optional[int] = None, ): """ Args: @@ -100,14 +100,14 @@ def __init__( The dataset used for data-aware weight compression or quantization with NNCF. ignored_scope (`dict or nncf.IgnoredScope`, *optional*): An ignored scope that defines the list of model nodes to be ignored during quantization. - subset_size (`int`, *optional*): + num_samples (`int`, *optional*): The maximum number of samples composing the calibration dataset. """ self.dataset = dataset if isinstance(ignored_scope, dict): ignored_scope = nncf.IgnoredScope(**ignored_scope) self.ignored_scope = ignored_scope - self.subset_size = subset_size + self.num_samples = num_samples def post_init(self): if not (self.dataset is None or isinstance(self.dataset, (str, list, nncf.Dataset, datasets.Dataset))): @@ -121,22 +121,22 @@ def post_init(self): f"{type(self.dataset)}" ) - def to_dict_without_properties(self, property_names: Union[List[str], Tuple[str]]) -> Dict[str, Any]: + def _to_dict_without_properties(self, property_names: Union[List[str], Tuple[str]]) -> Dict[str, Any]: """ Calls to_dict() with given properties overwritten with None. Useful for hiding non-serializable properties. """ if len(property_names) == 0: return super().to_dict() - with replace_properties_values(self, property_names, [None] * len(property_names)): + with _replace_properties_values(self, property_names, [None] * len(property_names)): result = super().to_dict() return result def to_dict(self) -> Dict[str, Any]: - properties_to_omit = [] if is_serializable(self.dataset) else ["dataset"] + properties_to_omit = [] if _is_serializable(self.dataset) else ["dataset"] if isinstance(self.ignored_scope, nncf.IgnoredScope): - with replace_properties_values(self, ["ignored_scope"], [self.ignored_scope.__dict__]): - return self.to_dict_without_properties(properties_to_omit) - return self.to_dict_without_properties(properties_to_omit) + with _replace_properties_values(self, ["ignored_scope"], [self.ignored_scope.__dict__]): + return self._to_dict_without_properties(properties_to_omit) + return self._to_dict_without_properties(properties_to_omit) class OVConfig(BaseConfig): @@ -180,10 +180,10 @@ def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): for name, value in model_inputs.items() ] - def to_dict_safe(self, to_diff_dict: bool = False) -> Dict[str, Any]: + def _to_dict_safe(self, to_diff_dict: bool = False) -> Dict[str, Any]: if self.quantization_config is None: # Parent to_dict() implementation does not support quantization_config being None - with replace_properties_values(self, ("quantization_config",), (OVQuantizationConfigBase(),)): + with _replace_properties_values(self, ("quantization_config",), (OVQuantizationConfigBase(),)): result = super().to_diff_dict() if to_diff_dict else super().to_dict() del result["quantization_config"] else: @@ -191,10 +191,10 @@ def to_dict_safe(self, to_diff_dict: bool = False) -> Dict[str, Any]: return result def to_dict(self) -> Dict[str, Any]: - return self.to_dict_safe(to_diff_dict=False) + return self._to_dict_safe(to_diff_dict=False) def to_diff_dict(self) -> Dict[str, Any]: - return self.to_dict_safe(to_diff_dict=True) + return self._to_dict_safe(to_diff_dict=True) class OVQuantizationMethod(str, Enum): @@ -236,7 +236,7 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): preserve the accuracy of the model, the more sensitive layers receives a higher precision. ignored_scope (`dict`, *optional*): An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization. - subset_size (`int`, *optional*): + num_samples (`int`, *optional*): The maximum number of samples composing the calibration dataset. quant_method (`str`, defaults of OVQuantizationMethod.DEFAULT): Weight compression method to apply. @@ -253,11 +253,11 @@ def __init__( all_layers: Optional[bool] = None, sensitivity_metric: Optional[str] = None, ignored_scope: Optional[Union[dict, nncf.IgnoredScope]] = None, - subset_size: Optional[int] = None, + num_samples: Optional[int] = None, quant_method: Optional[Union[QuantizationMethod, OVQuantizationMethod]] = OVQuantizationMethod.DEFAULT, **kwargs, ): - super().__init__(dataset, ignored_scope, subset_size) + super().__init__(dataset, ignored_scope, num_samples) self.bits = bits self.sym = sym self.tokenizer = tokenizer @@ -265,7 +265,6 @@ def __init__( self.ratio = ratio self.all_layers = all_layers self.sensitivity_metric = sensitivity_metric - self.subset_size = subset_size self.quant_method = quant_method self.post_init() @@ -305,8 +304,8 @@ def post_init(self): ) def to_dict(self) -> Dict[str, Any]: - if not is_serializable(self.tokenizer): - return self.to_dict_without_properties(("tokenizer",)) + if not _is_serializable(self.tokenizer): + return self._to_dict_without_properties(("tokenizer",)) return super().to_dict() @@ -316,7 +315,7 @@ def __init__( self, dataset: Union[str, List[str], nncf.Dataset, datasets.Dataset], ignored_scope: Optional[Union[dict, nncf.IgnoredScope]] = None, - subset_size: Optional[int] = 300, + num_samples: Optional[int] = 300, preset: nncf.QuantizationPreset = None, model_type: nncf.ModelType = nncf.ModelType.TRANSFORMER, fast_bias_correction: bool = True, @@ -332,7 +331,7 @@ def __init__( A dataset used for quantization parameters calibration. Required parameter. ignored_scope (`dict or nncf.IgnoredScope`, *optional*): An ignored scope that defines the list of model nodes to be ignored during quantization. - subset_size (`int`, *optional*): + num_samples (`int`, *optional*): The maximum number of samples composing the calibration dataset. preset (`nncf.QuantizationPreset`, *optional*): A preset controls the quantization mode (symmetric and asymmetric). @@ -345,10 +344,10 @@ def __init__( Model type is needed to specify additional patterns in the model. Supported only `transformer` now. fast_bias_correction (`bool`, defaults to True): Whether to apply fast or full bias correction algorithm. - overflow_fix (`bool`, default to OverflowFix.DISABLE): + overflow_fix (`nncf.OverflowFix`, default to OverflowFix.DISABLE): Parameter for controlling overflow fix setting. """ - super().__init__(dataset, ignored_scope, subset_size) + super().__init__(dataset, ignored_scope, num_samples) self.preset = preset self.model_type = model_type self.fast_bias_correction = fast_bias_correction @@ -370,7 +369,7 @@ def to_dict(self) -> Dict[str, Any]: # TODO: remove code below once NNCF is updated to 2.10 overflow_fix_value = None if self.overflow_fix is None else self.overflow_fix.value preset_value = None if self.preset is None else self.preset.value - with replace_properties_values(self, ("overflow_fix", "preset"), (overflow_fix_value, preset_value)): + with _replace_properties_values(self, ("overflow_fix", "preset"), (overflow_fix_value, preset_value)): return super().to_dict() diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 9a91b02f0c..4b156eda9e 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -640,7 +640,7 @@ def _from_pretrained( # from optimum.gptq.utils import get_seqlen # seqlen = get_seqlen(causal_model) - nsamples = quantization_config.subset_size if quantization_config.subset_size else 128 + nsamples = quantization_config.num_samples if quantization_config.num_samples else 128 dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples) dataset = prepare_dataset(dataset) quantization_config = copy.deepcopy(quantization_config) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index f6f13482ce..eb407b4cd1 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -321,7 +321,7 @@ def _from_pretrained( if not isinstance(sd_model, supported_pipelines): raise NotImplementedError(f"Quantization in hybrid mode is not supported for {cls.__name__}") - nsamples = quantization_config.subset_size if quantization_config.subset_size else 200 + nsamples = quantization_config.num_samples if quantization_config.num_samples else 200 unet_inputs = sd_model._prepare_unet_inputs(quantization_config.dataset, nsamples) from .quantization import _hybrid_quantization diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 5e11dfbba2..3962a88226 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -209,6 +209,7 @@ def quantize( batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, + weights_only: bool = None, **kwargs, ): """ @@ -228,6 +229,10 @@ def quantize( The function to use to form a batch from a list of elements of the calibration dataset. remove_unused_columns (`bool`, defaults to `True`): Whether to remove the columns unused by the model forward method. + weights_only (`bool`, *optional*): + Being deprecated. + Compress weights to integer precision (8-bit by default) while keeping activations + floating-point. Fits best for LLM footprint reduction and performance acceleration. Examples: ```python @@ -257,9 +262,9 @@ def quantize( "`calibration_dataset` argument is deprecated. Please provide calibration dataset " "with `ov_config.quantization_config.dataset`." ) - if "weights_only" in kwargs: - raise ValueError( - "`weights_only` argument is deprecated. Please provide `ov_config.quantization_config` " + if weights_only is not None: + logger.warning( + "`weights_only` argument is deprecated. In the future please provide `ov_config.quantization_config` " "as an instance of OVWeightQuantizationConfig for weight-only compression." ) @@ -274,8 +279,14 @@ def quantize( raise TypeError(f"`ov_config` should be an `OVConfig`, but got: {type(ov_config)} instead.") quantization_config = ov_config.quantization_config if quantization_config is None: - ov_config.quantization_config = OVWeightQuantizationConfig(bits=8, sym=True) - logger.info("`quantization_config` was not provided, 8-bit symmetric weight quantization will be applied.") + if weights_only is None or weights_only is True: + if weights_only is None: + logger.info( + "`quantization_config` was not provided, 8-bit symmetric weight quantization will be applied." + ) + ov_config.quantization_config = OVWeightQuantizationConfig(bits=8, sym=True) + else: + ov_config.quantization_config = OVQuantizationConfig() if isinstance(self.model, OVBaseModel): self._quantize_ovbasemodel( @@ -335,7 +346,7 @@ def _quantize_ovbasemodel( try: for data in calibration_dataloader: self.model.generate(**data, max_new_tokens=1) - if len(collected_inputs) >= quantization_config.subset_size: + if len(collected_inputs) >= quantization_config.num_samples: break finally: self.model.request = self.model.request.request @@ -347,7 +358,7 @@ def _quantize_ovbasemodel( quantized_model = nncf.quantize( self.model.model, quantization_dataset, - subset_size=quantization_config.subset_size, + subset_size=quantization_config.num_samples, ignored_scope=quantization_config.ignored_scope, model_type=quantization_config.model_type, preset=quantization_config.preset, @@ -446,7 +457,7 @@ def _quantize_torchmodel( model = nncf.quantize( model, quantization_dataset, - subset_size=quantization_config.subset_size, + subset_size=quantization_config.num_samples, ignored_scope=quantization_config.ignored_scope, model_type=quantization_config.model_type, preset=quantization_config.preset, @@ -603,7 +614,7 @@ def _weight_only_quantization( from optimum.gptq.data import get_dataset, prepare_dataset - nsamples = config.subset_size if config.subset_size else 128 + nsamples = config.num_samples if config.num_samples else 128 dataset = get_dataset(config.dataset, tokenizer, seqlen=32, nsamples=nsamples) dataset = prepare_dataset(dataset) @@ -626,7 +637,7 @@ def _weight_only_quantization( # awq=config.quant_method == QuantizationMethod.AWQ, # TODO : enable from nncf v2.9.0 ignored_scope=config.ignored_scope, dataset=dataset, - # subset_size=config.subset_size if config.subset_size else 128, # TODO : enable from nncf v2.9.0 + # subset_size=config.num_samples if config.num_samples else 128, # TODO : enable from nncf v2.9.0 ) @@ -705,7 +716,7 @@ def _hybrid_quantization( wc_quantization_config.ignored_scope.types.append("Convolution") compressed_model = _weight_only_quantization(model, wc_quantization_config) - subset_size = quantization_config.subset_size if quantization_config.subset_size else 200 + subset_size = quantization_config.num_samples if quantization_config.num_samples else 200 quantized_model = nncf.quantize( model=compressed_model, calibration_dataset=nncf.Dataset(dataset), diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index be99e6054d..8be2bce769 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -392,7 +392,7 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type): @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION) def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8): model_id = MODEL_NAMES[model_type] - quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", subset_size=2) + quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", num_samples=2) with tempfile.TemporaryDirectory() as tmp_dir: model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) @@ -414,7 +414,7 @@ def test_ovmodel_hybrid_quantization_with_custom_dataset( model = model_cls.from_pretrained( model_id, export=True, - quantization_config=OVWeightQuantizationConfig(bits=8, dataset=dataset, subset_size=3), + quantization_config=OVWeightQuantizationConfig(bits=8, dataset=dataset, num_samples=3), ) num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet) self.assertEqual(expected_num_fake_quantize, num_fake_quantize) @@ -749,7 +749,7 @@ class OVQuantizationConfigTest(unittest.TestCase): group_size=128, all_layers=True, sensitivity_metric="mean_activation_magnitude", - subset_size=100, + num_samples=100, quant_method=OVQuantizationMethod.DEFAULT, ), ["ignored_scope"], @@ -768,7 +768,7 @@ class OVQuantizationConfigTest(unittest.TestCase): OVQuantizationConfig( dataset="wikitext", ignored_scope={"names": ["op_name"]}, - subset_size=100, + num_samples=100, preset=nncf.QuantizationPreset.MIXED, model_type=nncf.ModelType.TRANSFORMER, fast_bias_correction=True, From f7fa3a18074c81acd5d1d7d0d9bef2fd17188650 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 11 Apr 2024 11:32:53 +0200 Subject: [PATCH 11/15] Make quantization config contain only serializable properties. --- Makefile | 2 +- optimum/intel/openvino/configuration.py | 228 +++++++++++---------- optimum/intel/openvino/modeling_base.py | 9 +- optimum/intel/openvino/modeling_decoder.py | 16 +- optimum/intel/openvino/quantization.py | 162 +++++++++------ tests/openvino/test_quantization.py | 181 +++++++++------- 6 files changed, 343 insertions(+), 255 deletions(-) diff --git a/Makefile b/Makefile index 83035cf467..2a72d9d4c6 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL)) # Run code quality checks style_check: - black --check . + black . ruff check . style: diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 030490061e..df580bb93b 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -11,13 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import json +import copy +import inspect import logging from dataclasses import dataclass from enum import Enum -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Union -import datasets import nncf import torch from nncf.quantization.advanced_parameters import OverflowFix @@ -52,36 +52,6 @@ } -class _replace_properties_values: - """ - A context manager for temporarily overriding an object's properties - """ - - def __init__(self, obj, property_names, property_values): - self.obj = obj - self.property_names = property_names - self.new_property_values = property_values - self.old_property_values = [None] * len(property_names) - for i, property_name in enumerate(self.property_names): - self.old_property_values[i] = getattr(obj, property_name) - - def __enter__(self): - for property_name, new_property_value in zip(self.property_names, self.new_property_values): - setattr(self.obj, property_name, new_property_value) - - def __exit__(self, exc_type, exc_val, exc_tb): - for property_name, old_property_value in zip(self.property_names, self.old_property_values): - setattr(self.obj, property_name, old_property_value) - - -def _is_serializable(obj): - try: - json.dumps(obj) - return True - except Exception: - return False - - @dataclass class OVQuantizationConfigBase(QuantizationConfigMixin): """ @@ -90,53 +60,41 @@ class OVQuantizationConfigBase(QuantizationConfigMixin): def __init__( self, - dataset: Optional[Union[str, List[str], nncf.Dataset, datasets.Dataset]] = None, - ignored_scope: Optional[Union[dict, nncf.IgnoredScope]] = None, + ignored_scope: Optional[dict] = None, num_samples: Optional[int] = None, + weight_only: Optional[bool] = None, + **kwargs, ): """ Args: - dataset (`str or List[str] or nncf.Dataset or datasets.Dataset`, *optional*): - The dataset used for data-aware weight compression or quantization with NNCF. - ignored_scope (`dict or nncf.IgnoredScope`, *optional*): - An ignored scope that defines the list of model nodes to be ignored during quantization. + ignored_scope (`dict`, *optional*): + An ignored scope that defines a list of model nodes to be ignored during quantization. Dictionary + entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. num_samples (`int`, *optional*): The maximum number of samples composing the calibration dataset. + weight_only (`bool`, *optional*): + Used to explicitly specify type of quantization (weight-only of full) to apply. """ - self.dataset = dataset - if isinstance(ignored_scope, dict): - ignored_scope = nncf.IgnoredScope(**ignored_scope) + if isinstance(ignored_scope, nncf.IgnoredScope): + ignored_scope = ignored_scope.__dict__ self.ignored_scope = ignored_scope self.num_samples = num_samples + self.weight_only = weight_only def post_init(self): - if not (self.dataset is None or isinstance(self.dataset, (str, list, nncf.Dataset, datasets.Dataset))): + try: + self.get_ignored_scope_instance() + except Exception as e: raise ValueError( - "Dataset must be a instance of either string, list of strings, nncf.Dataset or " - f"dataset.Dataset, but found {type(self.dataset)}" - ) - if not (self.ignored_scope is None or isinstance(self.ignored_scope, nncf.IgnoredScope)): - raise ValueError( - "Ignored scope must be a instance of either dict, or nncf.IgnoredScope but found " - f"{type(self.dataset)}" + f"Can't create an `IgnoredScope` object from the provided ignored scope dict: {self.ignored_scope}.\n{e}" ) + if not (self.num_samples is None or isinstance(self.num_samples, int) and self.num_samples > 0): + raise ValueError(f"`num_samples` is expected to be a positive integer, but found: {self.num_samples}") - def _to_dict_without_properties(self, property_names: Union[List[str], Tuple[str]]) -> Dict[str, Any]: - """ - Calls to_dict() with given properties overwritten with None. Useful for hiding non-serializable properties. - """ - if len(property_names) == 0: - return super().to_dict() - with _replace_properties_values(self, property_names, [None] * len(property_names)): - result = super().to_dict() - return result - - def to_dict(self) -> Dict[str, Any]: - properties_to_omit = [] if _is_serializable(self.dataset) else ["dataset"] - if isinstance(self.ignored_scope, nncf.IgnoredScope): - with _replace_properties_values(self, ["ignored_scope"], [self.ignored_scope.__dict__]): - return self._to_dict_without_properties(properties_to_omit) - return self._to_dict_without_properties(properties_to_omit) + def get_ignored_scope_instance(self) -> nncf.IgnoredScope: + if self.ignored_scope is None: + return nncf.IgnoredScope() + return nncf.IgnoredScope(**copy.deepcopy(self.ignored_scope)) class OVConfig(BaseConfig): @@ -155,16 +113,11 @@ def __init__( self.input_info = input_info self.save_onnx_model = save_onnx_model self.optimum_version = kwargs.pop("optimum_version", None) + if isinstance(quantization_config, dict): + quantization_config = self._quantization_config_from_dict(quantization_config) self.quantization_config = quantization_config self.compression = None # A backward-compatability field for training-time compression parameters - if isinstance(self.quantization_config, dict): - # Config is loaded as dict during deserialization - logger.info( - "`quantization_config` was provided as a dict, in this form it can't be used for quantization. " - "Please provide config as an instance of OVWeightQuantizationConfig or OVQuantizationConfig" - ) - bits = ( self.quantization_config.bits if isinstance(self.quantization_config, OVWeightQuantizationConfig) else None ) @@ -180,12 +133,40 @@ def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): for name, value in model_inputs.items() ] + @staticmethod + def _quantization_config_from_dict(quantization_config: dict) -> OVQuantizationConfigBase: + wq_args = inspect.getfullargspec(OVWeightQuantizationConfig.__init__).args + q_args = inspect.getfullargspec(OVQuantizationConfig.__init__).args + config_keys = quantization_config.keys() + matches_wq_config_signature = all(arg_name in wq_args for arg_name in config_keys) + matches_q_config_signature = all(arg_name in q_args for arg_name in config_keys) + if matches_wq_config_signature == matches_q_config_signature: + weight_only = quantization_config.get("weight_only", None) + if weight_only is None: + logger.warning( + "Can't determine type of OV quantization config. Please specify explicitly whether you intend to " + "run weight-only quantization or not with `weight_only` parameter. Creating an instance of " + "OVWeightQuantizationConfig." + ) + return OVWeightQuantizationConfig.from_dict(quantization_config) + matches_wq_config_signature = weight_only + + config_type = OVWeightQuantizationConfig if matches_wq_config_signature else OVQuantizationConfig + return config_type.from_dict(quantization_config) + def _to_dict_safe(self, to_diff_dict: bool = False) -> Dict[str, Any]: + class ConfigStub: + def to_dict(self): + return None + + def to_diff_dict(self): + return None + if self.quantization_config is None: # Parent to_dict() implementation does not support quantization_config being None - with _replace_properties_values(self, ("quantization_config",), (OVQuantizationConfigBase(),)): - result = super().to_diff_dict() if to_diff_dict else super().to_dict() - del result["quantization_config"] + self_copy = copy.deepcopy(self) + self_copy.quantization_config = ConfigStub() + result = self_copy.to_diff_dict() if to_diff_dict else self_copy.to_dict() else: result = super().to_diff_dict() if to_diff_dict else super().to_dict() return result @@ -212,9 +193,8 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): The number of bits to quantize to. sym (`bool`, defaults to `False`): Whether to use symmetric quantization. - tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*): + tokenizer (`str`, *optional*): The tokenizer used to process the dataset. You can pass either: - - A custom tokenizer object. - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. @@ -224,6 +204,8 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): The dataset used for data-aware compression or quantization with NNCF. You can provide your own dataset in a list of strings or just use the one from the list ['wikitext','c4','c4-new','ptb','ptb-new'] for LLLMs or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models. + Alternatively, you can provide data objects via `calibration_dataset` argument + of `OVQuantizer.quantize()` method. ratio (`float`, defaults to 1.0): The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM and the rest to INT8_ASYM). @@ -235,32 +217,44 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): The sensitivity metric for assigning quantization precision to layers. In order to preserve the accuracy of the model, the more sensitive layers receives a higher precision. ignored_scope (`dict`, *optional*): - An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization. + An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary + entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. num_samples (`int`, *optional*): The maximum number of samples composing the calibration dataset. quant_method (`str`, defaults of OVQuantizationMethod.DEFAULT): Weight compression method to apply. + weight_only (`bool`, *optional*): + Used to explicitly specify type of quantization to apply. + weight_only (`bool`, *optional*): + Used to explicitly specify type of quantization (weight-only of full) to apply. """ def __init__( self, bits: int = 8, sym: bool = False, - tokenizer: Optional[Any] = None, - dataset: Optional[Union[str, List[str], nncf.Dataset, datasets.Dataset]] = None, + tokenizer: Optional[str] = None, + dataset: Optional[Union[str, List[str]]] = None, ratio: float = 1.0, group_size: Optional[int] = None, all_layers: Optional[bool] = None, sensitivity_metric: Optional[str] = None, - ignored_scope: Optional[Union[dict, nncf.IgnoredScope]] = None, + ignored_scope: Optional[dict] = None, num_samples: Optional[int] = None, quant_method: Optional[Union[QuantizationMethod, OVQuantizationMethod]] = OVQuantizationMethod.DEFAULT, + weight_only: Optional[bool] = True, **kwargs, ): - super().__init__(dataset, ignored_scope, num_samples) + if weight_only is False: + logger.warning( + "Trying to create an instance of `OVWeightQuantizationConfig` with `weight_only` being " + "False. Please check your configuration." + ) + super().__init__(ignored_scope, num_samples, True) self.bits = bits self.sym = sym self.tokenizer = tokenizer + self.dataset = dataset self.group_size = group_size or (-1 if bits == 8 else 128) self.ratio = ratio self.all_layers = all_layers @@ -277,6 +271,11 @@ def post_init(self): raise ValueError("`ratio` must between 0 and 1.") if self.group_size is not None and self.group_size != -1 and self.group_size <= 0: raise ValueError("`group_size` must be greater than 0 or equal to -1") + if not (self.dataset is None or isinstance(self.dataset, (str, list))): + raise ValueError( + f"Dataset must be a instance of either string or list of strings, but found {type(self.dataset)}. " + f"If you wish to provide a custom dataset please pass it via `calibration_dataset` argument." + ) if self.dataset is not None and isinstance(self.dataset, str): llm_datasets = ["wikitext", "c4", "c4-new", "ptb", "ptb-new"] stable_diffusion_datasets = [ @@ -303,23 +302,21 @@ def post_init(self): f"For 8-bit quantization, `group_size` is expected to be set to -1, but was set to {self.group_size}" ) - def to_dict(self) -> Dict[str, Any]: - if not _is_serializable(self.tokenizer): - return self._to_dict_without_properties(("tokenizer",)) - return super().to_dict() + if self.tokenizer is not None and not isinstance(self.tokenizer, str): + raise ValueError(f"Tokenizer is expected to be a string, but found {self.tokenizer}") @dataclass class OVQuantizationConfig(OVQuantizationConfigBase): def __init__( self, - dataset: Union[str, List[str], nncf.Dataset, datasets.Dataset], - ignored_scope: Optional[Union[dict, nncf.IgnoredScope]] = None, + ignored_scope: Optional[dict] = None, num_samples: Optional[int] = 300, preset: nncf.QuantizationPreset = None, model_type: nncf.ModelType = nncf.ModelType.TRANSFORMER, fast_bias_correction: bool = True, overflow_fix: OverflowFix = OverflowFix.DISABLE, + weight_only: Optional[bool] = False, **kwargs, ): """ @@ -327,10 +324,9 @@ def __init__( compression, during quantization both weights and activations are converted to lower precision. For weight-only model quantization please see OVWeightQuantizationConfig. Args: - dataset (`str or List[str] or nncf.Dataset or datasets.Dataset`): - A dataset used for quantization parameters calibration. Required parameter. - ignored_scope (`dict or nncf.IgnoredScope`, *optional*): - An ignored scope that defines the list of model nodes to be ignored during quantization. + ignored_scope (`dict`, *optional*): + An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary + entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. num_samples (`int`, *optional*): The maximum number of samples composing the calibration dataset. preset (`nncf.QuantizationPreset`, *optional*): @@ -346,31 +342,45 @@ def __init__( Whether to apply fast or full bias correction algorithm. overflow_fix (`nncf.OverflowFix`, default to OverflowFix.DISABLE): Parameter for controlling overflow fix setting. + weight_only (`bool`, *optional*): + Used to explicitly specify type of quantization (weight-only of full) to apply. """ - super().__init__(dataset, ignored_scope, num_samples) + if weight_only is True: + logger.warning( + "Trying to create an instance of `OVQuantizationConfig` with `weight_only` being True. " + "Please check your configuration." + ) + super().__init__(ignored_scope, num_samples, False) + # TODO: remove checks below once NNCF is updated to 2.10 + if isinstance(overflow_fix, str): + overflow_fix = OverflowFix(overflow_fix) + if isinstance(preset, str): + preset = nncf.QuantizationPreset(preset) + self.preset = preset self.model_type = model_type self.fast_bias_correction = fast_bias_correction self.overflow_fix = overflow_fix self.post_init() - def post_init(self): - """ - Safety checker that arguments are correct - """ - super().post_init() - if self.dataset is None: - raise ValueError( - "`dataset` is needed to compute the activations range during the calibration step and was not provided." - " In case you only want to apply quantization on the weights, please run weight-only quantization." - ) - def to_dict(self) -> Dict[str, Any]: # TODO: remove code below once NNCF is updated to 2.10 - overflow_fix_value = None if self.overflow_fix is None else self.overflow_fix.value - preset_value = None if self.preset is None else self.preset.value - with _replace_properties_values(self, ("overflow_fix", "preset"), (overflow_fix_value, preset_value)): - return super().to_dict() + if isinstance(self.overflow_fix, Enum) or isinstance(self.preset, Enum): + overflow_fix_value = ( + None + if self.overflow_fix is None + else self.overflow_fix + if isinstance(self.overflow_fix, str) + else self.overflow_fix.value + ) + preset_value = ( + None if self.preset is None else self.preset if isinstance(self.preset, str) else self.preset.value + ) + self_copy = copy.deepcopy(self) + self_copy.overflow_fix = overflow_fix_value + self_copy.preset = preset_value + return self_copy.to_dict() + return super().to_dict() def _check_default_4bit_configs(config: PretrainedConfig): diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index a6b8aacf43..88c455bc65 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -18,6 +18,7 @@ from tempfile import TemporaryDirectory, gettempdir from typing import Dict, Optional, Union +import nncf import openvino from huggingface_hub import hf_hub_download from openvino import Core, convert_model @@ -100,7 +101,11 @@ def __init__( self._openvino_config = OVConfig(quantization_config=quantization_config) @staticmethod - def load_model(file_name: Union[str, Path], quantization_config: Union[OVWeightQuantizationConfig, Dict] = None): + def load_model( + file_name: Union[str, Path], + quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, + calibration_dataset: Optional[nncf.Dataset] = None, + ): """ Loads the model. @@ -135,7 +140,7 @@ def fix_op_names_duplicates(model: openvino.runtime.Model): from optimum.intel.openvino.quantization import _weight_only_quantization - model = _weight_only_quantization(model, quantization_config) + model = _weight_only_quantization(model, quantization_config, calibration_dataset=calibration_dataset) return model diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 4b156eda9e..c5964a5d46 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -19,6 +19,7 @@ from tempfile import TemporaryDirectory from typing import Dict, Optional, Tuple, Union +import nncf import numpy as np import openvino import torch @@ -572,7 +573,8 @@ def _from_pretrained( from_onnx: bool = False, local_files_only: bool = False, load_in_8bit: bool = False, - quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, + quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None, + calibration_dataset: Optional[nncf.Dataset] = None, **kwargs, ): model_path = Path(model_id) @@ -596,7 +598,11 @@ def _from_pretrained( quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) load_in_4bit = quantization_config.bits == 4 if quantization_config else False - model = cls.load_model(model_cache_path, quantization_config=None if load_in_4bit else quantization_config) + model = cls.load_model( + model_cache_path, + quantization_config=None if load_in_4bit else quantization_config, + calibration_dataset=calibration_dataset, + ) model_type = config.model_type.replace("_", "-") if model_type == "bloom": @@ -632,7 +638,7 @@ def _from_pretrained( f"For the given model, we recommend the following `quantization_config` : {default_config}" ) - if isinstance(quantization_config.dataset, str): + if calibration_dataset is None and isinstance(quantization_config.dataset, str): tokenizer = quantization_config.tokenizer or AutoTokenizer.from_pretrained(model_id) from optimum.gptq.data import get_dataset, prepare_dataset @@ -644,9 +650,9 @@ def _from_pretrained( dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples) dataset = prepare_dataset(dataset) quantization_config = copy.deepcopy(quantization_config) - quantization_config.dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x)) + calibration_dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x)) - _weight_only_quantization(model, quantization_config) + _weight_only_quantization(model, quantization_config, calibration_dataset) return causal_model diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 3962a88226..dcdf6d8ffe 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -18,8 +18,9 @@ import os from collections import deque from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union +import datasets import nncf import openvino import torch @@ -203,6 +204,7 @@ def from_pretrained(cls, model: PreTrainedModel, **kwargs): def quantize( self, + calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, save_directory: Union[str, Path] = None, ov_config: OVConfig = None, file_name: Optional[str] = None, @@ -216,6 +218,9 @@ def quantize( Quantize a model given the optimization specifications defined in `quantization_config`. Args: + calibration_dataset (`datasets.Dataset` or `nncf.Dataset` or `Iterable`, *optional*): + A collection of data samples to use for quantization calibration. Is optional for weight-only + quantization and is required for full quantization. save_directory (`Union[str, Path]`): The directory where the quantized model should be saved. ov_config (`OVConfig`, *optional*): @@ -235,6 +240,16 @@ def quantize( floating-point. Fits best for LLM footprint reduction and performance acceleration. Examples: + ```python + >>> from optimum.intel.openvino import OVQuantizer, OVModelForCausalLM + >>> from transformers import AutoModelForCausalLM + >>> model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b") + >>> quantizer = OVQuantizer.from_pretrained(model, task="text-generation") + >>> ov_config = OVConfig(quantization_config=OVWeightQuantizationConfig(bits=8, sym=True)) + >>> quantizer.quantize(ov_config=ov_config, save_directory="./quantized_model") + >>> optimized_model = OVModelForCausalLM.from_pretrained("./quantized_model") + ``` + ```python >>> from optimum.intel.openvino import OVQuantizer, OVModelForSequenceClassification >>> from transformers import AutoModelForSequenceClassification @@ -243,25 +258,10 @@ def quantize( >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english") >>> quantizer = OVQuantizer.from_pretrained(model, task="text-classification") >>> ov_config = OVConfig(quantization_config=OVQuantizationConfig(dataset=calibration_dataset)) - >>> quantizer.quantize(ov_config=ov_config, save_directory="./quantized_model") + >>> quantizer.quantize(calibration_dataset=dataset, ov_config=ov_config, save_directory="./quantized_model") >>> optimized_model = OVModelForSequenceClassification.from_pretrained("./quantized_model") ``` - - ```python - >>> from optimum.intel.openvino import OVQuantizer, OVModelForCausalLM - >>> from transformers import AutoModelForCausalLM - >>> model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b") - >>> quantizer = OVQuantizer.from_pretrained(model, task="text-generation") - >>> ov_config = OVConfig(quantization_config=OVWeightQuantizationConfig(bits=8, sym=True)) - >>> quantizer.quantize(ov_config=ov_config, save_directory="./quantized_model") - >>> optimized_model = OVModelForCausalLM.from_pretrained("./quantized_model") - ``` """ - if "calibration_dataset" in kwargs: - raise ValueError( - "`calibration_dataset` argument is deprecated. Please provide calibration dataset " - "with `ov_config.quantization_config.dataset`." - ) if weights_only is not None: logger.warning( "`weights_only` argument is deprecated. In the future please provide `ov_config.quantization_config` " @@ -282,15 +282,21 @@ def quantize( if weights_only is None or weights_only is True: if weights_only is None: logger.info( - "`quantization_config` was not provided, 8-bit symmetric weight quantization will be applied." + "`quantization_config` was not provided, 8-bit asymmetric weight quantization will be applied." ) - ov_config.quantization_config = OVWeightQuantizationConfig(bits=8, sym=True) + ov_config.quantization_config = OVWeightQuantizationConfig(bits=8) else: ov_config.quantization_config = OVQuantizationConfig() if isinstance(self.model, OVBaseModel): self._quantize_ovbasemodel( - ov_config, save_directory, batch_size, data_collator, remove_unused_columns, **kwargs + ov_config, + save_directory, + calibration_dataset, + batch_size, + data_collator, + remove_unused_columns, + **kwargs, ) elif isinstance(self.model, torch.nn.Module): @@ -299,7 +305,14 @@ def quantize( "To convert a PyTorch model to OpenVINO, you can set `export=True` when loading your model as `OVModelForXxx.from_pretrained(..., export=True)`" ) self._quantize_torchmodel( - ov_config, save_directory, file_name, batch_size, data_collator, remove_unused_columns, **kwargs + ov_config, + save_directory, + calibration_dataset, + file_name, + batch_size, + data_collator, + remove_unused_columns, + **kwargs, ) else: raise TypeError(f"Unsupported model type: {type(self.model)}") @@ -308,6 +321,7 @@ def _quantize_ovbasemodel( self, ov_config: OVConfig, save_directory: Union[str, Path], + calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, @@ -318,19 +332,18 @@ def _quantize_ovbasemodel( quantization_config = ov_config.quantization_config if isinstance(quantization_config, OVWeightQuantizationConfig): - _weight_only_quantization(self.model.model, quantization_config) + _weight_only_quantization(self.model.model, quantization_config, calibration_dataset) self.model.save_pretrained(save_directory) ov_config.save_pretrained(save_directory) return if not isinstance(quantization_config, OVQuantizationConfig): raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}") - calibration_dataset = quantization_config.dataset if isinstance(calibration_dataset, nncf.Dataset): quantization_dataset = calibration_dataset - else: + elif isinstance(calibration_dataset, datasets.Dataset): calibration_dataloader = self._get_calibration_dataloader( - calibration_dataset=quantization_config.dataset, + calibration_dataset=calibration_dataset, batch_size=batch_size, remove_unused_columns=remove_unused_columns, data_collator=data_collator, @@ -353,13 +366,17 @@ def _quantize_ovbasemodel( quantization_dataset = nncf.Dataset(collected_inputs) else: quantization_dataset = nncf.Dataset(calibration_dataloader) + else: + if calibration_dataset is None: + raise ValueError("Calibration dataset is required to run quantization.") + quantization_dataset = nncf.Dataset(calibration_dataset) # Actual model quantization quantized_model = nncf.quantize( self.model.model, quantization_dataset, subset_size=quantization_config.num_samples, - ignored_scope=quantization_config.ignored_scope, + ignored_scope=quantization_config.get_ignored_scope_instance(), model_type=quantization_config.model_type, preset=quantization_config.preset, fast_bias_correction=quantization_config.fast_bias_correction, @@ -374,6 +391,7 @@ def _quantize_torchmodel( self, ov_config: OVConfig, save_directory: Union[str, Path], + calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, file_name: Optional[str] = None, batch_size: int = 1, data_collator: Optional[DataCollator] = None, @@ -416,24 +434,18 @@ def _quantize_torchmodel( quantization_config = ov_config.quantization_config if isinstance(quantization_config, OVWeightQuantizationConfig): - dataset = quantization_config.dataset - if not isinstance(dataset, nncf.Dataset): - if dataset is not None: - raise ValueError( - "Please provide `dataset` for weight compression as an instance of `nncf.Dataset`." - ) - if stateful: - # patch model before weight compression - model = patch_model_with_bettertransformer(model) - - dummy_inputs = onnx_config.generate_dummy_inputs(framework="pt") - device = get_model_device(model) - dummy_inputs = tree_map( - lambda value: value.to(device) if isinstance(value, torch.Tensor) else value, dummy_inputs - ) - check_dummy_inputs_are_allowed(model, dummy_inputs) - dataset = nncf.Dataset([dummy_inputs]) - nncf.compress_weights(model, dataset=dataset) + if stateful: + # patch model before weight compression + model = patch_model_with_bettertransformer(model) + + dummy_inputs = onnx_config.generate_dummy_inputs(framework="pt") + device = get_model_device(model) + dummy_inputs = tree_map( + lambda value: value.to(device) if isinstance(value, torch.Tensor) else value, dummy_inputs + ) + check_dummy_inputs_are_allowed(model, dummy_inputs) + + nncf.compress_weights(model, dataset=nncf.Dataset([dummy_inputs])) else: if not isinstance(quantization_config, OVQuantizationConfig): raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}") @@ -444,21 +456,25 @@ def _quantize_torchmodel( ) stateful = False - if isinstance(quantization_config.dataset, nncf.Dataset): - quantization_dataset = quantization_config.dataset - else: + if isinstance(calibration_dataset, nncf.Dataset): + quantization_dataset = calibration_dataset + elif isinstance(calibration_dataset, datasets.Dataset): calibration_dataloader = self._get_calibration_dataloader( - calibration_dataset=quantization_config.dataset, + calibration_dataset=calibration_dataset, batch_size=batch_size, remove_unused_columns=remove_unused_columns, data_collator=data_collator, ) quantization_dataset = nncf.Dataset(calibration_dataloader) + else: + if calibration_dataset is None: + raise ValueError("Calibration dataset is required to run quantization.") + quantization_dataset = nncf.Dataset(calibration_dataset) model = nncf.quantize( model, quantization_dataset, subset_size=quantization_config.num_samples, - ignored_scope=quantization_config.ignored_scope, + ignored_scope=quantization_config.get_ignored_scope_instance(), model_type=quantization_config.model_type, preset=quantization_config.preset, fast_bias_correction=quantization_config.fast_bias_correction, @@ -522,7 +538,7 @@ def get_calibration_dataset( preprocess_batch: bool = True, use_auth_token: bool = False, cache_dir: Optional[str] = None, - ) -> "Dataset": + ) -> datasets.Dataset: """ Create the calibration `datasets.Dataset` to use for the post-training static quantization calibration step. @@ -599,18 +615,33 @@ def _remove_unused_columns(self, dataset: "Dataset"): def _weight_only_quantization( - model: openvino.runtime.Model, quantization_config: Union[OVWeightQuantizationConfig, Dict] + model: openvino.runtime.Model, + quantization_config: Union[OVWeightQuantizationConfig, Dict], + calibration_dataset: Optional[Union[nncf.Dataset, Iterable]] = None, ) -> openvino.runtime.Model: config = quantization_config if isinstance(config, dict): config = OVWeightQuantizationConfig.from_dict(quantization_config) - dataset = config.dataset - - if config.dataset is not None and isinstance(config.dataset, str): - tokenizer = config.tokenizer - if isinstance(tokenizer, str): - tokenizer = AutoTokenizer.from_pretrained(tokenizer) + if config.dataset is not None and calibration_dataset is not None: + logger.info( + "Both `quantization_config.dataset` and `calibration_dataset` were provided for weight only " + "quantization. Will rely on `calibration_dataset`." + ) + dataset = None + if calibration_dataset is not None: + if isinstance(calibration_dataset, datasets.Dataset): + raise ValueError( + "Providing calibration dataset as an instance of `datasets.Dataset` for OV weight-only " + "quantization is not supported. Please provide it as `nncf.Dataset` or as iterable of " + "model inputs." + ) + elif isinstance(calibration_dataset, nncf.Dataset): + dataset = calibration_dataset + else: + dataset = nncf.Dataset(calibration_dataset) + elif config.dataset is not None and isinstance(config.dataset, str): + tokenizer = AutoTokenizer.from_pretrained(config.tokenizer) from optimum.gptq.data import get_dataset, prepare_dataset @@ -635,7 +666,7 @@ def _weight_only_quantization( all_layers=config.all_layers, sensitivity_metric=sensitivity_metric, # awq=config.quant_method == QuantizationMethod.AWQ, # TODO : enable from nncf v2.9.0 - ignored_scope=config.ignored_scope, + ignored_scope=config.get_ignored_scope_instance(), dataset=dataset, # subset_size=config.num_samples if config.num_samples else 128, # TODO : enable from nncf v2.9.0 ) @@ -706,16 +737,13 @@ def _hybrid_quantization( """ ops_to_compress = _collect_ops_with_weights(model) - ignored_scope: Union[nncf.IgnoredScope, None] = quantization_config.ignored_scope - ignored_scope = ignored_scope or nncf.IgnoredScope() - ptq_ignored_scope = copy.deepcopy(ignored_scope) - ptq_ignored_scope.names += ops_to_compress - - wc_quantization_config = copy.deepcopy(quantization_config) - wc_quantization_config.ignored_scope = ignored_scope - wc_quantization_config.ignored_scope.types.append("Convolution") - compressed_model = _weight_only_quantization(model, wc_quantization_config) + wc_config = copy.deepcopy(quantization_config) + wc_config.ignored_scope = wc_config.ignored_scope or {} + wc_config.ignored_scope["types"] = wc_config.ignored_scope.get("types", []) + ["Convolution"] + compressed_model = _weight_only_quantization(model, wc_config) + ptq_ignored_scope = quantization_config.get_ignored_scope_instance() + ptq_ignored_scope.names += ops_to_compress subset_size = quantization_config.num_samples if quantization_config.num_samples else 200 quantized_model = nncf.quantize( model=compressed_model, diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 8be2bce769..b22d5e3955 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -15,12 +15,13 @@ # ruff: noqa import itertools +import logging import tempfile import unittest from collections import defaultdict from enum import Enum from functools import partial -from typing import List +from typing import List, Union import evaluate import numpy as np @@ -104,9 +105,13 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="train", ) - quantization_config = OVQuantizationConfig(dataset=calibration_dataset) - ov_config = OVConfig(quantization_config=quantization_config) - quantizer.quantize(save_directory=tmp_dir, ov_config=ov_config, file_name=file_name) + ov_config = OVConfig(quantization_config=OVQuantizationConfig()) + quantizer.quantize( + save_directory=tmp_dir, + calibration_dataset=calibration_dataset, + file_name=file_name, + ov_config=ov_config, + ) model = model_cls.from_pretrained(tmp_dir, file_name=file_name) # TODO: uncomment once move to a newer version of NNCF which has some fixes (addmm, baddmm) @@ -120,7 +125,7 @@ def preprocess_function(examples, tokenizer): # Verify that the configuration is correctly saved and loaded loaded_config = OVConfig.from_pretrained(tmp_dir) - self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) def test_ovmodel_static_quantization(self, model_cls, model_name, expected_fake_quantize, expected_int8): @@ -146,9 +151,8 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="train", ) - quantization_config = OVQuantizationConfig(dataset=calibration_dataset) - ov_config = OVConfig(quantization_config=quantization_config) - quantizer.quantize(save_directory=tmp_dir, ov_config=ov_config) + ov_config = OVConfig(quantization_config=OVQuantizationConfig()) + quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset, ov_config=ov_config) model = model_cls.from_pretrained(tmp_dir) @@ -162,7 +166,7 @@ def preprocess_function(examples, tokenizer): # Verify that the configuration is correctly saved and loaded loaded_config = OVConfig.from_pretrained(tmp_dir) - self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) class OVWeightCompressionTest(unittest.TestCase): @@ -281,12 +285,12 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i # Verify that the configuration is correctly saved and loaded loaded_config = OVConfig.from_pretrained(tmp_dir) - original_config_as_dict = OVWeightQuantizationConfig(bits=8, sym=True).to_dict() + original_config_as_dict = OVWeightQuantizationConfig().to_dict() for k in original_config_as_dict.keys(): v = original_config_as_dict[k] if isinstance(v, Enum): original_config_as_dict[k] = v.value - self.assertEqual(original_config_as_dict, loaded_config.quantization_config) + self.assertEqual(original_config_as_dict, loaded_config.quantization_config.to_dict()) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_pt_int8, expected_ov_int8): @@ -311,7 +315,7 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p # Verify that the configuration is correctly saved and loaded loaded_config = OVConfig.from_pretrained(tmp_dir) - self.assertEqual(OVWeightQuantizationConfig(bits=8, sym=True).to_dict(), loaded_config.quantization_config) + self.assertEqual(OVWeightQuantizationConfig().to_dict(), loaded_config.quantization_config.to_dict()) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS) def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_int8, expected_int4): @@ -342,7 +346,7 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i # Verify that the configuration is correctly saved and loaded loaded_config = OVConfig.from_pretrained(tmp_dir) - self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) @parameterized.expand(SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) @unittest.skipIf(not IS_SUPPORT_STATEFUL, "Stateful models supported only in 2023.3 and above") @@ -368,7 +372,7 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_id, exp # Verify that the configuration is correctly saved and loaded loaded_config = OVConfig.from_pretrained(tmp_dir) - self.assertEqual(OVWeightQuantizationConfig(bits=8, sym=True).to_dict(), loaded_config.quantization_config) + self.assertEqual(OVWeightQuantizationConfig().to_dict(), loaded_config.quantization_config.to_dict()) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION) def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type): @@ -439,11 +443,11 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_ model.save_pretrained(tmp_dir) openvino_config = OVConfig.from_pretrained(tmp_dir) - self.assertEqual(openvino_config.quantization_config["bits"], 4) + self.assertEqual(openvino_config.quantization_config.bits, 4) self.assertEqual(openvino_config.dtype, "int4") if model_id == "facebook/opt-125m": for key, value in self.DEFAULT_INT4_CONFIG.items(): - self.assertEqual(value, openvino_config.quantization_config[key]) + self.assertEqual(value, getattr(openvino_config.quantization_config, key)) @parameterized.expand(LOAD_IN_4_BITS_SCOPE) def test_ovmodel_4bit_auto_compression_with_config( @@ -461,7 +465,7 @@ def test_ovmodel_4bit_auto_compression_with_config( model.save_pretrained(tmp_dir) openvino_config = OVConfig.from_pretrained(tmp_dir) - self.assertEqual(openvino_config.quantization_config["bits"], 4) + self.assertEqual(openvino_config.quantization_config.bits, 4) self.assertEqual(openvino_config.dtype, "int4") @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTO_COMPRESSED_MATMULS) @@ -492,9 +496,8 @@ def transform_fn(data, tokenizer): model = model_cls.from_pretrained( model_id, export=True, - quantization_config=OVWeightQuantizationConfig( - bits=4, sym=True, group_size=-1, ratio=0.8, dataset=quantization_dataset - ), + quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, group_size=-1, ratio=0.8), + calibration_dataset=quantization_dataset, ) _, num_int8, num_int4 = get_num_quantized_nodes(model) @@ -584,7 +587,7 @@ def test_ovmodel_load_large_model_with_additional_quantization_config(self): "all_layers": None, "sensitivity_metric": None, "dataset": None, - "ignored_scope": None, + "ignored_scope": nncf.IgnoredScope(), } compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params) @@ -610,9 +613,8 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="test", ) - quantization_config = OVQuantizationConfig(dataset=calibration_dataset) - ov_config = OVConfig(quantization_config=quantization_config) - quantizer.quantize(save_directory=tmp_dir, ov_config=ov_config) + ov_config = OVConfig(quantization_config=OVQuantizationConfig()) + quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset, ov_config=ov_config) # Test that inference on quantized model works model = OVModelForQuestionAnswering.from_pretrained(tmp_dir) @@ -629,7 +631,7 @@ def preprocess_function(examples, tokenizer): # Verify that the configuration is correctly saved and loaded loaded_config = OVConfig.from_pretrained(tmp_dir) - self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_ovmodel_static_quantization(self, model_name): @@ -649,9 +651,8 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="test", ) - quantization_config = OVQuantizationConfig(dataset=calibration_dataset) - ov_config = OVConfig(quantization_config=quantization_config) - quantizer.quantize(save_directory=tmp_dir, ov_config=ov_config) + ov_config = OVConfig(quantization_config=OVQuantizationConfig()) + quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset, ov_config=ov_config) # Test that inference on quantized model works model = OVModelForQuestionAnswering.from_pretrained(tmp_dir) @@ -668,7 +669,7 @@ def preprocess_function(examples, tokenizer): # Verify that the configuration is correctly saved and loaded loaded_config = OVConfig.from_pretrained(tmp_dir) - self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) class OVTrainerTest(unittest.TestCase): @@ -719,24 +720,13 @@ def compute_metrics(p): class OVQuantizationConfigTest(unittest.TestCase): QUANTIZATION_CONFIGS = ( - ( - None, - [], - ), - (OVWeightQuantizationConfig(), []), + (None,), + (OVWeightQuantizationConfig(),), ( OVWeightQuantizationConfig( bits=8, sym=True, ), - [], - ), - ( - { - "bits": 8, - "sym": True, - }, - [], ), ( OVWeightQuantizationConfig( @@ -752,21 +742,56 @@ class OVQuantizationConfigTest(unittest.TestCase): num_samples=100, quant_method=OVQuantizationMethod.DEFAULT, ), - ["ignored_scope"], ), - (OVWeightQuantizationConfig(dataset=["wikitext", "c4"]), []), - (OVWeightQuantizationConfig(dataset=load_dataset("wikitext", "wikitext-2-raw-v1", split="test")), ["dataset"]), - (OVWeightQuantizationConfig(dataset=nncf.Dataset([np.zeros((1, 10))])), ["dataset"]), + (OVWeightQuantizationConfig(dataset=["hello world", "i'm alive"]),), ( - OVWeightQuantizationConfig(tokenizer=AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")), - ["tokenizer"], + OVQuantizationConfig( + ignored_scope={"names": ["op_name"]}, + num_samples=100, + preset=nncf.QuantizationPreset.MIXED, + model_type=nncf.ModelType.TRANSFORMER, + fast_bias_correction=True, + overflow_fix=OverflowFix.DISABLE, + ), ), - (OVWeightQuantizationConfig(ignored_scope=nncf.IgnoredScope(names=["op_name"])), ["ignored_scope"]), - (OVQuantizationConfig(dataset="wikitext"), []), - ({"dataset": "wikitext"}, []), + (OVQuantizationConfig(ignored_scope=nncf.IgnoredScope(names=["op_name"])),), + ) + + QUANTIZATION_CONFIG_DICTS = ( + (dict(bits=8, sym=True), OVWeightQuantizationConfig, None), ( - OVQuantizationConfig( + dict( dataset="wikitext", + bits=4, + ignored_scope={"names": ["op_name"]}, + sym=False, + tokenizer="dbmdz/bert-base-german-cased", + ratio=1.0, + group_size=128, + all_layers=True, + sensitivity_metric="mean_activation_magnitude", + num_samples=100, + quant_method=OVQuantizationMethod.DEFAULT, + ), + OVWeightQuantizationConfig, + None, + ), + (dict(), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), + ( + dict(ignored_scope={"names": ["op_name"]}), + OVWeightQuantizationConfig, + "Can't determine type of OV quantization config", + ), + (dict(num_samples=100), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), + (dict(abc="def"), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), + ( + dict(bits=8, fast_bias_correction=True), + OVWeightQuantizationConfig, + "Can't determine type of OV quantization config", + ), + (dict(model_type=nncf.ModelType.TRANSFORMER), OVQuantizationConfig, None), + ( + dict( ignored_scope={"names": ["op_name"]}, num_samples=100, preset=nncf.QuantizationPreset.MIXED, @@ -774,21 +799,25 @@ class OVQuantizationConfigTest(unittest.TestCase): fast_bias_correction=True, overflow_fix=OverflowFix.DISABLE, ), - ["ignored_scope"], + OVQuantizationConfig, + None, ), - (OVQuantizationConfig(dataset=["wikitext", "c4"]), []), - (OVQuantizationConfig(dataset=load_dataset("wikitext", "wikitext-2-raw-v1", split="test")), ["dataset"]), - (OVQuantizationConfig(dataset=nncf.Dataset([np.zeros((1, 10))])), ["dataset"]), + (dict(weight_only=True), OVWeightQuantizationConfig, None), + (dict(weight_only=False), OVQuantizationConfig, None), + (dict(abc="def", weight_only=False), OVQuantizationConfig, None), + (dict(abc="def", weight_only=True), OVWeightQuantizationConfig, None), + (dict(bits=8, fast_bias_correction=True, weight_only=True), OVWeightQuantizationConfig, None), + (dict(bits=8, fast_bias_correction=True, weight_only=False), OVQuantizationConfig, None), + (dict(bits=8, sym=True, weight_only=False), OVWeightQuantizationConfig, "Please check your configuration"), ( - OVQuantizationConfig(dataset=["wikitext", "c4"], ignored_scope=nncf.IgnoredScope(names=["op_name"])), - ["ignored_scope"], + dict(model_type=nncf.ModelType.TRANSFORMER, weight_only=True), + OVQuantizationConfig, + "Please check your configuration", ), ) @parameterized.expand(QUANTIZATION_CONFIGS) - def test_config_serialization( - self, quantization_config: OVQuantizationConfigBase, non_equal_property_names: List[str] - ): + def test_config_serialization(self, quantization_config: OVQuantizationConfigBase): def str_to_enum(enum_cls, value): for k, v in enum_cls.__members__.items(): if getattr(enum_cls, k).value == value: @@ -803,12 +832,8 @@ def str_to_enum(enum_cls, value): if quantization_config is None: self.assertEqual(loaded_ov_config.quantization_config, None) return - for key, value in loaded_ov_config.quantization_config.items(): - initial_value = ( - quantization_config[key] - if isinstance(quantization_config, dict) - else getattr(ov_config.quantization_config, key) - ) + for key, value in loaded_ov_config.quantization_config.to_dict().items(): + initial_value = getattr(ov_config.quantization_config, key) if key == "preset" or key == "overflow_fix": # TODO: remove once NNCF is updated to 2.10 if getattr(quantization_config, key) is not None: @@ -817,10 +842,24 @@ def str_to_enum(enum_cls, value): value = str_to_enum(nncf.QuantizationPreset, value) else: value = str_to_enum(OverflowFix, value) - if key in non_equal_property_names: - self.assertNotEqual(value, initial_value) - else: - self.assertEqual(value, initial_value) + self.assertEqual(value, initial_value) + + @parameterized.expand(QUANTIZATION_CONFIG_DICTS) + def test_config_from_dict(self, quantization_config: dict, config_type: type, warning_log: Union[str, None]): + from optimum.intel.openvino.configuration import logger as configuration_logger + + if warning_log is not None: + with self.assertLogs(configuration_logger, logging.WARN) as cm: + ov_config = OVConfig(quantization_config=quantization_config) + self.assertTrue(any(warning_log in log for log in cm.output)) + else: + ov_config = OVConfig(quantization_config=quantization_config) + self.assertIsInstance(ov_config.quantization_config, config_type) + for k, v in quantization_config.items(): + if k == "weight_only" and warning_log == "Please check your configuration": + continue + if hasattr(ov_config.quantization_config, k): + self.assertEqual(getattr(ov_config.quantization_config, k), v) class InferRequestWrapperTest(unittest.TestCase): From 0e79c097084ccc9dc96722cf3836f1e6fe7585bd Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 12 Apr 2024 11:30:59 +0200 Subject: [PATCH 12/15] Small tweaks --- Makefile | 2 +- optimum/intel/openvino/configuration.py | 2 +- optimum/intel/openvino/quantization.py | 13 +++++++------ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 2a72d9d4c6..83035cf467 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL)) # Run code quality checks style_check: - black . + black --check . ruff check . style: diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index df580bb93b..6fa5823582 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -116,7 +116,7 @@ def __init__( if isinstance(quantization_config, dict): quantization_config = self._quantization_config_from_dict(quantization_config) self.quantization_config = quantization_config - self.compression = None # A backward-compatability field for training-time compression parameters + self.compression = None # A field for backward-compatability of training-time compression parameters bits = ( self.quantization_config.bits if isinstance(self.quantization_config, OVWeightQuantizationConfig) else None diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index dcdf6d8ffe..33985dbe6e 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -245,7 +245,7 @@ def quantize( >>> from transformers import AutoModelForCausalLM >>> model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b") >>> quantizer = OVQuantizer.from_pretrained(model, task="text-generation") - >>> ov_config = OVConfig(quantization_config=OVWeightQuantizationConfig(bits=8, sym=True)) + >>> ov_config = OVConfig(quantization_config=OVWeightQuantizationConfig()) >>> quantizer.quantize(ov_config=ov_config, save_directory="./quantized_model") >>> optimized_model = OVModelForCausalLM.from_pretrained("./quantized_model") ``` @@ -257,15 +257,17 @@ def quantize( >>> # or >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english") >>> quantizer = OVQuantizer.from_pretrained(model, task="text-classification") - >>> ov_config = OVConfig(quantization_config=OVQuantizationConfig(dataset=calibration_dataset)) + >>> ov_config = OVConfig(quantization_config=OVQuantizationConfig()) >>> quantizer.quantize(calibration_dataset=dataset, ov_config=ov_config, save_directory="./quantized_model") >>> optimized_model = OVModelForSequenceClassification.from_pretrained("./quantized_model") ``` """ + # TODO: deprecate weights_only argument if weights_only is not None: logger.warning( "`weights_only` argument is deprecated. In the future please provide `ov_config.quantization_config` " - "as an instance of OVWeightQuantizationConfig for weight-only compression." + "as an instance of OVWeightQuantizationConfig for weight-only compression or as an instance of " + "OVQuantizationConfig for full model quantization." ) if save_directory is None: @@ -274,9 +276,8 @@ def quantize( if ov_config is None: ov_config = OVConfig() - if ov_config is not None: - if not isinstance(ov_config, OVConfig): - raise TypeError(f"`ov_config` should be an `OVConfig`, but got: {type(ov_config)} instead.") + if not isinstance(ov_config, OVConfig): + raise TypeError(f"`ov_config` should be an `OVConfig`, but got: {type(ov_config)} instead.") quantization_config = ov_config.quantization_config if quantization_config is None: if weights_only is None or weights_only is True: From 13b2350c880405d2f29919c3c16449530f7d3319 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 12 Apr 2024 17:07:36 +0200 Subject: [PATCH 13/15] Address comments --- optimum/intel/openvino/configuration.py | 8 ++++---- optimum/intel/openvino/modeling_base.py | 3 +-- optimum/intel/openvino/modeling_decoder.py | 3 +-- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 6fa5823582..e75301729d 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -223,10 +223,9 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): The maximum number of samples composing the calibration dataset. quant_method (`str`, defaults of OVQuantizationMethod.DEFAULT): Weight compression method to apply. - weight_only (`bool`, *optional*): - Used to explicitly specify type of quantization to apply. weight_only (`bool`, *optional*): - Used to explicitly specify type of quantization (weight-only of full) to apply. + Used to explicitly specify type of quantization (weight-only of full) to apply. Useful when building + the config from dictionary. """ def __init__( @@ -343,7 +342,8 @@ def __init__( overflow_fix (`nncf.OverflowFix`, default to OverflowFix.DISABLE): Parameter for controlling overflow fix setting. weight_only (`bool`, *optional*): - Used to explicitly specify type of quantization (weight-only of full) to apply. + Used to explicitly specify type of quantization (weight-only of full) to apply. Useful when building + the config from dictionary. """ if weight_only is True: logger.warning( diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 88c455bc65..dbc26a58b4 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -18,7 +18,6 @@ from tempfile import TemporaryDirectory, gettempdir from typing import Dict, Optional, Union -import nncf import openvino from huggingface_hub import hf_hub_download from openvino import Core, convert_model @@ -104,7 +103,7 @@ def __init__( def load_model( file_name: Union[str, Path], quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, - calibration_dataset: Optional[nncf.Dataset] = None, + calibration_dataset: Optional["nncf.Dataset"] = None, ): """ Loads the model. diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index c5964a5d46..44137186e2 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -19,7 +19,6 @@ from tempfile import TemporaryDirectory from typing import Dict, Optional, Tuple, Union -import nncf import numpy as np import openvino import torch @@ -574,7 +573,6 @@ def _from_pretrained( local_files_only: bool = False, load_in_8bit: bool = False, quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None, - calibration_dataset: Optional[nncf.Dataset] = None, **kwargs, ): model_path = Path(model_id) @@ -598,6 +596,7 @@ def _from_pretrained( quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) load_in_4bit = quantization_config.bits == 4 if quantization_config else False + calibration_dataset = kwargs.get("calibration_dataset", None) model = cls.load_model( model_cache_path, quantization_config=None if load_in_4bit else quantization_config, From f314ba06b32e800f8143d7bc0458b27f8b9a0de4 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 15 Apr 2024 09:22:19 +0200 Subject: [PATCH 14/15] Fix ruff --- optimum/intel/openvino/modeling_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index dbc26a58b4..4dab093bef 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -103,7 +103,7 @@ def __init__( def load_model( file_name: Union[str, Path], quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, - calibration_dataset: Optional["nncf.Dataset"] = None, + calibration_dataset: Optional["Dataset"] = None, ): """ Loads the model. From 70ee0ef36482e376d014c5a52433f7aae5c36ae5 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 15 Apr 2024 09:38:22 +0200 Subject: [PATCH 15/15] Fix ruff 2 --- optimum/intel/openvino/modeling_base.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 4dab093bef..d5b19bb28c 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -103,7 +103,7 @@ def __init__( def load_model( file_name: Union[str, Path], quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, - calibration_dataset: Optional["Dataset"] = None, + calibration_dataset: Optional = None, ): """ Loads the model. @@ -111,6 +111,10 @@ def load_model( Arguments: file_name (`str` or `Path`): The path of the model ONNX or XML file. + quantization_config (`OVWeightQuantizationConfig` or `Dict`, *optional*): + Quantization config to apply after model is loaded. + calibration_dataset (`nncf.Dataset`, *optional*): + Optional nncf.Dataset to feed to model weight compression when quantization config is provided. """ def fix_op_names_duplicates(model: openvino.runtime.Model):