From d9ef0e6458f32cbf7a6a6889b133bf3c24e61683 Mon Sep 17 00:00:00 2001 From: Nikita Malinin Date: Thu, 23 Jan 2025 17:30:53 +0100 Subject: [PATCH 01/20] Initial commit --- docs/source/openvino/export.mdx | 4 +- optimum/commands/export/openvino.py | 96 +++++-- optimum/intel/openvino/configuration.py | 350 +++++++++++++++++++++++- optimum/intel/openvino/quantization.py | 63 ++++- 4 files changed, 476 insertions(+), 37 deletions(-) diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx index 1d0c534193..e25d50fa0c 100644 --- a/docs/source/openvino/export.mdx +++ b/docs/source/openvino/export.mdx @@ -31,7 +31,7 @@ Check out the help for more options: ```text usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code] - [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8,f8e4m3,f8e5m2}] + [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3}] [--library {transformers,diffusers,timm,sentence_transformers,open_clip}] [--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym] [--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}] @@ -67,7 +67,7 @@ Optional arguments: on your local machine arbitrary code present in the model repository. --weight-format {fp32,fp16,int8,int4,mxfp4,nf4} The weight format of the exported model. - --quant-mode {int8,f8e4m3,f8e5m2} + --quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3} Quantization precision mode. This is used for applying full model quantization including activations. --library {transformers,diffusers,timm,sentence_transformers,open_clip} diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 8d272a693f..75b218677d 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -78,7 +78,7 @@ def parse_args_openvino(parser: "ArgumentParser"): optional_group.add_argument( "--quant-mode", type=str, - choices=["int8", "f8e4m3", "f8e5m2"], + choices=["int8", "f8e4m3", "f8e5m2", "nf4_f8e4m3"], default=None, help=( "Quantization precision mode. This is used for applying full model quantization including activations. " @@ -307,7 +307,14 @@ def parse_args(parser: "ArgumentParser"): def run(self): from ...exporters.openvino.__main__ import infer_task, main_export, maybe_convert_tokenizers from ...exporters.openvino.utils import save_preprocessors - from ...intel.openvino.configuration import _DEFAULT_4BIT_CONFIG, OVConfig, get_default_int4_config + from ...intel.openvino.configuration import ( + _DEFAULT_4BIT_CONFIG, + OVCompressWeightsOptions, + OVConfig, + OVGeneralQuantizationConfig, + OVQuantizeOptions, + get_default_int4_config, + ) if self.args.library is None: # TODO: add revision, subfolder and token to args @@ -342,43 +349,39 @@ def run(self): if no_compression_parameter_provided(self.args) and self.args.weight_format == "int4": quantization_config = get_default_int4_config(self.args.model) else: - is_int8 = self.args.weight_format == "int8" - quantization_config = { - "bits": 8 if is_int8 else 4, - "ratio": 1 if is_int8 else (self.args.ratio or _DEFAULT_4BIT_CONFIG["ratio"]), - "sym": self.args.sym or False, - "group_size": -1 if is_int8 else self.args.group_size, - "all_layers": None if is_int8 else self.args.all_layers, - "dataset": self.args.dataset, - "num_samples": self.args.num_samples, - "quant_method": "awq" if self.args.awq else "default", - "sensitivity_metric": self.args.sensitivity_metric, - "scale_estimation": self.args.scale_estimation, - "gptq": self.args.gptq, - "lora_correction": self.args.lora_correction, - "weight_format": self.args.weight_format, - "backup_precision": self.args.backup_precision, - } + quantization_config = prepare_for_wc_config(self.args, _DEFAULT_4BIT_CONFIG) if quantization_config.get("dataset", None) is not None: quantization_config["trust_remote_code"] = self.args.trust_remote_code ov_config = OVConfig(quantization_config=quantization_config) - else: + elif self.args.quant_mode is not None: if self.args.dataset is None: raise ValueError( "Dataset is required for full quantization. Please provide it with --dataset argument." ) - quantization_config = { - "weight_format": self.args.quant_mode, - "activation_format": self.args.quant_mode, - "bits": 8, - "sym": self.args.sym or False, - "dataset": self.args.dataset, - "num_samples": self.args.num_samples, - "smooth_quant_alpha": self.args.smooth_quant_alpha, - "trust_remote_code": self.args.trust_remote_code, - } + if self.args.quant_mode == "nf4_f8e4m3": + wc_config = prepare_for_wc_config(self.args, _DEFAULT_4BIT_CONFIG) + wc_config["weight_format"] = "nf4" + cw_options = OVCompressWeightsOptions.init_with_format(**wc_config) + + q_config = prepare_for_q_config(self.args) + q_config["activation_format"] = "f8e4m3" + q_options = OVQuantizeOptions.init_with_format(**q_config) + + quantization_config = OVGeneralQuantizationConfig.init_with_format( + bits=8, + sym=self.args.sym, + ignored_scope=None, + num_samples=self.args.num_samples, + dataset=self.args.dataset, + trust_remote_code=self.args.trust_remote_code, + weight_format=self.args.weight_format, + ) + quantization_config.compress_weights_options = cw_options + quantization_config.quantize_options = q_options + else: + quantization_config = prepare_for_q_config(self.args) ov_config = OVConfig(quantization_config=quantization_config) quantization_config = ov_config.quantization_config if ov_config else None @@ -470,3 +473,36 @@ def run(self): library_name=library_name, # **input_shapes, ) + + +def prepare_for_wc_config(args, default_configs): + is_int8 = args.weight_format == "int8" + return { + "bits": 8 if is_int8 else 4, + "ratio": 1 if is_int8 else (args.ratio or default_configs["ratio"]), + "sym": args.sym or False, + "group_size": -1 if is_int8 else args.group_size, + "all_layers": None if is_int8 else args.all_layers, + "dataset": args.dataset, + "num_samples": args.num_samples, + "quant_method": "awq" if args.awq else "default", + "sensitivity_metric": args.sensitivity_metric, + "scale_estimation": args.scale_estimation, + "gptq": args.gptq, + "lora_correction": args.lora_correction, + "weight_format": args.weight_format, + "backup_precision": args.backup_precision, + } + + +def prepare_for_q_config(args): + return { + "weight_format": args.quant_mode, + "activation_format": args.quant_mode, + "bits": 8, + "sym": args.sym or False, + "dataset": args.dataset, + "num_samples": args.num_samples, + "smooth_quant_alpha": args.smooth_quant_alpha, + "trust_remote_code": args.trust_remote_code, + } diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 59b4b65ddd..76840686fb 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -717,10 +717,7 @@ def __init__( self.compression = kwargs.get( "compression", None ) # A field for backward-compatability of training-time compression parameters - if self.quantization_config is not None: - self.dtype = self.quantization_config.weight_format - else: - self.dtype = dtype + self.dtype = dtype def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): self.input_info = [ @@ -775,3 +772,348 @@ def to_dict(self) -> Dict[str, Any]: def to_diff_dict(self) -> Dict[str, Any]: return self._to_dict_safe(to_diff_dict=True) + + +class OVCompressWeightsOptions: + def __init__( + self, + mode: str, + ratio: Optional[float] = None, + group_size: Optional[int] = None, + all_layers: Optional[bool] = None, + sensitivity_metric: Optional[str] = None, + awq: Optional[bool] = None, + scale_estimation: Optional[bool] = None, + gptq: Optional[bool] = None, + lora_correction: Optional[bool] = None, + backup_mode: Optional[str] = None, + advanced_parameters: Optional[Dict] = None, + ): + """ + Class containing specific nncf.compress_weights method's options. + Args: + mode (`str`): + Mode for weight compression. Possible values: ['int4_sym', 'int4_asym', 'int8_sym', 'int8_asym', 'e2m1', 'nf4']. + ratio (`float`, *optional*): + The ratio between baseline and backup precisions. + group_size (`int`, *optional*): + The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. + all_layers (`bool`, *optional*): + Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit precision. + sensitivity_metric (`str`, *optional*): + The sensitivity metric for assigning quantization precision to layers. In order to + preserve the accuracy of the model, the more sensitive layers receives a higher precision. + awq (`bool`, *optional*): + Indicates whether to apply a AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but requires + additional time for tuning weights on a calibration dataset. To run AWQ, providing a dataset is + required. + scale_estimation (`bool`, *optional*): + Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original and + compressed layers. Providing a dataset is required to run scale estimation. + qptq (`bool`, *optional*): + Whether to apply GPTQ algorithm. GPTQ optimizes compressed weights in a layer-wise fashion to minimize the + difference between activations of a compressed and original layer. Dataset is required to run GPTQ. + lora_correction (`bool`, *optional*): + If True, apply LoRA Correction algorithm. When enabled, this algorithm introduces low-rank adaptation + layers in the model that can recover accuracy after weight compression at some cost of inference latency. + It calculates low-rank matrices via singular value decomposition (SVD) on the difference between the + original and quantized weights. These matrices are iteratively refined by solving a system of linear + equations to improve accuracy. + backup_precision (`str`, *optional*): + Defines a backup precision for mixed-precision weight compression. + - "none" stands for original floating-point precision of the model weights, in this case weights are + retained in their original precision without any quantization. + - "int8_sym" stands for 8-bit integer symmetric quantization without zero point. + - "int8_asym" stands for 8-bit integer asymmetric quantization with zero points per each quantization group. + advanced_parameters(`Dict`, *optional*) + Defines a dictionary with the advanced parameters. + """ + self.mode = mode + self.ratio = ratio + self.group_size = group_size + self.all_layers = all_layers + self.sensitivity_metric = sensitivity_metric + self.awq = awq + self.scale_estimation = scale_estimation + self.gptq = gptq + self.lora_correction = lora_correction + self.backup_mode = backup_mode + + self._nncf_dict = None + + @staticmethod + def init_with_format( + bits: int = 8, + sym: bool = False, + group_size: Optional[int] = None, + ratio: float = 1.0, + all_layers: Optional[bool] = None, + sensitivity_metric: Optional[str] = None, + quant_method: Union[str, OVQuantizationMethod] = OVQuantizationMethod.DEFAULT, + scale_estimation: bool = None, + weight_format: Optional[str] = None, + gptq: bool = None, + lora_correction: bool = None, + backup_precision: Optional[str] = None, + **kwargs, + ): + """ + Method for the backwards-compatible OVCompressWeightsOptions initialization. + All options are the same as those in the OVWeightQuantizationConfig. + """ + signed_bitness = { + 4: "int4", + 8: "int8", + } + mode = weight_format if weight_format else signed_bitness[bits] + if mode in signed_bitness.values(): + mode += "_sym" if sym else "_asym" + mode = mode + + if isinstance(quant_method, str): + awq = quant_method == "awq" + elif isinstance(quant_method, OVQuantizationMethod): + awq = quant_method == OVQuantizationMethod.AWQ + + return OVCompressWeightsOptions( + mode=mode, + ratio=ratio, + group_size=group_size, + all_layers=all_layers, + sensitivity_metric=sensitivity_metric, + awq=awq, + scale_estimation=scale_estimation, + gptq=gptq, + backup_mode=backup_precision, + lora_correction=lora_correction, + ) + + def to_nncf_dict(self) -> Dict[str, Any]: + """ + Returns a dictionary with the NNCF-friendly variables that are ready to use. + """ + if self._nncf_dict: + return self._nncf_dict + + if is_nncf_available(): + mode = nncf.CompressWeightsMode(self.mode) + sensitivity_metric = nncf.SensitivityMetric(self.sensitivity_metric) if self.sensitivity_metric else None + backup_mode = nncf.BackupMode(self.backup_mode) if self.backup_mode else None + self._nncf_dict = { + "mode": mode, + "ratio": self.ratio, + "group_size": self.group_size, + "all_layers": self.all_layers, + "sensitivity_metric": sensitivity_metric, + "awq": self.awq, + "scale_estimation": self.scale_estimation, + "gptq": self.gptq, + "lora_correction": self.lora_correction, + "backup_mode": backup_mode, + } + return self._nncf_dict + + raise ImportError("NNCF is required to execute this method. Please install nncf first.") + + def to_dict(self) -> Dict[str, Any]: + return copy.deepcopy(self.__dict__) + + +class OVQuantizeOptions: + def __init__( + self, + mode: Optional[str] = None, + preset: Optional[str] = None, + target_device: str = "any", + fast_bias_correction: bool = True, + model_type: Optional[str] = None, + advanced_parameters: Optional[Dict] = None, + ): + """ + Class containing specific nncf.quantize method's options. + Args: + mode (`str`, *optional*): + Defines special quantization modes. Possible values: ['fp8_e4m3', 'fp8_e5m2']. + preset (`str`, *optional*): + Quantization presets, usually meaning to enable either a symmetrical or asymmetrical scheme. Possible values: ['performance', 'mixed']. + target_device (`str`, defaults to "any"): + Target device architecture for compression. Possible values: ['any', 'cpu', 'gpu', 'npu', 'cpu_spr']. + fast_bias_correction (`bool`, defaults to True): + Whether to apply fast or full bias correction algorithm. + model_type (`str`, *optional*): + Model type is needed to specify additional patterns in the model. Supported only `transformer` now. + advanced_parameters(`Dict`, *optional*) + Defines a dictionary with the advanced parameters. + Examples of the values: + - overflow_fix (`str`): + Parameter for controlling overflow fix setting. + - smooth_quant_alphas (`dict`): + SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and + reduces quantization error. + Examples of the values: + - matmul (`float`) + - convolution (`float`) + """ + self.mode = mode + self.preset = preset + self.target_device = target_device + self.fast_bias_correction = fast_bias_correction + self.model_type = model_type + self.advanced_parameters = advanced_parameters + + self._nncf_dict = None + + @staticmethod + def init_with_format( + bits: int = 8, + sym: bool = False, + ignored_scope: Optional[dict] = None, + num_samples: Optional[int] = 300, + model_type: str = "transformer", + fast_bias_correction: bool = True, + overflow_fix: str = "disable", + dataset: Optional[str] = None, + tokenizer: Optional[str] = None, + processor: Optional[str] = None, + trust_remote_code: bool = False, + smooth_quant_alpha: Optional[float] = None, + weight_format: Optional[str] = "int8", + activation_format: Optional[str] = "int8", + **kwargs, + ): + """ + Method for the backwards-compatible OVQuantizeOptions initialization. + All options are the same as those in the OVQuantizationConfig. + """ + preset = "performance" if sym else "mixed" + advanced_parameters = {"overflow_fix": overflow_fix} + if smooth_quant_alpha: + advanced_parameters["smooth_quant_alphas"] = {"matmul": smooth_quant_alpha} + + mode = None + if activation_format: + mode_map = { + "f8e4m3": "fp8_e4m3", + "f8e5m2": "fp8_e5m2", + } + mode = mode_map[activation_format] + preset = "performance" + + return OVQuantizeOptions( + mode=mode, + preset=preset, + target_device="any", + fast_bias_correction=fast_bias_correction, + model_type=model_type, + advanced_parameters=advanced_parameters, + ) + + def to_nncf_dict(self) -> Dict[str, Any]: + """ + Returns a dictionary with the NNCF-friendly variables that are ready to use. + """ + if self._nncf_dict: + return self._nncf_dict + + if is_nncf_available(): + mode = nncf.QuantizationMode(self.mode) if self.mode else None + preset = nncf.QuantizationPreset(self.preset) + target_device = nncf.TargetDevice(self.target_device.upper()) + model_type = nncf.ModelType(self.model_type) if self.model_type else None + advanced_parameters = None + if self.advanced_parameters: + advanced_parameters = nncf.AdvancedQuantizationParameters( + overflow_fix=self.advanced_parameters["overflow_fix"], + ) + if "smooth_quant_alphas" in self.advanced_parameters: + advanced_parameters.smooth_quant_alphas = ( + nncf.AdvancedSmoothQuantParameters(**self.advanced_parameters["smooth_quant_alphas"]), + ) + + self._nncf_dict = { + "mode": mode, + "preset": preset, + "target_device": target_device, + "fast_bias_correction": self.fast_bias_correction, + "model_type": model_type, + "advanced_parameters": advanced_parameters, + } + return self._nncf_dict + + raise ImportError("NNCF is required to execute this method. Please install nncf first.") + + def to_dict(self) -> Dict: + return copy.deepcopy(self.__dict__) + + +class OVGeneralQuantizationConfig(QuantizationConfigMixin): + def __init__( + self, + ignored_scope: Optional[Dict] = None, + num_samples: Optional[int] = None, + compress_weights_options: Optional[OVCompressWeightsOptions] = None, + quantize_options: Optional[OVQuantizeOptions] = None, + ): + """ + Class containing general options for the NNCF-based quantization. + Args: + ignored_scope (`dict`, *optional*): + An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary + entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. + num_samples (`int`, *optional*): + The maximum number of samples composing the calibration dataset. + compress_weights_options (`OVCompressWeightsOptions`, *optional*): + See OVCompressWeightsOptions instance. + quantize_options (`OVQuantizeOptions`, *optional*): + See OVQuantizeOptions instance. + """ + self.ignored_scope = ignored_scope + self.num_samples = num_samples + self.compress_weights_options = compress_weights_options + self.quantize_options = quantize_options + self.bits = None + self.sym = None + self.dataset = None + self.tokenizer = None + self.processor = None + self.trust_remote_code = None + self.weight_format = None + + @staticmethod + def init_with_format( + bits: int = 8, + sym: bool = False, + ignored_scope: Optional[dict] = None, + num_samples: Optional[int] = None, + dataset: Optional[Optional[Union[str, List[str]]]] = None, + tokenizer: Optional[str] = None, + processor: Optional[str] = None, + trust_remote_code: bool = False, + weight_format: Optional[str] = None, + ): + """ + Method for the backwards-compatible QuantizationConfigMixin initialization. + All options are the same as those in the QuantizationConfigMixin. + """ + config = OVGeneralQuantizationConfig( + ignored_scope=ignored_scope, + num_samples=num_samples, + ) + config.bits = bits + config.sym = sym + config.dataset = dataset + config.tokenizer = tokenizer + config.processor = processor + config.trust_remote_code = trust_remote_code + config.weight_format = weight_format + return config + + def get_ignored_scope_instance(self) -> nncf.IgnoredScope: + ignored_scope = copy.deepcopy(self.ignored_scope) if self.ignored_scope else {} + return nncf.IgnoredScope(**ignored_scope) + + def to_dict(self): + result = copy.deepcopy(self.__dict__) + result["compress_weights_options"] = self.compress_weights_options.to_dict() + result["quantize_options"] = self.quantize_options.to_dict() + return result diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index f61c2b93ca..391643a0fe 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -60,6 +60,7 @@ from ..utils.modeling_utils import get_model_device from .configuration import ( OVConfig, + OVGeneralQuantizationConfig, OVQuantizationConfig, OVQuantizationConfigBase, OVQuantizationMethod, @@ -451,7 +452,7 @@ def _quantize_ovbasemodel( else: _weight_only_quantization(self.model.model, quantization_config, calibration_dataset, **kwargs) self.model.request = None - else: + elif isinstance(quantization_config, OVQuantizationConfig): if not isinstance(quantization_config, OVQuantizationConfig): raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}") @@ -467,6 +468,15 @@ def _quantize_ovbasemodel( ) self.model.model = quantized_model self.model.request = None + else: + if calibration_dataset is None: + raise ValueError("Calibration dataset is required to run quantization.") + + quantized_model = _general_quantization( + self.model.model, quantization_config, calibration_dataset, **kwargs + ) + self.model.model = quantized_model + self.model.request = None if save_directory is not None: self.model.save_pretrained(save_directory) @@ -1187,3 +1197,54 @@ def _hybrid_quantization( **kwargs, ) return quantized_model + + +def _general_quantization( + model: openvino.Model, + quantization_config: OVGeneralQuantizationConfig, + calibration_dataset: nncf.Dataset, + **kwargs, +) -> openvino.Model: + """ + Quantize a model with NNCF in two possible steps: + - weights-only quantization with nncf.compress_weights method. + - full quantization (excluding weights from previous step) with nncf.quantize method. + + Args: + model (`openvino.runtime.Model`): + The OpenVINO Runtime model for applying quantization. + quantization_config (`OVGeneralQuantizationConfig`): + The configuration containing the parameters related to quantization. + calibration_dataset (`nncf.Dataset`): + The dataset used for quantization. + Returns: + The OpenVINO Runtime model with applied quantization. + """ + quantized_model = model + + ignored_scope = quantization_config.get_ignored_scope_instance() + + if quantization_config.compress_weights_options: + ops_with_weights = _collect_ops_with_weights(model) + wc_kwargs = copy.deepcopy(kwargs) + wc_kwargs.update(quantization_config.compress_weights_options.to_nncf_dict()) + quantized_model = nncf.compress_weights( + model, + ignored_scope=ignored_scope, + dataset=calibration_dataset, + subset_size=quantization_config.num_samples, + **wc_kwargs, + ) + ignored_scope.names += ops_with_weights + + if quantization_config.quantize_options: + q_kwargs = copy.deepcopy(kwargs) + q_kwargs.update(quantization_config.quantize_options.to_nncf_dict()) + quantized_model = nncf.quantize( + model, + calibration_dataset, + subset_size=quantization_config.num_samples, + ignored_scope=ignored_scope, + **q_kwargs, + ) + return quantized_model From 08f59921cfe5ce41c80dcc32af395ba52225ff89 Mon Sep 17 00:00:00 2001 From: Nikita Malinin Date: Fri, 24 Jan 2025 08:57:53 +0100 Subject: [PATCH 02/20] Fix tests --- optimum/intel/openvino/configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 76840686fb..80d062400e 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -1108,7 +1108,7 @@ def init_with_format( config.weight_format = weight_format return config - def get_ignored_scope_instance(self) -> nncf.IgnoredScope: + def get_ignored_scope_instance(self) -> "nncf.IgnoredScope": ignored_scope = copy.deepcopy(self.ignored_scope) if self.ignored_scope else {} return nncf.IgnoredScope(**ignored_scope) From 89b3afcf84d0ec45796f86310cde0698666849d2 Mon Sep 17 00:00:00 2001 From: Nikita Malinin Date: Fri, 24 Jan 2025 10:17:28 +0100 Subject: [PATCH 03/20] Add test --- optimum/intel/openvino/configuration.py | 4 ++-- tests/openvino/test_exporters_cli.py | 14 +++++++++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 80d062400e..b03aeab968 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -1026,8 +1026,8 @@ def to_nncf_dict(self) -> Dict[str, Any]: overflow_fix=self.advanced_parameters["overflow_fix"], ) if "smooth_quant_alphas" in self.advanced_parameters: - advanced_parameters.smooth_quant_alphas = ( - nncf.AdvancedSmoothQuantParameters(**self.advanced_parameters["smooth_quant_alphas"]), + advanced_parameters.smooth_quant_alphas = nncf.AdvancedSmoothQuantParameters( + **self.advanced_parameters["smooth_quant_alphas"] ) self._nncf_dict = { diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 7c64d84d3d..d9e20df772 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -131,6 +131,14 @@ class OVCLIExportTestCase(unittest.TestCase): (13,), (16,), ), + ( + "text-generation", + "llama", + "nf4_f8e4m3", + "--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --group-size 16 --trust-remote-code", + (4,), + (14,), + ), ] TEST_4BIT_CONFIGURATIONS = [ @@ -446,7 +454,11 @@ def test_exporters_cli_full_quantization( for i, model in enumerate(models): num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(model) self.assertEqual(expected_fake_nodes[i], num_fake_nodes) - self.assertEqual(expected_low_precision_nodes[i], num_weight_nodes[quant_mode]) + weight_types = quant_mode.split("_") + num_weights = 0 + for weight_type in weight_types: + num_weights += num_weight_nodes[weight_type] + self.assertEqual(expected_low_precision_nodes[i], num_weights) def test_exporters_cli_int4_with_local_model_and_default_config(self): with TemporaryDirectory() as tmpdir: From e3412a6ef3b023f5788f3d7167388f4f40758074 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 28 Jan 2025 19:33:19 +0100 Subject: [PATCH 04/20] OVMixedQuantizationConfig proposal --- optimum/commands/export/openvino.py | 21 +- optimum/intel/__init__.py | 4 + optimum/intel/openvino/__init__.py | 8 +- optimum/intel/openvino/configuration.py | 533 ++++++------------ optimum/intel/openvino/quantization.py | 53 +- .../utils/dummy_openvino_and_nncf_objects.py | 11 + tests/openvino/test_quantization.py | 24 +- 7 files changed, 239 insertions(+), 415 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 75b218677d..c4750604fb 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -307,14 +307,7 @@ def parse_args(parser: "ArgumentParser"): def run(self): from ...exporters.openvino.__main__ import infer_task, main_export, maybe_convert_tokenizers from ...exporters.openvino.utils import save_preprocessors - from ...intel.openvino.configuration import ( - _DEFAULT_4BIT_CONFIG, - OVCompressWeightsOptions, - OVConfig, - OVGeneralQuantizationConfig, - OVQuantizeOptions, - get_default_int4_config, - ) + from ...intel.openvino.configuration import _DEFAULT_4BIT_CONFIG, OVConfig, get_default_int4_config if self.args.library is None: # TODO: add revision, subfolder and token to args @@ -363,23 +356,17 @@ def run(self): if self.args.quant_mode == "nf4_f8e4m3": wc_config = prepare_for_wc_config(self.args, _DEFAULT_4BIT_CONFIG) wc_config["weight_format"] = "nf4" - cw_options = OVCompressWeightsOptions.init_with_format(**wc_config) q_config = prepare_for_q_config(self.args) q_config["activation_format"] = "f8e4m3" - q_options = OVQuantizeOptions.init_with_format(**q_config) - quantization_config = OVGeneralQuantizationConfig.init_with_format( - bits=8, - sym=self.args.sym, - ignored_scope=None, + quantization_config = dict( + weight_quantization_config=wc_config, + quantization_config=q_config, num_samples=self.args.num_samples, dataset=self.args.dataset, trust_remote_code=self.args.trust_remote_code, - weight_format=self.args.weight_format, ) - quantization_config.compress_weights_options = cw_options - quantization_config.quantize_options = q_options else: quantization_config = prepare_for_q_config(self.args) ov_config = OVConfig(quantization_config=quantization_config) diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 91aaf57ae0..cb652b7d66 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -84,6 +84,7 @@ "OVQuantizationConfig", "OVWeightQuantizationConfig", "OVDynamicQuantizationConfig", + "OVMixedQuantizationConfig", ] ) else: @@ -94,6 +95,7 @@ "OVQuantizationConfig", "OVWeightQuantizationConfig", "OVDynamicQuantizationConfig", + "OVMixedQuantizationConfig", ] ) @@ -270,6 +272,7 @@ except OptionalDependencyNotAvailable: from .utils.dummy_openvino_and_nncf_objects import ( OVDynamicQuantizationConfig, + OVMixedQuantizationConfig, OVQuantizationConfig, OVQuantizer, OVTrainingArguments, @@ -278,6 +281,7 @@ else: from .openvino import ( OVDynamicQuantizationConfig, + OVMixedQuantizationConfig, OVQuantizationConfig, OVQuantizer, OVTrainingArguments, diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 8945dc6382..32bc59690d 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -55,7 +55,13 @@ from .trainer import OVTrainer -from .configuration import OVConfig, OVDynamicQuantizationConfig, OVQuantizationConfig, OVWeightQuantizationConfig +from .configuration import ( + OVConfig, + OVDynamicQuantizationConfig, + OVMixedQuantizationConfig, + OVQuantizationConfig, + OVWeightQuantizationConfig, +) from .modeling import ( OVModelForAudioClassification, OVModelForAudioFrameClassification, diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index b03aeab968..785d72d747 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -263,26 +263,15 @@ class OVQuantizationConfigBase(QuantizationConfigMixin): def __init__( self, - bits: int = 8, - sym: bool = False, - ignored_scope: Optional[dict] = None, num_samples: Optional[int] = None, - dataset: Optional[Optional[Union[str, List[str]]]] = None, + dataset: Optional[Union[str, List[str]]] = None, tokenizer: Optional[str] = None, processor: Optional[str] = None, trust_remote_code: bool = False, - weight_format: Optional[str] = None, **kwargs, ): """ Args: - bits (`int`, defaults to 8): - The number of bits to quantize to. - sym (`bool`, defaults to `False`): - Whether to use symmetric quantization. - ignored_scope (`dict`, *optional*): - An ignored scope that defines a list of model nodes to be ignored during quantization. Dictionary - entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. num_samples (`int`, *optional*): The maximum number of samples composing the calibration dataset. dataset (`str or List[str]`, *optional*): @@ -295,37 +284,17 @@ def __init__( Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the model repository. - weight_format (`str`, *optional*): - Data format weights are compressed to. """ - self.bits = bits - self.sym = sym self.num_samples = num_samples self.dataset = dataset self.tokenizer = tokenizer self.processor = processor self.trust_remote_code = trust_remote_code - self.weight_format = weight_format - - if isinstance(ignored_scope, nncf.IgnoredScope): - ignored_scope = ignored_scope.__dict__ - self.ignored_scope = ignored_scope def post_init(self): - try: - self.get_ignored_scope_instance() - except Exception as e: - raise ValueError( - f"Can't create an `IgnoredScope` object from the provided ignored scope dict: {self.ignored_scope}.\n{e}" - ) if not (self.num_samples is None or isinstance(self.num_samples, int) and self.num_samples > 0): raise ValueError(f"`num_samples` is expected to be a positive integer, but found: {self.num_samples}") - def get_ignored_scope_instance(self) -> "nncf.IgnoredScope": - if self.ignored_scope is None: - return nncf.IgnoredScope() - return nncf.IgnoredScope(**copy.deepcopy(self.ignored_scope)) - @dataclass class OVWeightQuantizationConfig(OVQuantizationConfigBase): @@ -436,16 +405,14 @@ def __init__( **kwargs, ): super().__init__( - bits=bits, - sym=sym, - ignored_scope=ignored_scope, num_samples=num_samples, dataset=dataset, tokenizer=tokenizer, processor=processor, trust_remote_code=trust_remote_code, - weight_format=weight_format, ) + self.bits = bits + self.sym = sym self.group_size = group_size or (-1 if bits == 8 else 128) self.ratio = ratio self.all_layers = all_layers @@ -455,6 +422,11 @@ def __init__( self.gptq = gptq self.lora_correction = lora_correction self.backup_precision = backup_precision + if isinstance(ignored_scope, nncf.IgnoredScope): + ignored_scope = ignored_scope.__dict__ + self.ignored_scope = ignored_scope + self.weight_format = weight_format + self._nncf_dict = None self.post_init() def post_init(self): @@ -493,10 +465,17 @@ def post_init(self): "quantization algorithm is selected and compression ratio is 1.0." ) + try: + self.get_ignored_scope_instance() + except Exception as e: + raise ValueError( + f"Can't create an `IgnoredScope` object from the provided ignored scope dict: {self.ignored_scope}.\n{e}" + ) + if self.bits not in [4, 8]: raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}") - if self.bits == 8: + if self.bits == 8 and self.weight_format: if self.ratio != 1: raise ValueError( f"For 8-bit quantization, `ratio` is expected to be set to 1.0, but was set to {self.ratio}" @@ -565,6 +544,52 @@ def post_init(self): if self.gptq and self.lora_correction: raise ValueError("The GPTQ and LoRA Correction algorithms can't be applied simultaneously") + def get_ignored_scope_instance(self) -> "nncf.IgnoredScope": + if self.ignored_scope is None: + return nncf.IgnoredScope() + return nncf.IgnoredScope(**copy.deepcopy(self.ignored_scope)) + + def to_nncf_dict(self) -> Dict[str, Any]: + """ + Returns a dictionary with the NNCF-friendly variables that are ready to use. + """ + if self._nncf_dict: + return self._nncf_dict + + if is_nncf_available(): + signed_bitness = { + 4: "int4", + 8: "int8", + } + mode = self.weight_format if self.weight_format else signed_bitness[self.bits] + if mode in signed_bitness.values(): + mode += "_sym" if self.sym else "_asym" + + if isinstance(self.quant_method, str): + awq = self.quant_method == "awq" + else: + awq = self.quant_method == OVQuantizationMethod.AWQ + + mode = nncf.CompressWeightsMode(mode) + sensitivity_metric = nncf.SensitivityMetric(self.sensitivity_metric) if self.sensitivity_metric else None + backup_mode = nncf.BackupMode(self.backup_precision) if self.backup_precision else None + self._nncf_dict = { + "mode": mode, + "ratio": self.ratio, + "group_size": self.group_size, + "ignored_scope": self.get_ignored_scope_instance(), + "all_layers": self.all_layers, + "sensitivity_metric": sensitivity_metric, + "awq": awq, + "scale_estimation": self.scale_estimation, + "gptq": self.gptq, + "lora_correction": self.lora_correction, + "backup_mode": backup_mode, + } + return self._nncf_dict + + raise ImportError("NNCF is required to execute this method. Please install nncf first.") + @dataclass class OVDynamicQuantizationConfig(OVWeightQuantizationConfig): @@ -596,7 +621,6 @@ def __init__( processor: Optional[str] = None, trust_remote_code: bool = False, smooth_quant_alpha: Optional[float] = None, - weight_format: Optional[str] = "int8", activation_format: Optional[str] = "int8", **kwargs, ): @@ -642,34 +666,32 @@ def __init__( smooth_quant_alpha (`float`, *optional*): SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and reduces quantization error. - weight_format (`str`, defaults to "int8"): - Data format weights are quantized to. Possible values: ['int8', 'f8e4m3', 'f8e5m2']. activation_format (`str`, defaults to "int8"): Data format activations are compressed to. Possible values: ['int8', 'f8e4m3', 'f8e5m2']. """ super().__init__( - bits=bits, - sym=sym, - ignored_scope=ignored_scope, num_samples=num_samples, dataset=dataset, tokenizer=tokenizer, processor=processor, trust_remote_code=trust_remote_code, - weight_format=weight_format, ) + self.bits = bits + self.sym = sym self.model_type = model_type self.fast_bias_correction = fast_bias_correction self.overflow_fix = overflow_fix self.smooth_quant_alpha = smooth_quant_alpha self.activation_format = activation_format + if isinstance(ignored_scope, nncf.IgnoredScope): + ignored_scope = ignored_scope.__dict__ + self.ignored_scope = ignored_scope f8_formats = ["f8e4m3", "f8e5m2"] - if self.activation_format in f8_formats and self.weight_format in f8_formats: - logger.info( - f"{self.activation_format} for activations and {self.weight_format} weights were found. A symmetrical scheme will be used." - ) + if self.activation_format in f8_formats: + logger.info(f"{self.activation_format} for activations was found. A symmetrical scheme will be used.") self.sym = True + self._nncf_dict = None self.post_init() def post_init(self): @@ -694,6 +716,58 @@ def post_init(self): f"SmoothQuant alpha parameter must be in range [0, 1], but found {self.smooth_quant_alpha}" ) + def get_ignored_scope_instance(self) -> "nncf.IgnoredScope": + if self.ignored_scope is None: + return nncf.IgnoredScope() + return nncf.IgnoredScope(**copy.deepcopy(self.ignored_scope)) + + def to_nncf_dict(self) -> Dict[str, Any]: + """ + Returns a dictionary with the NNCF-friendly variables that are ready to use. + """ + if self._nncf_dict: + return self._nncf_dict + + if is_nncf_available(): + preset = "performance" if self.sym else "mixed" + advanced_parameters_dict = {"overflow_fix": self.overflow_fix} + if self.smooth_quant_alpha: + advanced_parameters_dict["smooth_quant_alphas"] = {"matmul": self.smooth_quant_alpha} + + mode = None + if self.activation_format: + mode_map = { + "int8": "int8", + "f8e4m3": "fp8_e4m3", + "f8e5m2": "fp8_e5m2", + } + mode = mode_map[self.activation_format] + if mode == "int8": + mode += "_sym" if self.sym else "_asym" + preset = "performance" + + preset = nncf.QuantizationPreset(preset) + model_type = nncf.ModelType(self.model_type) if self.model_type else None + advanced_parameters = nncf.AdvancedQuantizationParameters( + overflow_fix=advanced_parameters_dict["overflow_fix"], + ) + if "smooth_quant_alphas" in advanced_parameters_dict: + advanced_parameters.smooth_quant_alphas = nncf.AdvancedSmoothQuantParameters( + **advanced_parameters_dict["smooth_quant_alphas"] + ) + + self._nncf_dict = { + "mode": mode, + "preset": preset, + "fast_bias_correction": self.fast_bias_correction, + "model_type": model_type, + "ignored_scope": self.get_ignored_scope_instance(), + "advanced_parameters": advanced_parameters, + } + return self._nncf_dict + + raise ImportError("NNCF is required to execute this method. Please install nncf first.") + class OVConfig(BaseConfig): CONFIG_NAME = "openvino_config.json" @@ -731,6 +805,8 @@ def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): @staticmethod def _quantization_config_from_dict(quantization_config: dict) -> OVQuantizationConfigBase: + if "weight_quantization_config" in quantization_config and "quantization_config" in quantization_config: + return OVMixedQuantizationConfig.from_dict(quantization_config) wq_args = inspect.getfullargspec(OVWeightQuantizationConfig.__init__).args q_args = inspect.getfullargspec(OVQuantizationConfig.__init__).args weight_only = quantization_config.pop("weight_only", None) @@ -774,285 +850,17 @@ def to_diff_dict(self) -> Dict[str, Any]: return self._to_dict_safe(to_diff_dict=True) -class OVCompressWeightsOptions: - def __init__( - self, - mode: str, - ratio: Optional[float] = None, - group_size: Optional[int] = None, - all_layers: Optional[bool] = None, - sensitivity_metric: Optional[str] = None, - awq: Optional[bool] = None, - scale_estimation: Optional[bool] = None, - gptq: Optional[bool] = None, - lora_correction: Optional[bool] = None, - backup_mode: Optional[str] = None, - advanced_parameters: Optional[Dict] = None, - ): - """ - Class containing specific nncf.compress_weights method's options. - Args: - mode (`str`): - Mode for weight compression. Possible values: ['int4_sym', 'int4_asym', 'int8_sym', 'int8_asym', 'e2m1', 'nf4']. - ratio (`float`, *optional*): - The ratio between baseline and backup precisions. - group_size (`int`, *optional*): - The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. - all_layers (`bool`, *optional*): - Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit precision. - sensitivity_metric (`str`, *optional*): - The sensitivity metric for assigning quantization precision to layers. In order to - preserve the accuracy of the model, the more sensitive layers receives a higher precision. - awq (`bool`, *optional*): - Indicates whether to apply a AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but requires - additional time for tuning weights on a calibration dataset. To run AWQ, providing a dataset is - required. - scale_estimation (`bool`, *optional*): - Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original and - compressed layers. Providing a dataset is required to run scale estimation. - qptq (`bool`, *optional*): - Whether to apply GPTQ algorithm. GPTQ optimizes compressed weights in a layer-wise fashion to minimize the - difference between activations of a compressed and original layer. Dataset is required to run GPTQ. - lora_correction (`bool`, *optional*): - If True, apply LoRA Correction algorithm. When enabled, this algorithm introduces low-rank adaptation - layers in the model that can recover accuracy after weight compression at some cost of inference latency. - It calculates low-rank matrices via singular value decomposition (SVD) on the difference between the - original and quantized weights. These matrices are iteratively refined by solving a system of linear - equations to improve accuracy. - backup_precision (`str`, *optional*): - Defines a backup precision for mixed-precision weight compression. - - "none" stands for original floating-point precision of the model weights, in this case weights are - retained in their original precision without any quantization. - - "int8_sym" stands for 8-bit integer symmetric quantization without zero point. - - "int8_asym" stands for 8-bit integer asymmetric quantization with zero points per each quantization group. - advanced_parameters(`Dict`, *optional*) - Defines a dictionary with the advanced parameters. - """ - self.mode = mode - self.ratio = ratio - self.group_size = group_size - self.all_layers = all_layers - self.sensitivity_metric = sensitivity_metric - self.awq = awq - self.scale_estimation = scale_estimation - self.gptq = gptq - self.lora_correction = lora_correction - self.backup_mode = backup_mode - - self._nncf_dict = None - - @staticmethod - def init_with_format( - bits: int = 8, - sym: bool = False, - group_size: Optional[int] = None, - ratio: float = 1.0, - all_layers: Optional[bool] = None, - sensitivity_metric: Optional[str] = None, - quant_method: Union[str, OVQuantizationMethod] = OVQuantizationMethod.DEFAULT, - scale_estimation: bool = None, - weight_format: Optional[str] = None, - gptq: bool = None, - lora_correction: bool = None, - backup_precision: Optional[str] = None, - **kwargs, - ): - """ - Method for the backwards-compatible OVCompressWeightsOptions initialization. - All options are the same as those in the OVWeightQuantizationConfig. - """ - signed_bitness = { - 4: "int4", - 8: "int8", - } - mode = weight_format if weight_format else signed_bitness[bits] - if mode in signed_bitness.values(): - mode += "_sym" if sym else "_asym" - mode = mode - - if isinstance(quant_method, str): - awq = quant_method == "awq" - elif isinstance(quant_method, OVQuantizationMethod): - awq = quant_method == OVQuantizationMethod.AWQ - - return OVCompressWeightsOptions( - mode=mode, - ratio=ratio, - group_size=group_size, - all_layers=all_layers, - sensitivity_metric=sensitivity_metric, - awq=awq, - scale_estimation=scale_estimation, - gptq=gptq, - backup_mode=backup_precision, - lora_correction=lora_correction, - ) - - def to_nncf_dict(self) -> Dict[str, Any]: - """ - Returns a dictionary with the NNCF-friendly variables that are ready to use. - """ - if self._nncf_dict: - return self._nncf_dict - - if is_nncf_available(): - mode = nncf.CompressWeightsMode(self.mode) - sensitivity_metric = nncf.SensitivityMetric(self.sensitivity_metric) if self.sensitivity_metric else None - backup_mode = nncf.BackupMode(self.backup_mode) if self.backup_mode else None - self._nncf_dict = { - "mode": mode, - "ratio": self.ratio, - "group_size": self.group_size, - "all_layers": self.all_layers, - "sensitivity_metric": sensitivity_metric, - "awq": self.awq, - "scale_estimation": self.scale_estimation, - "gptq": self.gptq, - "lora_correction": self.lora_correction, - "backup_mode": backup_mode, - } - return self._nncf_dict - - raise ImportError("NNCF is required to execute this method. Please install nncf first.") - - def to_dict(self) -> Dict[str, Any]: - return copy.deepcopy(self.__dict__) - - -class OVQuantizeOptions: +class OVMixedQuantizationConfig(OVQuantizationConfigBase): def __init__( self, - mode: Optional[str] = None, - preset: Optional[str] = None, - target_device: str = "any", - fast_bias_correction: bool = True, - model_type: Optional[str] = None, - advanced_parameters: Optional[Dict] = None, - ): - """ - Class containing specific nncf.quantize method's options. - Args: - mode (`str`, *optional*): - Defines special quantization modes. Possible values: ['fp8_e4m3', 'fp8_e5m2']. - preset (`str`, *optional*): - Quantization presets, usually meaning to enable either a symmetrical or asymmetrical scheme. Possible values: ['performance', 'mixed']. - target_device (`str`, defaults to "any"): - Target device architecture for compression. Possible values: ['any', 'cpu', 'gpu', 'npu', 'cpu_spr']. - fast_bias_correction (`bool`, defaults to True): - Whether to apply fast or full bias correction algorithm. - model_type (`str`, *optional*): - Model type is needed to specify additional patterns in the model. Supported only `transformer` now. - advanced_parameters(`Dict`, *optional*) - Defines a dictionary with the advanced parameters. - Examples of the values: - - overflow_fix (`str`): - Parameter for controlling overflow fix setting. - - smooth_quant_alphas (`dict`): - SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and - reduces quantization error. - Examples of the values: - - matmul (`float`) - - convolution (`float`) - """ - self.mode = mode - self.preset = preset - self.target_device = target_device - self.fast_bias_correction = fast_bias_correction - self.model_type = model_type - self.advanced_parameters = advanced_parameters - - self._nncf_dict = None - - @staticmethod - def init_with_format( - bits: int = 8, - sym: bool = False, - ignored_scope: Optional[dict] = None, - num_samples: Optional[int] = 300, - model_type: str = "transformer", - fast_bias_correction: bool = True, - overflow_fix: str = "disable", - dataset: Optional[str] = None, + weight_quantization_config: Union[OVWeightQuantizationConfig, dict], + quantization_config: Union[OVQuantizationConfig, dict], + num_samples: Optional[int] = None, + dataset: Optional[Union[str, List[str]]] = None, tokenizer: Optional[str] = None, processor: Optional[str] = None, trust_remote_code: bool = False, - smooth_quant_alpha: Optional[float] = None, - weight_format: Optional[str] = "int8", - activation_format: Optional[str] = "int8", **kwargs, - ): - """ - Method for the backwards-compatible OVQuantizeOptions initialization. - All options are the same as those in the OVQuantizationConfig. - """ - preset = "performance" if sym else "mixed" - advanced_parameters = {"overflow_fix": overflow_fix} - if smooth_quant_alpha: - advanced_parameters["smooth_quant_alphas"] = {"matmul": smooth_quant_alpha} - - mode = None - if activation_format: - mode_map = { - "f8e4m3": "fp8_e4m3", - "f8e5m2": "fp8_e5m2", - } - mode = mode_map[activation_format] - preset = "performance" - - return OVQuantizeOptions( - mode=mode, - preset=preset, - target_device="any", - fast_bias_correction=fast_bias_correction, - model_type=model_type, - advanced_parameters=advanced_parameters, - ) - - def to_nncf_dict(self) -> Dict[str, Any]: - """ - Returns a dictionary with the NNCF-friendly variables that are ready to use. - """ - if self._nncf_dict: - return self._nncf_dict - - if is_nncf_available(): - mode = nncf.QuantizationMode(self.mode) if self.mode else None - preset = nncf.QuantizationPreset(self.preset) - target_device = nncf.TargetDevice(self.target_device.upper()) - model_type = nncf.ModelType(self.model_type) if self.model_type else None - advanced_parameters = None - if self.advanced_parameters: - advanced_parameters = nncf.AdvancedQuantizationParameters( - overflow_fix=self.advanced_parameters["overflow_fix"], - ) - if "smooth_quant_alphas" in self.advanced_parameters: - advanced_parameters.smooth_quant_alphas = nncf.AdvancedSmoothQuantParameters( - **self.advanced_parameters["smooth_quant_alphas"] - ) - - self._nncf_dict = { - "mode": mode, - "preset": preset, - "target_device": target_device, - "fast_bias_correction": self.fast_bias_correction, - "model_type": model_type, - "advanced_parameters": advanced_parameters, - } - return self._nncf_dict - - raise ImportError("NNCF is required to execute this method. Please install nncf first.") - - def to_dict(self) -> Dict: - return copy.deepcopy(self.__dict__) - - -class OVGeneralQuantizationConfig(QuantizationConfigMixin): - def __init__( - self, - ignored_scope: Optional[Dict] = None, - num_samples: Optional[int] = None, - compress_weights_options: Optional[OVCompressWeightsOptions] = None, - quantize_options: Optional[OVQuantizeOptions] = None, ): """ Class containing general options for the NNCF-based quantization. @@ -1067,53 +875,48 @@ def __init__( quantize_options (`OVQuantizeOptions`, *optional*): See OVQuantizeOptions instance. """ - self.ignored_scope = ignored_scope - self.num_samples = num_samples - self.compress_weights_options = compress_weights_options - self.quantize_options = quantize_options - self.bits = None - self.sym = None - self.dataset = None - self.tokenizer = None - self.processor = None - self.trust_remote_code = None - self.weight_format = None - - @staticmethod - def init_with_format( - bits: int = 8, - sym: bool = False, - ignored_scope: Optional[dict] = None, - num_samples: Optional[int] = None, - dataset: Optional[Optional[Union[str, List[str]]]] = None, - tokenizer: Optional[str] = None, - processor: Optional[str] = None, - trust_remote_code: bool = False, - weight_format: Optional[str] = None, - ): - """ - Method for the backwards-compatible QuantizationConfigMixin initialization. - All options are the same as those in the QuantizationConfigMixin. - """ - config = OVGeneralQuantizationConfig( - ignored_scope=ignored_scope, + super().__init__( num_samples=num_samples, + dataset=dataset, + tokenizer=tokenizer, + processor=processor, + trust_remote_code=trust_remote_code, ) - config.bits = bits - config.sym = sym - config.dataset = dataset - config.tokenizer = tokenizer - config.processor = processor - config.trust_remote_code = trust_remote_code - config.weight_format = weight_format - return config + if isinstance(weight_quantization_config, dict): + weight_quantization_config = copy.deepcopy(weight_quantization_config) + base_config = { + "num_samples": num_samples, + "dataset": dataset, + "tokenizer": tokenizer, + "processor": processor, + "trust_remote_code": trust_remote_code, + } + base_config.update(weight_quantization_config) + weight_quantization_config = OVWeightQuantizationConfig(**base_config) + self.weight_quantization_config = weight_quantization_config - def get_ignored_scope_instance(self) -> "nncf.IgnoredScope": - ignored_scope = copy.deepcopy(self.ignored_scope) if self.ignored_scope else {} - return nncf.IgnoredScope(**ignored_scope) + if isinstance(quantization_config, dict): + quantization_config = copy.deepcopy(quantization_config) + base_config = { + "num_samples": num_samples, + "dataset": dataset, + "tokenizer": tokenizer, + "processor": processor, + "trust_remote_code": trust_remote_code, + } + base_config.update(quantization_config) + quantization_config = OVQuantizationConfig(**base_config) + self.quantization_config = quantization_config + + self.post_init() def to_dict(self): - result = copy.deepcopy(self.__dict__) - result["compress_weights_options"] = self.compress_weights_options.to_dict() - result["quantize_options"] = self.quantize_options.to_dict() + # TODO: prepare proper implementation + weight_quantization_config = self.weight_quantization_config + quantization_config = self.quantization_config + self.weight_quantization_config = self.weight_quantization_config.to_dict() + self.quantization_config = self.quantization_config.to_dict() + result = super().to_dict() + self.weight_quantization_config = weight_quantization_config + self.quantization_config = quantization_config return result diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 391643a0fe..f474f227d7 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -60,7 +60,7 @@ from ..utils.modeling_utils import get_model_device from .configuration import ( OVConfig, - OVGeneralQuantizationConfig, + OVMixedQuantizationConfig, OVQuantizationConfig, OVQuantizationConfigBase, OVQuantizationMethod, @@ -1201,7 +1201,7 @@ def _hybrid_quantization( def _general_quantization( model: openvino.Model, - quantization_config: OVGeneralQuantizationConfig, + quantization_config: OVMixedQuantizationConfig, calibration_dataset: nncf.Dataset, **kwargs, ) -> openvino.Model: @@ -1213,38 +1213,31 @@ def _general_quantization( Args: model (`openvino.runtime.Model`): The OpenVINO Runtime model for applying quantization. - quantization_config (`OVGeneralQuantizationConfig`): + quantization_config (`OVGeneralMixedConfig`): The configuration containing the parameters related to quantization. calibration_dataset (`nncf.Dataset`): The dataset used for quantization. Returns: The OpenVINO Runtime model with applied quantization. """ - quantized_model = model - - ignored_scope = quantization_config.get_ignored_scope_instance() - - if quantization_config.compress_weights_options: - ops_with_weights = _collect_ops_with_weights(model) - wc_kwargs = copy.deepcopy(kwargs) - wc_kwargs.update(quantization_config.compress_weights_options.to_nncf_dict()) - quantized_model = nncf.compress_weights( - model, - ignored_scope=ignored_scope, - dataset=calibration_dataset, - subset_size=quantization_config.num_samples, - **wc_kwargs, - ) - ignored_scope.names += ops_with_weights - - if quantization_config.quantize_options: - q_kwargs = copy.deepcopy(kwargs) - q_kwargs.update(quantization_config.quantize_options.to_nncf_dict()) - quantized_model = nncf.quantize( - model, - calibration_dataset, - subset_size=quantization_config.num_samples, - ignored_scope=ignored_scope, - **q_kwargs, - ) + + ops_with_weights = _collect_ops_with_weights(model) + wc_kwargs = copy.deepcopy(kwargs) + wc_kwargs.update(quantization_config.weight_quantization_config.to_nncf_dict()) + compressed_model = nncf.compress_weights( + model, + dataset=calibration_dataset, + subset_size=quantization_config.num_samples, + **wc_kwargs, + ) + + q_kwargs = copy.deepcopy(kwargs) + q_kwargs.update(quantization_config.quantization_config.to_nncf_dict()) + q_kwargs["ignored_scope"].names += ops_with_weights + quantized_model = nncf.quantize( + compressed_model, + calibration_dataset, + subset_size=quantization_config.num_samples, + **q_kwargs, + ) return quantized_model diff --git a/optimum/intel/utils/dummy_openvino_and_nncf_objects.py b/optimum/intel/utils/dummy_openvino_and_nncf_objects.py index e646074e1e..4b96d28589 100644 --- a/optimum/intel/utils/dummy_openvino_and_nncf_objects.py +++ b/optimum/intel/utils/dummy_openvino_and_nncf_objects.py @@ -68,3 +68,14 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino", "nncf"]) + + +class OVMixedQuantizationConfig(metaclass=DummyObject): + _backends = ["openvino", "nncf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "nncf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "nncf"]) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 1df43d5480..133bc5b0c0 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -61,6 +61,7 @@ OVQuantizer, OVTrainer, OVQuantizationConfig, + OVMixedQuantizationConfig, OVWeightQuantizationConfig, OVDynamicQuantizationConfig, OVModelOpenCLIPForZeroShotImageClassification, @@ -127,6 +128,18 @@ class OVQuantizerTest(unittest.TestCase): (13,), (16,), ), + ( + OVModelForCausalLM, + "llama", + OVMixedQuantizationConfig( + weight_quantization_config=OVWeightQuantizationConfig(bits=4, weight_format="nf4", group_size=16), + quantization_config=OVQuantizationConfig(activation_format="f8e4m3", smooth_quant_alpha=0.9), + dataset="wikitext2", + num_samples=1, + ), + (4,), + (14,), + ), ] @parameterized.expand(SUPPORTED_ARCHITECTURES_TORCH_MODEL) @@ -220,7 +233,10 @@ def test_ov_model_static_quantization_with_auto_dataset( self, model_cls, model_name, quantization_config, expected_fake_nodes, expected_low_precision_nodes ): model_id = MODEL_NAMES[model_name] - quant_mode = quantization_config.activation_format + if isinstance(quantization_config, OVMixedQuantizationConfig): + quant_mode = f"{quantization_config.weight_quantization_config.weight_format}_{quantization_config.quantization_config.activation_format}" + else: + quant_mode = quantization_config.activation_format with TemporaryDirectory() as tmp_dir: ov_model = model_cls.from_pretrained(model_id, quantization_config=quantization_config) @@ -245,7 +261,11 @@ def test_ov_model_static_quantization_with_auto_dataset( elif model_cls == OVModelForCausalLM: num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(ov_model.model) self.assertEqual(expected_fake_nodes[0], num_fake_nodes) - self.assertEqual(expected_low_precision_nodes[0], num_weight_nodes[quant_mode]) + weight_types = quant_mode.split("_") + num_weights = 0 + for weight_type in weight_types: + num_weights += num_weight_nodes[weight_type] + self.assertEqual(expected_low_precision_nodes[0], num_weights) tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: From 8be0df1c820d7bd390e5e6f882dd0d19183858e4 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 6 Feb 2025 14:17:56 +0100 Subject: [PATCH 05/20] Polishing changes --- optimum/commands/export/openvino.py | 17 +- optimum/intel/openvino/configuration.py | 299 +++++++++--------- optimum/intel/openvino/modeling_base.py | 13 +- .../intel/openvino/modeling_base_seq2seq.py | 2 +- optimum/intel/openvino/modeling_decoder.py | 2 +- optimum/intel/openvino/modeling_diffusion.py | 2 +- optimum/intel/openvino/modeling_seq2seq.py | 1 + .../openvino/modeling_visual_language.py | 2 +- optimum/intel/openvino/quantization.py | 115 ++----- tests/openvino/test_quantization.py | 27 +- 10 files changed, 207 insertions(+), 273 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index c4750604fb..9000459cc2 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -342,33 +342,33 @@ def run(self): if no_compression_parameter_provided(self.args) and self.args.weight_format == "int4": quantization_config = get_default_int4_config(self.args.model) else: - quantization_config = prepare_for_wc_config(self.args, _DEFAULT_4BIT_CONFIG) + quantization_config = prepare_wc_config(self.args, _DEFAULT_4BIT_CONFIG) if quantization_config.get("dataset", None) is not None: quantization_config["trust_remote_code"] = self.args.trust_remote_code ov_config = OVConfig(quantization_config=quantization_config) - elif self.args.quant_mode is not None: + else: if self.args.dataset is None: raise ValueError( "Dataset is required for full quantization. Please provide it with --dataset argument." ) if self.args.quant_mode == "nf4_f8e4m3": - wc_config = prepare_for_wc_config(self.args, _DEFAULT_4BIT_CONFIG) + wc_config = prepare_wc_config(self.args, _DEFAULT_4BIT_CONFIG) wc_config["weight_format"] = "nf4" - q_config = prepare_for_q_config(self.args) + q_config = prepare_q_config(self.args) q_config["activation_format"] = "f8e4m3" quantization_config = dict( weight_quantization_config=wc_config, - quantization_config=q_config, + activation_quantization_config=q_config, num_samples=self.args.num_samples, dataset=self.args.dataset, trust_remote_code=self.args.trust_remote_code, ) else: - quantization_config = prepare_for_q_config(self.args) + quantization_config = prepare_q_config(self.args) ov_config = OVConfig(quantization_config=quantization_config) quantization_config = ov_config.quantization_config if ov_config else None @@ -462,7 +462,7 @@ def run(self): ) -def prepare_for_wc_config(args, default_configs): +def prepare_wc_config(args, default_configs): is_int8 = args.weight_format == "int8" return { "bits": 8 if is_int8 else 4, @@ -482,9 +482,8 @@ def prepare_for_wc_config(args, default_configs): } -def prepare_for_q_config(args): +def prepare_q_config(args): return { - "weight_format": args.quant_mode, "activation_format": args.quant_mode, "bits": 8, "sym": args.sym or False, diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 785d72d747..cc51790307 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import abc import copy import inspect import json @@ -295,9 +296,40 @@ def post_init(self): if not (self.num_samples is None or isinstance(self.num_samples, int) and self.num_samples > 0): raise ValueError(f"`num_samples` is expected to be a positive integer, but found: {self.num_samples}") + def clone(self): + return copy.deepcopy(self) + + +class _OVQuantizationConfigWithIgnoredScope(abc.ABC): + def __init__(self, ignored_scope: Optional[Union[dict, "nncf.IgnoredScope"]] = None): + """ + Base class for configs with ignored scope. + + Args: + ignored_scope (`dict`, *optional*): + An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary + entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. + """ + if isinstance(ignored_scope, nncf.IgnoredScope): + ignored_scope = ignored_scope.__dict__ + self.ignored_scope = ignored_scope + + def post_init(self): + try: + self.get_ignored_scope_instance() + except Exception as e: + raise ValueError( + f"Can't create an `IgnoredScope` object from the provided ignored scope dict: {self.ignored_scope}.\n{e}" + ) + + def get_ignored_scope_instance(self) -> "nncf.IgnoredScope": + if self.ignored_scope is None: + return nncf.IgnoredScope() + return nncf.IgnoredScope(**copy.deepcopy(self.ignored_scope)) + @dataclass -class OVWeightQuantizationConfig(OVQuantizationConfigBase): +class OVWeightQuantizationConfig(OVQuantizationConfigBase, _OVQuantizationConfigWithIgnoredScope): """ This is a wrapper class about all possible attributes and features that you can play with a model that has been loaded using `optimum-intel` api for weight-only quantization with NNCF. For full model quantization please see @@ -393,7 +425,7 @@ def __init__( ratio: float = 1.0, all_layers: Optional[bool] = None, sensitivity_metric: Optional[str] = None, - ignored_scope: Optional[dict] = None, + ignored_scope: Optional[Union[dict, "nncf.IgnoredScope"]] = None, num_samples: Optional[int] = None, quant_method: Union[str, OVQuantizationMethod] = OVQuantizationMethod.DEFAULT, scale_estimation: bool = None, @@ -404,13 +436,15 @@ def __init__( backup_precision: Optional[str] = None, **kwargs, ): - super().__init__( + OVQuantizationConfigBase.__init__( + self, num_samples=num_samples, dataset=dataset, tokenizer=tokenizer, processor=processor, trust_remote_code=trust_remote_code, ) + _OVQuantizationConfigWithIgnoredScope.__init__(self, ignored_scope) self.bits = bits self.sym = sym self.group_size = group_size or (-1 if bits == 8 else 128) @@ -422,18 +456,15 @@ def __init__( self.gptq = gptq self.lora_correction = lora_correction self.backup_precision = backup_precision - if isinstance(ignored_scope, nncf.IgnoredScope): - ignored_scope = ignored_scope.__dict__ - self.ignored_scope = ignored_scope self.weight_format = weight_format - self._nncf_dict = None self.post_init() def post_init(self): r""" Safety checker that arguments are correct """ - super().post_init() + OVQuantizationConfigBase.post_init(self) + _OVQuantizationConfigWithIgnoredScope.post_init(self) if not (0 <= self.ratio <= 1): raise ValueError("`ratio` must between 0 and 1.") if self.group_size is not None and self.group_size != -1 and self.group_size <= 0: @@ -465,13 +496,6 @@ def post_init(self): "quantization algorithm is selected and compression ratio is 1.0." ) - try: - self.get_ignored_scope_instance() - except Exception as e: - raise ValueError( - f"Can't create an `IgnoredScope` object from the provided ignored scope dict: {self.ignored_scope}.\n{e}" - ) - if self.bits not in [4, 8]: raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}") @@ -544,51 +568,39 @@ def post_init(self): if self.gptq and self.lora_correction: raise ValueError("The GPTQ and LoRA Correction algorithms can't be applied simultaneously") - def get_ignored_scope_instance(self) -> "nncf.IgnoredScope": - if self.ignored_scope is None: - return nncf.IgnoredScope() - return nncf.IgnoredScope(**copy.deepcopy(self.ignored_scope)) - def to_nncf_dict(self) -> Dict[str, Any]: """ Returns a dictionary with the NNCF-friendly variables that are ready to use. """ - if self._nncf_dict: - return self._nncf_dict - - if is_nncf_available(): - signed_bitness = { - 4: "int4", - 8: "int8", - } - mode = self.weight_format if self.weight_format else signed_bitness[self.bits] - if mode in signed_bitness.values(): - mode += "_sym" if self.sym else "_asym" - if isinstance(self.quant_method, str): - awq = self.quant_method == "awq" - else: - awq = self.quant_method == OVQuantizationMethod.AWQ - - mode = nncf.CompressWeightsMode(mode) - sensitivity_metric = nncf.SensitivityMetric(self.sensitivity_metric) if self.sensitivity_metric else None - backup_mode = nncf.BackupMode(self.backup_precision) if self.backup_precision else None - self._nncf_dict = { - "mode": mode, - "ratio": self.ratio, - "group_size": self.group_size, - "ignored_scope": self.get_ignored_scope_instance(), - "all_layers": self.all_layers, - "sensitivity_metric": sensitivity_metric, - "awq": awq, - "scale_estimation": self.scale_estimation, - "gptq": self.gptq, - "lora_correction": self.lora_correction, - "backup_mode": backup_mode, - } - return self._nncf_dict - - raise ImportError("NNCF is required to execute this method. Please install nncf first.") + if not is_nncf_available(): + raise ImportError("NNCF is required to execute this method. Please install nncf first.") + + signed_bitness = {4: "int4", 8: "int8"} + mode = self.weight_format if self.weight_format else signed_bitness[self.bits] + if mode in signed_bitness.values(): + mode += "_sym" if self.sym else "_asym" + mode = nncf.CompressWeightsMode(mode) + + awq = self.quant_method == ("awq" if isinstance(self.quant_method, str) else OVQuantizationMethod.AWQ) + sensitivity_metric = nncf.SensitivityMetric(self.sensitivity_metric) if self.sensitivity_metric else None + backup_mode = nncf.BackupMode(self.backup_precision) if self.backup_precision else None + result = { + "mode": mode, + "ratio": self.ratio, + "group_size": self.group_size, + "ignored_scope": self.get_ignored_scope_instance(), + "all_layers": self.all_layers, + "sensitivity_metric": sensitivity_metric, + "awq": awq, + "scale_estimation": self.scale_estimation, + "gptq": self.gptq, + "lora_correction": self.lora_correction, + "backup_mode": backup_mode, + } + if self.num_samples is not None: + result["subset_size"] = self.num_samples + return result @dataclass @@ -606,12 +618,12 @@ def __init__( @dataclass -class OVQuantizationConfig(OVQuantizationConfigBase): +class OVQuantizationConfig(OVQuantizationConfigBase, _OVQuantizationConfigWithIgnoredScope): def __init__( self, bits: int = 8, sym: bool = False, - ignored_scope: Optional[dict] = None, + ignored_scope: Optional[Union[dict, "nncf.IgnoredScope"]] = None, num_samples: Optional[int] = 300, model_type: str = "transformer", fast_bias_correction: bool = True, @@ -669,13 +681,15 @@ def __init__( activation_format (`str`, defaults to "int8"): Data format activations are compressed to. Possible values: ['int8', 'f8e4m3', 'f8e5m2']. """ - super().__init__( + OVQuantizationConfigBase.__init__( + self, num_samples=num_samples, dataset=dataset, tokenizer=tokenizer, processor=processor, trust_remote_code=trust_remote_code, ) + _OVQuantizationConfigWithIgnoredScope.__init__(self, ignored_scope) self.bits = bits self.sym = sym self.model_type = model_type @@ -683,22 +697,19 @@ def __init__( self.overflow_fix = overflow_fix self.smooth_quant_alpha = smooth_quant_alpha self.activation_format = activation_format - if isinstance(ignored_scope, nncf.IgnoredScope): - ignored_scope = ignored_scope.__dict__ - self.ignored_scope = ignored_scope f8_formats = ["f8e4m3", "f8e5m2"] if self.activation_format in f8_formats: logger.info(f"{self.activation_format} for activations was found. A symmetrical scheme will be used.") self.sym = True - self._nncf_dict = None self.post_init() def post_init(self): r""" Safety checker that arguments are correct """ - super().post_init() + OVQuantizationConfigBase.post_init(self) + _OVQuantizationConfigWithIgnoredScope.post_init(self) if self.dataset is not None: speech_to_text_datasets = list(PREDEFINED_SPEECH_TO_TEXT_DATASETS.keys()) @@ -716,57 +727,41 @@ def post_init(self): f"SmoothQuant alpha parameter must be in range [0, 1], but found {self.smooth_quant_alpha}" ) - def get_ignored_scope_instance(self) -> "nncf.IgnoredScope": - if self.ignored_scope is None: - return nncf.IgnoredScope() - return nncf.IgnoredScope(**copy.deepcopy(self.ignored_scope)) - def to_nncf_dict(self) -> Dict[str, Any]: """ Returns a dictionary with the NNCF-friendly variables that are ready to use. """ - if self._nncf_dict: - return self._nncf_dict - - if is_nncf_available(): - preset = "performance" if self.sym else "mixed" - advanced_parameters_dict = {"overflow_fix": self.overflow_fix} - if self.smooth_quant_alpha: - advanced_parameters_dict["smooth_quant_alphas"] = {"matmul": self.smooth_quant_alpha} - - mode = None - if self.activation_format: - mode_map = { - "int8": "int8", - "f8e4m3": "fp8_e4m3", - "f8e5m2": "fp8_e5m2", - } - mode = mode_map[self.activation_format] - if mode == "int8": - mode += "_sym" if self.sym else "_asym" - preset = "performance" - - preset = nncf.QuantizationPreset(preset) - model_type = nncf.ModelType(self.model_type) if self.model_type else None - advanced_parameters = nncf.AdvancedQuantizationParameters( - overflow_fix=advanced_parameters_dict["overflow_fix"], - ) - if "smooth_quant_alphas" in advanced_parameters_dict: - advanced_parameters.smooth_quant_alphas = nncf.AdvancedSmoothQuantParameters( - **advanced_parameters_dict["smooth_quant_alphas"] - ) - self._nncf_dict = { - "mode": mode, - "preset": preset, - "fast_bias_correction": self.fast_bias_correction, - "model_type": model_type, - "ignored_scope": self.get_ignored_scope_instance(), - "advanced_parameters": advanced_parameters, - } - return self._nncf_dict + if not is_nncf_available(): + raise ImportError("NNCF is required to execute this method. Please install nncf first.") - raise ImportError("NNCF is required to execute this method. Please install nncf first.") + preset = "performance" if self.sym else "mixed" + advanced_parameters_dict = {"overflow_fix": self.overflow_fix} + if self.smooth_quant_alpha: + advanced_parameters_dict["smooth_quant_alphas"] = {"matmul": self.smooth_quant_alpha} + + mode_map = {"f8e4m3": "fp8_e4m3", "f8e5m2": "fp8_e5m2"} + mode = mode_map.get(self.activation_format) + + preset = nncf.QuantizationPreset(preset) + model_type = nncf.ModelType(self.model_type) + advanced_parameters = nncf.AdvancedQuantizationParameters( + overflow_fix=advanced_parameters_dict["overflow_fix"], + ) + if "smooth_quant_alphas" in advanced_parameters_dict: + advanced_parameters.smooth_quant_alphas = nncf.AdvancedSmoothQuantParameters( + **advanced_parameters_dict["smooth_quant_alphas"] + ) + + return { + "mode": mode, + "preset": preset, + "subset_size": self.num_samples, + "fast_bias_correction": self.fast_bias_correction, + "model_type": model_type, + "ignored_scope": self.get_ignored_scope_instance(), + "advanced_parameters": advanced_parameters, + } class OVConfig(BaseConfig): @@ -786,12 +781,24 @@ def __init__( self.save_onnx_model = save_onnx_model self.optimum_version = kwargs.pop("optimum_version", None) if isinstance(quantization_config, dict): - quantization_config = self._quantization_config_from_dict(quantization_config) + quantization_config = self.quantization_config_from_dict(quantization_config) self.quantization_config = quantization_config self.compression = kwargs.get( "compression", None ) # A field for backward-compatability of training-time compression parameters - self.dtype = dtype + if self.quantization_config is not None: + if isinstance(self.quantization_config, OVWeightQuantizationConfig): + self.dtype = self.quantization_config.weight_format + elif isinstance(self.quantization_config, OVQuantizationConfig): + self.dtype = self.quantization_config.activation_format + elif isinstance(self.quantization_config, OVMixedQuantizationConfig): + weight_format = self.quantization_config.weight_quantization_config.weight_format + activation_format = self.quantization_config.activation_quantization_config.activation_format + self.dtype = f"{weight_format}_{activation_format}" + else: + raise ValueError(f"Unsupported type of quantization config: {type(self.quantization_config)}") + else: + self.dtype = dtype def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): self.input_info = [ @@ -804,8 +811,11 @@ def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): ] @staticmethod - def _quantization_config_from_dict(quantization_config: dict) -> OVQuantizationConfigBase: - if "weight_quantization_config" in quantization_config and "quantization_config" in quantization_config: + def quantization_config_from_dict(quantization_config: dict) -> OVQuantizationConfigBase: + if ( + "weight_quantization_config" in quantization_config + and "activation_quantization_config" in quantization_config + ): return OVMixedQuantizationConfig.from_dict(quantization_config) wq_args = inspect.getfullargspec(OVWeightQuantizationConfig.__init__).args q_args = inspect.getfullargspec(OVQuantizationConfig.__init__).args @@ -854,7 +864,7 @@ class OVMixedQuantizationConfig(OVQuantizationConfigBase): def __init__( self, weight_quantization_config: Union[OVWeightQuantizationConfig, dict], - quantization_config: Union[OVQuantizationConfig, dict], + activation_quantization_config: Union[OVQuantizationConfig, dict], num_samples: Optional[int] = None, dataset: Optional[Union[str, List[str]]] = None, tokenizer: Optional[str] = None, @@ -862,19 +872,22 @@ def __init__( trust_remote_code: bool = False, **kwargs, ): - """ - Class containing general options for the NNCF-based quantization. - Args: - ignored_scope (`dict`, *optional*): - An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary - entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. - num_samples (`int`, *optional*): - The maximum number of samples composing the calibration dataset. - compress_weights_options (`OVCompressWeightsOptions`, *optional*): - See OVCompressWeightsOptions instance. - quantize_options (`OVQuantizeOptions`, *optional*): - See OVQuantizeOptions instance. - """ + if isinstance(weight_quantization_config, dict): + weight_quantization_config = OVWeightQuantizationConfig.from_dict(weight_quantization_config) + self.weight_quantization_config = weight_quantization_config + + if isinstance(activation_quantization_config, dict): + activation_quantization_config = OVQuantizationConfig.from_dict(activation_quantization_config) + self.activation_quantization_config = activation_quantization_config + + # Pull dataset-related parameters from child configs. This is not the intended use case, but we process it just + # in case user sets those parameters inside child configs only. + wqc, aqc = self.weight_quantization_config, self.activation_quantization_config + num_samples = num_samples or self.wqc.num_samples or self.aqc.num_samples + dataset = dataset or self.wqc.dataset or self.aqc.dataset + tokenizer = tokenizer or self.wqc.tokenizer or self.aqc.tokenizer + processor = processor or self.wqc.processor or self.aqc.processor + trust_remote_code = trust_remote_code or self.wqc.trust_remote_code or self.aqc.trust_remote_code super().__init__( num_samples=num_samples, dataset=dataset, @@ -882,41 +895,11 @@ def __init__( processor=processor, trust_remote_code=trust_remote_code, ) - if isinstance(weight_quantization_config, dict): - weight_quantization_config = copy.deepcopy(weight_quantization_config) - base_config = { - "num_samples": num_samples, - "dataset": dataset, - "tokenizer": tokenizer, - "processor": processor, - "trust_remote_code": trust_remote_code, - } - base_config.update(weight_quantization_config) - weight_quantization_config = OVWeightQuantizationConfig(**base_config) - self.weight_quantization_config = weight_quantization_config - - if isinstance(quantization_config, dict): - quantization_config = copy.deepcopy(quantization_config) - base_config = { - "num_samples": num_samples, - "dataset": dataset, - "tokenizer": tokenizer, - "processor": processor, - "trust_remote_code": trust_remote_code, - } - base_config.update(quantization_config) - quantization_config = OVQuantizationConfig(**base_config) - self.quantization_config = quantization_config self.post_init() def to_dict(self): - # TODO: prepare proper implementation - weight_quantization_config = self.weight_quantization_config - quantization_config = self.quantization_config - self.weight_quantization_config = self.weight_quantization_config.to_dict() - self.quantization_config = self.quantization_config.to_dict() result = super().to_dict() - self.weight_quantization_config = weight_quantization_config - self.quantization_config = quantization_config + result["weight_quantization_config"] = self.weight_quantization_config.to_dict() + result["activation_quantization_config"] = self.activation_quantization_config.to_dict() return result diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 99422f1a54..69fdcdaaef 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -250,6 +250,13 @@ def fix_op_names_duplicates(model: openvino.runtime.Model): from optimum.intel.openvino.quantization import _weight_only_quantization + if not isinstance(quantization_config, dict) and not isinstance( + quantization_config, OVWeightQuantizationConfig + ): + raise RuntimeError( + "Expected quantization_config to be a dictionary or OVWeightQuantizationConfig object." + ) + model = _weight_only_quantization(model, quantization_config) return model @@ -378,7 +385,7 @@ def _from_pretrained( compile_only = kwargs.get("compile_only", False) - quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) + quantization_config = cls._prepare_quantization_config(quantization_config, load_in_8bit) model = None if not compile_only: @@ -481,14 +488,14 @@ def from_pretrained( ) @staticmethod - def _prepare_weight_quantization_config( + def _prepare_quantization_config( quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None, load_in_8bit: bool = False ): # Give default quantization config if not provided and load_in_8bit=True if not quantization_config and load_in_8bit: quantization_config = OVWeightQuantizationConfig(bits=8) elif isinstance(quantization_config, dict): - quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config) + quantization_config = OVConfig.quantization_config_from_dict(quantization_config) return quantization_config diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index 11ee8f89a7..20185c3c87 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -196,7 +196,7 @@ def _from_pretrained( decoder_with_past_file_name = decoder_with_past_file_name or default_decoder_with_past_file_name decoder_with_past = None - quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) + quantization_config = cls._prepare_quantization_config(quantization_config, load_in_8bit) compile_only = kwargs.get("compile_only", False) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 4897db1459..adc9b968f2 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -840,7 +840,7 @@ def _from_pretrained( if quantization_config.get("dataset", None) is not None: quantization_config["trust_remote_code"] = kwargs.get("trust_remote_code", False) - quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) + quantization_config = cls._prepare_quantization_config(quantization_config, load_in_8bit) enable_compilation = kwargs.pop("compile", True) and not quantization_config diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index bc2f75e0ce..8f279c2e85 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -474,7 +474,7 @@ def _from_pretrained( kwargs[config_key] = value compile_only = kwargs.get("compile_only", False) - quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) + quantization_config = cls._prepare_quantization_config(quantization_config, load_in_8bit) if (quantization_config is None or quantization_config.dataset is None) and not compile_only: for name, path in models.items(): if name in kwargs: diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py index 983f1f6850..c15003523d 100644 --- a/optimum/intel/openvino/modeling_seq2seq.py +++ b/optimum/intel/openvino/modeling_seq2seq.py @@ -1029,6 +1029,7 @@ def _from_pretrained( ): compile_only = kwargs.get("compile_only", False) + quantization_config = cls._prepare_quantization_config(quantization_config, load_in_8bit) if not compile_only and isinstance(quantization_config, OVQuantizationConfig): model = super(OVModelForSpeechSeq2Seq, cls)._from_pretrained( model_id, config, load_in_8bit=False, **kwargs diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 1c0e35cca2..8f1c32d4ac 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -545,7 +545,7 @@ def _from_pretrained( except Exception: pass - quantization_config = model_cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) + quantization_config = model_cls._prepare_quantization_config(quantization_config, load_in_8bit) to_quantize = not compile_only and quantization_config is not None if to_quantize: kwargs["compile"] = False diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index f474f227d7..162b90ed06 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -30,7 +30,6 @@ import torch import transformers from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE -from nncf import CompressWeightsMode, SensitivityMetric from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters, OverflowFix from nncf.torch import register_module from nncf.torch.initialization import PTInitializingDataLoader @@ -395,7 +394,7 @@ def _quantize_ovbasemodel( raise ValueError("Calibration dataset is required to run hybrid quantization.") if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline): # Apply weight-only quantization to all SD submodels except UNet - quantization_config_copy = copy.deepcopy(quantization_config) + quantization_config_copy = quantization_config.clone() quantization_config_copy.dataset = None quantization_config_copy.quant_method = OVQuantizationMethod.DEFAULT sub_model_names = [ @@ -453,9 +452,6 @@ def _quantize_ovbasemodel( _weight_only_quantization(self.model.model, quantization_config, calibration_dataset, **kwargs) self.model.request = None elif isinstance(quantization_config, OVQuantizationConfig): - if not isinstance(quantization_config, OVQuantizationConfig): - raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}") - if calibration_dataset is None: raise ValueError("Calibration dataset is required to run quantization.") @@ -468,15 +464,15 @@ def _quantize_ovbasemodel( ) self.model.model = quantized_model self.model.request = None - else: + elif isinstance(quantization_config, OVMixedQuantizationConfig): if calibration_dataset is None: raise ValueError("Calibration dataset is required to run quantization.") - quantized_model = _general_quantization( - self.model.model, quantization_config, calibration_dataset, **kwargs - ) + quantized_model = _mixed_quantization(self.model.model, quantization_config, calibration_dataset, **kwargs) self.model.model = quantized_model self.model.request = None + else: + raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}") if save_directory is not None: self.model.save_pretrained(save_directory) @@ -983,7 +979,7 @@ def transform_fn(data_item): def _quantize_whisper_model(self, quantization_config, calibration_dataset, **kwargs): # Quantize encoder model # quantization_config.num_samples of audio samples result in more actual model inputs - config = copy.deepcopy(quantization_config) + config = quantization_config.clone() config.num_samples = calibration_dataset[0].get_length() quantized_encoder_model = _full_quantization( self.model.encoder_model, config, calibration_dataset[0], **kwargs @@ -993,7 +989,7 @@ def _quantize_whisper_model(self, quantization_config, calibration_dataset, **kw self.model.encoder.request = None # Quantize decoder model - config = copy.deepcopy(quantization_config) + config = quantization_config.clone() config.num_samples = calibration_dataset[1].get_length() quantized_decoder_model = _full_quantization( self.model.decoder_model, config, calibration_dataset[1], **kwargs @@ -1004,7 +1000,7 @@ def _quantize_whisper_model(self, quantization_config, calibration_dataset, **kw if self.model.decoder_with_past_model is not None: # Quantize decoder with past model - config = copy.deepcopy(quantization_config) + config = quantization_config.clone() config.num_samples = calibration_dataset[2].get_length() quantized_decoder_w_p_model = _full_quantization( self.model.decoder_with_past_model, config, calibration_dataset[2], **kwargs @@ -1037,36 +1033,12 @@ def _weight_only_quantization( else: dataset = nncf.Dataset(calibration_dataset) - sensitivity_metric = None - if isinstance(config.sensitivity_metric, str): - sensitivity_metric = getattr(SensitivityMetric, config.sensitivity_metric.upper()) - - if config.weight_format == "mxfp4": - mode = CompressWeightsMode.E2M1 - elif config.weight_format == "nf4": - mode = CompressWeightsMode.NF4 - else: - if config.bits == 8: - mode = CompressWeightsMode.INT8_SYM if config.sym else CompressWeightsMode.INT8_ASYM - else: - mode = CompressWeightsMode.INT4_SYM if config.sym else CompressWeightsMode.INT4_ASYM - + wc_kwargs = copy.deepcopy(kwargs) + wc_kwargs.update(config.to_nncf_dict()) return nncf.compress_weights( model, - mode=mode, - ratio=config.ratio, - group_size=config.group_size, - all_layers=config.all_layers, - sensitivity_metric=sensitivity_metric, - awq=getattr(config.quant_method, "name", "") == "AWQ" or None, - ignored_scope=config.get_ignored_scope_instance(), dataset=dataset, - subset_size=config.num_samples if config.num_samples else 128, - scale_estimation=config.scale_estimation, - gptq=config.gptq, - lora_correction=config.lora_correction, - backup_mode=None if config.backup_precision is None else nncf.BackupMode(config.backup_precision), - **kwargs, + **wc_kwargs, ) @@ -1076,37 +1048,14 @@ def _full_quantization( calibration_dataset: nncf.Dataset, **kwargs, ): - advanced_parameters_kwargs = {} - if quantization_config.smooth_quant_alpha is not None: - advanced_parameters_kwargs["smooth_quant_alphas"] = AdvancedSmoothQuantParameters( - matmul=quantization_config.smooth_quant_alpha - ) - - q_mode_map = { - "f8e4m3": nncf.QuantizationMode.FP8_E4M3, - "f8e5m2": nncf.QuantizationMode.FP8_E5M2, - } - - if quantization_config.activation_format in q_mode_map: - kwargs.update({"mode": q_mode_map[quantization_config.activation_format]}) - - quantized_model = nncf.quantize( + q_kwargs = copy.deepcopy(kwargs) + q_kwargs.update(quantization_config.to_nncf_dict()) + return nncf.quantize( model, - calibration_dataset, - subset_size=quantization_config.num_samples, - ignored_scope=quantization_config.get_ignored_scope_instance(), - model_type=nncf.ModelType(quantization_config.model_type), - preset=nncf.QuantizationPreset.PERFORMANCE if quantization_config.sym else nncf.QuantizationPreset.MIXED, - fast_bias_correction=quantization_config.fast_bias_correction, - advanced_parameters=nncf.AdvancedQuantizationParameters( - overflow_fix=OverflowFix(quantization_config.overflow_fix), - **advanced_parameters_kwargs, - ), - **kwargs, + calibration_dataset=calibration_dataset, + **q_kwargs, ) - return quantized_model - def _get_operation_const_op(operation, const_port_id: int): node = operation.input_value(const_port_id).get_node() @@ -1173,7 +1122,7 @@ def _hybrid_quantization( """ ops_to_compress = _collect_ops_with_weights(model) - wc_config = copy.deepcopy(quantization_config) + wc_config = quantization_config.clone() wc_config.ignored_scope = wc_config.ignored_scope or {} wc_ignored_types = ["Convolution"] if any(op.get_type_name() == "Convolution" for op in model.get_ops()) else [] @@ -1199,21 +1148,21 @@ def _hybrid_quantization( return quantized_model -def _general_quantization( +def _mixed_quantization( model: openvino.Model, quantization_config: OVMixedQuantizationConfig, calibration_dataset: nncf.Dataset, **kwargs, ) -> openvino.Model: """ - Quantize a model with NNCF in two possible steps: + Quantize a model with NNCF in two steps: - weights-only quantization with nncf.compress_weights method. - full quantization (excluding weights from previous step) with nncf.quantize method. Args: model (`openvino.runtime.Model`): The OpenVINO Runtime model for applying quantization. - quantization_config (`OVGeneralMixedConfig`): + quantization_config (`OVMixedQuantizationConfig`): The configuration containing the parameters related to quantization. calibration_dataset (`nncf.Dataset`): The dataset used for quantization. @@ -1222,22 +1171,16 @@ def _general_quantization( """ ops_with_weights = _collect_ops_with_weights(model) - wc_kwargs = copy.deepcopy(kwargs) - wc_kwargs.update(quantization_config.weight_quantization_config.to_nncf_dict()) - compressed_model = nncf.compress_weights( - model, - dataset=calibration_dataset, - subset_size=quantization_config.num_samples, - **wc_kwargs, + compressed_model = _weight_only_quantization( + model, quantization_config.weight_quantization_config, calibration_dataset, **kwargs ) - q_kwargs = copy.deepcopy(kwargs) - q_kwargs.update(quantization_config.quantization_config.to_nncf_dict()) - q_kwargs["ignored_scope"].names += ops_with_weights - quantized_model = nncf.quantize( - compressed_model, - calibration_dataset, - subset_size=quantization_config.num_samples, - **q_kwargs, + activation_quantization_config = quantization_config.activation_quantization_config.clone() + if activation_quantization_config.ignored_scope is None: + activation_quantization_config.ignored_scope = {} + ignored_names = activation_quantization_config.ignored_scope.get("names", []) + ops_with_weights + activation_quantization_config.ignored_scope["names"] = ignored_names + quantized_model = _full_quantization( + compressed_model, activation_quantization_config, calibration_dataset, **kwargs ) return quantized_model diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 133bc5b0c0..04da199e56 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -104,12 +104,11 @@ class OVQuantizerTest(unittest.TestCase): ( OVModelForSpeechSeq2Seq, "whisper", - OVQuantizationConfig( + dict( dataset="librispeech", num_samples=1, processor=MODEL_NAMES["whisper"], trust_remote_code=True, - weight_only=False, smooth_quant_alpha=0.95, ), (14, 22, 21) if is_transformers_version("<=", "4.42.4") else (14, 22, 25), @@ -118,11 +117,9 @@ class OVQuantizerTest(unittest.TestCase): ( OVModelForCausalLM, "llama", - OVQuantizationConfig( + dict( dataset="wikitext2", num_samples=1, - weight_only=False, - weight_format="f8e4m3", activation_format="f8e4m3", ), (13,), @@ -131,9 +128,9 @@ class OVQuantizerTest(unittest.TestCase): ( OVModelForCausalLM, "llama", - OVMixedQuantizationConfig( - weight_quantization_config=OVWeightQuantizationConfig(bits=4, weight_format="nf4", group_size=16), - quantization_config=OVQuantizationConfig(activation_format="f8e4m3", smooth_quant_alpha=0.9), + dict( + weight_quantization_config=dict(bits=4, weight_format="nf4", group_size=16), + activation_quantization_config=dict(activation_format="f8e4m3"), dataset="wikitext2", num_samples=1, ), @@ -233,15 +230,19 @@ def test_ov_model_static_quantization_with_auto_dataset( self, model_cls, model_name, quantization_config, expected_fake_nodes, expected_low_precision_nodes ): model_id = MODEL_NAMES[model_name] - if isinstance(quantization_config, OVMixedQuantizationConfig): - quant_mode = f"{quantization_config.weight_quantization_config.weight_format}_{quantization_config.quantization_config.activation_format}" - else: - quant_mode = quantization_config.activation_format with TemporaryDirectory() as tmp_dir: ov_model = model_cls.from_pretrained(model_id, quantization_config=quantization_config) ov_model.save_pretrained(tmp_dir) + # Convert dict config to class through OVConfig + if isinstance(quantization_config, dict): + quantization_config = OVConfig.quantization_config_from_dict(quantization_config) + if isinstance(quantization_config, OVMixedQuantizationConfig): + quant_mode = f"{quantization_config.weight_quantization_config.weight_format}_{quantization_config.activation_quantization_config.activation_format}" + else: + quant_mode = quantization_config.activation_format + if model_cls == OVModelForSpeechSeq2Seq: models = [ov_model.encoder.model, ov_model.decoder.model] @@ -1274,7 +1275,7 @@ def test_config_from_dict(self, quantization_config: dict, config_type: type, wa @parameterized.expand(DEFAULT_CONFIGURATIONS) def test_named_default_configurations(self, config_id: str): custom_configuration = self.DEFAULT_CONFIGURATIONS[config_id] - prepared_config = OVModelForCausalLM._prepare_weight_quantization_config(custom_configuration) + prepared_config = OVModelForCausalLM._prepare_quantization_config(custom_configuration) for field_name, reference_value in custom_configuration.items(): value = prepared_config.__getattribute__(field_name) self.assertEqual(value, reference_value) From 38d944ca24885e0e8cb5a56650f8898de496a21d Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 6 Feb 2025 14:36:42 +0100 Subject: [PATCH 06/20] Style --- optimum/commands/export/openvino.py | 14 +++++++------- optimum/intel/openvino/configuration.py | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 3b69814b38..5db0f03a60 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -366,13 +366,13 @@ def run(self): q_config = prepare_q_config(self.args) q_config["activation_format"] = "f8e4m3" - quantization_config = dict( - weight_quantization_config=wc_config, - activation_quantization_config=q_config, - num_samples=self.args.num_samples, - dataset=self.args.dataset, - trust_remote_code=self.args.trust_remote_code, - ) + quantization_config = { + "weight_quantization_config": wc_config, + "activation_quantization_config": q_config, + "num_samples": self.args.num_samples, + "dataset": self.args.dataset, + "trust_remote_code": self.args.trust_remote_code, + } else: quantization_config = prepare_q_config(self.args) ov_config = OVConfig(quantization_config=quantization_config) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 197ce3b2ab..c485246e01 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -890,11 +890,11 @@ def __init__( # Pull dataset-related parameters from child configs. This is not the intended use case, but we process it just # in case user sets those parameters inside child configs only. wqc, aqc = self.weight_quantization_config, self.activation_quantization_config - num_samples = num_samples or self.wqc.num_samples or self.aqc.num_samples - dataset = dataset or self.wqc.dataset or self.aqc.dataset - tokenizer = tokenizer or self.wqc.tokenizer or self.aqc.tokenizer - processor = processor or self.wqc.processor or self.aqc.processor - trust_remote_code = trust_remote_code or self.wqc.trust_remote_code or self.aqc.trust_remote_code + num_samples = num_samples or wqc.num_samples or aqc.num_samples + dataset = dataset or wqc.dataset or aqc.dataset + tokenizer = tokenizer or wqc.tokenizer or aqc.tokenizer + processor = processor or wqc.processor or aqc.processor + trust_remote_code = trust_remote_code or wqc.trust_remote_code or aqc.trust_remote_code super().__init__( num_samples=num_samples, dataset=dataset, From 1336d47f20dbf047586947b5f8ca03d217184f3a Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 6 Feb 2025 15:58:13 +0100 Subject: [PATCH 07/20] Fix --- optimum/intel/openvino/configuration.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index c485246e01..f9d4c1a153 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -580,9 +580,13 @@ def to_nncf_dict(self) -> Dict[str, Any]: mode = self.weight_format if self.weight_format else signed_bitness[self.bits] if mode in signed_bitness.values(): mode += "_sym" if self.sym else "_asym" + if mode == "mxfp4": + mode = "e2m1" mode = nncf.CompressWeightsMode(mode) - awq = self.quant_method == ("awq" if isinstance(self.quant_method, str) else OVQuantizationMethod.AWQ) + awq = None + if self.quant_method == "awq" or self.quant_method == OVQuantizationMethod.AWQ: + awq = True sensitivity_metric = nncf.SensitivityMetric(self.sensitivity_metric) if self.sensitivity_metric else None backup_mode = nncf.BackupMode(self.backup_precision) if self.backup_precision else None result = { From ee65304003d6da9c7d4a178cad800f0d9a7dca6a Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 6 Feb 2025 17:21:12 +0100 Subject: [PATCH 08/20] Hybrid quantization as mixed quantization --- optimum/intel/openvino/configuration.py | 11 +++-- optimum/intel/openvino/quantization.py | 61 +++++++++++-------------- 2 files changed, 33 insertions(+), 39 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index f9d4c1a153..a7473fba98 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -596,14 +596,13 @@ def to_nncf_dict(self) -> Dict[str, Any]: "ignored_scope": self.get_ignored_scope_instance(), "all_layers": self.all_layers, "sensitivity_metric": sensitivity_metric, + "subset_size": self.num_samples or 128, "awq": awq, "scale_estimation": self.scale_estimation, "gptq": self.gptq, "lora_correction": self.lora_correction, "backup_mode": backup_mode, } - if self.num_samples is not None: - result["subset_size"] = self.num_samples return result @@ -733,9 +732,11 @@ def post_init(self): if self.bits != 8: raise ValueError(f"Only support 8-bit for static quantization but found {self.bits}") - if self.smooth_quant_alpha is not None and not (0 <= self.smooth_quant_alpha <= 1): + if self.smooth_quant_alpha is not None and ( + self.smooth_quant_alpha != -1 and not (0 <= self.smooth_quant_alpha <= 1) + ): raise ValueError( - f"SmoothQuant alpha parameter must be in range [0, 1], but found {self.smooth_quant_alpha}" + f"SmoothQuant alpha parameter can equal -1 or be in range [0, 1], but found {self.smooth_quant_alpha}" ) def to_nncf_dict(self) -> Dict[str, Any]: @@ -894,7 +895,7 @@ def __init__( # Pull dataset-related parameters from child configs. This is not the intended use case, but we process it just # in case user sets those parameters inside child configs only. wqc, aqc = self.weight_quantization_config, self.activation_quantization_config - num_samples = num_samples or wqc.num_samples or aqc.num_samples + num_samples = max(num_samples or 0, max(wqc.num_samples, aqc.num_samples)) dataset = dataset or wqc.dataset or aqc.dataset tokenizer = tokenizer or wqc.tokenizer or aqc.tokenizer processor = processor or wqc.processor or aqc.processor diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 041269ca37..d02d8b1828 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -30,7 +30,7 @@ import torch import transformers from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE -from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters, OverflowFix +from nncf.quantization.advanced_parameters import OverflowFix from nncf.torch import register_module from nncf.torch.initialization import PTInitializingDataLoader from openvino._offline_transformations import compress_quantize_weights_transformation @@ -1056,9 +1056,11 @@ def _full_quantization( model: openvino.runtime.Model, quantization_config: OVQuantizationConfig, calibration_dataset: nncf.Dataset, + verify_not_optimized: bool = True, **kwargs, ): - _verify_not_optimized(model) + if verify_not_optimized: + _verify_not_optimized(model) q_kwargs = copy.deepcopy(kwargs) q_kwargs.update(quantization_config.to_nncf_dict()) return nncf.quantize( @@ -1131,38 +1133,32 @@ def _hybrid_quantization( Returns: The OpenVINO Runtime model with applied hybrid quantization. """ - ops_to_compress = _collect_ops_with_weights(model) wc_config = quantization_config.clone() wc_config.ignored_scope = wc_config.ignored_scope or {} - wc_ignored_types = ["Convolution"] if any(op.get_type_name() == "Convolution" for op in model.get_ops()) else [] wc_config.ignored_scope["types"] = wc_config.ignored_scope.get("types", []) + wc_ignored_types - compressed_model = _weight_only_quantization(model, wc_config, **kwargs) - - ptq_ignored_scope = quantization_config.get_ignored_scope_instance() - ptq_ignored_scope.names += ops_to_compress - - subset_size = quantization_config.num_samples if quantization_config.num_samples else 200 - quantized_model = nncf.quantize( - model=compressed_model, - calibration_dataset=dataset, - model_type=nncf.ModelType.TRANSFORMER, - ignored_scope=ptq_ignored_scope, - # SQ algo should be disabled for MatMul nodes because their weights are already compressed - advanced_parameters=nncf.AdvancedQuantizationParameters( - smooth_quant_alphas=AdvancedSmoothQuantParameters(matmul=-1) - ), - subset_size=subset_size, + + q_config = OVQuantizationConfig( + ignored_scope=quantization_config.ignored_scope, + num_samples=quantization_config.num_samples or 200, + smooth_quant_alpha=-1, + **kwargs, + ) + + mixed_quantization_config = OVMixedQuantizationConfig( + weight_quantization_config=wc_config, + activation_quantization_config=q_config, **kwargs, ) - return quantized_model + + return _mixed_quantization(model, mixed_quantization_config, dataset, **kwargs) def _mixed_quantization( model: openvino.Model, quantization_config: OVMixedQuantizationConfig, - calibration_dataset: nncf.Dataset, + dataset: nncf.Dataset, **kwargs, ) -> openvino.Model: """ @@ -1175,25 +1171,22 @@ def _mixed_quantization( The OpenVINO Runtime model for applying quantization. quantization_config (`OVMixedQuantizationConfig`): The configuration containing the parameters related to quantization. - calibration_dataset (`nncf.Dataset`): + dataset (`nncf.Dataset`): The dataset used for quantization. Returns: The OpenVINO Runtime model with applied quantization. """ + wc_config = quantization_config.weight_quantization_config + wc_dataset = dataset if wc_config.bits != 8 else None + + q_config = quantization_config.activation_quantization_config.clone() + q_config.ignored_scope = q_config.ignored_scope or {} ops_with_weights = _collect_ops_with_weights(model) - compressed_model = _weight_only_quantization( - model, quantization_config.weight_quantization_config, calibration_dataset, **kwargs - ) + q_config.ignored_scope["names"] = q_config.ignored_scope.get("names", []) + ops_with_weights - activation_quantization_config = quantization_config.activation_quantization_config.clone() - if activation_quantization_config.ignored_scope is None: - activation_quantization_config.ignored_scope = {} - ignored_names = activation_quantization_config.ignored_scope.get("names", []) + ops_with_weights - activation_quantization_config.ignored_scope["names"] = ignored_names - quantized_model = _full_quantization( - compressed_model, activation_quantization_config, calibration_dataset, **kwargs - ) + compressed_model = _weight_only_quantization(model, wc_config, wc_dataset, **kwargs) + quantized_model = _full_quantization(compressed_model, q_config, dataset, verify_not_optimized=False, **kwargs) return quantized_model From 2d5201a493775520d77a5dcd064d54bc372224c3 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 6 Feb 2025 17:36:35 +0100 Subject: [PATCH 09/20] Fix --- optimum/intel/openvino/configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index a7473fba98..ab6fbb0774 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -895,7 +895,7 @@ def __init__( # Pull dataset-related parameters from child configs. This is not the intended use case, but we process it just # in case user sets those parameters inside child configs only. wqc, aqc = self.weight_quantization_config, self.activation_quantization_config - num_samples = max(num_samples or 0, max(wqc.num_samples, aqc.num_samples)) + num_samples = max(num_samples or 0, max(wqc.num_samples or 0, aqc.num_samples)) dataset = dataset or wqc.dataset or aqc.dataset tokenizer = tokenizer or wqc.tokenizer or aqc.tokenizer processor = processor or wqc.processor or aqc.processor From 02098921834870581900f08eb4355e58e85712e0 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 7 Feb 2025 12:10:35 +0100 Subject: [PATCH 10/20] Fix --- optimum/intel/openvino/configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index ab6fbb0774..1add575c04 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -768,7 +768,7 @@ def to_nncf_dict(self) -> Dict[str, Any]: return { "mode": mode, "preset": preset, - "subset_size": self.num_samples, + "subset_size": self.num_samples or 128, "fast_bias_correction": self.fast_bias_correction, "model_type": model_type, "ignored_scope": self.get_ignored_scope_instance(), From 8b28044cf96b903171ed0d3206ad0c4c8a688060 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 7 Feb 2025 17:30:37 +0100 Subject: [PATCH 11/20] Renaming + docstrings --- optimum/commands/export/openvino.py | 2 +- optimum/intel/openvino/configuration.py | 69 ++++++++++++++++--------- optimum/intel/openvino/quantization.py | 14 +++-- tests/openvino/test_quantization.py | 7 ++- 4 files changed, 59 insertions(+), 33 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 5db0f03a60..1502d06428 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -368,7 +368,7 @@ def run(self): quantization_config = { "weight_quantization_config": wc_config, - "activation_quantization_config": q_config, + "full_quantization_config": q_config, "num_samples": self.args.num_samples, "dataset": self.args.dataset, "trust_remote_code": self.args.trust_remote_code, diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 1add575c04..bcfc713e57 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -303,7 +303,7 @@ def clone(self): class _OVQuantizationConfigWithIgnoredScope(abc.ABC): def __init__(self, ignored_scope: Optional[Union[dict, "nncf.IgnoredScope"]] = None): """ - Base class for configs with ignored scope. + Base class for configs with an ignored scope. Args: ignored_scope (`dict`, *optional*): @@ -570,12 +570,9 @@ def post_init(self): def to_nncf_dict(self) -> Dict[str, Any]: """ - Returns a dictionary with the NNCF-friendly variables that are ready to use. + Returns a dictionary with the variables that are ready to use for nncf.quantize() call. """ - if not is_nncf_available(): - raise ImportError("NNCF is required to execute this method. Please install nncf first.") - signed_bitness = {4: "int4", 8: "int8"} mode = self.weight_format if self.weight_format else signed_bitness[self.bits] if mode in signed_bitness.values(): @@ -741,12 +738,9 @@ def post_init(self): def to_nncf_dict(self) -> Dict[str, Any]: """ - Returns a dictionary with the NNCF-friendly variables that are ready to use. + Returns a dictionary with the variables that are ready to use for nncf.compress_weights() call. """ - if not is_nncf_available(): - raise ImportError("NNCF is required to execute this method. Please install nncf first.") - preset = "performance" if self.sym else "mixed" advanced_parameters_dict = {"overflow_fix": self.overflow_fix} if self.smooth_quant_alpha: @@ -805,7 +799,7 @@ def __init__( self.dtype = self.quantization_config.activation_format elif isinstance(self.quantization_config, OVMixedQuantizationConfig): weight_format = self.quantization_config.weight_quantization_config.weight_format - activation_format = self.quantization_config.activation_quantization_config.activation_format + activation_format = self.quantization_config.full_quantization_config.activation_format self.dtype = f"{weight_format}_{activation_format}" else: raise ValueError(f"Unsupported type of quantization config: {type(self.quantization_config)}") @@ -824,10 +818,7 @@ def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): @staticmethod def quantization_config_from_dict(quantization_config: dict) -> OVQuantizationConfigBase: - if ( - "weight_quantization_config" in quantization_config - and "activation_quantization_config" in quantization_config - ): + if "weight_quantization_config" in quantization_config and "full_quantization_config" in quantization_config: return OVMixedQuantizationConfig.from_dict(quantization_config) wq_args = inspect.getfullargspec(OVWeightQuantizationConfig.__init__).args q_args = inspect.getfullargspec(OVQuantizationConfig.__init__).args @@ -876,7 +867,7 @@ class OVMixedQuantizationConfig(OVQuantizationConfigBase): def __init__( self, weight_quantization_config: Union[OVWeightQuantizationConfig, dict], - activation_quantization_config: Union[OVQuantizationConfig, dict], + full_quantization_config: Union[OVQuantizationConfig, dict], num_samples: Optional[int] = None, dataset: Optional[Union[str, List[str]]] = None, tokenizer: Optional[str] = None, @@ -884,22 +875,50 @@ def __init__( trust_remote_code: bool = False, **kwargs, ): + """ + Configuration class for mixed quantization where we separately quantize: + (1) weights of weighted layers to the precision given in the `weight_quantization_config`, and + (2) weights and activations of other possible layers; precision is given in the `full_quantization_config`. + + By default, all weighted layers are quantized in the first step. This leaves only non-weighted layers for the second step. + If some layers are instructed to be ignored in the first step with `weight_quantization_config.ignored_scope` parameter, + weights and activations of these layers are fully quantized to the precision given in the `full_quantization_config`. + + Args: + weight_quantization_config (`OVWeightQuantizationConfig` or `dict`): + Configuration related to weight quantization. + full_quantization_config (`OVQuantizationConfig` or `dict`): + Configuration related to full quantization. + num_samples (`int`, *optional*): + The maximum number of samples composing the calibration dataset. + dataset (`str or List[str]`, *optional*): + The dataset used for data-aware optimization with NNCF. + tokenizer (`str`, *optional*): + The tokenizer used to process the dataset. + processor (`str`, *optional*): + A transformers processor used to process the dataset inputs. + trust_remote_code (`bool`, defaults to `False`): + Allows to use custom code for the modeling hosted in the model repository. This option should only be + set for repositories you trust and in which you have read the code, as it will execute on your local + machine arbitrary code present in the model repository. + **kwargs: + """ if isinstance(weight_quantization_config, dict): weight_quantization_config = OVWeightQuantizationConfig.from_dict(weight_quantization_config) self.weight_quantization_config = weight_quantization_config - if isinstance(activation_quantization_config, dict): - activation_quantization_config = OVQuantizationConfig.from_dict(activation_quantization_config) - self.activation_quantization_config = activation_quantization_config + if isinstance(full_quantization_config, dict): + full_quantization_config = OVQuantizationConfig.from_dict(full_quantization_config) + self.full_quantization_config = full_quantization_config # Pull dataset-related parameters from child configs. This is not the intended use case, but we process it just # in case user sets those parameters inside child configs only. - wqc, aqc = self.weight_quantization_config, self.activation_quantization_config - num_samples = max(num_samples or 0, max(wqc.num_samples or 0, aqc.num_samples)) - dataset = dataset or wqc.dataset or aqc.dataset - tokenizer = tokenizer or wqc.tokenizer or aqc.tokenizer - processor = processor or wqc.processor or aqc.processor - trust_remote_code = trust_remote_code or wqc.trust_remote_code or aqc.trust_remote_code + wqc, fqc = self.weight_quantization_config, self.full_quantization_config + num_samples = max((num_samples or 0, wqc.num_samples or 0, fqc.num_samples or 0)) or None + dataset = dataset or wqc.dataset or fqc.dataset + tokenizer = tokenizer or wqc.tokenizer or fqc.tokenizer + processor = processor or wqc.processor or fqc.processor + trust_remote_code = trust_remote_code or wqc.trust_remote_code or fqc.trust_remote_code super().__init__( num_samples=num_samples, dataset=dataset, @@ -913,5 +932,5 @@ def __init__( def to_dict(self): result = super().to_dict() result["weight_quantization_config"] = self.weight_quantization_config.to_dict() - result["activation_quantization_config"] = self.activation_quantization_config.to_dict() + result["full_quantization_config"] = self.full_quantization_config.to_dict() return result diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index d02d8b1828..f8ddc1e2ac 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -1148,7 +1148,7 @@ def _hybrid_quantization( mixed_quantization_config = OVMixedQuantizationConfig( weight_quantization_config=wc_config, - activation_quantization_config=q_config, + full_quantization_config=q_config, **kwargs, ) @@ -1162,9 +1162,13 @@ def _mixed_quantization( **kwargs, ) -> openvino.Model: """ - Quantize a model with NNCF in two steps: - - weights-only quantization with nncf.compress_weights method. - - full quantization (excluding weights from previous step) with nncf.quantize method. + Perform mixed precision quantization where we separately quantize: + (1) weights of weighted layers to the precision given in the `quantization_config.weight_quantization_config`, and + (2) weights and activations of other possible layers; precision is given in the `quantization_config.full_quantization_config`. + + By default, all weighted layers are quantized in the first step. This leaves only non-weighted layers for the second step. + If some weighted layers are instructed to be ignored in the first step with `weight_quantization_config.ignored_scope` parameter, + weights and activations of these layers are fully quantized to the precision given in the `quantization_config.full_quantization_config`. Args: model (`openvino.runtime.Model`): @@ -1180,7 +1184,7 @@ def _mixed_quantization( wc_config = quantization_config.weight_quantization_config wc_dataset = dataset if wc_config.bits != 8 else None - q_config = quantization_config.activation_quantization_config.clone() + q_config = quantization_config.full_quantization_config.clone() q_config.ignored_scope = q_config.ignored_scope or {} ops_with_weights = _collect_ops_with_weights(model) q_config.ignored_scope["names"] = q_config.ignored_scope.get("names", []) + ops_with_weights diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 976cde0609..9a963b511f 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -132,7 +132,7 @@ class OVQuantizerTest(unittest.TestCase): "llama", dict( weight_quantization_config=dict(bits=4, weight_format="nf4", group_size=16), - activation_quantization_config=dict(activation_format="f8e4m3"), + full_quantization_config=dict(activation_format="f8e4m3"), dataset="wikitext2", num_samples=1, ), @@ -244,7 +244,10 @@ def test_ov_model_static_quantization_with_auto_dataset( if isinstance(quantization_config, dict): quantization_config = OVConfig.quantization_config_from_dict(quantization_config) if isinstance(quantization_config, OVMixedQuantizationConfig): - quant_mode = f"{quantization_config.weight_quantization_config.weight_format}_{quantization_config.activation_quantization_config.activation_format}" + quant_mode = ( + f"{quantization_config.weight_quantization_config.weight_format}_" + f"{quantization_config.full_quantization_config.activation_format}" + ) else: quant_mode = quantization_config.activation_format From 2abb98950cba69cff2ada6723a671d70e01aff13 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 7 Feb 2025 18:13:05 +0100 Subject: [PATCH 12/20] Update num_samples default value --- optimum/intel/openvino/configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index bcfc713e57..868809f466 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -631,7 +631,7 @@ def __init__( bits: int = 8, sym: bool = False, ignored_scope: Optional[Union[dict, "nncf.IgnoredScope"]] = None, - num_samples: Optional[int] = 300, + num_samples: Optional[int] = 128, model_type: str = "transformer", fast_bias_correction: bool = True, overflow_fix: str = "disable", From 45241492405bc0500b02ba264635e356448e250d Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 10 Feb 2025 18:18:03 +0100 Subject: [PATCH 13/20] Removed ignored scope base class; fix how mixed precision quantization is applied --- optimum/intel/openvino/configuration.py | 59 +++++++-------- optimum/intel/openvino/quantization.py | 49 ++++++++---- tests/openvino/test_exporters_cli.py | 60 +++++++++------ tests/openvino/test_quantization.py | 99 ++++++++++++++++--------- tests/openvino/utils_tests.py | 35 +++++++-- 5 files changed, 187 insertions(+), 115 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 868809f466..810603e44c 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import abc import copy import inspect import json @@ -264,6 +263,7 @@ class OVQuantizationConfigBase(QuantizationConfigMixin): def __init__( self, + ignored_scope: Optional[Union[dict, "nncf.IgnoredScope"]] = None, num_samples: Optional[int] = None, dataset: Optional[Union[str, List[str]]] = None, tokenizer: Optional[str] = None, @@ -273,6 +273,9 @@ def __init__( ): """ Args: + ignored_scope (`dict` or `nncf.IgnoredScope`, *optional*): + An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary + entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. num_samples (`int`, *optional*): The maximum number of samples composing the calibration dataset. dataset (`str or List[str]`, *optional*): @@ -291,25 +294,6 @@ def __init__( self.tokenizer = tokenizer self.processor = processor self.trust_remote_code = trust_remote_code - - def post_init(self): - if not (self.num_samples is None or isinstance(self.num_samples, int) and self.num_samples > 0): - raise ValueError(f"`num_samples` is expected to be a positive integer, but found: {self.num_samples}") - - def clone(self): - return copy.deepcopy(self) - - -class _OVQuantizationConfigWithIgnoredScope(abc.ABC): - def __init__(self, ignored_scope: Optional[Union[dict, "nncf.IgnoredScope"]] = None): - """ - Base class for configs with an ignored scope. - - Args: - ignored_scope (`dict`, *optional*): - An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary - entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. - """ if isinstance(ignored_scope, nncf.IgnoredScope): ignored_scope = ignored_scope.__dict__ self.ignored_scope = ignored_scope @@ -321,15 +305,20 @@ def post_init(self): raise ValueError( f"Can't create an `IgnoredScope` object from the provided ignored scope dict: {self.ignored_scope}.\n{e}" ) + if not (self.num_samples is None or isinstance(self.num_samples, int) and self.num_samples > 0): + raise ValueError(f"`num_samples` is expected to be a positive integer, but found: {self.num_samples}") def get_ignored_scope_instance(self) -> "nncf.IgnoredScope": if self.ignored_scope is None: return nncf.IgnoredScope() return nncf.IgnoredScope(**copy.deepcopy(self.ignored_scope)) + def clone(self): + return copy.deepcopy(self) + @dataclass -class OVWeightQuantizationConfig(OVQuantizationConfigBase, _OVQuantizationConfigWithIgnoredScope): +class OVWeightQuantizationConfig(OVQuantizationConfigBase): """ This is a wrapper class about all possible attributes and features that you can play with a model that has been loaded using `optimum-intel` api for weight-only quantization with NNCF. For full model quantization please see @@ -371,7 +360,7 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase, _OVQuantizationConfig sensitivity_metric (`str`, *optional*): The sensitivity metric for assigning quantization precision to layers. In order to preserve the accuracy of the model, the more sensitive layers receives a higher precision. - ignored_scope (`dict`, *optional*): + ignored_scope (`dict` or `nncf.IgnoredScope`, *optional*): An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. num_samples (`int`, *optional*): @@ -436,15 +425,14 @@ def __init__( backup_precision: Optional[str] = None, **kwargs, ): - OVQuantizationConfigBase.__init__( - self, + super().__init__( + ignored_scope=ignored_scope, num_samples=num_samples, dataset=dataset, tokenizer=tokenizer, processor=processor, trust_remote_code=trust_remote_code, ) - _OVQuantizationConfigWithIgnoredScope.__init__(self, ignored_scope) self.bits = bits self.sym = sym self.group_size = group_size or (-1 if bits == 8 else 128) @@ -463,8 +451,7 @@ def post_init(self): r""" Safety checker that arguments are correct """ - OVQuantizationConfigBase.post_init(self) - _OVQuantizationConfigWithIgnoredScope.post_init(self) + super().post_init() if not (0 <= self.ratio <= 1): raise ValueError("`ratio` must between 0 and 1.") if self.group_size is not None and self.group_size != -1 and self.group_size <= 0: @@ -625,7 +612,7 @@ def __init__( @dataclass -class OVQuantizationConfig(OVQuantizationConfigBase, _OVQuantizationConfigWithIgnoredScope): +class OVQuantizationConfig(OVQuantizationConfigBase): def __init__( self, bits: int = 8, @@ -652,7 +639,7 @@ def __init__( The number of bits to quantize to. sym (`bool`, defaults to `False`): Whether to use symmetric quantization on the activations. Symmetric quantization will be applied on the weights in any case. - ignored_scope (`dict`, *optional*): + ignored_scope (`dict` or `nncf.IgnoredScope`, *optional*): An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. num_samples (`int`, *optional*): @@ -688,15 +675,14 @@ def __init__( activation_format (`str`, defaults to "int8"): Data format activations are compressed to. Possible values: ['int8', 'f8e4m3', 'f8e5m2']. """ - OVQuantizationConfigBase.__init__( - self, + super().__init__( + ignored_scope=ignored_scope, num_samples=num_samples, dataset=dataset, tokenizer=tokenizer, processor=processor, trust_remote_code=trust_remote_code, ) - _OVQuantizationConfigWithIgnoredScope.__init__(self, ignored_scope) self.bits = bits self.sym = sym self.model_type = model_type @@ -715,8 +701,7 @@ def post_init(self): r""" Safety checker that arguments are correct """ - OVQuantizationConfigBase.post_init(self) - _OVQuantizationConfigWithIgnoredScope.post_init(self) + super().post_init() if self.dataset is not None: speech_to_text_datasets = list(PREDEFINED_SPEECH_TO_TEXT_DATASETS.keys()) @@ -868,6 +853,7 @@ def __init__( self, weight_quantization_config: Union[OVWeightQuantizationConfig, dict], full_quantization_config: Union[OVQuantizationConfig, dict], + ignored_scope: Optional[Union[dict, "nncf.IgnoredScope"]] = None, num_samples: Optional[int] = None, dataset: Optional[Union[str, List[str]]] = None, tokenizer: Optional[str] = None, @@ -889,6 +875,10 @@ def __init__( Configuration related to weight quantization. full_quantization_config (`OVQuantizationConfig` or `dict`): Configuration related to full quantization. + ignored_scope (`dict` or `nncf.IgnoredScope`, *optional*): + An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary + entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. + Ignored scope provided here will be used for both weight and full quantization steps. num_samples (`int`, *optional*): The maximum number of samples composing the calibration dataset. dataset (`str or List[str]`, *optional*): @@ -920,6 +910,7 @@ def __init__( processor = processor or wqc.processor or fqc.processor trust_remote_code = trust_remote_code or wqc.trust_remote_code or fqc.trust_remote_code super().__init__( + ignored_scope=ignored_scope, num_samples=num_samples, dataset=dataset, tokenizer=tokenizer, diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index f8ddc1e2ac..74226aff21 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -1014,6 +1014,7 @@ def _weight_only_quantization( model: openvino.runtime.Model, quantization_config: Union[OVWeightQuantizationConfig, Dict], calibration_dataset: Optional[Union[nncf.Dataset, Iterable]] = None, + remove_kv_cache_precision_flag: Optional[bool] = True, **kwargs, ) -> openvino.runtime.Model: _verify_not_optimized(model) @@ -1042,12 +1043,13 @@ def _weight_only_quantization( **wc_kwargs, ) - # If KV cache compression was disabled, remove the disabling flag from the model - if compressed_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]): - prev_rt_info = compressed_model.get_rt_info("runtime_options").value - if prev_rt_info["KV_CACHE_PRECISION"] == "f16": - prev_rt_info.pop("KV_CACHE_PRECISION") - compressed_model.set_rt_info(prev_rt_info, "runtime_options") + if remove_kv_cache_precision_flag: + # Remove the KV cache compression disabling flag from the model + if compressed_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]): + prev_rt_info = compressed_model.get_rt_info("runtime_options").value + if prev_rt_info["KV_CACHE_PRECISION"] == "f16": + prev_rt_info.pop("KV_CACHE_PRECISION") + compressed_model.set_rt_info(prev_rt_info, "runtime_options") return compressed_model @@ -1135,12 +1137,13 @@ def _hybrid_quantization( """ wc_config = quantization_config.clone() - wc_config.ignored_scope = wc_config.ignored_scope or {} - wc_ignored_types = ["Convolution"] if any(op.get_type_name() == "Convolution" for op in model.get_ops()) else [] - wc_config.ignored_scope["types"] = wc_config.ignored_scope.get("types", []) + wc_ignored_types + wc_config.ignored_scope = {} + if any(op.get_type_name() == "Convolution" for op in model.get_ops()): + wc_config.ignored_scope["types"] = ["Convolution"] + q_config_ignored_scope = {"names": _collect_ops_with_weights(model)} q_config = OVQuantizationConfig( - ignored_scope=quantization_config.ignored_scope, + ignored_scope=q_config_ignored_scope, num_samples=quantization_config.num_samples or 200, smooth_quant_alpha=-1, **kwargs, @@ -1149,6 +1152,7 @@ def _hybrid_quantization( mixed_quantization_config = OVMixedQuantizationConfig( weight_quantization_config=wc_config, full_quantization_config=q_config, + ignored_scope=quantization_config.ignored_scope, **kwargs, ) @@ -1181,16 +1185,29 @@ def _mixed_quantization( The OpenVINO Runtime model with applied quantization. """ - wc_config = quantization_config.weight_quantization_config + def merge_ignored_scopes( + ignored_scope_1: Union[Dict[str, List[str]], None], ignored_scope_2: Union[Dict[str, List[str]], None] + ) -> Dict[str, List[str]]: + if ignored_scope_1 is None: + return copy.deepcopy(ignored_scope_2) if ignored_scope_2 is not None else None + if ignored_scope_2 is None: + return copy.deepcopy(ignored_scope_1) + merged_ignored_scope = {} + for key in set(ignored_scope_1) | set(ignored_scope_2): + merged_ignored_scope[key] = list(set(ignored_scope_1.get(key, []) + ignored_scope_2.get(key, []))) + return merged_ignored_scope + + wc_config = quantization_config.weight_quantization_config.clone() + wc_config.ignored_scope = merge_ignored_scopes(wc_config.ignored_scope, quantization_config.ignored_scope) wc_dataset = dataset if wc_config.bits != 8 else None + compressed_model = _weight_only_quantization( + model, wc_config, wc_dataset, remove_kv_cache_precision_flag=False, **kwargs + ) q_config = quantization_config.full_quantization_config.clone() - q_config.ignored_scope = q_config.ignored_scope or {} - ops_with_weights = _collect_ops_with_weights(model) - q_config.ignored_scope["names"] = q_config.ignored_scope.get("names", []) + ops_with_weights - - compressed_model = _weight_only_quantization(model, wc_config, wc_dataset, **kwargs) + q_config.ignored_scope = merge_ignored_scopes(q_config.ignored_scope, quantization_config.ignored_scope) quantized_model = _full_quantization(compressed_model, q_config, dataset, verify_not_optimized=False, **kwargs) + return quantized_model diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 4e9686a57e..9f137bf8a3 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -14,7 +14,7 @@ import subprocess import unittest from pathlib import Path -from typing import Dict, List, Tuple +from typing import Dict, List from parameterized import parameterized from transformers import AutoModelForCausalLM @@ -128,24 +128,34 @@ class OVCLIExportTestCase(unittest.TestCase): "whisper", "int8", "--dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code", - (14, 22, 21) if is_transformers_version("<=", "4.36.0") else (14, 22, 25), - (14, 21, 17) if is_transformers_version("<=", "4.36.0") else (14, 22, 18), + [14, 22, 21] if is_transformers_version("<=", "4.36.0") else [14, 22, 25], + [{"int8": 14}, {"int8": 21}, {"int8": 17}] + if is_transformers_version("<=", "4.36.0") + else [{"int8": 14}, {"int8": 22}, {"int8": 18}], ), ( "text-generation", "llama", "f8e4m3", "--dataset wikitext2 --smooth-quant-alpha 0.9 --trust-remote-code", - (13,), - (16,), + [ + 13, + ], + [ + {"f8e4m3": 16}, + ], ), ( "text-generation", "llama", "nf4_f8e4m3", "--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --group-size 16 --trust-remote-code", - (4,), - (14,), + [ + 13, + ], + [ + {"int8": 4, "nf4": 14}, + ], ), ] @@ -446,8 +456,8 @@ def test_exporters_cli_full_quantization( model_type: str, quant_mode: str, option: str, - expected_fake_nodes: Tuple[int], - expected_low_precision_nodes: Tuple[int], + expected_fake_nodes_per_model: List[int], + expected_num_weight_nodes_per_model: List[Dict[str, int]], ): with TemporaryDirectory() as tmpdir: subprocess.run( @@ -457,22 +467,28 @@ def test_exporters_cli_full_quantization( ) model = eval(_HEAD_TO_AUTOMODELS[task]).from_pretrained(tmpdir) - models = [model] if task == "automatic-speech-recognition": - models = [model.encoder, model.decoder] + submodels = [model.encoder, model.decoder] if model.decoder_with_past is not None: - models.append(model.decoder_with_past) + submodels.append(model.decoder_with_past) + expected_kv_cache_precision_per_model = [None, None, None] else: - expected_fake_nodes = expected_fake_nodes[:-1] - self.assertEqual(len(expected_fake_nodes), len(models)) - for i, model in enumerate(models): - num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(model) - self.assertEqual(expected_fake_nodes[i], num_fake_nodes) - weight_types = quant_mode.split("_") - num_weights = 0 - for weight_type in weight_types: - num_weights += num_weight_nodes[weight_type] - self.assertEqual(expected_low_precision_nodes[i], num_weights) + expected_num_weight_nodes_per_model = expected_num_weight_nodes_per_model[:-1] + expected_fake_nodes_per_model = expected_fake_nodes_per_model[:-1] + expected_kv_cache_precision_per_model = [None, "f16"] + elif "text-generation" in task: + submodels = [model] + expected_kv_cache_precision_per_model = ["f16"] + else: + raise Exception("Unexpected task.") + + check_compression_state_per_model( + self, + submodels, + expected_num_weight_nodes_per_model, + expected_fake_nodes_per_model, + expected_kv_cache_precision_per_model, + ) def test_exporters_cli_int4_with_local_model_and_default_config(self): with TemporaryDirectory() as tmpdir: diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 7ec6d3e8c4..2028f24248 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -113,8 +113,10 @@ class OVQuantizerTest(unittest.TestCase): trust_remote_code=True, smooth_quant_alpha=0.95, ), - (14, 22, 21) if is_transformers_version("<=", "4.42.4") else (14, 22, 25), - (14, 21, 17) if is_transformers_version("<=", "4.42.4") else (14, 22, 18), + [14, 22, 21] if is_transformers_version("<=", "4.36.0") else [14, 22, 25], + [{"int8": 14}, {"int8": 21}, {"int8": 17}] + if is_transformers_version("<=", "4.36.0") + else [{"int8": 14}, {"int8": 22}, {"int8": 18}], ), ( OVModelForCausalLM, @@ -124,8 +126,12 @@ class OVQuantizerTest(unittest.TestCase): num_samples=1, activation_format="f8e4m3", ), - (13,), - (16,), + [ + 13, + ], + [ + {"f8e4m3": 16}, + ], ), ( OVModelForCausalLM, @@ -136,8 +142,36 @@ class OVQuantizerTest(unittest.TestCase): dataset="wikitext2", num_samples=1, ), - (4,), - (14,), + [ + 13, + ], + [ + {"int8": 4, "nf4": 14}, + ], + ), + ( + OVModelForCausalLM, + "llama", + OVMixedQuantizationConfig( + weight_quantization_config=OVWeightQuantizationConfig( + bits=4, + weight_format="nf4", + group_size=16, + ignored_scope={"patterns": ["^__module.model.layers.0.self_attn"]}, + ), + full_quantization_config=OVQuantizationConfig( + activation_format="f8e4m3", ignored_scope={"patterns": ["^__module.model.layers.0.mlp"]} + ), + ignored_scope={"patterns": ["^__module.model.layers.1.self_attn"]}, + dataset="wikitext2", + num_samples=1, + ), + [ + 7, + ], + [ + {"int8": 4, "f8e4m3": 4, "nf4": 6}, + ], ), ] @@ -232,7 +266,12 @@ def preprocess_function(examples, tokenizer): @parameterized.expand(SUPPORTED_ARCHITECTURES_OV_MODEL_WITH_AUTO_DATASET) def test_ov_model_static_quantization_with_auto_dataset( - self, model_cls, model_name, quantization_config, expected_fake_nodes, expected_low_precision_nodes + self, + model_cls, + model_name, + quantization_config, + expected_fake_nodes_per_model, + expected_num_weight_nodes_per_model, ): model_id = MODEL_NAMES[model_name] @@ -240,41 +279,21 @@ def test_ov_model_static_quantization_with_auto_dataset( ov_model = model_cls.from_pretrained(model_id, quantization_config=quantization_config) ov_model.save_pretrained(tmp_dir) - # Convert dict config to class through OVConfig - if isinstance(quantization_config, dict): - quantization_config = OVConfig.quantization_config_from_dict(quantization_config) - if isinstance(quantization_config, OVMixedQuantizationConfig): - quant_mode = ( - f"{quantization_config.weight_quantization_config.weight_format}_" - f"{quantization_config.full_quantization_config.activation_format}" - ) - else: - quant_mode = quantization_config.activation_format - if model_cls == OVModelForSpeechSeq2Seq: - models = [ov_model.encoder.model, ov_model.decoder.model] - + submodels = [ov_model.encoder.model, ov_model.decoder.model] if ov_model.decoder_with_past is not None: - models.append(ov_model.decoder_with_past.model) - for model, expected_fake_nodes, expected_lp_nodes in zip( - models, - expected_fake_nodes, - expected_low_precision_nodes, - ): - num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(model) - self.assertEqual(expected_fake_nodes, num_fake_nodes) - self.assertEqual(expected_lp_nodes, num_weight_nodes[quant_mode]) + submodels.append(ov_model.decoder_with_past.model) + expected_kv_cache_precision_per_model = [None, None, None] + else: + expected_num_weight_nodes_per_model = expected_num_weight_nodes_per_model[:-1] + expected_fake_nodes_per_model = expected_fake_nodes_per_model[:-1] + expected_kv_cache_precision_per_model = [None, "f16"] input_features = torch.randn((1, 128, 3000), dtype=torch.float32) ov_model.generate(input_features) elif model_cls == OVModelForCausalLM: - num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(ov_model.model) - self.assertEqual(expected_fake_nodes[0], num_fake_nodes) - weight_types = quant_mode.split("_") - num_weights = 0 - for weight_type in weight_types: - num_weights += num_weight_nodes[weight_type] - self.assertEqual(expected_low_precision_nodes[0], num_weights) + submodels = [ov_model] + expected_kv_cache_precision_per_model = ["f16"] tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: @@ -285,6 +304,14 @@ def test_ov_model_static_quantization_with_auto_dataset( else: raise Exception("Unexpected model class.") + check_compression_state_per_model( + self, + submodels, + expected_num_weight_nodes_per_model, + expected_fake_nodes_per_model, + expected_kv_cache_precision_per_model, + ) + class OVWeightCompressionTest(unittest.TestCase): SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS = ( diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index e4c2ede8e9..7a51a6ce7e 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -13,7 +13,7 @@ # limitations under the License. import unittest from contextlib import contextmanager -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union import numpy as np import openvino as ov @@ -293,14 +293,35 @@ def new_forward( def check_compression_state_per_model( test_case: unittest.TestCase, models: List[Union[ov.Model, OVBaseModel]], - expected_num_weight_nodes_per_model: List[Dict], + expected_num_weight_nodes_per_model: List[Dict[str, int]], + expected_num_fake_nodes_per_model: Optional[List[int]] = None, + expected_kv_cache_precision_per_model: Optional[List[Union[str, None]]] = None, ): test_case.assertEqual(len(models), len(expected_num_weight_nodes_per_model)) - actual_num_weights_per_model = [] - for submodel, expected_num_weight_nodes in zip(models, expected_num_weight_nodes_per_model): + actual_num_weights_per_model = [{}] * len(models) + actual_num_fake_nodes_per_model = [0] * len(models) + actual_kv_cache_precision_per_model = [None] * len(models) + for i, (submodel, expected_num_weight_nodes) in enumerate(zip(models, expected_num_weight_nodes_per_model)): ov_model = submodel if isinstance(submodel, ov.Model) else submodel.model - _, num_weight_nodes = get_num_quantized_nodes(ov_model) + num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(ov_model) expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)}) - actual_num_weights_per_model.append(num_weight_nodes) - test_case.assertFalse(ov_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) + + actual_num_weights_per_model[i] = num_weight_nodes + actual_num_fake_nodes_per_model[i] = num_fake_nodes + + if ov_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]): + actual_kv_cache_precision = ov_model.get_rt_info(["runtime_options", "KV_CACHE_PRECISION"]).value + else: + actual_kv_cache_precision = None + actual_kv_cache_precision_per_model[i] = actual_kv_cache_precision + + # Check weight nodes test_case.assertEqual(expected_num_weight_nodes_per_model, actual_num_weights_per_model) + + # Check fake nodes + if expected_num_fake_nodes_per_model is not None: + test_case.assertEqual(expected_num_fake_nodes_per_model, actual_num_fake_nodes_per_model) + + # Check KV cache precision + expected_kv_cache_precision_per_model = expected_kv_cache_precision_per_model or ([None] * len(models)) + test_case.assertEqual(expected_kv_cache_precision_per_model, actual_kv_cache_precision_per_model) From 50c77bff7d751a6ea50eaa30c69f357ace6f75ef Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 10 Feb 2025 20:33:23 +0100 Subject: [PATCH 14/20] Rename weight & activation format to dtype --- optimum/commands/export/openvino.py | 8 ++-- optimum/intel/openvino/configuration.py | 63 ++++++++++++------------- tests/openvino/test_quantization.py | 19 ++++---- 3 files changed, 45 insertions(+), 45 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 1502d06428..21d957e949 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -361,10 +361,10 @@ def run(self): if self.args.quant_mode == "nf4_f8e4m3": wc_config = prepare_wc_config(self.args, _DEFAULT_4BIT_CONFIG) - wc_config["weight_format"] = "nf4" + wc_config["dtype"] = "nf4" q_config = prepare_q_config(self.args) - q_config["activation_format"] = "f8e4m3" + q_config["dtype"] = "f8e4m3" quantization_config = { "weight_quantization_config": wc_config, @@ -490,14 +490,14 @@ def prepare_wc_config(args, default_configs): "scale_estimation": args.scale_estimation, "gptq": args.gptq, "lora_correction": args.lora_correction, - "weight_format": args.weight_format, + "dtype": args.weight_format, "backup_precision": args.backup_precision, } def prepare_q_config(args): return { - "activation_format": args.quant_mode, + "dtype": args.quant_mode, "bits": 8, "sym": args.sym or False, "dataset": args.dataset, diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 810603e44c..21b08c543a 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -379,8 +379,8 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): scale_estimation (`bool`, *optional*): Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original and compressed layers. Providing a dataset is required to run scale estimation. - weight_format (`str`, *optional*): - Data format weights are compressed to. Possible values: ['int4', 'int8', 'mxfp4', 'nf4']. + dtype (`str`, *optional*): + Data type weights are compressed to. Possible values: ['int4', 'int8', 'mxfp4', 'nf4']. qptq (`bool`, *optional*): Whether to apply GPTQ algorithm. GPTQ optimizes compressed weights in a layer-wise fashion to minimize the difference between activations of a compressed and original layer. Dataset is required to run GPTQ. @@ -418,7 +418,7 @@ def __init__( num_samples: Optional[int] = None, quant_method: Union[str, OVQuantizationMethod] = OVQuantizationMethod.DEFAULT, scale_estimation: bool = None, - weight_format: Optional[str] = None, + dtype: Optional[str] = None, gptq: bool = None, processor: Optional[str] = None, lora_correction: bool = None, @@ -444,7 +444,7 @@ def __init__( self.gptq = gptq self.lora_correction = lora_correction self.backup_precision = backup_precision - self.weight_format = weight_format + self.dtype = dtype self.post_init() def post_init(self): @@ -486,7 +486,7 @@ def post_init(self): if self.bits not in [4, 8]: raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}") - if self.bits == 8 and self.weight_format: + if self.bits == 8 and self.dtype: if self.ratio != 1: raise ValueError( f"For 8-bit quantization, `ratio` is expected to be set to 1.0, but was set to {self.ratio}" @@ -532,26 +532,26 @@ def post_init(self): if self.processor is not None and not isinstance(self.processor, str): raise ValueError(f"Processor is expected to be a string, but found {self.processor}") - if self.weight_format is None: - self.weight_format = "int4" if self.bits == 4 else "int8" - if self.weight_format not in ["int4", "int8", "mxfp4", "nf4"]: + if self.dtype is None: + self.dtype = "int4" if self.bits == 4 else "int8" + if self.dtype not in ["int4", "int8", "mxfp4", "nf4"]: raise ValueError( - f"Weight format must be one of the following: ['int4', 'int8', 'mxfp4', 'nf4'], but found: {self.weight_format}." + f"Weights quantization data type must be one of the following: ['int4', 'int8', 'mxfp4', 'nf4'], but found: {self.dtype}." ) - if self.weight_format in ["mxfp4", "nf4"]: + if self.dtype in ["mxfp4", "nf4"]: if self.bits != 4: raise ValueError( - f"When applying weight compression with '{self.weight_format}' weight format, the `bits` parameter must be set to 4, but found {self.bits}" + f"When applying weight compression with '{self.dtype}' data type, the `bits` parameter must be set to 4, but found {self.bits}" ) - if self.weight_format == "mxfp4": + if self.dtype == "mxfp4": if self.quant_method == OVQuantizationMethod.AWQ: - raise ValueError("The AWQ algorithm is not supported for 'mxpf4' weight format") + raise ValueError("The AWQ algorithm is not supported for 'mxpf4' data type") if self.scale_estimation: - raise ValueError("The Scale Estimation algorithm is not supported for 'mxpf4' weight format") + raise ValueError("The Scale Estimation algorithm is not supported for 'mxpf4' data type") if self.gptq: - raise ValueError("The GPTQ algorithm is not supported for 'mxfp4' weight format") + raise ValueError("The GPTQ algorithm is not supported for 'mxfp4' data type") if self.lora_correction: - raise ValueError("The LoRA Correction algorithm is not supported for 'mxfp4' weight format") + raise ValueError("The LoRA Correction algorithm is not supported for 'mxfp4' data type") if self.gptq and self.lora_correction: raise ValueError("The GPTQ and LoRA Correction algorithms can't be applied simultaneously") @@ -561,7 +561,7 @@ def to_nncf_dict(self) -> Dict[str, Any]: """ signed_bitness = {4: "int4", 8: "int8"} - mode = self.weight_format if self.weight_format else signed_bitness[self.bits] + mode = self.dtype if self.dtype else signed_bitness[self.bits] if mode in signed_bitness.values(): mode += "_sym" if self.sym else "_asym" if mode == "mxfp4": @@ -627,7 +627,7 @@ def __init__( processor: Optional[str] = None, trust_remote_code: bool = False, smooth_quant_alpha: Optional[float] = None, - activation_format: Optional[str] = "int8", + dtype: Optional[str] = "int8", **kwargs, ): """ @@ -672,8 +672,8 @@ def __init__( smooth_quant_alpha (`float`, *optional*): SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and reduces quantization error. - activation_format (`str`, defaults to "int8"): - Data format activations are compressed to. Possible values: ['int8', 'f8e4m3', 'f8e5m2']. + dtype (`str`, defaults to "int8"): + Data type activations are compressed to. Possible values: ['int8', 'f8e4m3', 'f8e5m2']. """ super().__init__( ignored_scope=ignored_scope, @@ -689,11 +689,10 @@ def __init__( self.fast_bias_correction = fast_bias_correction self.overflow_fix = overflow_fix self.smooth_quant_alpha = smooth_quant_alpha - self.activation_format = activation_format + self.dtype = dtype - f8_formats = ["f8e4m3", "f8e5m2"] - if self.activation_format in f8_formats: - logger.info(f"{self.activation_format} for activations was found. A symmetrical scheme will be used.") + f8_dtypes = ["f8e4m3", "f8e5m2"] + if self.dtype in f8_dtypes: self.sym = True self.post_init() @@ -732,7 +731,7 @@ def to_nncf_dict(self) -> Dict[str, Any]: advanced_parameters_dict["smooth_quant_alphas"] = {"matmul": self.smooth_quant_alpha} mode_map = {"f8e4m3": "fp8_e4m3", "f8e5m2": "fp8_e5m2"} - mode = mode_map.get(self.activation_format) + mode = mode_map.get(self.dtype) preset = nncf.QuantizationPreset(preset) model_type = nncf.ModelType(self.model_type) @@ -778,14 +777,14 @@ def __init__( "compression", None ) # A field for backward-compatability of training-time compression parameters if self.quantization_config is not None: - if isinstance(self.quantization_config, OVWeightQuantizationConfig): - self.dtype = self.quantization_config.weight_format - elif isinstance(self.quantization_config, OVQuantizationConfig): - self.dtype = self.quantization_config.activation_format + if isinstance(self.quantization_config, OVWeightQuantizationConfig) or isinstance( + self.quantization_config, OVQuantizationConfig + ): + self.dtype = self.quantization_config.dtype elif isinstance(self.quantization_config, OVMixedQuantizationConfig): - weight_format = self.quantization_config.weight_quantization_config.weight_format - activation_format = self.quantization_config.full_quantization_config.activation_format - self.dtype = f"{weight_format}_{activation_format}" + wc_dtype = self.quantization_config.weight_quantization_config.dtype + q_dtype = self.quantization_config.full_quantization_config.dtype + self.dtype = f"{wc_dtype}_{q_dtype}" else: raise ValueError(f"Unsupported type of quantization config: {type(self.quantization_config)}") else: diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 2028f24248..023b6923bb 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -124,7 +124,8 @@ class OVQuantizerTest(unittest.TestCase): dict( dataset="wikitext2", num_samples=1, - activation_format="f8e4m3", + dtype="f8e4m3", + weight_only=False, ), [ 13, @@ -137,8 +138,8 @@ class OVQuantizerTest(unittest.TestCase): OVModelForCausalLM, "llama", dict( - weight_quantization_config=dict(bits=4, weight_format="nf4", group_size=16), - full_quantization_config=dict(activation_format="f8e4m3"), + weight_quantization_config=dict(bits=4, dtype="nf4", group_size=16, weight_only=True), + full_quantization_config=dict(dtype="f8e4m3", weight_only=False), dataset="wikitext2", num_samples=1, ), @@ -155,12 +156,12 @@ class OVQuantizerTest(unittest.TestCase): OVMixedQuantizationConfig( weight_quantization_config=OVWeightQuantizationConfig( bits=4, - weight_format="nf4", + dtype="nf4", group_size=16, ignored_scope={"patterns": ["^__module.model.layers.0.self_attn"]}, ), full_quantization_config=OVQuantizationConfig( - activation_format="f8e4m3", ignored_scope={"patterns": ["^__module.model.layers.0.mlp"]} + dtype="f8e4m3", ignored_scope={"patterns": ["^__module.model.layers.0.mlp"]} ), ignored_scope={"patterns": ["^__module.model.layers.1.self_attn"]}, dataset="wikitext2", @@ -335,14 +336,14 @@ class OVWeightCompressionTest(unittest.TestCase): OVModelForCausalLM, "gpt2", False, - dict(bits=4, weight_format="mxfp4", group_size=32), + dict(bits=4, dtype="mxfp4", group_size=32), [{"int8": 4, "f4e2m1": 20, "f8e8m0": 20}], ), ( OVModelForCausalLM, "gpt2", False, - dict(bits=4, weight_format="nf4", group_size=32), + dict(bits=4, dtype="nf4", group_size=32), [ { "int8": 4, @@ -905,7 +906,7 @@ def test_ovmodel_4bit_auto_compression_with_config( openvino_config = OVConfig.from_pretrained(tmp_dir) self.assertEqual(openvino_config.quantization_config.bits, 4) - self.assertEqual(openvino_config.dtype, quantization_config.weight_format) + self.assertEqual(openvino_config.dtype, quantization_config.dtype) @parameterized.expand(((OVModelForCausalLM, "gpt2"),)) def test_ovmodel_stateful_load_with_compressed_weights(self, model_cls, model_type): @@ -1062,7 +1063,7 @@ def test_ovmodel_4bit_dynamic_with_config( model.save_pretrained(tmp_dir) openvino_config = OVConfig.from_pretrained(tmp_dir) self.assertEqual(openvino_config.quantization_config.bits, 4) - self.assertEqual(openvino_config.dtype, quantization_config.weight_format) + self.assertEqual(openvino_config.dtype, quantization_config.dtype) class OVQuantizerQATest(unittest.TestCase): From 569fe61c19bd2532e0f294729633cee67afbe51b Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 10 Feb 2025 20:49:16 +0100 Subject: [PATCH 15/20] Add int4_f8e4m3 quant mode --- docs/source/openvino/export.mdx | 5 +++-- optimum/commands/export/openvino.py | 7 ++++--- tests/openvino/test_exporters_cli.py | 24 ++++++++++++++++++++++++ tests/openvino/test_quantization.py | 16 ++++++++++++++++ 4 files changed, 47 insertions(+), 5 deletions(-) diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx index e25d50fa0c..0ba01e2842 100644 --- a/docs/source/openvino/export.mdx +++ b/docs/source/openvino/export.mdx @@ -31,7 +31,8 @@ Check out the help for more options: ```text usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code] - [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3}] + [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] + [--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,int4_f8e4m3}] [--library {transformers,diffusers,timm,sentence_transformers,open_clip}] [--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym] [--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}] @@ -67,7 +68,7 @@ Optional arguments: on your local machine arbitrary code present in the model repository. --weight-format {fp32,fp16,int8,int4,mxfp4,nf4} The weight format of the exported model. - --quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3} + --quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,int4_f8e4m3} Quantization precision mode. This is used for applying full model quantization including activations. --library {transformers,diffusers,timm,sentence_transformers,open_clip} diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 21d957e949..e2700b79ae 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -78,7 +78,7 @@ def parse_args_openvino(parser: "ArgumentParser"): optional_group.add_argument( "--quant-mode", type=str, - choices=["int8", "f8e4m3", "f8e5m2", "nf4_f8e4m3"], + choices=["int8", "f8e4m3", "f8e5m2", "nf4_f8e4m3", "int4_f8e4m3"], default=None, help=( "Quantization precision mode. This is used for applying full model quantization including activations. " @@ -359,9 +359,10 @@ def run(self): "Dataset is required for full quantization. Please provide it with --dataset argument." ) - if self.args.quant_mode == "nf4_f8e4m3": + if self.args.quant_mode in ["nf4_f8e4m3", "int4_f8e4m3"]: wc_config = prepare_wc_config(self.args, _DEFAULT_4BIT_CONFIG) - wc_config["dtype"] = "nf4" + weight_dtype_map = {"nf4_f8e4m3": "nf4", "int4_f8e4m3": "int4"} + wc_config["dtype"] = weight_dtype_map[self.args.quant_mode] q_config = prepare_q_config(self.args) q_config["dtype"] = "f8e4m3" diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 9f137bf8a3..b0d1614b03 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -157,6 +157,30 @@ class OVCLIExportTestCase(unittest.TestCase): {"int8": 4, "nf4": 14}, ], ), + ( + "text-generation", + "llama", + "int4_f8e4m3", + "--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --group-size 16 --trust-remote-code", + [ + 13, + ], + [ + {"int8": 4, "int4": 28}, + ], + ), + ( + "text-generation", + "llama", + "int4_f8e4m3", + "--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --group-size 16 --trust-remote-code --sym", + [ + 13, + ], + [ + {"int8": 4, "int4": 14}, + ], + ), ] TEST_4BIT_CONFIGURATIONS = [ diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 023b6923bb..a9d758dfb3 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -174,6 +174,22 @@ class OVQuantizerTest(unittest.TestCase): {"int8": 4, "f8e4m3": 4, "nf4": 6}, ], ), + ( + OVModelForCausalLM, + "llama", + OVMixedQuantizationConfig( + weight_quantization_config=OVWeightQuantizationConfig(bits=4, group_size=16), + full_quantization_config=OVQuantizationConfig(dtype="f8e4m3"), + dataset="wikitext2", + num_samples=1, + ), + [ + 13, + ], + [ + {"int8": 4, "int4": 28}, + ], + ), ] @parameterized.expand(SUPPORTED_ARCHITECTURES_TORCH_MODEL) From fa63e4032e5db2bf803cbdecc72b3bb99b51bf4b Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 10 Feb 2025 21:00:06 +0100 Subject: [PATCH 16/20] Update description --- optimum/intel/openvino/configuration.py | 7 ++++--- optimum/intel/openvino/quantization.py | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 21b08c543a..8edda7c37e 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -865,9 +865,10 @@ def __init__( (1) weights of weighted layers to the precision given in the `weight_quantization_config`, and (2) weights and activations of other possible layers; precision is given in the `full_quantization_config`. - By default, all weighted layers are quantized in the first step. This leaves only non-weighted layers for the second step. - If some layers are instructed to be ignored in the first step with `weight_quantization_config.ignored_scope` parameter, - weights and activations of these layers are fully quantized to the precision given in the `full_quantization_config`. + By default, weights of all weighted layers are quantized in the first step. In the second step activations of + weighted and non-weighted layers are quantized. If some layers are instructed to be ignored in the first step + with `weight_quantization_config.ignored_scope` parameter, both weights and activations of these layers are + quantized to the precision given in the `full_quantization_config`. Args: weight_quantization_config (`OVWeightQuantizationConfig` or `dict`): diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 74226aff21..2acc8e333a 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -1170,9 +1170,10 @@ def _mixed_quantization( (1) weights of weighted layers to the precision given in the `quantization_config.weight_quantization_config`, and (2) weights and activations of other possible layers; precision is given in the `quantization_config.full_quantization_config`. - By default, all weighted layers are quantized in the first step. This leaves only non-weighted layers for the second step. - If some weighted layers are instructed to be ignored in the first step with `weight_quantization_config.ignored_scope` parameter, - weights and activations of these layers are fully quantized to the precision given in the `quantization_config.full_quantization_config`. + By default, weights of all weighted layers are quantized in the first step. In the second step activations of + weighted and non-weighted layers are quantized. If some layers are instructed to be ignored in the first step + with `weight_quantization_config.ignored_scope` parameter, both weights and activations of these layers are + quantized to the precision given in the `full_quantization_config`. Args: model (`openvino.runtime.Model`): From f61b7e86264c2719610338a56ac1b971c6047d2a Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 12 Feb 2025 18:10:52 +0100 Subject: [PATCH 17/20] Add 'nf4_f8e5m2', 'int4_f8e5m2'; add backup precision --- docs/source/openvino/export.mdx | 4 +- optimum/commands/export/openvino.py | 10 ++--- optimum/intel/openvino/configuration.py | 14 ++++++- tests/openvino/test_exporters_cli.py | 30 +++++++++----- tests/openvino/test_quantization.py | 54 ++++++++++++++++++++++--- 5 files changed, 89 insertions(+), 23 deletions(-) diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx index 0ba01e2842..441614402e 100644 --- a/docs/source/openvino/export.mdx +++ b/docs/source/openvino/export.mdx @@ -32,7 +32,7 @@ Check out the help for more options: ```text usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code] [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] - [--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,int4_f8e4m3}] + [--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,nf4_f8e5m2,int4_f8e4m3,int4_f8e5m2}] [--library {transformers,diffusers,timm,sentence_transformers,open_clip}] [--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym] [--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}] @@ -68,7 +68,7 @@ Optional arguments: on your local machine arbitrary code present in the model repository. --weight-format {fp32,fp16,int8,int4,mxfp4,nf4} The weight format of the exported model. - --quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,int4_f8e4m3} + --quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,nf4_f8e5m2,int4_f8e4m3,int4_f8e5m2} Quantization precision mode. This is used for applying full model quantization including activations. --library {transformers,diffusers,timm,sentence_transformers,open_clip} diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 7ffc3ebce9..443a8996ed 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -78,7 +78,7 @@ def parse_args_openvino(parser: "ArgumentParser"): optional_group.add_argument( "--quant-mode", type=str, - choices=["int8", "f8e4m3", "f8e5m2", "nf4_f8e4m3", "int4_f8e4m3"], + choices=["int8", "f8e4m3", "f8e5m2", "nf4_f8e4m3", "nf4_f8e5m2", "int4_f8e4m3", "int4_f8e5m2"], default=None, help=( "Quantization precision mode. This is used for applying full model quantization including activations. " @@ -363,13 +363,13 @@ def run(self): "Dataset is required for full quantization. Please provide it with --dataset argument." ) - if self.args.quant_mode in ["nf4_f8e4m3", "int4_f8e4m3"]: + if self.args.quant_mode in ["nf4_f8e4m3", "nf4_f8e5m2", "int4_f8e4m3", "int4_f8e5m2"]: wc_config = prepare_wc_config(self.args, _DEFAULT_4BIT_CONFIG) - weight_dtype_map = {"nf4_f8e4m3": "nf4", "int4_f8e4m3": "int4"} - wc_config["dtype"] = weight_dtype_map[self.args.quant_mode] + wc_dtype, q_dtype = self.args.quant_mode.split("_") + wc_config["dtype"] = wc_dtype q_config = prepare_q_config(self.args) - q_config["dtype"] = "f8e4m3" + q_config["dtype"] = q_dtype quantization_config = { "weight_quantization_config": wc_config, diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 8edda7c37e..7c2ae4ffcd 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -483,6 +483,9 @@ def post_init(self): "quantization algorithm is selected and compression ratio is 1.0." ) + if self.dtype in ["int4", "int8"]: + self.bits = 4 if self.dtype == "int4" else 8 + if self.bits not in [4, 8]: raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}") @@ -895,15 +898,24 @@ def __init__( """ if isinstance(weight_quantization_config, dict): weight_quantization_config = OVWeightQuantizationConfig.from_dict(weight_quantization_config) + else: + weight_quantization_config = weight_quantization_config.clone() self.weight_quantization_config = weight_quantization_config + wqc = self.weight_quantization_config if isinstance(full_quantization_config, dict): full_quantization_config = OVQuantizationConfig.from_dict(full_quantization_config) + else: + full_quantization_config = full_quantization_config.clone() self.full_quantization_config = full_quantization_config + fqc = self.full_quantization_config + + if fqc.dtype in ["f8e4m3", "f8e5m2"] and wqc.backup_precision is None: + # TODO: remove once there is support for FP8 weight compression in NNCF + wqc.backup_precision = "none" # Pull dataset-related parameters from child configs. This is not the intended use case, but we process it just # in case user sets those parameters inside child configs only. - wqc, fqc = self.weight_quantization_config, self.full_quantization_config num_samples = max((num_samples or 0, wqc.num_samples or 0, fqc.num_samples or 0)) or None dataset = dataset or wqc.dataset or fqc.dataset tokenizer = tokenizer or wqc.tokenizer or fqc.tokenizer diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index b0d1614b03..922bb7d007 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -149,36 +149,48 @@ class OVCLIExportTestCase(unittest.TestCase): "text-generation", "llama", "nf4_f8e4m3", - "--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --group-size 16 --trust-remote-code", + "--dataset wikitext2 --num-samples 1 --group-size 16 --trust-remote-code --ratio 0.5", [ - 13, + 14, ], [ - {"int8": 4, "nf4": 14}, + {"f8e4m3": 11, "nf4": 5}, ], ), ( "text-generation", "llama", - "int4_f8e4m3", - "--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --group-size 16 --trust-remote-code", + "nf4_f8e5m2", + "--dataset wikitext2 --num-samples 1 --group-size 16 --trust-remote-code --sym --ratio 0.5", [ - 13, + 14, ], [ - {"int8": 4, "int4": 28}, + {"f8e5m2": 11, "nf4": 5}, ], ), ( "text-generation", "llama", "int4_f8e4m3", - "--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --group-size 16 --trust-remote-code --sym", + "--dataset wikitext2 --num-samples 1 --group-size 16 --trust-remote-code --sym --ratio 0.5", + [ + 14, + ], + [ + {"f8e4m3": 11, "int4": 5}, + ], + ), + ( + "text-generation", + "llama", + "int4_f8e5m2", + "--dataset wikitext2 --num-samples 1 --group-size 16 --trust-remote-code", [ 13, ], [ - {"int8": 4, "int4": 14}, + {"f8e5m2": 2, "int4": 28}, ], ), ] diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index a9d758dfb3..cec9db3ce6 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -138,16 +138,16 @@ class OVQuantizerTest(unittest.TestCase): OVModelForCausalLM, "llama", dict( - weight_quantization_config=dict(bits=4, dtype="nf4", group_size=16, weight_only=True), + weight_quantization_config=dict(bits=4, dtype="nf4", group_size=16, weight_only=True, ratio=0.5), full_quantization_config=dict(dtype="f8e4m3", weight_only=False), dataset="wikitext2", num_samples=1, ), [ - 13, + 14, ], [ - {"int8": 4, "nf4": 14}, + {"f8e4m3": 11, "nf4": 5}, ], ), ( @@ -158,6 +158,7 @@ class OVQuantizerTest(unittest.TestCase): bits=4, dtype="nf4", group_size=16, + ratio=0.5, ignored_scope={"patterns": ["^__module.model.layers.0.self_attn"]}, ), full_quantization_config=OVQuantizationConfig( @@ -171,23 +172,64 @@ class OVQuantizerTest(unittest.TestCase): 7, ], [ - {"int8": 4, "f8e4m3": 4, "nf4": 6}, + {"f8e4m3": 8, "nf4": 2}, ], ), ( OVModelForCausalLM, "llama", OVMixedQuantizationConfig( - weight_quantization_config=OVWeightQuantizationConfig(bits=4, group_size=16), + weight_quantization_config=OVWeightQuantizationConfig( + bits=4, + dtype="nf4", + group_size=16, + ratio=0.5, + ignored_scope={"patterns": ["^__module.model.layers.0.self_attn"]}, + ), + full_quantization_config=OVQuantizationConfig( + dtype="f8e5m2", ignored_scope={"patterns": ["^__module.model.layers.0.mlp"]} + ), + ignored_scope={"patterns": ["^__module.model.layers.1.self_attn"]}, + dataset="wikitext2", + num_samples=1, + ), + [ + 7, + ], + [ + {"f8e5m2": 8, "nf4": 2}, + ], + ), + ( + OVModelForCausalLM, + "llama", + OVMixedQuantizationConfig( + weight_quantization_config=OVWeightQuantizationConfig(bits=4, group_size=16, ratio=0.5), full_quantization_config=OVQuantizationConfig(dtype="f8e4m3"), dataset="wikitext2", num_samples=1, ), + [ + 14, + ], + [ + {"f8e4m3": 11, "int4": 10}, + ], + ), + ( + OVModelForCausalLM, + "llama", + OVMixedQuantizationConfig( + weight_quantization_config=OVWeightQuantizationConfig(bits=4, group_size=16), + full_quantization_config=OVQuantizationConfig(dtype="f8e5m2"), + dataset="wikitext2", + num_samples=1, + ), [ 13, ], [ - {"int8": 4, "int4": 28}, + {"f8e5m2": 2, "int4": 28}, ], ), ] From 73adf4a7dbba0555af8487108e4dad453c97cb79 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 14 Feb 2025 18:26:55 +0100 Subject: [PATCH 18/20] Address comments --- optimum/intel/openvino/configuration.py | 42 ++++++++++++++++--------- optimum/intel/openvino/quantization.py | 33 +++++++++---------- tests/openvino/test_exporters_cli.py | 4 --- tests/openvino/test_quantization.py | 4 --- tests/openvino/utils_tests.py | 12 +------ 5 files changed, 46 insertions(+), 49 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 7c2ae4ffcd..ff83d1e639 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -18,7 +18,7 @@ from dataclasses import dataclass from enum import Enum from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Type, Union import torch from transformers.utils.quantization_config import QuantizationConfigMixin @@ -571,9 +571,7 @@ def to_nncf_dict(self) -> Dict[str, Any]: mode = "e2m1" mode = nncf.CompressWeightsMode(mode) - awq = None - if self.quant_method == "awq" or self.quant_method == OVQuantizationMethod.AWQ: - awq = True + awq = True if self.quant_method == OVQuantizationMethod.AWQ else None sensitivity_metric = nncf.SensitivityMetric(self.sensitivity_metric) if self.sensitivity_metric else None backup_mode = nncf.BackupMode(self.backup_precision) if self.backup_precision else None result = { @@ -896,21 +894,22 @@ def __init__( machine arbitrary code present in the model repository. **kwargs: """ - if isinstance(weight_quantization_config, dict): - weight_quantization_config = OVWeightQuantizationConfig.from_dict(weight_quantization_config) - else: - weight_quantization_config = weight_quantization_config.clone() - self.weight_quantization_config = weight_quantization_config + self.weight_quantization_config = self._initialize_quantization_config( + weight_quantization_config, OVWeightQuantizationConfig + ) wqc = self.weight_quantization_config - if isinstance(full_quantization_config, dict): - full_quantization_config = OVQuantizationConfig.from_dict(full_quantization_config) - else: - full_quantization_config = full_quantization_config.clone() - self.full_quantization_config = full_quantization_config + self.full_quantization_config = self._initialize_quantization_config( + full_quantization_config, OVQuantizationConfig + ) fqc = self.full_quantization_config if fqc.dtype in ["f8e4m3", "f8e5m2"] and wqc.backup_precision is None: + # Here we simulate FP8 backup weight compression precision through full quantization: during weight + # compression step some weighted layers are kept in original precision and later are compressed to FP8 + # during full precision quantization step. + # The issue with current approach is that if one provides an ignored scope for the full quantization step, + # then the weights of the layers under this ignored scope won't be compressed to FP8. # TODO: remove once there is support for FP8 weight compression in NNCF wqc.backup_precision = "none" @@ -932,6 +931,21 @@ def __init__( self.post_init() + @staticmethod + def _initialize_quantization_config( + config: Union[dict, OVWeightQuantizationConfig, OVQuantizationConfig], + config_type: Type[Union[OVWeightQuantizationConfig, OVQuantizationConfig]], + ): + if isinstance(config, dict): + return config_type.from_dict(config) + elif isinstance(config, config_type): + return config.clone() + else: + raise ValueError( + f"Unsupported type of quantization config. Expected either a dictionary or an instance of " + f"{config_type}, but found: {type(config)}." + ) + def to_dict(self): result = super().to_dict() result["weight_quantization_config"] = self.weight_quantization_config.to_dict() diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 2acc8e333a..9c3838cfa9 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -1014,7 +1014,6 @@ def _weight_only_quantization( model: openvino.runtime.Model, quantization_config: Union[OVWeightQuantizationConfig, Dict], calibration_dataset: Optional[Union[nncf.Dataset, Iterable]] = None, - remove_kv_cache_precision_flag: Optional[bool] = True, **kwargs, ) -> openvino.runtime.Model: _verify_not_optimized(model) @@ -1043,13 +1042,7 @@ def _weight_only_quantization( **wc_kwargs, ) - if remove_kv_cache_precision_flag: - # Remove the KV cache compression disabling flag from the model - if compressed_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]): - prev_rt_info = compressed_model.get_rt_info("runtime_options").value - if prev_rt_info["KV_CACHE_PRECISION"] == "f16": - prev_rt_info.pop("KV_CACHE_PRECISION") - compressed_model.set_rt_info(prev_rt_info, "runtime_options") + _remove_f16_kv_cache_precision_flag(compressed_model) return compressed_model @@ -1065,11 +1058,11 @@ def _full_quantization( _verify_not_optimized(model) q_kwargs = copy.deepcopy(kwargs) q_kwargs.update(quantization_config.to_nncf_dict()) - return nncf.quantize( - model, - calibration_dataset=calibration_dataset, - **q_kwargs, - ) + quantized_model = nncf.quantize(model, calibration_dataset=calibration_dataset, **q_kwargs) + + _remove_f16_kv_cache_precision_flag(quantized_model) + + return quantized_model def _get_operation_const_op(operation, const_port_id: int): @@ -1201,9 +1194,7 @@ def merge_ignored_scopes( wc_config = quantization_config.weight_quantization_config.clone() wc_config.ignored_scope = merge_ignored_scopes(wc_config.ignored_scope, quantization_config.ignored_scope) wc_dataset = dataset if wc_config.bits != 8 else None - compressed_model = _weight_only_quantization( - model, wc_config, wc_dataset, remove_kv_cache_precision_flag=False, **kwargs - ) + compressed_model = _weight_only_quantization(model, wc_config, wc_dataset, **kwargs) q_config = quantization_config.full_quantization_config.clone() q_config.ignored_scope = merge_ignored_scopes(q_config.ignored_scope, quantization_config.ignored_scope) @@ -1227,3 +1218,13 @@ def _verify_not_optimized(ov_model): raise RuntimeError(message_template.format(model_weight_compression_config)) elif model_quantization_config is not None: raise RuntimeError(message_template.format(model_quantization_config)) + + +def _remove_f16_kv_cache_precision_flag(model: openvino.Model) -> openvino.Model: + # Remove the KV cache compression disabling flag from the model + if model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]): + prev_rt_info = model.get_rt_info("runtime_options").value + if prev_rt_info["KV_CACHE_PRECISION"] == "f16": + prev_rt_info.pop("KV_CACHE_PRECISION") + model.set_rt_info(prev_rt_info, "runtime_options") + return model diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 922bb7d007..6ac70a47bf 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -507,14 +507,11 @@ def test_exporters_cli_full_quantization( submodels = [model.encoder, model.decoder] if model.decoder_with_past is not None: submodels.append(model.decoder_with_past) - expected_kv_cache_precision_per_model = [None, None, None] else: expected_num_weight_nodes_per_model = expected_num_weight_nodes_per_model[:-1] expected_fake_nodes_per_model = expected_fake_nodes_per_model[:-1] - expected_kv_cache_precision_per_model = [None, "f16"] elif "text-generation" in task: submodels = [model] - expected_kv_cache_precision_per_model = ["f16"] else: raise Exception("Unexpected task.") @@ -523,7 +520,6 @@ def test_exporters_cli_full_quantization( submodels, expected_num_weight_nodes_per_model, expected_fake_nodes_per_model, - expected_kv_cache_precision_per_model, ) def test_exporters_cli_int4_with_local_model_and_default_config(self): diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index cec9db3ce6..0e3e0212f2 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -342,17 +342,14 @@ def test_ov_model_static_quantization_with_auto_dataset( submodels = [ov_model.encoder.model, ov_model.decoder.model] if ov_model.decoder_with_past is not None: submodels.append(ov_model.decoder_with_past.model) - expected_kv_cache_precision_per_model = [None, None, None] else: expected_num_weight_nodes_per_model = expected_num_weight_nodes_per_model[:-1] expected_fake_nodes_per_model = expected_fake_nodes_per_model[:-1] - expected_kv_cache_precision_per_model = [None, "f16"] input_features = torch.randn((1, 128, 3000), dtype=torch.float32) ov_model.generate(input_features) elif model_cls == OVModelForCausalLM: submodels = [ov_model] - expected_kv_cache_precision_per_model = ["f16"] tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: @@ -368,7 +365,6 @@ def test_ov_model_static_quantization_with_auto_dataset( submodels, expected_num_weight_nodes_per_model, expected_fake_nodes_per_model, - expected_kv_cache_precision_per_model, ) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 7a51a6ce7e..f8e7a4add1 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -295,12 +295,10 @@ def check_compression_state_per_model( models: List[Union[ov.Model, OVBaseModel]], expected_num_weight_nodes_per_model: List[Dict[str, int]], expected_num_fake_nodes_per_model: Optional[List[int]] = None, - expected_kv_cache_precision_per_model: Optional[List[Union[str, None]]] = None, ): test_case.assertEqual(len(models), len(expected_num_weight_nodes_per_model)) actual_num_weights_per_model = [{}] * len(models) actual_num_fake_nodes_per_model = [0] * len(models) - actual_kv_cache_precision_per_model = [None] * len(models) for i, (submodel, expected_num_weight_nodes) in enumerate(zip(models, expected_num_weight_nodes_per_model)): ov_model = submodel if isinstance(submodel, ov.Model) else submodel.model num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(ov_model) @@ -309,11 +307,7 @@ def check_compression_state_per_model( actual_num_weights_per_model[i] = num_weight_nodes actual_num_fake_nodes_per_model[i] = num_fake_nodes - if ov_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]): - actual_kv_cache_precision = ov_model.get_rt_info(["runtime_options", "KV_CACHE_PRECISION"]).value - else: - actual_kv_cache_precision = None - actual_kv_cache_precision_per_model[i] = actual_kv_cache_precision + test_case.assertFalse(ov_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) # Check weight nodes test_case.assertEqual(expected_num_weight_nodes_per_model, actual_num_weights_per_model) @@ -321,7 +315,3 @@ def check_compression_state_per_model( # Check fake nodes if expected_num_fake_nodes_per_model is not None: test_case.assertEqual(expected_num_fake_nodes_per_model, actual_num_fake_nodes_per_model) - - # Check KV cache precision - expected_kv_cache_precision_per_model = expected_kv_cache_precision_per_model or ([None] * len(models)) - test_case.assertEqual(expected_kv_cache_precision_per_model, actual_kv_cache_precision_per_model) From b564e7d1d4aac68ef3d466c18aa33d443fdd5823 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Sun, 16 Feb 2025 17:36:02 +0100 Subject: [PATCH 19/20] Trigger Test From c259d4f2cbed444ee91281aa932c63128479ba56 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 18 Feb 2025 10:42:41 +0100 Subject: [PATCH 20/20] Address comments --- optimum/intel/openvino/configuration.py | 23 +++++++++++++++++++---- optimum/intel/openvino/modeling_base.py | 8 +++----- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index ff83d1e639..ad6b14aa2a 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -444,6 +444,12 @@ def __init__( self.gptq = gptq self.lora_correction = lora_correction self.backup_precision = backup_precision + if kwargs.get("weight_format") is not None: + logger.warning( + "The `weight_format` parameter is deprecated and will be removed in optimum-intel v1.24.0. " + "Please use `dtype` instead." + ) + dtype = kwargs.get("weight_format") self.dtype = dtype self.post_init() @@ -484,7 +490,12 @@ def post_init(self): ) if self.dtype in ["int4", "int8"]: - self.bits = 4 if self.dtype == "int4" else 8 + bits = 4 if self.dtype == "int4" else 8 + if self.bits is not None and self.bits != bits: + logger.warning( + f"Overriding `bits` parameter to the value `bits`={bits} to match the given {self.dtype} `dtype`." + ) + self.bits = bits if self.bits not in [4, 8]: raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}") @@ -690,6 +701,12 @@ def __init__( self.fast_bias_correction = fast_bias_correction self.overflow_fix = overflow_fix self.smooth_quant_alpha = smooth_quant_alpha + if kwargs.get("activation_format") is not None: + logger.warning( + "The `activation_format` parameter is deprecated and will be removed in optimum-intel v1.24.0. " + "Please use `dtype` instead." + ) + dtype = kwargs.get("activation_format") self.dtype = dtype f8_dtypes = ["f8e4m3", "f8e5m2"] @@ -778,9 +795,7 @@ def __init__( "compression", None ) # A field for backward-compatability of training-time compression parameters if self.quantization_config is not None: - if isinstance(self.quantization_config, OVWeightQuantizationConfig) or isinstance( - self.quantization_config, OVQuantizationConfig - ): + if isinstance(self.quantization_config, (OVWeightQuantizationConfig, OVQuantizationConfig)): self.dtype = self.quantization_config.dtype elif isinstance(self.quantization_config, OVMixedQuantizationConfig): wc_dtype = self.quantization_config.weight_quantization_config.dtype diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index d7c3eeeb17..932b505b70 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -250,11 +250,9 @@ def fix_op_names_duplicates(model: openvino.runtime.Model): from optimum.intel.openvino.quantization import _weight_only_quantization - if not isinstance(quantization_config, dict) and not isinstance( - quantization_config, OVWeightQuantizationConfig - ): - raise RuntimeError( - "Expected quantization_config to be a dictionary or OVWeightQuantizationConfig object." + if not isinstance(quantization_config, (dict, OVWeightQuantizationConfig)): + raise TypeError( + f"Expected `quantization_config` to be either a dictionary or OVWeightQuantizationConfig object, got {type(quantization_config)}." ) model = _weight_only_quantization(model, quantization_config)