From d78950f500cc49294e7a6ffe0422d48f0ef94ad4 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev <nikita.savelyev@intel.com> Date: Mon, 22 Apr 2024 19:13:19 +0200 Subject: [PATCH 1/6] Move calibration dataset construction to WC function --- optimum/intel/openvino/modeling_decoder.py | 22 +++++++--------------- optimum/intel/openvino/quantization.py | 7 +++++-- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 39a7bee9a2..ca7352076c 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -624,29 +624,21 @@ def _from_pretrained( raise ImportError( "Quantization of the weights requires nncf, please install it with `pip install nncf`" ) - import nncf - from .quantization import _weight_only_quantization + from optimum.intel.openvino.quantization import _weight_only_quantization default_config = _check_default_4bit_configs(config) - if default_config: logger.info( f"For the given model, we recommend the following `quantization_config` : {default_config}" ) - calibration_dataset = None - if isinstance(quantization_config.dataset, str): - tokenizer = quantization_config.tokenizer or AutoTokenizer.from_pretrained(model_id) - - from optimum.gptq.data import get_dataset, prepare_dataset - - nsamples = quantization_config.num_samples or 128 - dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples) - dataset = prepare_dataset(dataset) - calibration_dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x)) - - _weight_only_quantization(model, quantization_config, calibration_dataset) + _weight_only_quantization( + model, + quantization_config, + tokenizer=AutoTokenizer.from_pretrained(quantization_config.tokenizer or model_id), + transform_fn=lambda x: causal_model.prepare_inputs(**x), + ) return causal_model diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 217e5e4056..eb55909a8a 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -33,7 +33,7 @@ from openvino.runtime import Core, Tensor from torch.utils._pytree import tree_map from torch.utils.data import DataLoader, RandomSampler -from transformers import AutoTokenizer, DataCollator, PreTrainedModel, default_data_collator +from transformers import AutoTokenizer, DataCollator, PreTrainedModel, default_data_collator, PreTrainedTokenizer from transformers.pytorch_utils import Conv1D from transformers.utils import is_accelerate_available @@ -622,6 +622,8 @@ def _weight_only_quantization( model: openvino.runtime.Model, quantization_config: Union[OVWeightQuantizationConfig, Dict], calibration_dataset: Optional[Union[nncf.Dataset, Iterable]] = None, + tokenizer: Optional[PreTrainedTokenizer] = None, + transform_fn: Optional[Callable] = None, ) -> openvino.runtime.Model: config = quantization_config if isinstance(config, dict): @@ -645,13 +647,14 @@ def _weight_only_quantization( else: dataset = nncf.Dataset(calibration_dataset) elif config.dataset is not None and isinstance(config.dataset, str): - tokenizer = AutoTokenizer.from_pretrained(config.tokenizer) + tokenizer = tokenizer or AutoTokenizer.from_pretrained(config.tokenizer) from optimum.gptq.data import get_dataset, prepare_dataset nsamples = config.num_samples if config.num_samples else 128 dataset = get_dataset(config.dataset, tokenizer, seqlen=32, nsamples=nsamples) dataset = prepare_dataset(dataset) + dataset = nncf.Dataset(dataset, transform_fn) sensitivity_metric = None if isinstance(config.sensitivity_metric, str): From 99471b2f8dec0f93568064e1e321dd1c61006e63 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev <nikita.savelyev@intel.com> Date: Tue, 23 Apr 2024 15:34:20 +0200 Subject: [PATCH 2/6] Tweak tokenizer --- optimum/intel/openvino/modeling_decoder.py | 7 ++++--- optimum/intel/openvino/quantization.py | 3 +-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index ca7352076c..18ee086fbd 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import copy import logging import os from pathlib import Path @@ -633,10 +633,11 @@ def _from_pretrained( f"For the given model, we recommend the following `quantization_config` : {default_config}" ) + quantization_config_copy = copy.deepcopy(quantization_config) + quantization_config_copy.tokenizer = quantization_config.tokenizer or model_id _weight_only_quantization( model, - quantization_config, - tokenizer=AutoTokenizer.from_pretrained(quantization_config.tokenizer or model_id), + quantization_config_copy, transform_fn=lambda x: causal_model.prepare_inputs(**x), ) diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index eb55909a8a..c31cc5bacc 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -622,7 +622,6 @@ def _weight_only_quantization( model: openvino.runtime.Model, quantization_config: Union[OVWeightQuantizationConfig, Dict], calibration_dataset: Optional[Union[nncf.Dataset, Iterable]] = None, - tokenizer: Optional[PreTrainedTokenizer] = None, transform_fn: Optional[Callable] = None, ) -> openvino.runtime.Model: config = quantization_config @@ -647,7 +646,7 @@ def _weight_only_quantization( else: dataset = nncf.Dataset(calibration_dataset) elif config.dataset is not None and isinstance(config.dataset, str): - tokenizer = tokenizer or AutoTokenizer.from_pretrained(config.tokenizer) + tokenizer = AutoTokenizer.from_pretrained(config.tokenizer) from optimum.gptq.data import get_dataset, prepare_dataset From b986830826d832c4f319eb6c9849ab623e0f6085 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev <nikita.savelyev@intel.com> Date: Tue, 23 Apr 2024 15:36:48 +0200 Subject: [PATCH 3/6] Removed not used import --- optimum/intel/openvino/quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index c31cc5bacc..39cd7ead94 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -33,7 +33,7 @@ from openvino.runtime import Core, Tensor from torch.utils._pytree import tree_map from torch.utils.data import DataLoader, RandomSampler -from transformers import AutoTokenizer, DataCollator, PreTrainedModel, default_data_collator, PreTrainedTokenizer +from transformers import AutoTokenizer, DataCollator, PreTrainedModel, default_data_collator from transformers.pytorch_utils import Conv1D from transformers.utils import is_accelerate_available From cdbedb42aad95116743241fd7142bdc1a6101566 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev <nikita.savelyev@intel.com> Date: Tue, 23 Apr 2024 15:52:27 +0200 Subject: [PATCH 4/6] ruff --- optimum/intel/openvino/modeling_decoder.py | 2 +- optimum/intel/openvino/quantization.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 18ee086fbd..2bf33eb38d 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -23,7 +23,7 @@ import torch from openvino.preprocess import PrePostProcessor from openvino.runtime import Core, Tensor, Type -from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig +from transformers import AutoModelForCausalLM, PretrainedConfig from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward from transformers.generation import GenerationMixin from transformers.modeling_outputs import CausalLMOutputWithPast diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 39cd7ead94..66c7dab498 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -21,14 +21,9 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union import datasets -import nncf import openvino import torch import transformers -from nncf import CompressWeightsMode, SensitivityMetric -from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters, OverflowFix -from nncf.torch import register_module -from nncf.torch.initialization import PTInitializingDataLoader from openvino._offline_transformations import compress_quantize_weights_transformation from openvino.runtime import Core, Tensor from torch.utils._pytree import tree_map @@ -37,6 +32,11 @@ from transformers.pytorch_utils import Conv1D from transformers.utils import is_accelerate_available +import nncf +from nncf import CompressWeightsMode, SensitivityMetric +from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters, OverflowFix +from nncf.torch import register_module +from nncf.torch.initialization import PTInitializingDataLoader from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed from optimum.exporters.tasks import TasksManager from optimum.quantization_base import OptimumQuantizer From fa4065f5f64953a339f94d3ae5770b7150e15823 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev <nikita.savelyev@intel.com> Date: Tue, 23 Apr 2024 15:53:47 +0200 Subject: [PATCH 5/6] ruff 2 --- optimum/intel/openvino/quantization.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 66c7dab498..39cd7ead94 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -21,9 +21,14 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union import datasets +import nncf import openvino import torch import transformers +from nncf import CompressWeightsMode, SensitivityMetric +from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters, OverflowFix +from nncf.torch import register_module +from nncf.torch.initialization import PTInitializingDataLoader from openvino._offline_transformations import compress_quantize_weights_transformation from openvino.runtime import Core, Tensor from torch.utils._pytree import tree_map @@ -32,11 +37,6 @@ from transformers.pytorch_utils import Conv1D from transformers.utils import is_accelerate_available -import nncf -from nncf import CompressWeightsMode, SensitivityMetric -from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters, OverflowFix -from nncf.torch import register_module -from nncf.torch.initialization import PTInitializingDataLoader from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed from optimum.exporters.tasks import TasksManager from optimum.quantization_base import OptimumQuantizer From ea3f2113df52015e309aacd6a92ef85eba9580f5 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev <nikita.savelyev@intel.com> Date: Thu, 25 Apr 2024 09:51:34 +0200 Subject: [PATCH 6/6] Refactor through OVQuantizer call --- optimum/intel/openvino/modeling_decoder.py | 10 ++-- optimum/intel/openvino/quantization.py | 57 +++++++++++++--------- 2 files changed, 38 insertions(+), 29 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 2bf33eb38d..3acd18dab0 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -625,21 +625,19 @@ def _from_pretrained( "Quantization of the weights requires nncf, please install it with `pip install nncf`" ) - from optimum.intel.openvino.quantization import _weight_only_quantization + from optimum.intel.openvino.quantization import OVQuantizer default_config = _check_default_4bit_configs(config) + if default_config: logger.info( f"For the given model, we recommend the following `quantization_config` : {default_config}" ) + quantizer = OVQuantizer(causal_model) quantization_config_copy = copy.deepcopy(quantization_config) quantization_config_copy.tokenizer = quantization_config.tokenizer or model_id - _weight_only_quantization( - model, - quantization_config_copy, - transform_fn=lambda x: causal_model.prepare_inputs(**x), - ) + quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config_copy)) return causal_model diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 39cd7ead94..f3b09f5aa7 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -198,7 +198,7 @@ def from_pretrained(cls, model: PreTrainedModel, **kwargs): def quantize( self, calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, - save_directory: Union[str, Path] = None, + save_directory: Optional[Union[str, Path]] = None, ov_config: OVConfig = None, file_name: Optional[str] = None, batch_size: int = 1, @@ -214,7 +214,7 @@ def quantize( calibration_dataset (`datasets.Dataset` or `nncf.Dataset` or `Iterable`, *optional*): A collection of data samples to use for quantization calibration. Is optional for weight-only quantization and is required for full quantization. - save_directory (`Union[str, Path]`): + save_directory (`Union[str, Path]`, *optional*): The directory where the quantized model should be saved. ov_config (`OVConfig`, *optional*): The configuration containing the parameters related to quantization. If not provided, 8-bit symmetric @@ -262,10 +262,6 @@ def quantize( "as an instance of `OVWeightQuantizationConfig` for weight-only compression or as an instance of `OVQuantizationConfig` for full model quantization." ) - if save_directory is None: - # TODO : can be set to self.model.config.name_or_path for OVModels when not provided - raise ValueError("`save_directory` needs to be specified") - if ov_config is None: ov_config = OVConfig() if not isinstance(ov_config, OVConfig): @@ -318,21 +314,41 @@ def quantize( def _quantize_ovbasemodel( self, ov_config: OVConfig, - save_directory: Union[str, Path], + save_directory: Union[str, Path] = None, calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, **kwargs, ): - save_directory = Path(save_directory) - save_directory.mkdir(parents=True, exist_ok=True) + if save_directory is not None: + save_directory = Path(save_directory) + save_directory.mkdir(parents=True, exist_ok=True) quantization_config = ov_config.quantization_config if isinstance(quantization_config, OVWeightQuantizationConfig): + if calibration_dataset is None and isinstance(quantization_config.dataset, str): + from optimum.intel import OVModelForCausalLM + + if isinstance(self.model, OVModelForCausalLM): + from optimum.gptq.data import get_dataset, prepare_dataset + + tokenizer = AutoTokenizer.from_pretrained(quantization_config.tokenizer) + nsamples = quantization_config.num_samples if quantization_config.num_samples else 128 + calibration_dataset = get_dataset( + quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples + ) + calibration_dataset = prepare_dataset(calibration_dataset) + calibration_dataset = nncf.Dataset(calibration_dataset, lambda x: self.model.prepare_inputs(**x)) + else: + raise ValueError( + f"Can't create weight compression calibration dataset from string for {type(self.model)}" + ) + _weight_only_quantization(self.model.model, quantization_config, calibration_dataset) - self.model.save_pretrained(save_directory) - ov_config.save_pretrained(save_directory) + if save_directory is not None: + self.model.save_pretrained(save_directory) + ov_config.save_pretrained(save_directory) return if not isinstance(quantization_config, OVQuantizationConfig): raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}") @@ -384,8 +400,9 @@ def _quantize_ovbasemodel( **kwargs, ) self.model.model = quantized_model - self.model.save_pretrained(save_directory) - ov_config.save_pretrained(save_directory) + if save_directory is not None: + self.model.save_pretrained(save_directory) + ov_config.save_pretrained(save_directory) def _quantize_torchmodel( self, @@ -398,6 +415,10 @@ def _quantize_torchmodel( remove_unused_columns: bool = True, **kwargs, ): + if save_directory is None: + # TODO : can be set to self.model.config.name_or_path for OVModels when not provided + raise ValueError("`save_directory` needs to be specified") + self._set_task() save_directory = Path(save_directory) save_directory.mkdir(parents=True, exist_ok=True) @@ -622,7 +643,6 @@ def _weight_only_quantization( model: openvino.runtime.Model, quantization_config: Union[OVWeightQuantizationConfig, Dict], calibration_dataset: Optional[Union[nncf.Dataset, Iterable]] = None, - transform_fn: Optional[Callable] = None, ) -> openvino.runtime.Model: config = quantization_config if isinstance(config, dict): @@ -645,15 +665,6 @@ def _weight_only_quantization( dataset = calibration_dataset else: dataset = nncf.Dataset(calibration_dataset) - elif config.dataset is not None and isinstance(config.dataset, str): - tokenizer = AutoTokenizer.from_pretrained(config.tokenizer) - - from optimum.gptq.data import get_dataset, prepare_dataset - - nsamples = config.num_samples if config.num_samples else 128 - dataset = get_dataset(config.dataset, tokenizer, seqlen=32, nsamples=nsamples) - dataset = prepare_dataset(dataset) - dataset = nncf.Dataset(dataset, transform_fn) sensitivity_metric = None if isinstance(config.sensitivity_metric, str):