Skip to content

Commit ff5d185

Browse files
Introduce OVQuantizationConfig for nncf.quantize() parameters (huggingface#638)
* Introduce OVQuantizationConfig for nncf.quantize() parameters * Ignored scope tweaks * Added **kwargs to quantization call. Added config serialization test. * Ignored scope changes. Tests pass. * Added documentation * Linters * Linters * Tweak ignored scope serialization * Added deprecation errors, tweak docs * Addressed minor comments * Make quantization config contain only serializable properties. * Small tweaks * Address comments * Fix ruff * Fix ruff 2
1 parent 0540b12 commit ff5d185

9 files changed

+654
-244
lines changed

optimum/intel/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@
124124
"OVModelForVision2Seq",
125125
"OVModelForSequenceClassification",
126126
"OVModelForTokenClassification",
127+
"OVQuantizationConfig",
127128
"OVWeightQuantizationConfig",
128129
"OVConfig",
129130
]
@@ -243,6 +244,7 @@
243244
OVModelForSpeechSeq2Seq,
244245
OVModelForTokenClassification,
245246
OVModelForVision2Seq,
247+
OVQuantizationConfig,
246248
OVWeightQuantizationConfig,
247249
)
248250

optimum/intel/openvino/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
from .trainer import OVTrainer
4444

4545

46-
from .configuration import OVConfig, OVWeightQuantizationConfig
46+
from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig
4747
from .modeling import (
4848
OVModelForAudioClassification,
4949
OVModelForAudioFrameClassification,

optimum/intel/openvino/configuration.py

+222-99
Large diffs are not rendered by default.

optimum/intel/openvino/modeling_base.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -100,13 +100,21 @@ def __init__(
100100
self._openvino_config = OVConfig(quantization_config=quantization_config)
101101

102102
@staticmethod
103-
def load_model(file_name: Union[str, Path], quantization_config: Union[OVWeightQuantizationConfig, Dict] = None):
103+
def load_model(
104+
file_name: Union[str, Path],
105+
quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
106+
calibration_dataset: Optional = None,
107+
):
104108
"""
105109
Loads the model.
106110
107111
Arguments:
108112
file_name (`str` or `Path`):
109113
The path of the model ONNX or XML file.
114+
quantization_config (`OVWeightQuantizationConfig` or `Dict`, *optional*):
115+
Quantization config to apply after model is loaded.
116+
calibration_dataset (`nncf.Dataset`, *optional*):
117+
Optional nncf.Dataset to feed to model weight compression when quantization config is provided.
110118
"""
111119

112120
def fix_op_names_duplicates(model: openvino.runtime.Model):
@@ -135,7 +143,7 @@ def fix_op_names_duplicates(model: openvino.runtime.Model):
135143

136144
from optimum.intel.openvino.quantization import _weight_only_quantization
137145

138-
model = _weight_only_quantization(model, quantization_config)
146+
model = _weight_only_quantization(model, quantization_config, calibration_dataset=calibration_dataset)
139147

140148
return model
141149

optimum/intel/openvino/modeling_decoder.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -572,7 +572,7 @@ def _from_pretrained(
572572
from_onnx: bool = False,
573573
local_files_only: bool = False,
574574
load_in_8bit: bool = False,
575-
quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
575+
quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None,
576576
**kwargs,
577577
):
578578
model_path = Path(model_id)
@@ -596,7 +596,12 @@ def _from_pretrained(
596596
quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
597597

598598
load_in_4bit = quantization_config.bits == 4 if quantization_config else False
599-
model = cls.load_model(model_cache_path, quantization_config=None if load_in_4bit else quantization_config)
599+
calibration_dataset = kwargs.get("calibration_dataset", None)
600+
model = cls.load_model(
601+
model_cache_path,
602+
quantization_config=None if load_in_4bit else quantization_config,
603+
calibration_dataset=calibration_dataset,
604+
)
600605

601606
model_type = config.model_type.replace("_", "-")
602607
if model_type == "bloom":
@@ -632,7 +637,7 @@ def _from_pretrained(
632637
f"For the given model, we recommend the following `quantization_config` : {default_config}"
633638
)
634639

635-
if isinstance(quantization_config.dataset, str):
640+
if calibration_dataset is None and isinstance(quantization_config.dataset, str):
636641
tokenizer = quantization_config.tokenizer or AutoTokenizer.from_pretrained(model_id)
637642

638643
from optimum.gptq.data import get_dataset, prepare_dataset
@@ -644,9 +649,9 @@ def _from_pretrained(
644649
dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples)
645650
dataset = prepare_dataset(dataset)
646651
quantization_config = copy.deepcopy(quantization_config)
647-
quantization_config.dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x))
652+
calibration_dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x))
648653

649-
_weight_only_quantization(model, quantization_config)
654+
_weight_only_quantization(model, quantization_config, calibration_dataset)
650655

651656
return causal_model
652657

0 commit comments

Comments
 (0)