Skip to content

Commit ee65304

Browse files
Hybrid quantization as mixed quantization
1 parent 1336d47 commit ee65304

File tree

2 files changed

+33
-39
lines changed

2 files changed

+33
-39
lines changed

optimum/intel/openvino/configuration.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -596,14 +596,13 @@ def to_nncf_dict(self) -> Dict[str, Any]:
596596
"ignored_scope": self.get_ignored_scope_instance(),
597597
"all_layers": self.all_layers,
598598
"sensitivity_metric": sensitivity_metric,
599+
"subset_size": self.num_samples or 128,
599600
"awq": awq,
600601
"scale_estimation": self.scale_estimation,
601602
"gptq": self.gptq,
602603
"lora_correction": self.lora_correction,
603604
"backup_mode": backup_mode,
604605
}
605-
if self.num_samples is not None:
606-
result["subset_size"] = self.num_samples
607606
return result
608607

609608

@@ -733,9 +732,11 @@ def post_init(self):
733732
if self.bits != 8:
734733
raise ValueError(f"Only support 8-bit for static quantization but found {self.bits}")
735734

736-
if self.smooth_quant_alpha is not None and not (0 <= self.smooth_quant_alpha <= 1):
735+
if self.smooth_quant_alpha is not None and (
736+
self.smooth_quant_alpha != -1 and not (0 <= self.smooth_quant_alpha <= 1)
737+
):
737738
raise ValueError(
738-
f"SmoothQuant alpha parameter must be in range [0, 1], but found {self.smooth_quant_alpha}"
739+
f"SmoothQuant alpha parameter can equal -1 or be in range [0, 1], but found {self.smooth_quant_alpha}"
739740
)
740741

741742
def to_nncf_dict(self) -> Dict[str, Any]:
@@ -894,7 +895,7 @@ def __init__(
894895
# Pull dataset-related parameters from child configs. This is not the intended use case, but we process it just
895896
# in case user sets those parameters inside child configs only.
896897
wqc, aqc = self.weight_quantization_config, self.activation_quantization_config
897-
num_samples = num_samples or wqc.num_samples or aqc.num_samples
898+
num_samples = max(num_samples or 0, max(wqc.num_samples, aqc.num_samples))
898899
dataset = dataset or wqc.dataset or aqc.dataset
899900
tokenizer = tokenizer or wqc.tokenizer or aqc.tokenizer
900901
processor = processor or wqc.processor or aqc.processor

optimum/intel/openvino/quantization.py

+27-34
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
import torch
3131
import transformers
3232
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
33-
from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters, OverflowFix
33+
from nncf.quantization.advanced_parameters import OverflowFix
3434
from nncf.torch import register_module
3535
from nncf.torch.initialization import PTInitializingDataLoader
3636
from openvino._offline_transformations import compress_quantize_weights_transformation
@@ -1056,9 +1056,11 @@ def _full_quantization(
10561056
model: openvino.runtime.Model,
10571057
quantization_config: OVQuantizationConfig,
10581058
calibration_dataset: nncf.Dataset,
1059+
verify_not_optimized: bool = True,
10591060
**kwargs,
10601061
):
1061-
_verify_not_optimized(model)
1062+
if verify_not_optimized:
1063+
_verify_not_optimized(model)
10621064
q_kwargs = copy.deepcopy(kwargs)
10631065
q_kwargs.update(quantization_config.to_nncf_dict())
10641066
return nncf.quantize(
@@ -1131,38 +1133,32 @@ def _hybrid_quantization(
11311133
Returns:
11321134
The OpenVINO Runtime model with applied hybrid quantization.
11331135
"""
1134-
ops_to_compress = _collect_ops_with_weights(model)
11351136

11361137
wc_config = quantization_config.clone()
11371138
wc_config.ignored_scope = wc_config.ignored_scope or {}
1138-
11391139
wc_ignored_types = ["Convolution"] if any(op.get_type_name() == "Convolution" for op in model.get_ops()) else []
11401140
wc_config.ignored_scope["types"] = wc_config.ignored_scope.get("types", []) + wc_ignored_types
1141-
compressed_model = _weight_only_quantization(model, wc_config, **kwargs)
1142-
1143-
ptq_ignored_scope = quantization_config.get_ignored_scope_instance()
1144-
ptq_ignored_scope.names += ops_to_compress
1145-
1146-
subset_size = quantization_config.num_samples if quantization_config.num_samples else 200
1147-
quantized_model = nncf.quantize(
1148-
model=compressed_model,
1149-
calibration_dataset=dataset,
1150-
model_type=nncf.ModelType.TRANSFORMER,
1151-
ignored_scope=ptq_ignored_scope,
1152-
# SQ algo should be disabled for MatMul nodes because their weights are already compressed
1153-
advanced_parameters=nncf.AdvancedQuantizationParameters(
1154-
smooth_quant_alphas=AdvancedSmoothQuantParameters(matmul=-1)
1155-
),
1156-
subset_size=subset_size,
1141+
1142+
q_config = OVQuantizationConfig(
1143+
ignored_scope=quantization_config.ignored_scope,
1144+
num_samples=quantization_config.num_samples or 200,
1145+
smooth_quant_alpha=-1,
1146+
**kwargs,
1147+
)
1148+
1149+
mixed_quantization_config = OVMixedQuantizationConfig(
1150+
weight_quantization_config=wc_config,
1151+
activation_quantization_config=q_config,
11571152
**kwargs,
11581153
)
1159-
return quantized_model
1154+
1155+
return _mixed_quantization(model, mixed_quantization_config, dataset, **kwargs)
11601156

11611157

11621158
def _mixed_quantization(
11631159
model: openvino.Model,
11641160
quantization_config: OVMixedQuantizationConfig,
1165-
calibration_dataset: nncf.Dataset,
1161+
dataset: nncf.Dataset,
11661162
**kwargs,
11671163
) -> openvino.Model:
11681164
"""
@@ -1175,25 +1171,22 @@ def _mixed_quantization(
11751171
The OpenVINO Runtime model for applying quantization.
11761172
quantization_config (`OVMixedQuantizationConfig`):
11771173
The configuration containing the parameters related to quantization.
1178-
calibration_dataset (`nncf.Dataset`):
1174+
dataset (`nncf.Dataset`):
11791175
The dataset used for quantization.
11801176
Returns:
11811177
The OpenVINO Runtime model with applied quantization.
11821178
"""
11831179

1180+
wc_config = quantization_config.weight_quantization_config
1181+
wc_dataset = dataset if wc_config.bits != 8 else None
1182+
1183+
q_config = quantization_config.activation_quantization_config.clone()
1184+
q_config.ignored_scope = q_config.ignored_scope or {}
11841185
ops_with_weights = _collect_ops_with_weights(model)
1185-
compressed_model = _weight_only_quantization(
1186-
model, quantization_config.weight_quantization_config, calibration_dataset, **kwargs
1187-
)
1186+
q_config.ignored_scope["names"] = q_config.ignored_scope.get("names", []) + ops_with_weights
11881187

1189-
activation_quantization_config = quantization_config.activation_quantization_config.clone()
1190-
if activation_quantization_config.ignored_scope is None:
1191-
activation_quantization_config.ignored_scope = {}
1192-
ignored_names = activation_quantization_config.ignored_scope.get("names", []) + ops_with_weights
1193-
activation_quantization_config.ignored_scope["names"] = ignored_names
1194-
quantized_model = _full_quantization(
1195-
compressed_model, activation_quantization_config, calibration_dataset, **kwargs
1196-
)
1188+
compressed_model = _weight_only_quantization(model, wc_config, wc_dataset, **kwargs)
1189+
quantized_model = _full_quantization(compressed_model, q_config, dataset, verify_not_optimized=False, **kwargs)
11971190
return quantized_model
11981191

11991192

0 commit comments

Comments
 (0)