30
30
import torch
31
31
import transformers
32
32
from huggingface_hub .constants import HUGGINGFACE_HUB_CACHE
33
- from nncf .quantization .advanced_parameters import AdvancedSmoothQuantParameters , OverflowFix
33
+ from nncf .quantization .advanced_parameters import OverflowFix
34
34
from nncf .torch import register_module
35
35
from nncf .torch .initialization import PTInitializingDataLoader
36
36
from openvino ._offline_transformations import compress_quantize_weights_transformation
@@ -1056,9 +1056,11 @@ def _full_quantization(
1056
1056
model : openvino .runtime .Model ,
1057
1057
quantization_config : OVQuantizationConfig ,
1058
1058
calibration_dataset : nncf .Dataset ,
1059
+ verify_not_optimized : bool = True ,
1059
1060
** kwargs ,
1060
1061
):
1061
- _verify_not_optimized (model )
1062
+ if verify_not_optimized :
1063
+ _verify_not_optimized (model )
1062
1064
q_kwargs = copy .deepcopy (kwargs )
1063
1065
q_kwargs .update (quantization_config .to_nncf_dict ())
1064
1066
return nncf .quantize (
@@ -1131,38 +1133,32 @@ def _hybrid_quantization(
1131
1133
Returns:
1132
1134
The OpenVINO Runtime model with applied hybrid quantization.
1133
1135
"""
1134
- ops_to_compress = _collect_ops_with_weights (model )
1135
1136
1136
1137
wc_config = quantization_config .clone ()
1137
1138
wc_config .ignored_scope = wc_config .ignored_scope or {}
1138
-
1139
1139
wc_ignored_types = ["Convolution" ] if any (op .get_type_name () == "Convolution" for op in model .get_ops ()) else []
1140
1140
wc_config .ignored_scope ["types" ] = wc_config .ignored_scope .get ("types" , []) + wc_ignored_types
1141
- compressed_model = _weight_only_quantization (model , wc_config , ** kwargs )
1142
-
1143
- ptq_ignored_scope = quantization_config .get_ignored_scope_instance ()
1144
- ptq_ignored_scope .names += ops_to_compress
1145
-
1146
- subset_size = quantization_config .num_samples if quantization_config .num_samples else 200
1147
- quantized_model = nncf .quantize (
1148
- model = compressed_model ,
1149
- calibration_dataset = dataset ,
1150
- model_type = nncf .ModelType .TRANSFORMER ,
1151
- ignored_scope = ptq_ignored_scope ,
1152
- # SQ algo should be disabled for MatMul nodes because their weights are already compressed
1153
- advanced_parameters = nncf .AdvancedQuantizationParameters (
1154
- smooth_quant_alphas = AdvancedSmoothQuantParameters (matmul = - 1 )
1155
- ),
1156
- subset_size = subset_size ,
1141
+
1142
+ q_config = OVQuantizationConfig (
1143
+ ignored_scope = quantization_config .ignored_scope ,
1144
+ num_samples = quantization_config .num_samples or 200 ,
1145
+ smooth_quant_alpha = - 1 ,
1146
+ ** kwargs ,
1147
+ )
1148
+
1149
+ mixed_quantization_config = OVMixedQuantizationConfig (
1150
+ weight_quantization_config = wc_config ,
1151
+ activation_quantization_config = q_config ,
1157
1152
** kwargs ,
1158
1153
)
1159
- return quantized_model
1154
+
1155
+ return _mixed_quantization (model , mixed_quantization_config , dataset , ** kwargs )
1160
1156
1161
1157
1162
1158
def _mixed_quantization (
1163
1159
model : openvino .Model ,
1164
1160
quantization_config : OVMixedQuantizationConfig ,
1165
- calibration_dataset : nncf .Dataset ,
1161
+ dataset : nncf .Dataset ,
1166
1162
** kwargs ,
1167
1163
) -> openvino .Model :
1168
1164
"""
@@ -1175,25 +1171,22 @@ def _mixed_quantization(
1175
1171
The OpenVINO Runtime model for applying quantization.
1176
1172
quantization_config (`OVMixedQuantizationConfig`):
1177
1173
The configuration containing the parameters related to quantization.
1178
- calibration_dataset (`nncf.Dataset`):
1174
+ dataset (`nncf.Dataset`):
1179
1175
The dataset used for quantization.
1180
1176
Returns:
1181
1177
The OpenVINO Runtime model with applied quantization.
1182
1178
"""
1183
1179
1180
+ wc_config = quantization_config .weight_quantization_config
1181
+ wc_dataset = dataset if wc_config .bits != 8 else None
1182
+
1183
+ q_config = quantization_config .activation_quantization_config .clone ()
1184
+ q_config .ignored_scope = q_config .ignored_scope or {}
1184
1185
ops_with_weights = _collect_ops_with_weights (model )
1185
- compressed_model = _weight_only_quantization (
1186
- model , quantization_config .weight_quantization_config , calibration_dataset , ** kwargs
1187
- )
1186
+ q_config .ignored_scope ["names" ] = q_config .ignored_scope .get ("names" , []) + ops_with_weights
1188
1187
1189
- activation_quantization_config = quantization_config .activation_quantization_config .clone ()
1190
- if activation_quantization_config .ignored_scope is None :
1191
- activation_quantization_config .ignored_scope = {}
1192
- ignored_names = activation_quantization_config .ignored_scope .get ("names" , []) + ops_with_weights
1193
- activation_quantization_config .ignored_scope ["names" ] = ignored_names
1194
- quantized_model = _full_quantization (
1195
- compressed_model , activation_quantization_config , calibration_dataset , ** kwargs
1196
- )
1188
+ compressed_model = _weight_only_quantization (model , wc_config , wc_dataset , ** kwargs )
1189
+ quantized_model = _full_quantization (compressed_model , q_config , dataset , verify_not_optimized = False , ** kwargs )
1197
1190
return quantized_model
1198
1191
1199
1192
0 commit comments