54
54
)
55
55
56
56
57
- # TODO : remove as unused
58
- _COMPRESSION_OPTIONS = {
59
- "int8" : {"mode" : nncf .CompressWeightsMode .INT8 },
60
- "int4_sym_g128" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 128 },
61
- "int4_asym_g128" : {"mode" : nncf .CompressWeightsMode .INT4_ASYM , "group_size" : 128 },
62
- "int4_sym_g64" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 64 },
63
- "int4_asym_g64" : {"mode" : nncf .CompressWeightsMode .INT4_ASYM , "group_size" : 64 },
64
- }
65
-
66
57
register_module (ignored_algorithms = [])(Conv1D )
67
58
68
59
core = Core ()
@@ -234,27 +225,16 @@ def quantize(
234
225
ov_config = ov_config or quantization_config
235
226
236
227
if isinstance (self .model , OVBaseModel ):
237
- if self .model .export_feature == "text-generation" and self .model .use_cache :
238
- self ._quantize_ovcausallm (
239
- calibration_dataset ,
240
- save_directory ,
241
- batch_size ,
242
- data_collator ,
243
- remove_unused_columns ,
244
- weights_only ,
245
- ov_config ,
246
- ** kwargs ,
247
- )
248
- else :
249
- self ._quantize_ovbasemodel (
250
- calibration_dataset ,
251
- save_directory ,
252
- batch_size ,
253
- data_collator ,
254
- remove_unused_columns ,
255
- weights_only ,
256
- ** kwargs ,
257
- )
228
+ self ._quantize_ovbasemodel (
229
+ calibration_dataset ,
230
+ save_directory ,
231
+ batch_size ,
232
+ data_collator ,
233
+ remove_unused_columns ,
234
+ weights_only ,
235
+ ov_config ,
236
+ ** kwargs ,
237
+ )
258
238
259
239
elif isinstance (self .model , torch .nn .Module ):
260
240
self ._quantize_torchmodel (
@@ -270,51 +250,7 @@ def quantize(
270
250
else :
271
251
raise TypeError (f"Unsupported model type: { type (self .model )} " )
272
252
273
- def _get_compression_options (self , config : OVConfig ):
274
- options = {}
275
- if config is not None and "type" in config .compression :
276
- options = _COMPRESSION_OPTIONS [config .compression ["type" ]]
277
- if "ratio" in config .compression :
278
- options ["ratio" ] = config .compression ["ratio" ]
279
- return options
280
-
281
253
def _quantize_ovbasemodel (
282
- self ,
283
- calibration_dataset : Dataset ,
284
- save_directory : Union [str , Path ],
285
- batch_size : int = 1 ,
286
- data_collator : Optional [DataCollator ] = None ,
287
- remove_unused_columns : bool = True ,
288
- weights_only : bool = False ,
289
- ** kwargs ,
290
- ):
291
- save_directory = Path (save_directory )
292
- save_directory .mkdir (parents = True , exist_ok = True )
293
-
294
- if weights_only :
295
- self .model .model = nncf .compress_weights (self .model .model )
296
- self .model .save_pretrained (save_directory )
297
- return
298
-
299
- calibration_dataloader = self ._get_calibration_dataloader (
300
- calibration_dataset = calibration_dataset ,
301
- batch_size = batch_size ,
302
- remove_unused_columns = remove_unused_columns ,
303
- data_collator = data_collator ,
304
- )
305
-
306
- quantization_dataset = nncf .Dataset (calibration_dataloader , lambda x : x )
307
- quantized_model = nncf .quantize (
308
- self .model .model ,
309
- quantization_dataset ,
310
- model_type = nncf .ModelType .TRANSFORMER if not kwargs .get ("model_type" ) else kwargs .get ("model_type" ),
311
- fast_bias_correction = kwargs .get ("fast_bias_correction" , True ),
312
- ** kwargs ,
313
- )
314
- self .model .model = quantized_model
315
- self .model .save_pretrained (save_directory )
316
-
317
- def _quantize_ovcausallm (
318
254
self ,
319
255
calibration_dataset : Dataset ,
320
256
save_directory : Union [str , Path ],
@@ -329,11 +265,11 @@ def _quantize_ovcausallm(
329
265
save_directory .mkdir (parents = True , exist_ok = True )
330
266
331
267
if weights_only :
332
- quantization_config = None if ov_config is None else ov_config . quantization_config
333
- if quantization_config is None :
334
- # Use default 8-bit compression
335
- quantization_config = OVWeightQuantizationConfig ( bits = 8 , sym = True )
336
- _weight_only_quantization (self .model , quantization_config )
268
+ # Use default 8-bit compression if not provided
269
+ q_config = (
270
+ OVWeightQuantizationConfig ( bits = 8 , sym = True ) if ov_config is None else ov_config . quantization_config
271
+ )
272
+ _weight_only_quantization (self .model , q_config )
337
273
338
274
self .model .save_pretrained (save_directory )
339
275
return
@@ -345,21 +281,23 @@ def _quantize_ovcausallm(
345
281
data_collator = data_collator ,
346
282
)
347
283
348
- # Prefeth past_key_values
349
- self .model .update_pkv_precision (True )
350
- self .model .compile ()
351
- subset_size = kwargs .get ("subset_size" , 300 )
352
- data_cache = []
284
+ if self .model .export_feature == "text-generation" and self .model .use_cache :
285
+ # Prefeth past_key_values
286
+ self .model .update_pkv_precision (True )
287
+ self .model .compile ()
288
+ subset_size = kwargs .get ("subset_size" , 300 )
289
+ data_cache = []
353
290
354
- self .model .request = InferRequestWrapper (self .model .request , data_cache )
355
- for _ , data in enumerate (calibration_dataloader ):
356
- self .model .generate (** data , max_new_tokens = 1 )
357
- if len (data_cache ) >= subset_size :
358
- break
359
- self .model .request = self .model .request .request
291
+ self .model .request = InferRequestWrapper (self .model .request , data_cache )
292
+ for _ , data in enumerate (calibration_dataloader ):
293
+ self .model .generate (** data , max_new_tokens = 1 )
294
+ if len (data_cache ) >= subset_size :
295
+ break
296
+ self .model .request = self .model .request .request
297
+ calibration_dataloader = data_cache
360
298
361
299
# Actual model quantization
362
- quantization_dataset = nncf .Dataset (data_cache , lambda x : x )
300
+ quantization_dataset = nncf .Dataset (calibration_dataloader , lambda x : x )
363
301
quantized_model = nncf .quantize (
364
302
self .model .model ,
365
303
quantization_dataset ,
0 commit comments