@@ -75,6 +75,16 @@ def parse_args_openvino(parser: "ArgumentParser"):
75
75
default = None ,
76
76
help = "The weight format of the exported model." ,
77
77
)
78
+ optional_group .add_argument (
79
+ "--quant-mode" ,
80
+ type = str ,
81
+ choices = ["int8" ],
82
+ default = None ,
83
+ help = (
84
+ "Quantization precision mode. This is used for applying full model quantization including activations. "
85
+ "The only currently supported choice is 'int8' for int8 quantization of both weights and activations."
86
+ ),
87
+ )
78
88
optional_group .add_argument (
79
89
"--library" ,
80
90
type = str ,
@@ -228,6 +238,15 @@ def parse_args_openvino(parser: "ArgumentParser"):
228
238
action = "store_true" ,
229
239
help = "Do not add converted tokenizer and detokenizer OpenVINO models." ,
230
240
)
241
+ optional_group .add_argument (
242
+ "--smooth-quant-alpha" ,
243
+ type = float ,
244
+ default = None ,
245
+ help = (
246
+ "SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and "
247
+ "reduces quantization error. Valid only when activations quantization is enabled."
248
+ ),
249
+ )
231
250
232
251
233
252
def no_compression_parameter_provided (args ):
@@ -252,6 +271,20 @@ def no_compression_parameter_provided(args):
252
271
)
253
272
254
273
274
+ def no_quantization_parameter_provided (args ):
275
+ return all (
276
+ (
277
+ it is None
278
+ for it in (
279
+ args .sym ,
280
+ args .dataset ,
281
+ args .num_samples ,
282
+ args .smooth_quant_alpha ,
283
+ )
284
+ )
285
+ )
286
+
287
+
255
288
class OVExportCommand (BaseOptimumCLICommand ):
256
289
COMMAND = CommandInfo (name = "openvino" , help = "Export PyTorch models to OpenVINO IR." )
257
290
@@ -291,16 +324,21 @@ def run(self):
291
324
else :
292
325
library_name = self .args .library
293
326
294
- if self .args .weight_format is None :
327
+ if self .args .weight_format is None and self . args . quant_mode is None :
295
328
ov_config = None
296
329
if not no_compression_parameter_provided (self .args ):
297
330
raise ValueError (
298
331
"Some compression parameters are provided, but the weight format is not specified. "
299
332
"Please provide it with --weight-format argument."
300
333
)
334
+ if not no_quantization_parameter_provided (self .args ):
335
+ raise ValueError (
336
+ "Some quantization parameters are provided, but the quantization mode is not specified. "
337
+ "Please provide it with --quant-mode argument."
338
+ )
301
339
elif self .args .weight_format in {"fp16" , "fp32" }:
302
340
ov_config = OVConfig (dtype = self .args .weight_format )
303
- else :
341
+ elif self . args . weight_format is not None :
304
342
# For int4 quantization if no parameter is provided, then use the default config if exists
305
343
if no_compression_parameter_provided (self .args ) and self .args .weight_format == "int4" :
306
344
quantization_config = get_default_int4_config (self .args .model )
@@ -326,6 +364,21 @@ def run(self):
326
364
if quantization_config .get ("dataset" , None ) is not None :
327
365
quantization_config ["trust_remote_code" ] = self .args .trust_remote_code
328
366
ov_config = OVConfig (quantization_config = quantization_config )
367
+ else :
368
+ if self .args .quant_mode != "int8" :
369
+ raise ValueError ("Only 'int8' quantization mode is currently supported." )
370
+
371
+ quantization_config = {
372
+ "weight_format" : self .args .quant_mode ,
373
+ "activation_format" : self .args .quant_mode ,
374
+ "bits" : 8 ,
375
+ "sym" : self .args .sym or False ,
376
+ "dataset" : self .args .dataset ,
377
+ "num_samples" : self .args .num_samples ,
378
+ "smooth_quant_alpha" : self .args .smooth_quant_alpha ,
379
+ "trust_remote_code" : self .args .trust_remote_code ,
380
+ }
381
+ ov_config = OVConfig (quantization_config = quantization_config )
329
382
330
383
quantization_config = ov_config .quantization_config if ov_config else None
331
384
quantize_with_dataset = quantization_config and getattr (quantization_config , "dataset" , None ) is not None
@@ -368,17 +421,25 @@ def run(self):
368
421
model .save_pretrained (self .args .output )
369
422
if not self .args .disable_convert_tokenizer :
370
423
maybe_convert_tokenizers (library_name , self .args .output , model , task = task )
371
- elif (task .startswith ("text-generation" ) or task == "image-text-to-text" ) and quantize_with_dataset :
424
+ elif (
425
+ quantize_with_dataset
426
+ and (task .startswith ("text-generation" ) or task == "automatic-speech-recognition" )
427
+ or (task == "image-text-to-text" and quantization_config is not None )
428
+ ):
372
429
if task .startswith ("text-generation" ):
373
430
from optimum .intel import OVModelForCausalLM
374
431
375
432
model_cls = OVModelForCausalLM
376
- else :
433
+ elif task == "image-text-to-text" :
377
434
from optimum .intel import OVModelForVisualCausalLM
378
435
379
436
model_cls = OVModelForVisualCausalLM
437
+ else :
438
+ from optimum .intel import OVModelForSpeechSeq2Seq
439
+
440
+ model_cls = OVModelForSpeechSeq2Seq
380
441
381
- # To quantize a model with a dataset, an instance of a model class is required
442
+ # In this case, to apply quantization an instance of a model class is required
382
443
model = model_cls .from_pretrained (
383
444
self .args .model ,
384
445
export = True ,
0 commit comments