@@ -232,6 +232,7 @@ def main_export(
232
232
)
233
233
234
234
do_gptq_patching = False
235
+ do_quant_patching = False
235
236
custom_architecture = False
236
237
patch_16bit = False
237
238
loading_kwargs = model_loading_kwargs or {}
@@ -247,7 +248,11 @@ def main_export(
247
248
trust_remote_code = trust_remote_code ,
248
249
)
249
250
quantization_config = getattr (config , "quantization_config" , None )
250
- do_gptq_patching = quantization_config and quantization_config ["quant_method" ] == "gptq"
251
+ supported_quant_methods = ["gptq" ]
252
+ if is_openvino_version (">=" , "2024.6.0" ):
253
+ supported_quant_methods .append ("awq" )
254
+ do_quant_patching = quantization_config and quantization_config ["quant_method" ] in supported_quant_methods
255
+ do_gptq_patching = do_quant_patching and quantization_config ["quant_method" ] == "gptq"
251
256
model_type = config .model_type .replace ("_" , "-" )
252
257
if model_type not in TasksManager ._SUPPORTED_MODEL_TYPE :
253
258
custom_architecture = True
@@ -296,7 +301,6 @@ def main_export(
296
301
if (
297
302
dtype is None
298
303
and framework == "pt"
299
- and not do_gptq_patching
300
304
and (
301
305
task .startswith ("text-generation" )
302
306
or getattr (config , "model_type" , None ) in MULTI_MODAL_TEXT_GENERATION_MODELS
@@ -315,28 +319,28 @@ def main_export(
315
319
patch_16bit = True
316
320
loading_kwargs ["torch_dtype" ] = dtype
317
321
# Patch the modules to export of GPTQ models w/o GPU
318
- if do_gptq_patching :
319
- torch .set_default_dtype (torch .float32 )
322
+ if do_quant_patching :
320
323
orig_cuda_check = torch .cuda .is_available
321
324
torch .cuda .is_available = lambda : True
322
325
323
- from optimum .gptq import GPTQQuantizer
326
+ if do_gptq_patching :
327
+ from optimum .gptq import GPTQQuantizer
324
328
325
- orig_post_init_model = GPTQQuantizer .post_init_model
329
+ orig_post_init_model = GPTQQuantizer .post_init_model
326
330
327
- def post_init_model (self , model ):
328
- from auto_gptq import exllama_set_max_input_length
331
+ def post_init_model (self , model ):
332
+ from auto_gptq import exllama_set_max_input_length
329
333
330
- class StoreAttr (object ):
331
- pass
334
+ class StoreAttr (object ):
335
+ pass
332
336
333
- model .quantize_config = StoreAttr ()
334
- model .quantize_config .desc_act = self .desc_act
335
- if self .desc_act and not self .disable_exllama and self .max_input_length is not None :
336
- model = exllama_set_max_input_length (model , self .max_input_length )
337
- return model
337
+ model .quantize_config = StoreAttr ()
338
+ model .quantize_config .desc_act = self .desc_act
339
+ if self .desc_act and not self .disable_exllama and self .max_input_length is not None :
340
+ model = exllama_set_max_input_length (model , self .max_input_length )
341
+ return model
338
342
339
- GPTQQuantizer .post_init_model = post_init_model
343
+ GPTQQuantizer .post_init_model = post_init_model
340
344
elif library_name == "diffusers" and is_openvino_version (">=" , "2024.6" ):
341
345
dtype = deduce_diffusers_dtype (
342
346
model_name_or_path ,
@@ -351,143 +355,150 @@ class StoreAttr(object):
351
355
loading_kwargs ["torch_dtype" ] = dtype
352
356
patch_16bit = True
353
357
354
- if library_name == "open_clip" :
355
- model = _OpenClipForZeroShotImageClassification .from_pretrained (model_name_or_path , cache_dir = cache_dir )
356
- else :
357
- model = TasksManager .get_model_from_task (
358
- task ,
359
- model_name_or_path ,
360
- subfolder = subfolder ,
361
- revision = revision ,
362
- cache_dir = cache_dir ,
363
- token = token ,
364
- local_files_only = local_files_only ,
365
- force_download = force_download ,
366
- trust_remote_code = trust_remote_code ,
367
- framework = framework ,
368
- device = device ,
369
- library_name = library_name ,
370
- ** loading_kwargs ,
371
- )
358
+ try :
359
+ if library_name == "open_clip" :
360
+ model = _OpenClipForZeroShotImageClassification .from_pretrained (model_name_or_path , cache_dir = cache_dir )
361
+ else :
362
+ model = TasksManager .get_model_from_task (
363
+ task ,
364
+ model_name_or_path ,
365
+ subfolder = subfolder ,
366
+ revision = revision ,
367
+ cache_dir = cache_dir ,
368
+ token = token ,
369
+ local_files_only = local_files_only ,
370
+ force_download = force_download ,
371
+ trust_remote_code = trust_remote_code ,
372
+ framework = framework ,
373
+ device = device ,
374
+ library_name = library_name ,
375
+ ** loading_kwargs ,
376
+ )
372
377
373
- needs_pad_token_id = task == "text-classification" and getattr (model .config , "pad_token_id" , None ) is None
378
+ needs_pad_token_id = task == "text-classification" and getattr (model .config , "pad_token_id" , None ) is None
374
379
375
- if needs_pad_token_id :
376
- if pad_token_id is not None :
377
- model .config .pad_token_id = pad_token_id
378
- else :
379
- tok = AutoTokenizer .from_pretrained (model_name_or_path )
380
- pad_token_id = getattr (tok , "pad_token_id" , None )
381
- if pad_token_id is None :
382
- raise ValueError (
383
- "Could not infer the pad token id, which is needed in this case, please provide it with the --pad_token_id argument"
384
- )
385
- model .config .pad_token_id = pad_token_id
380
+ if needs_pad_token_id :
381
+ if pad_token_id is not None :
382
+ model .config .pad_token_id = pad_token_id
383
+ else :
384
+ tok = AutoTokenizer .from_pretrained (model_name_or_path )
385
+ pad_token_id = getattr (tok , "pad_token_id" , None )
386
+ if pad_token_id is None :
387
+ raise ValueError (
388
+ "Could not infer the pad token id, which is needed in this case, please provide it with the --pad_token_id argument"
389
+ )
390
+ model .config .pad_token_id = pad_token_id
386
391
387
- if hasattr (model .config , "export_model_type" ):
388
- model_type = model .config .export_model_type .replace ("_" , "-" )
389
- else :
390
- model_type = model .config .model_type .replace ("_" , "-" )
391
-
392
- if (
393
- not custom_architecture
394
- and library_name != "diffusers"
395
- and task + "-with-past"
396
- in TasksManager .get_supported_tasks_for_model_type (model_type , exporter = "openvino" , library_name = library_name )
397
- ):
398
- # Make -with-past the default if --task was not explicitely specified
399
- if original_task == "auto" :
400
- task = task + "-with-past"
392
+ if hasattr (model .config , "export_model_type" ):
393
+ model_type = model .config .export_model_type .replace ("_" , "-" )
401
394
else :
402
- logger .info (
403
- f"The task `{ task } ` was manually specified, and past key values will not be reused in the decoding."
404
- f" if needed, please pass `--task { task } -with-past` to export using the past key values."
395
+ model_type = model .config .model_type .replace ("_" , "-" )
396
+
397
+ if (
398
+ not custom_architecture
399
+ and library_name != "diffusers"
400
+ and task + "-with-past"
401
+ in TasksManager .get_supported_tasks_for_model_type (
402
+ model_type , exporter = "openvino" , library_name = library_name
405
403
)
404
+ ):
405
+ # Make -with-past the default if --task was not explicitely specified
406
+ if original_task == "auto" :
407
+ task = task + "-with-past"
408
+ else :
409
+ logger .info (
410
+ f"The task `{ task } ` was manually specified, and past key values will not be reused in the decoding."
411
+ f" if needed, please pass `--task { task } -with-past` to export using the past key values."
412
+ )
406
413
407
- if original_task == "auto" :
408
- synonyms_for_task = sorted (TasksManager .synonyms_for_task (task ))
409
- if synonyms_for_task :
410
- synonyms_for_task = ", " .join (synonyms_for_task )
411
- possible_synonyms = f" (possible synonyms are: { synonyms_for_task } )"
412
- else :
413
- possible_synonyms = ""
414
- logger .info (f"Automatic task detection to { task } { possible_synonyms } ." )
414
+ if original_task == "auto" :
415
+ synonyms_for_task = sorted (TasksManager .synonyms_for_task (task ))
416
+ if synonyms_for_task :
417
+ synonyms_for_task = ", " .join (synonyms_for_task )
418
+ possible_synonyms = f" (possible synonyms are: { synonyms_for_task } )"
419
+ else :
420
+ possible_synonyms = ""
421
+ logger .info (f"Automatic task detection to { task } { possible_synonyms } ." )
415
422
416
- preprocessors = maybe_load_preprocessors (
417
- model_name_or_path , subfolder = subfolder , trust_remote_code = trust_remote_code
418
- )
423
+ preprocessors = maybe_load_preprocessors (
424
+ model_name_or_path , subfolder = subfolder , trust_remote_code = trust_remote_code
425
+ )
419
426
420
- submodel_paths = export_from_model (
421
- model = model ,
422
- output = output ,
423
- task = task ,
424
- ov_config = ov_config ,
425
- stateful = stateful ,
426
- model_kwargs = model_kwargs ,
427
- custom_export_configs = custom_export_configs ,
428
- fn_get_submodels = fn_get_submodels ,
429
- preprocessors = preprocessors ,
430
- device = device ,
431
- trust_remote_code = trust_remote_code ,
432
- patch_16bit_model = patch_16bit ,
433
- ** kwargs_shapes ,
434
- )
427
+ submodel_paths = export_from_model (
428
+ model = model ,
429
+ output = output ,
430
+ task = task ,
431
+ ov_config = ov_config ,
432
+ stateful = stateful ,
433
+ model_kwargs = model_kwargs ,
434
+ custom_export_configs = custom_export_configs ,
435
+ fn_get_submodels = fn_get_submodels ,
436
+ preprocessors = preprocessors ,
437
+ device = device ,
438
+ trust_remote_code = trust_remote_code ,
439
+ patch_16bit_model = patch_16bit ,
440
+ ** kwargs_shapes ,
441
+ )
435
442
436
- if convert_tokenizer :
437
- maybe_convert_tokenizers (library_name , output , model , preprocessors , task = task )
438
-
439
- clear_class_registry ()
440
- del model
441
- gc .collect ()
442
-
443
- for submodel_path in submodel_paths :
444
- submodel_path = Path (output ) / submodel_path
445
- submodel = core .read_model (submodel_path )
446
-
447
- quantization_config = None
448
- if ov_config is None :
449
- num_parameters = 0
450
- for op in submodel .get_ops ():
451
- if op .get_type_name () == "Constant" and op .get_element_type () in [Type .f16 , Type .f32 , Type .bf16 ]:
452
- num_parameters += reduce (operator .mul , op .shape , 1 )
453
- del op
454
- if num_parameters >= _MAX_UNCOMPRESSED_SIZE :
455
- if is_nncf_available ():
456
- quantization_config = {"bits" : 8 , "sym" : False }
457
- logger .info ("The model weights will be quantized to int8_asym." )
458
- else :
459
- logger .warning (
460
- "The model will be converted with no weights quantization. Quantization of the weights to int8 "
461
- "requires nncf. Please install it with `pip install nncf`"
462
- )
463
- break
464
- else :
465
- quantization_config = ov_config .quantization_config
466
- if quantization_config is None :
467
- del submodel
468
- gc .collect ()
469
- continue
443
+ if convert_tokenizer :
444
+ maybe_convert_tokenizers (library_name , output , model , preprocessors , task = task )
470
445
471
- if not is_nncf_available ():
472
- raise ImportError ("Quantization of the weights requires nncf, please install it with `pip install nncf`" )
446
+ clear_class_registry ()
447
+ del model
448
+ gc .collect ()
473
449
474
- from optimum .intel .openvino .quantization import _weight_only_quantization
450
+ for submodel_path in submodel_paths :
451
+ submodel_path = Path (output ) / submodel_path
452
+ submodel = core .read_model (submodel_path )
453
+
454
+ quantization_config = None
455
+ if ov_config is None :
456
+ num_parameters = 0
457
+ for op in submodel .get_ops ():
458
+ if op .get_type_name () == "Constant" and op .get_element_type () in [Type .f16 , Type .f32 , Type .bf16 ]:
459
+ num_parameters += reduce (operator .mul , op .shape , 1 )
460
+ del op
461
+ if num_parameters >= _MAX_UNCOMPRESSED_SIZE :
462
+ if is_nncf_available ():
463
+ quantization_config = {"bits" : 8 , "sym" : False }
464
+ logger .info ("The model weights will be quantized to int8_asym." )
465
+ else :
466
+ logger .warning (
467
+ "The model will be converted with no weights quantization. Quantization of the weights to int8 "
468
+ "requires nncf. Please install it with `pip install nncf`"
469
+ )
470
+ break
471
+ else :
472
+ quantization_config = ov_config .quantization_config
473
+ if quantization_config is None :
474
+ del submodel
475
+ gc .collect ()
476
+ continue
477
+
478
+ if not is_nncf_available ():
479
+ raise ImportError (
480
+ "Quantization of the weights requires nncf, please install it with `pip install nncf`"
481
+ )
475
482
476
- _weight_only_quantization (submodel , quantization_config )
477
- compressed_submodel_path = submodel_path .parent / f"{ submodel_path .stem } _compressed.xml"
478
- save_model (submodel , compressed_submodel_path , compress_to_fp16 = False )
479
- del submodel
480
- gc .collect ()
483
+ from optimum .intel .openvino .quantization import _weight_only_quantization
481
484
482
- submodel_path .unlink ()
483
- submodel_path .with_suffix (".bin" ).unlink ()
484
- compressed_submodel_path .rename (submodel_path )
485
- compressed_submodel_path .with_suffix (".bin" ).rename (submodel_path .with_suffix (".bin" ))
485
+ _weight_only_quantization (submodel , quantization_config )
486
+ compressed_submodel_path = submodel_path .parent / f"{ submodel_path .stem } _compressed.xml"
487
+ save_model (submodel , compressed_submodel_path , compress_to_fp16 = False )
488
+ del submodel
489
+ gc .collect ()
486
490
487
- # Unpatch modules after GPTQ export
488
- if do_gptq_patching :
489
- torch .cuda .is_available = orig_cuda_check
490
- GPTQQuantizer .post_init_model = orig_post_init_model
491
+ submodel_path .unlink ()
492
+ submodel_path .with_suffix (".bin" ).unlink ()
493
+ compressed_submodel_path .rename (submodel_path )
494
+ compressed_submodel_path .with_suffix (".bin" ).rename (submodel_path .with_suffix (".bin" ))
495
+
496
+ finally :
497
+ # Unpatch modules after quantized model export
498
+ if do_quant_patching :
499
+ torch .cuda .is_available = orig_cuda_check
500
+ if do_gptq_patching :
501
+ GPTQQuantizer .post_init_model = orig_post_init_model
491
502
492
503
493
504
def maybe_convert_tokenizers (library_name : str , output : Path , model = None , preprocessors = None , task = None ):
0 commit comments