14
14
15
15
import gc
16
16
import logging
17
+ import operator
17
18
import warnings
19
+ from functools import reduce
18
20
from pathlib import Path
19
21
from typing import TYPE_CHECKING , Any , Callable , Dict , Optional , Union
20
22
23
25
from transformers import AutoConfig , AutoTokenizer , PreTrainedTokenizerBase
24
26
from transformers .utils import is_torch_available
25
27
28
+ from openvino .runtime import Core , Type , save_model
26
29
from optimum .exporters import TasksManager
27
30
from optimum .exporters .onnx .base import OnnxConfig
28
31
from optimum .exporters .onnx .constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
29
32
from optimum .exporters .openvino .convert import export_from_model
30
33
from optimum .intel .utils .import_utils import (
31
34
is_openvino_tokenizers_available ,
32
35
is_openvino_version ,
33
- is_transformers_version ,
36
+ is_transformers_version , is_nncf_available ,
34
37
)
35
38
from optimum .utils .save_utils import maybe_load_preprocessors
36
39
37
- from .utils import clear_class_registry
38
-
40
+ from .utils import clear_class_registry , _MAX_UNCOMPRESSED_SIZE
39
41
40
42
if TYPE_CHECKING :
41
43
from optimum .intel .openvino .configuration import OVConfig
@@ -402,7 +404,7 @@ class StoreAttr(object):
402
404
model_name_or_path , subfolder = subfolder , trust_remote_code = trust_remote_code
403
405
)
404
406
405
- export_from_model (
407
+ submodel_paths = export_from_model (
406
408
model = model ,
407
409
output = output ,
408
410
task = task ,
@@ -425,6 +427,48 @@ class StoreAttr(object):
425
427
del model
426
428
gc .collect ()
427
429
430
+ core = Core ()
431
+ compressed_submodel_paths = []
432
+ for submodel_path in submodel_paths :
433
+ submodel_path = Path (output ) / submodel_path
434
+ submodel = core .read_model (submodel_path )
435
+
436
+ quantization_config = ov_config .quantization_config if ov_config is not None else None
437
+ if ov_config is None :
438
+ num_parameters = 0
439
+ for op in submodel .get_ops ():
440
+ if op .get_type_name () == "Constant" and op .get_element_type () in [Type .f16 , Type .f32 , Type .bf16 ]:
441
+ num_parameters += reduce (operator .mul , op .shape , 1 )
442
+ if num_parameters >= _MAX_UNCOMPRESSED_SIZE :
443
+ if is_nncf_available ():
444
+ quantization_config = {"bits" : 8 , "sym" : False }
445
+ logger .info ("The model weights will be quantized to int8_asym." )
446
+ else :
447
+ continue
448
+ if not quantization_config :
449
+ continue
450
+
451
+ if not is_nncf_available ():
452
+ raise ImportError (
453
+ "Quantization of the weights requires nncf, please install it with `pip install nncf`"
454
+ )
455
+
456
+ from optimum .intel .openvino .quantization import _weight_only_quantization
457
+
458
+ _weight_only_quantization (submodel , quantization_config )
459
+
460
+ compressed_submodel_path = Path (str (submodel_path ).replace (".xml" , "_compressed.xml" ))
461
+ save_model (submodel , compressed_submodel_path , compress_to_fp16 = ov_config and ov_config .dtype == "fp16" )
462
+ compressed_submodel_paths .append ((submodel_path , compressed_submodel_path ))
463
+
464
+ del submodel
465
+
466
+ for submodel_path , compressed_submodel_path in compressed_submodel_paths :
467
+ submodel_path .unlink ()
468
+ submodel_path .with_suffix (".bin" ).unlink ()
469
+ compressed_submodel_path .rename (submodel_path )
470
+ compressed_submodel_path .with_suffix (".bin" ).rename (submodel_path .with_suffix (".bin" ))
471
+
428
472
# Unpatch modules after GPTQ export
429
473
if do_gptq_patching :
430
474
torch .cuda .is_available = orig_cuda_check
0 commit comments