File tree 2 files changed +7
-1
lines changed
2 files changed +7
-1
lines changed Original file line number Diff line number Diff line change @@ -345,7 +345,7 @@ def run(self):
345
345
is_int8 = self .args .weight_format == "int8"
346
346
quantization_config = {
347
347
"bits" : 8 if is_int8 else 4 ,
348
- "ratio" : 1 if is_int8 else (self .args .ratio or _DEFAULT_4BIT_CONFIG ["ratio" ]),
348
+ "ratio" : 1.0 if is_int8 else (self .args .ratio or _DEFAULT_4BIT_CONFIG ["ratio" ]),
349
349
"sym" : self .args .sym or False ,
350
350
"group_size" : - 1 if is_int8 else self .args .group_size ,
351
351
"all_layers" : None if is_int8 else self .args .all_layers ,
Original file line number Diff line number Diff line change @@ -488,6 +488,12 @@ class StoreAttr(object):
488
488
from optimum .intel .openvino .quantization import _weight_only_quantization
489
489
490
490
_weight_only_quantization (submodel , quantization_config )
491
+ # kv cache compression disabled if quantization config is not provided,
492
+ # to keep aligned result of applying auto int8 compression and via explicit setting config, we should update it
493
+ if submodel .has_rt_info (["runtime_options" , "KV_CACHE_PRECISION" ]):
494
+ prev_rt_info = submodel .get_rt_info ("runtime_options" ).value
495
+ prev_rt_info .pop ("KV_CACHE_PRECISION" )
496
+ submodel .set_rt_info (prev_rt_info , "runtime_options" )
491
497
compressed_submodel_path = submodel_path .parent / f"{ submodel_path .stem } _compressed.xml"
492
498
save_model (submodel , compressed_submodel_path , compress_to_fp16 = False )
493
499
del submodel
You can’t perform that action at this time.
0 commit comments