Skip to content

Commit 28dfaf6

Browse files
committed
align rt_info int8 models compressed by default and via config
1 parent 3ef8ae2 commit 28dfaf6

File tree

2 files changed

+7
-1
lines changed

2 files changed

+7
-1
lines changed

optimum/commands/export/openvino.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ def run(self):
345345
is_int8 = self.args.weight_format == "int8"
346346
quantization_config = {
347347
"bits": 8 if is_int8 else 4,
348-
"ratio": 1 if is_int8 else (self.args.ratio or _DEFAULT_4BIT_CONFIG["ratio"]),
348+
"ratio": 1.0 if is_int8 else (self.args.ratio or _DEFAULT_4BIT_CONFIG["ratio"]),
349349
"sym": self.args.sym or False,
350350
"group_size": -1 if is_int8 else self.args.group_size,
351351
"all_layers": None if is_int8 else self.args.all_layers,

optimum/exporters/openvino/__main__.py

+6
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,12 @@ class StoreAttr(object):
488488
from optimum.intel.openvino.quantization import _weight_only_quantization
489489

490490
_weight_only_quantization(submodel, quantization_config)
491+
# kv cache compression disabled if quantization config is not provided,
492+
# to keep aligned result of applying auto int8 compression and via explicit setting config, we should update it
493+
if submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]):
494+
prev_rt_info = submodel.get_rt_info("runtime_options").value
495+
prev_rt_info.pop("KV_CACHE_PRECISION")
496+
submodel.set_rt_info(prev_rt_info, "runtime_options")
491497
compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
492498
save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
493499
del submodel

0 commit comments

Comments
 (0)