Skip to content

Commit c94b3f5

Browse files
authored
Set FP16 KV-cache for non-quantized text models (huggingface#1043)
* Set FP16 KV-cache for non-quantized text models * Style
1 parent ba45714 commit c94b3f5

File tree

3 files changed

+16
-4
lines changed

3 files changed

+16
-4
lines changed

optimum/exporters/openvino/__main__.py

+2
Original file line numberDiff line numberDiff line change
@@ -456,6 +456,8 @@ class StoreAttr(object):
456456
from optimum.intel.openvino.quantization import _weight_only_quantization
457457

458458
_weight_only_quantization(submodel, quantization_config)
459+
if "text-generation" in task:
460+
submodel.set_rt_info("u8", ["runtime_options", "KV_CACHE_PRECISION"])
459461

460462
compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
461463
save_model(submodel, compressed_submodel_path, compress_to_fp16=False)

optimum/exporters/openvino/convert.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,15 @@ def _set_runtime_options(
9999
],
100100
task: str,
101101
library_name: str,
102+
quantized_model: bool,
102103
):
103104
for model_name in models_and_export_configs.keys():
104105
_, sub_export_config = models_and_export_configs[model_name]
106+
sub_export_config.runtime_options = {}
105107
if "diffusers" in library_name or "text-generation" in task:
106-
sub_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"}
108+
sub_export_config.runtime_options["ACTIVATIONS_SCALE_FACTOR"] = "8.0"
109+
if not quantized_model and "text-generation" in task:
110+
sub_export_config.runtime_options["KV_CACHE_PRECISION"] = "f16"
107111

108112

109113
def _save_model(
@@ -116,8 +120,8 @@ def _save_model(
116120
compress_to_fp16 = ov_config is not None and ov_config.dtype == "fp16"
117121
model = _add_version_info_to_model(model, library_name)
118122

119-
if hasattr(config, "runtime_options"):
120-
model = _add_runtime_options_to_rt_info(model, config.runtime_options)
123+
runtime_options = config.runtime_options if hasattr(config, "runtime_options") else {}
124+
model = _add_runtime_options_to_rt_info(model, runtime_options)
121125
save_model(model, path, compress_to_fp16)
122126
del model
123127
gc.collect()
@@ -755,7 +759,12 @@ def export_from_model(
755759

756760
model.save_config(output)
757761

758-
_set_runtime_options(models_and_export_configs, task, library_name)
762+
_set_runtime_options(
763+
models_and_export_configs,
764+
task,
765+
library_name,
766+
hasattr(ov_config, "quantization_config") and ov_config.quantization_config,
767+
)
759768

760769
export_models(
761770
models_and_export_configs=models_and_export_configs,

tests/openvino/test_export.py

+1
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ def _openvino_export(
132132
ov_model.model.get_rt_info()["optimum"]["transformers_version"], _transformers_version
133133
)
134134
self.assertTrue(ov_model.model.has_rt_info(["runtime_options", "ACTIVATIONS_SCALE_FACTOR"]))
135+
self.assertTrue(ov_model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
135136

136137
if library_name == "diffusers":
137138
self.assertTrue(

0 commit comments

Comments
 (0)