Skip to content

Commit f601b8b

Browse files
Remove KV-cache compression disabling flag for compressed models (#1141)
* Remove kv cache compression disabling flag for compressed models * Add kv-cache precision flag check to a separate method * Add deprecation warning for * Fix test * Update optimum/intel/openvino/configuration.py Co-authored-by: Alexander Kozlov <alexander.kozlov@intel.com> --------- Co-authored-by: Alexander Kozlov <alexander.kozlov@intel.com>
1 parent 61a74cd commit f601b8b

File tree

6 files changed

+85
-59
lines changed

6 files changed

+85
-59
lines changed

optimum/exporters/openvino/__main__.py

-6
Original file line numberDiff line numberDiff line change
@@ -493,12 +493,6 @@ class StoreAttr(object):
493493
from optimum.intel.openvino.quantization import _weight_only_quantization
494494

495495
_weight_only_quantization(submodel, quantization_config)
496-
# kv cache compression disabled if quantization config is not provided,
497-
# to keep aligned result of applying auto int8 compression and via explicit setting config, we should update it
498-
if submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]):
499-
prev_rt_info = submodel.get_rt_info("runtime_options").value
500-
prev_rt_info.pop("KV_CACHE_PRECISION")
501-
submodel.set_rt_info(prev_rt_info, "runtime_options")
502496
compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
503497
save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
504498
del submodel

optimum/intel/openvino/configuration.py

+7
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,13 @@ def __init__(
578578
):
579579
super().__init__(bits=bits, sym=sym, group_size=weights_group_size, **kwargs)
580580
self.activations_group_size = activations_group_size
581+
logger.warning(
582+
"OVDynamicQuantizationConfig is deprecated and will be removed in optimum-intel v1.24.0. "
583+
"Dynamic quantization and KV cache compression are enabled by default starting from OpenVINO 2024.6 and "
584+
"there is no need to enable them manually. If you need precise control over these parameters, please "
585+
"provide `DYNAMIC_QUANTIZATION_GROUP_SIZE` and `KV_CACHE_PRECISION` with `ov_config` argument during model "
586+
"inference."
587+
)
581588

582589

583590
@dataclass

optimum/intel/openvino/quantization.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -1042,7 +1042,7 @@ def _weight_only_quantization(
10421042
else:
10431043
mode = CompressWeightsMode.INT4_SYM if config.sym else CompressWeightsMode.INT4_ASYM
10441044

1045-
return nncf.compress_weights(
1045+
compressed_model = nncf.compress_weights(
10461046
model,
10471047
mode=mode,
10481048
ratio=config.ratio,
@@ -1060,6 +1060,15 @@ def _weight_only_quantization(
10601060
**kwargs,
10611061
)
10621062

1063+
# If KV cache compression was disabled, remove the disabling flag from the model
1064+
if compressed_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]):
1065+
prev_rt_info = compressed_model.get_rt_info("runtime_options").value
1066+
if prev_rt_info["KV_CACHE_PRECISION"] == "f16":
1067+
prev_rt_info.pop("KV_CACHE_PRECISION")
1068+
compressed_model.set_rt_info(prev_rt_info, "runtime_options")
1069+
1070+
return compressed_model
1071+
10631072

10641073
def _full_quantization(
10651074
model: openvino.runtime.Model,

tests/openvino/test_exporters_cli.py

+21-21
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from utils_tests import (
2222
_ARCHITECTURES_TO_EXPECTED_INT8,
2323
MODEL_NAMES,
24-
compare_num_quantized_nodes_per_model,
24+
check_compression_state_per_model,
2525
get_num_quantized_nodes,
2626
)
2727

@@ -192,27 +192,27 @@ class OVCLIExportTestCase(unittest.TestCase):
192192
"image-text-to-text",
193193
"llava_next",
194194
"int4 --group-size 16 --ratio 0.8",
195-
[{"int8": 14, "int4": 16}, {"int8": 9}, {"int8": 1}],
195+
[{"int8": 14, "int4": 16}, {"int8": 1}, {"int8": 9}],
196196
),
197197
(
198198
"image-text-to-text",
199199
"llava_next",
200200
'int4 --group-size 16 --ratio 0.8 --sensitivity-metric "hessian_input_activation" '
201201
"--dataset contextual --num-samples 1",
202-
[{"int8": 6, "int4": 24}, {"int8": 9}, {"int8": 1}],
202+
[{"int8": 6, "int4": 24}, {"int8": 1}, {"int8": 9}],
203203
),
204204
(
205205
"image-text-to-text",
206206
"nanollava",
207207
"int4 --group-size 8 --ratio 0.8 --trust-remote-code",
208-
[{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
208+
[{"int8": 16, "int4": 14}, {"int8": 1}, {"int8": 15}],
209209
),
210210
(
211211
"image-text-to-text",
212212
"nanollava",
213213
'int4 --group-size 8 --ratio 0.8 --sensitivity-metric "mean_activation_variance" '
214214
"--dataset contextual --num-samples 1 --trust-remote-code",
215-
[{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
215+
[{"int8": 16, "int4": 14}, {"int8": 1}, {"int8": 15}],
216216
),
217217
]
218218
)
@@ -224,40 +224,40 @@ class OVCLIExportTestCase(unittest.TestCase):
224224
"image-text-to-text",
225225
"minicpmv",
226226
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
227-
[{"int8": 10, "int4": 20}, {"int8": 26}, {"int8": 1}, {"int8": 6}],
227+
[{"int8": 10, "int4": 20}, {"int8": 1}, {"int8": 26}, {"int8": 6}],
228228
),
229229
(
230230
"image-text-to-text",
231231
"minicpmv",
232232
'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
233233
"--dataset contextual --num-samples 1 --trust-remote-code",
234-
[{"int8": 8, "int4": 22}, {"int8": 26}, {"int8": 1}, {"int8": 6}],
234+
[{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 26}, {"int8": 6}],
235235
),
236236
(
237237
"image-text-to-text",
238238
"internvl2",
239239
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
240-
[{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}],
240+
[{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 11}],
241241
),
242242
(
243243
"image-text-to-text",
244244
"internvl2",
245245
'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
246246
"--dataset contextual --num-samples 1 --trust-remote-code",
247-
[{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}],
247+
[{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 11}],
248248
),
249249
(
250250
"image-text-to-text",
251251
"phi3_v",
252252
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
253-
[{"int8": 8, "int4": 10}, {"int8": 7}, {"int8": 1}, {"int8": 2}],
253+
[{"int8": 8, "int4": 10}, {"int8": 1}, {"int8": 7}, {"int8": 2}],
254254
),
255255
(
256256
"image-text-to-text",
257257
"phi3_v",
258258
'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
259259
"--dataset contextual --num-samples 1 --trust-remote-code",
260-
[{"int8": 4, "int4": 14}, {"int8": 7}, {"int8": 1}, {"int8": 2}],
260+
[{"int8": 4, "int4": 14}, {"int8": 1}, {"int8": 7}, {"int8": 2}],
261261
),
262262
(
263263
"image-text-to-text",
@@ -369,14 +369,15 @@ def test_exporters_cli_int8(self, task: str, model_type: str):
369369
model.text_encoder if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2
370370
)
371371
elif task.startswith("image-text-to-text"):
372-
models = [model.language_model, model.vision_embeddings]
372+
models = list(model.submodels.values())
373373
else:
374374
models = [model]
375375

376376
expected_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type]
377-
for i, model in enumerate(models):
378-
_, num_weight_nodes = get_num_quantized_nodes(model)
379-
self.assertEqual(expected_int8[i], num_weight_nodes["int8"])
377+
expected_int8 = [{"int8": it} for it in expected_int8]
378+
if task.startswith("text2text-generation") and (not task.endswith("with-past") or model.decoder.stateful):
379+
expected_int8 = expected_int8[:2]
380+
check_compression_state_per_model(self, models, expected_int8)
380381

381382
@parameterized.expand(SUPPORTED_SD_HYBRID_ARCHITECTURES)
382383
def test_exporters_cli_hybrid_quantization(
@@ -389,11 +390,11 @@ def test_exporters_cli_hybrid_quantization(
389390
check=True,
390391
)
391392
model = eval(_HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")]).from_pretrained(tmpdir)
392-
num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(
393-
model.unet if model.unet is not None else model.transformer
394-
)
393+
vision_model = model.unet.model if model.unet is not None else model.transformer.model
394+
num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(vision_model)
395395
self.assertEqual(expected_int8_nodes, num_weight_nodes["int8"])
396396
self.assertEqual(expected_fake_nodes, num_fake_nodes)
397+
self.assertFalse(vision_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
397398

398399
@parameterized.expand(TEST_4BIT_CONFIGURATIONS)
399400
def test_exporters_cli_4bit(
@@ -419,10 +420,9 @@ def test_exporters_cli_4bit(
419420
if task == "text-generation-with-past":
420421
submodels = [model]
421422
elif task == "image-text-to-text":
422-
submodels = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model]
423-
submodels += [getattr(model, part) for part in model.additional_parts]
423+
submodels = list(model.submodels.values())
424424

425-
compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model)
425+
check_compression_state_per_model(self, submodels, expected_num_weight_nodes_per_model)
426426

427427
self.assertTrue("--awq" not in option or b"Applying AWQ" in result.stdout)
428428
self.assertTrue("--scale-estimation" not in option or b"Applying Scale Estimation" in result.stdout)

0 commit comments

Comments
 (0)