Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit eec3b7a

Browse files
committedFeb 5, 2025·
Remove kv cache compression disabling flag for compressed models
1 parent cd44f82 commit eec3b7a

File tree

4 files changed

+84
-55
lines changed

4 files changed

+84
-55
lines changed
 

‎optimum/exporters/openvino/__main__.py

-6
Original file line numberDiff line numberDiff line change
@@ -493,12 +493,6 @@ class StoreAttr(object):
493493
from optimum.intel.openvino.quantization import _weight_only_quantization
494494

495495
_weight_only_quantization(submodel, quantization_config)
496-
# kv cache compression disabled if quantization config is not provided,
497-
# to keep aligned result of applying auto int8 compression and via explicit setting config, we should update it
498-
if submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]):
499-
prev_rt_info = submodel.get_rt_info("runtime_options").value
500-
prev_rt_info.pop("KV_CACHE_PRECISION")
501-
submodel.set_rt_info(prev_rt_info, "runtime_options")
502496
compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
503497
save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
504498
del submodel

‎optimum/intel/openvino/quantization.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -1042,7 +1042,7 @@ def _weight_only_quantization(
10421042
else:
10431043
mode = CompressWeightsMode.INT4_SYM if config.sym else CompressWeightsMode.INT4_ASYM
10441044

1045-
return nncf.compress_weights(
1045+
compressed_model = nncf.compress_weights(
10461046
model,
10471047
mode=mode,
10481048
ratio=config.ratio,
@@ -1060,6 +1060,15 @@ def _weight_only_quantization(
10601060
**kwargs,
10611061
)
10621062

1063+
# If KV cache compression was disabled, remove the disabling flag from the model
1064+
if compressed_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]):
1065+
prev_rt_info = compressed_model.get_rt_info("runtime_options").value
1066+
if prev_rt_info["KV_CACHE_PRECISION"] == "f16":
1067+
prev_rt_info.pop("KV_CACHE_PRECISION")
1068+
compressed_model.set_rt_info(prev_rt_info, "runtime_options")
1069+
1070+
return compressed_model
1071+
10631072

10641073
def _full_quantization(
10651074
model: openvino.runtime.Model,

‎tests/openvino/test_exporters_cli.py

+27-22
Original file line numberDiff line numberDiff line change
@@ -192,27 +192,27 @@ class OVCLIExportTestCase(unittest.TestCase):
192192
"image-text-to-text",
193193
"llava_next",
194194
"int4 --group-size 16 --ratio 0.8",
195-
[{"int8": 14, "int4": 16}, {"int8": 9}, {"int8": 1}],
195+
[{"int8": 14, "int4": 16}, {"int8": 1}, {"int8": 9}],
196196
),
197197
(
198198
"image-text-to-text",
199199
"llava_next",
200200
'int4 --group-size 16 --ratio 0.8 --sensitivity-metric "hessian_input_activation" '
201201
"--dataset contextual --num-samples 1",
202-
[{"int8": 6, "int4": 24}, {"int8": 9}, {"int8": 1}],
202+
[{"int8": 6, "int4": 24}, {"int8": 1}, {"int8": 9}],
203203
),
204204
(
205205
"image-text-to-text",
206206
"nanollava",
207207
"int4 --group-size 8 --ratio 0.8 --trust-remote-code",
208-
[{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
208+
[{"int8": 16, "int4": 14}, {"int8": 1}, {"int8": 15}],
209209
),
210210
(
211211
"image-text-to-text",
212212
"nanollava",
213213
'int4 --group-size 8 --ratio 0.8 --sensitivity-metric "mean_activation_variance" '
214214
"--dataset contextual --num-samples 1 --trust-remote-code",
215-
[{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
215+
[{"int8": 16, "int4": 14}, {"int8": 1}, {"int8": 15}],
216216
),
217217
]
218218
)
@@ -224,40 +224,40 @@ class OVCLIExportTestCase(unittest.TestCase):
224224
"image-text-to-text",
225225
"minicpmv",
226226
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
227-
[{"int8": 10, "int4": 20}, {"int8": 26}, {"int8": 1}, {"int8": 6}],
227+
[{"int8": 10, "int4": 20}, {"int8": 1}, {"int8": 26}, {"int8": 6}],
228228
),
229229
(
230230
"image-text-to-text",
231231
"minicpmv",
232232
'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
233233
"--dataset contextual --num-samples 1 --trust-remote-code",
234-
[{"int8": 8, "int4": 22}, {"int8": 26}, {"int8": 1}, {"int8": 6}],
234+
[{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 26}, {"int8": 6}],
235235
),
236236
(
237237
"image-text-to-text",
238238
"internvl2",
239239
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
240-
[{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}],
240+
[{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 11}],
241241
),
242242
(
243243
"image-text-to-text",
244244
"internvl2",
245245
'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
246246
"--dataset contextual --num-samples 1 --trust-remote-code",
247-
[{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}],
247+
[{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 11}],
248248
),
249249
(
250250
"image-text-to-text",
251251
"phi3_v",
252252
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
253-
[{"int8": 8, "int4": 10}, {"int8": 7}, {"int8": 1}, {"int8": 2}],
253+
[{"int8": 8, "int4": 10}, {"int8": 1}, {"int8": 7}, {"int8": 2}],
254254
),
255255
(
256256
"image-text-to-text",
257257
"phi3_v",
258258
'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
259259
"--dataset contextual --num-samples 1 --trust-remote-code",
260-
[{"int8": 4, "int4": 14}, {"int8": 7}, {"int8": 1}, {"int8": 2}],
260+
[{"int8": 4, "int4": 14}, {"int8": 1}, {"int8": 7}, {"int8": 2}],
261261
),
262262
(
263263
"image-text-to-text",
@@ -356,27 +356,31 @@ def test_exporters_cli_int8(self, task: str, model_type: str):
356356
).from_pretrained(tmpdir, **model_kwargs)
357357

358358
if task.startswith("text2text-generation"):
359-
models = [model.encoder, model.decoder]
359+
models = [model.encoder.model, model.decoder.model]
360360
if task.endswith("with-past") and not model.decoder.stateful:
361-
models.append(model.decoder_with_past)
361+
models.append(model.decoder_with_past.model)
362362
elif (
363363
model_type.startswith("stable-diffusion")
364364
or model_type.startswith("flux")
365365
or model_type.startswith("sana")
366366
):
367-
models = [model.unet or model.transformer, model.vae_encoder, model.vae_decoder]
367+
vision_model = model.unet.model if model.unet is not None else model.transformer.model
368+
models = [vision_model, model.vae_encoder.model, model.vae_decoder.model]
368369
models.append(
369-
model.text_encoder if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2
370+
model.text_encoder.model
371+
if model_type in ["stable-diffusion", "sana"]
372+
else model.text_encoder_2.model
370373
)
371374
elif task.startswith("image-text-to-text"):
372-
models = [model.language_model, model.vision_embeddings]
375+
models = list(model.submodels.values())
373376
else:
374-
models = [model]
377+
models = [model.model]
375378

376379
expected_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type]
377380
for i, model in enumerate(models):
378381
_, num_weight_nodes = get_num_quantized_nodes(model)
379382
self.assertEqual(expected_int8[i], num_weight_nodes["int8"])
383+
self.assertFalse(model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
380384

381385
@parameterized.expand(SUPPORTED_SD_HYBRID_ARCHITECTURES)
382386
def test_exporters_cli_hybrid_quantization(
@@ -389,11 +393,11 @@ def test_exporters_cli_hybrid_quantization(
389393
check=True,
390394
)
391395
model = eval(_HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")]).from_pretrained(tmpdir)
392-
num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(
393-
model.unet if model.unet is not None else model.transformer
394-
)
396+
vision_model = model.unet.model if model.unet is not None else model.transformer.model
397+
num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(vision_model)
395398
self.assertEqual(expected_int8_nodes, num_weight_nodes["int8"])
396399
self.assertEqual(expected_fake_nodes, num_fake_nodes)
400+
self.assertFalse(vision_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
397401

398402
@parameterized.expand(TEST_4BIT_CONFIGURATIONS)
399403
def test_exporters_cli_4bit(
@@ -417,10 +421,11 @@ def test_exporters_cli_4bit(
417421

418422
submodels = []
419423
if task == "text-generation-with-past":
420-
submodels = [model]
424+
submodels = [model.model]
421425
elif task == "image-text-to-text":
422-
submodels = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model]
423-
submodels += [getattr(model, part) for part in model.additional_parts]
426+
submodels = list(model.submodels.values())
427+
for submodel in submodels:
428+
self.assertFalse(submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
424429

425430
compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model)
426431

‎tests/openvino/test_quantization.py

+47-26
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,7 @@ class OVWeightCompressionTest(unittest.TestCase):
450450
num_samples=1,
451451
processor=MODEL_NAMES["llava_next"],
452452
),
453-
[{"int8": 6, "int4": 24}, {"int8": 9}, {"int8": 1}],
453+
[{"int8": 6, "int4": 24}, {"int8": 1}, {"int8": 9}],
454454
),
455455
(
456456
OVModelForVisualCausalLM,
@@ -467,7 +467,7 @@ class OVWeightCompressionTest(unittest.TestCase):
467467
tokenizer=MODEL_NAMES["nanollava"],
468468
trust_remote_code=True,
469469
),
470-
[{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
470+
[{"int8": 16, "int4": 14}, {"int8": 1}, {"int8": 15}],
471471
),
472472
]
473473
)
@@ -489,7 +489,7 @@ class OVWeightCompressionTest(unittest.TestCase):
489489
processor=MODEL_NAMES["minicpmv"],
490490
trust_remote_code=True,
491491
),
492-
[{"int8": 8, "int4": 22}, {"int8": 26}, {"int8": 1}, {"int8": 6}],
492+
[{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 26}, {"int8": 6}],
493493
),
494494
(
495495
OVModelForVisualCausalLM,
@@ -504,7 +504,7 @@ class OVWeightCompressionTest(unittest.TestCase):
504504
num_samples=1,
505505
trust_remote_code=True,
506506
),
507-
[{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}],
507+
[{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 11}],
508508
),
509509
(
510510
OVModelForVisualCausalLM,
@@ -519,7 +519,7 @@ class OVWeightCompressionTest(unittest.TestCase):
519519
num_samples=1,
520520
trust_remote_code=True,
521521
),
522-
[{"int8": 4, "int4": 14}, {"int8": 7}, {"int8": 1}, {"int8": 2}],
522+
[{"int8": 4, "int4": 14}, {"int8": 1}, {"int8": 7}, {"int8": 2}],
523523
),
524524
(
525525
OVModelForVisualCausalLM,
@@ -610,6 +610,7 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i
610610
if isinstance(v, Enum):
611611
original_config_as_dict[k] = v.value
612612
self.assertEqual(original_config_as_dict, loaded_config.quantization_config.to_dict())
613+
self.assertFalse(model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
613614

614615
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS)
615616
def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_pt_int8, expected_ov_int8):
@@ -636,6 +637,7 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p
636637
# Verify that the configuration is correctly saved and loaded
637638
loaded_config = OVConfig.from_pretrained(tmp_dir)
638639
self.assertEqual(OVWeightQuantizationConfig().to_dict(), loaded_config.quantization_config.to_dict())
640+
self.assertFalse(model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
639641

640642
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS)
641643
def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_int8_nodes, expected_int4_nodes):
@@ -663,6 +665,7 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i
663665
# Verify that the configuration is correctly saved and loaded
664666
loaded_config = OVConfig.from_pretrained(tmp_dir)
665667
self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict())
668+
self.assertFalse(model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
666669

667670
@parameterized.expand(SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS)
668671
def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_name, expected_pt_int8, expected_ov_int8):
@@ -688,6 +691,7 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_name, e
688691
# Verify that the configuration is correctly saved and loaded
689692
loaded_config = OVConfig.from_pretrained(tmp_dir)
690693
self.assertEqual(OVWeightQuantizationConfig().to_dict(), loaded_config.quantization_config.to_dict())
694+
self.assertFalse(model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
691695

692696
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION)
693697
def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust_remote_code):
@@ -709,19 +713,20 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust
709713
self.assertEqual(model._openvino_config.dtype, "int8")
710714

711715
if model.export_feature.startswith("text2text-generation"):
712-
models = [model.encoder, model.decoder]
716+
models = [model.encoder.model, model.decoder.model]
713717
if model.decoder_with_past is not None:
714-
models.append(model.decoder_with_past)
718+
models.append(model.decoder_with_past.model)
715719
elif model.export_feature == "text-to-image":
716-
models = [model.unet, model.vae_encoder, model.vae_decoder]
717-
models.append(model.text_encoder if model_type == "stable-diffusion" else model.text_encoder_2)
720+
models = [model.unet.model, model.vae_encoder.model, model.vae_decoder.model]
721+
models.append(
722+
model.text_encoder.model if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2.model
723+
)
718724
elif model_type == "open-clip":
719725
models = [model.text_model, model.visual_model]
720726
elif model.export_feature == "image-text-to-text":
721-
models = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model]
722-
models += [getattr(model, part) for part in model.additional_parts]
727+
models = list(model.submodels.values())
723728
else:
724-
models = [model]
729+
models = [model.model]
725730

726731
if model_type == "open-clip":
727732
pytest.skip(reason="ticket 161043")
@@ -734,6 +739,7 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust
734739
for i, model in enumerate(models):
735740
_, num_weight_nodes = get_num_quantized_nodes(model)
736741
self.assertEqual(expected_ov_int8[i], num_weight_nodes["int8"])
742+
self.assertFalse(model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
737743

738744
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION)
739745
def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_fake_nodes, expected_int8_nodes):
@@ -834,9 +840,10 @@ def test_ovmodel_4bit_auto_compression_with_config(
834840
if isinstance(model, OVModelForCausalLM):
835841
submodels = [model.model]
836842
elif isinstance(model, OVModelForVisualCausalLM):
837-
submodels = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model]
838-
submodels += [getattr(model, part) for part in model.additional_parts]
843+
submodels = list(model.submodels.values())
839844
compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model)
845+
for submodel in submodels:
846+
self.assertFalse(submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
840847

841848
model.save_pretrained(tmp_dir)
842849
# At the moment the first model in the list is the only one we apply data-aware compression to
@@ -863,35 +870,45 @@ def test_ovmodel_stateful_load_with_compressed_weights(self, model_cls, model_ty
863870
expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type][0]
864871
_, num_weight_nodes = get_num_quantized_nodes(model)
865872
self.assertEqual(expected_ov_int8, num_weight_nodes["int8"])
873+
self.assertFalse(model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
866874

867875
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION)
868876
def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type, trust_remote_code):
869877
model = model_cls.from_pretrained(
870878
MODEL_NAMES[model_type], export=True, load_in_8bit=False, trust_remote_code=trust_remote_code
871879
)
872880
if model.export_feature.startswith("text2text-generation"):
873-
models = [model.encoder, model.decoder]
881+
models = [model.encoder.model, model.decoder.model]
874882
if model.decoder_with_past is not None:
875-
models.append(model.decoder_with_past)
883+
models.append(model.decoder_with_past.model)
876884
elif model.export_feature == "text-to-image":
877-
models = [model.unet, model.vae_encoder, model.vae_decoder]
878-
models.append(model.text_encoder if model_type == "stable-diffusion" else model.text_encoder_2)
885+
models = [model.unet.model, model.vae_encoder.model, model.vae_decoder.model]
886+
models.append(
887+
model.text_encoder.model if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2.model
888+
)
879889
elif model_type == "open-clip":
880890
models = [model.text_model, model.visual_model]
881891
elif model.export_feature == "image-text-to-text":
882-
models = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model]
883-
models += [getattr(model, part) for part in model.additional_parts]
892+
models = list(model.submodels.values())
884893
else:
885-
models = [model]
894+
models = [model.model]
886895

887-
for i, model in enumerate(models):
888-
_, num_weight_nodes = get_num_quantized_nodes(model)
896+
for i, submodel in enumerate(models):
897+
_, num_weight_nodes = get_num_quantized_nodes(submodel)
889898
self.assertEqual(0, num_weight_nodes["int8"])
899+
if "text-generation" in model.export_feature or ("image-text-to-text" in model.export_feature and i == 0):
900+
self.assertTrue(submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
901+
kv_cache_precision = submodel.get_rt_info(["runtime_options", "KV_CACHE_PRECISION"]).value
902+
self.assertTrue(kv_cache_precision == "f16")
890903

891904
def test_ovmodel_load_large_model_with_default_compressed_weights(self):
905+
compressed_model_mock_obj = unittest.mock.Mock()
906+
compressed_model_mock_obj.has_rt_info.return_value = False
907+
892908
def main_export_in_stacktrace(*args, **kwargs):
893909
# Compression was called from `main_export`
894910
self.assertTrue(inspect.stack()[5].function == "main_export")
911+
return compressed_model_mock_obj
895912

896913
with unittest.mock.patch(
897914
"openvino.runtime.op.Constant.shape", new_callable=unittest.mock.PropertyMock
@@ -929,15 +946,20 @@ def test_ovmodel_load_large_model_with_uncompressed_weights(self):
929946
) as ov_constant_shape:
930947
ov_constant_shape.return_value = (2000000000,)
931948
with unittest.mock.patch("nncf.compress_weights") as compress_weights_patch:
932-
_ = OVModelForCausalLM.from_pretrained(
949+
model = OVModelForCausalLM.from_pretrained(
933950
MODEL_NAMES["llama"], export=True, load_in_8bit=False, compile=False, use_cache=False
934951
)
935952
compress_weights_patch.assert_not_called()
953+
self.assertTrue(model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
936954

937955
def test_ovmodel_load_large_model_with_additional_quantization_config(self):
956+
compressed_model_mock_obj = unittest.mock.Mock()
957+
compressed_model_mock_obj.has_rt_info.return_value = False
958+
938959
def main_export_not_in_stacktrace(*args, **kwargs):
939960
# Compression was not called from `main_export`
940961
self.assertTrue(all(frame_info.function != "main_export" for frame_info in inspect.stack()))
962+
return compressed_model_mock_obj
941963

942964
with unittest.mock.patch(
943965
"openvino.runtime.op.Constant.shape", new_callable=unittest.mock.PropertyMock
@@ -990,8 +1012,7 @@ def test_ovmodel_4bit_dynamic_with_config(
9901012
if isinstance(model, OVModelForCausalLM):
9911013
submodels = [model.model]
9921014
elif isinstance(model, OVModelForVisualCausalLM):
993-
submodels = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model]
994-
submodels += [getattr(model, part) for part in model.additional_parts]
1015+
submodels = list(model.submodels.values())
9951016
compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model)
9961017

9971018
model.save_pretrained(tmp_dir)

0 commit comments

Comments
 (0)
Please sign in to comment.