Skip to content

Commit 91bedf9

Browse files
Add kv-cache precision flag check to a separate method
1 parent eec3b7a commit 91bedf9

File tree

3 files changed

+39
-51
lines changed

3 files changed

+39
-51
lines changed

tests/openvino/test_exporters_cli.py

+10-17
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from utils_tests import (
2222
_ARCHITECTURES_TO_EXPECTED_INT8,
2323
MODEL_NAMES,
24-
compare_num_quantized_nodes_per_model,
24+
check_compression_state_per_model,
2525
get_num_quantized_nodes,
2626
)
2727

@@ -356,31 +356,26 @@ def test_exporters_cli_int8(self, task: str, model_type: str):
356356
).from_pretrained(tmpdir, **model_kwargs)
357357

358358
if task.startswith("text2text-generation"):
359-
models = [model.encoder.model, model.decoder.model]
359+
models = [model.encoder, model.decoder]
360360
if task.endswith("with-past") and not model.decoder.stateful:
361-
models.append(model.decoder_with_past.model)
361+
models.append(model.decoder_with_past)
362362
elif (
363363
model_type.startswith("stable-diffusion")
364364
or model_type.startswith("flux")
365365
or model_type.startswith("sana")
366366
):
367-
vision_model = model.unet.model if model.unet is not None else model.transformer.model
368-
models = [vision_model, model.vae_encoder.model, model.vae_decoder.model]
367+
models = [model.unet or model.transformer, model.vae_encoder, model.vae_decoder]
369368
models.append(
370-
model.text_encoder.model
371-
if model_type in ["stable-diffusion", "sana"]
372-
else model.text_encoder_2.model
369+
model.text_encoder if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2
373370
)
374371
elif task.startswith("image-text-to-text"):
375372
models = list(model.submodels.values())
376373
else:
377-
models = [model.model]
374+
models = [model]
378375

379376
expected_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type]
380-
for i, model in enumerate(models):
381-
_, num_weight_nodes = get_num_quantized_nodes(model)
382-
self.assertEqual(expected_int8[i], num_weight_nodes["int8"])
383-
self.assertFalse(model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
377+
expected_int8 = [{"int8": it} for it in expected_int8]
378+
check_compression_state_per_model(self, models, expected_int8)
384379

385380
@parameterized.expand(SUPPORTED_SD_HYBRID_ARCHITECTURES)
386381
def test_exporters_cli_hybrid_quantization(
@@ -421,13 +416,11 @@ def test_exporters_cli_4bit(
421416

422417
submodels = []
423418
if task == "text-generation-with-past":
424-
submodels = [model.model]
419+
submodels = [model]
425420
elif task == "image-text-to-text":
426421
submodels = list(model.submodels.values())
427-
for submodel in submodels:
428-
self.assertFalse(submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
429422

430-
compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model)
423+
check_compression_state_per_model(self, submodels, expected_num_weight_nodes_per_model)
431424

432425
self.assertTrue("--awq" not in option or b"Applying AWQ" in result.stdout)
433426
self.assertTrue("--scale-estimation" not in option or b"Applying Scale Estimation" in result.stdout)

tests/openvino/test_quantization.py

+21-28
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from functools import partial
2424
from typing import Union
2525

26+
import openvino as ov
2627
import pytest
2728
import evaluate
2829
import numpy as np
@@ -82,7 +83,7 @@
8283
MODEL_NAMES,
8384
get_num_quantized_nodes,
8485
_ARCHITECTURES_TO_EXPECTED_INT8,
85-
compare_num_quantized_nodes_per_model,
86+
check_compression_state_per_model,
8687
)
8788

8889
_TASK_TO_DATASET = {
@@ -713,20 +714,18 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust
713714
self.assertEqual(model._openvino_config.dtype, "int8")
714715

715716
if model.export_feature.startswith("text2text-generation"):
716-
models = [model.encoder.model, model.decoder.model]
717+
models = [model.encoder, model.decoder]
717718
if model.decoder_with_past is not None:
718-
models.append(model.decoder_with_past.model)
719+
models.append(model.decoder_with_past)
719720
elif model.export_feature == "text-to-image":
720-
models = [model.unet.model, model.vae_encoder.model, model.vae_decoder.model]
721-
models.append(
722-
model.text_encoder.model if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2.model
723-
)
721+
models = [model.unet, model.vae_encoder, model.vae_decoder]
722+
models.append(model.text_encoder if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2)
724723
elif model_type == "open-clip":
725724
models = [model.text_model, model.visual_model]
726725
elif model.export_feature == "image-text-to-text":
727726
models = list(model.submodels.values())
728727
else:
729-
models = [model.model]
728+
models = [model]
730729

731730
if model_type == "open-clip":
732731
pytest.skip(reason="ticket 161043")
@@ -736,10 +735,8 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust
736735
check_optimization_not_applicable_to_optimized_model(model, quantization_config={"bits": 8})
737736

738737
expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type]
739-
for i, model in enumerate(models):
740-
_, num_weight_nodes = get_num_quantized_nodes(model)
741-
self.assertEqual(expected_ov_int8[i], num_weight_nodes["int8"])
742-
self.assertFalse(model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
738+
expected_ov_int8 = [{"int8": it} for it in expected_ov_int8]
739+
check_compression_state_per_model(self, models, expected_ov_int8)
743740

744741
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION)
745742
def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_fake_nodes, expected_int8_nodes):
@@ -841,9 +838,7 @@ def test_ovmodel_4bit_auto_compression_with_config(
841838
submodels = [model.model]
842839
elif isinstance(model, OVModelForVisualCausalLM):
843840
submodels = list(model.submodels.values())
844-
compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model)
845-
for submodel in submodels:
846-
self.assertFalse(submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
841+
check_compression_state_per_model(self, submodels, expected_num_weight_nodes_per_model)
847842

848843
model.save_pretrained(tmp_dir)
849844
# At the moment the first model in the list is the only one we apply data-aware compression to
@@ -869,36 +864,34 @@ def test_ovmodel_stateful_load_with_compressed_weights(self, model_cls, model_ty
869864

870865
expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type][0]
871866
_, num_weight_nodes = get_num_quantized_nodes(model)
872-
self.assertEqual(expected_ov_int8, num_weight_nodes["int8"])
873-
self.assertFalse(model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
867+
check_compression_state_per_model(self, [model.model], [{"int8": expected_ov_int8}])
874868

875869
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION)
876870
def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type, trust_remote_code):
877871
model = model_cls.from_pretrained(
878872
MODEL_NAMES[model_type], export=True, load_in_8bit=False, trust_remote_code=trust_remote_code
879873
)
880874
if model.export_feature.startswith("text2text-generation"):
881-
models = [model.encoder.model, model.decoder.model]
875+
models = [model.encoder, model.decoder]
882876
if model.decoder_with_past is not None:
883-
models.append(model.decoder_with_past.model)
877+
models.append(model.decoder_with_past)
884878
elif model.export_feature == "text-to-image":
885-
models = [model.unet.model, model.vae_encoder.model, model.vae_decoder.model]
886-
models.append(
887-
model.text_encoder.model if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2.model
888-
)
879+
models = [model.unet, model.vae_encoder, model.vae_decoder]
880+
models.append(model.text_encoder if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2)
889881
elif model_type == "open-clip":
890882
models = [model.text_model, model.visual_model]
891883
elif model.export_feature == "image-text-to-text":
892884
models = list(model.submodels.values())
893885
else:
894-
models = [model.model]
886+
models = [model]
895887

896888
for i, submodel in enumerate(models):
897-
_, num_weight_nodes = get_num_quantized_nodes(submodel)
889+
ov_model = submodel if isinstance(submodel, ov.Model) else submodel.model
890+
_, num_weight_nodes = get_num_quantized_nodes(ov_model)
898891
self.assertEqual(0, num_weight_nodes["int8"])
899892
if "text-generation" in model.export_feature or ("image-text-to-text" in model.export_feature and i == 0):
900-
self.assertTrue(submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
901-
kv_cache_precision = submodel.get_rt_info(["runtime_options", "KV_CACHE_PRECISION"]).value
893+
self.assertTrue(ov_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
894+
kv_cache_precision = ov_model.get_rt_info(["runtime_options", "KV_CACHE_PRECISION"]).value
902895
self.assertTrue(kv_cache_precision == "f16")
903896

904897
def test_ovmodel_load_large_model_with_default_compressed_weights(self):
@@ -1013,7 +1006,7 @@ def test_ovmodel_4bit_dynamic_with_config(
10131006
submodels = [model.model]
10141007
elif isinstance(model, OVModelForVisualCausalLM):
10151008
submodels = list(model.submodels.values())
1016-
compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model)
1009+
check_compression_state_per_model(self, submodels, expected_num_weight_nodes_per_model)
10171010

10181011
model.save_pretrained(tmp_dir)
10191012
openvino_config = OVConfig.from_pretrained(tmp_dir)

tests/openvino/utils_tests.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -196,10 +196,10 @@
196196
"stable-diffusion-3": (66, 42, 58, 30),
197197
"flux": (56, 24, 28, 64),
198198
"flux-fill": (56, 24, 28, 64),
199-
"llava": (30, 9, 1),
200-
"llava_next": (30, 9, 1),
201-
"minicpmv": (30, 26, 1, 6),
202-
"nanollava": (30, 15, 1),
199+
"llava": (30, 1, 9),
200+
"llava_next": (30, 1, 9),
201+
"minicpmv": (30, 1, 26, 6),
202+
"nanollava": (30, 1, 15),
203203
"qwen2_vl": (30, 1, 1, 10),
204204
"sana": (58, 28, 28, 18),
205205
}
@@ -290,15 +290,17 @@ def new_forward(
290290
WQLinearMMFunction.forward = orig_gemm_forward
291291

292292

293-
def compare_num_quantized_nodes_per_model(
293+
def check_compression_state_per_model(
294294
test_case: unittest.TestCase,
295295
models: List[Union[ov.Model, OVBaseModel]],
296296
expected_num_weight_nodes_per_model: List[Dict],
297297
):
298298
test_case.assertEqual(len(models), len(expected_num_weight_nodes_per_model))
299299
actual_num_weights_per_model = []
300300
for submodel, expected_num_weight_nodes in zip(models, expected_num_weight_nodes_per_model):
301-
_, num_weight_nodes = get_num_quantized_nodes(submodel)
301+
ov_model = submodel if isinstance(submodel, ov.Model) else submodel.model
302+
_, num_weight_nodes = get_num_quantized_nodes(ov_model)
302303
expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)})
303304
actual_num_weights_per_model.append(num_weight_nodes)
305+
test_case.assertFalse(ov_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
304306
test_case.assertEqual(expected_num_weight_nodes_per_model, actual_num_weights_per_model)

0 commit comments

Comments
 (0)