From eec3b7ac45d6b3c87cc1e9b7b672343f880c7e87 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 5 Feb 2025 09:35:59 +0100 Subject: [PATCH 1/5] Remove kv cache compression disabling flag for compressed models --- optimum/exporters/openvino/__main__.py | 6 --- optimum/intel/openvino/quantization.py | 11 +++- tests/openvino/test_exporters_cli.py | 49 +++++++++-------- tests/openvino/test_quantization.py | 73 +++++++++++++++++--------- 4 files changed, 84 insertions(+), 55 deletions(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 3d5e7818a2..88c738999a 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -493,12 +493,6 @@ class StoreAttr(object): from optimum.intel.openvino.quantization import _weight_only_quantization _weight_only_quantization(submodel, quantization_config) - # kv cache compression disabled if quantization config is not provided, - # to keep aligned result of applying auto int8 compression and via explicit setting config, we should update it - if submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]): - prev_rt_info = submodel.get_rt_info("runtime_options").value - prev_rt_info.pop("KV_CACHE_PRECISION") - submodel.set_rt_info(prev_rt_info, "runtime_options") compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml" save_model(submodel, compressed_submodel_path, compress_to_fp16=False) del submodel diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index fa169ff547..2ba74244d8 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -1042,7 +1042,7 @@ def _weight_only_quantization( else: mode = CompressWeightsMode.INT4_SYM if config.sym else CompressWeightsMode.INT4_ASYM - return nncf.compress_weights( + compressed_model = nncf.compress_weights( model, mode=mode, ratio=config.ratio, @@ -1060,6 +1060,15 @@ def _weight_only_quantization( **kwargs, ) + # If KV cache compression was disabled, remove the disabling flag from the model + if compressed_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]): + prev_rt_info = compressed_model.get_rt_info("runtime_options").value + if prev_rt_info["KV_CACHE_PRECISION"] == "f16": + prev_rt_info.pop("KV_CACHE_PRECISION") + compressed_model.set_rt_info(prev_rt_info, "runtime_options") + + return compressed_model + def _full_quantization( model: openvino.runtime.Model, diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 06e6adb049..5e71109529 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -192,27 +192,27 @@ class OVCLIExportTestCase(unittest.TestCase): "image-text-to-text", "llava_next", "int4 --group-size 16 --ratio 0.8", - [{"int8": 14, "int4": 16}, {"int8": 9}, {"int8": 1}], + [{"int8": 14, "int4": 16}, {"int8": 1}, {"int8": 9}], ), ( "image-text-to-text", "llava_next", 'int4 --group-size 16 --ratio 0.8 --sensitivity-metric "hessian_input_activation" ' "--dataset contextual --num-samples 1", - [{"int8": 6, "int4": 24}, {"int8": 9}, {"int8": 1}], + [{"int8": 6, "int4": 24}, {"int8": 1}, {"int8": 9}], ), ( "image-text-to-text", "nanollava", "int4 --group-size 8 --ratio 0.8 --trust-remote-code", - [{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}], + [{"int8": 16, "int4": 14}, {"int8": 1}, {"int8": 15}], ), ( "image-text-to-text", "nanollava", 'int4 --group-size 8 --ratio 0.8 --sensitivity-metric "mean_activation_variance" ' "--dataset contextual --num-samples 1 --trust-remote-code", - [{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}], + [{"int8": 16, "int4": 14}, {"int8": 1}, {"int8": 15}], ), ] ) @@ -224,40 +224,40 @@ class OVCLIExportTestCase(unittest.TestCase): "image-text-to-text", "minicpmv", "int4 --group-size 4 --ratio 0.8 --trust-remote-code", - [{"int8": 10, "int4": 20}, {"int8": 26}, {"int8": 1}, {"int8": 6}], + [{"int8": 10, "int4": 20}, {"int8": 1}, {"int8": 26}, {"int8": 6}], ), ( "image-text-to-text", "minicpmv", 'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" ' "--dataset contextual --num-samples 1 --trust-remote-code", - [{"int8": 8, "int4": 22}, {"int8": 26}, {"int8": 1}, {"int8": 6}], + [{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 26}, {"int8": 6}], ), ( "image-text-to-text", "internvl2", "int4 --group-size 4 --ratio 0.8 --trust-remote-code", - [{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}], + [{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 11}], ), ( "image-text-to-text", "internvl2", 'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" ' "--dataset contextual --num-samples 1 --trust-remote-code", - [{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}], + [{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 11}], ), ( "image-text-to-text", "phi3_v", "int4 --group-size 4 --ratio 0.8 --trust-remote-code", - [{"int8": 8, "int4": 10}, {"int8": 7}, {"int8": 1}, {"int8": 2}], + [{"int8": 8, "int4": 10}, {"int8": 1}, {"int8": 7}, {"int8": 2}], ), ( "image-text-to-text", "phi3_v", 'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" ' "--dataset contextual --num-samples 1 --trust-remote-code", - [{"int8": 4, "int4": 14}, {"int8": 7}, {"int8": 1}, {"int8": 2}], + [{"int8": 4, "int4": 14}, {"int8": 1}, {"int8": 7}, {"int8": 2}], ), ( "image-text-to-text", @@ -356,27 +356,31 @@ def test_exporters_cli_int8(self, task: str, model_type: str): ).from_pretrained(tmpdir, **model_kwargs) if task.startswith("text2text-generation"): - models = [model.encoder, model.decoder] + models = [model.encoder.model, model.decoder.model] if task.endswith("with-past") and not model.decoder.stateful: - models.append(model.decoder_with_past) + models.append(model.decoder_with_past.model) elif ( model_type.startswith("stable-diffusion") or model_type.startswith("flux") or model_type.startswith("sana") ): - models = [model.unet or model.transformer, model.vae_encoder, model.vae_decoder] + vision_model = model.unet.model if model.unet is not None else model.transformer.model + models = [vision_model, model.vae_encoder.model, model.vae_decoder.model] models.append( - model.text_encoder if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2 + model.text_encoder.model + if model_type in ["stable-diffusion", "sana"] + else model.text_encoder_2.model ) elif task.startswith("image-text-to-text"): - models = [model.language_model, model.vision_embeddings] + models = list(model.submodels.values()) else: - models = [model] + models = [model.model] expected_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type] for i, model in enumerate(models): _, num_weight_nodes = get_num_quantized_nodes(model) self.assertEqual(expected_int8[i], num_weight_nodes["int8"]) + self.assertFalse(model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) @parameterized.expand(SUPPORTED_SD_HYBRID_ARCHITECTURES) def test_exporters_cli_hybrid_quantization( @@ -389,11 +393,11 @@ def test_exporters_cli_hybrid_quantization( check=True, ) model = eval(_HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")]).from_pretrained(tmpdir) - num_fake_nodes, num_weight_nodes = get_num_quantized_nodes( - model.unet if model.unet is not None else model.transformer - ) + vision_model = model.unet.model if model.unet is not None else model.transformer.model + num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(vision_model) self.assertEqual(expected_int8_nodes, num_weight_nodes["int8"]) self.assertEqual(expected_fake_nodes, num_fake_nodes) + self.assertFalse(vision_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) @parameterized.expand(TEST_4BIT_CONFIGURATIONS) def test_exporters_cli_4bit( @@ -417,10 +421,11 @@ def test_exporters_cli_4bit( submodels = [] if task == "text-generation-with-past": - submodels = [model] + submodels = [model.model] elif task == "image-text-to-text": - submodels = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model] - submodels += [getattr(model, part) for part in model.additional_parts] + submodels = list(model.submodels.values()) + for submodel in submodels: + self.assertFalse(submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 3822ec203a..a5305b7cd5 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -450,7 +450,7 @@ class OVWeightCompressionTest(unittest.TestCase): num_samples=1, processor=MODEL_NAMES["llava_next"], ), - [{"int8": 6, "int4": 24}, {"int8": 9}, {"int8": 1}], + [{"int8": 6, "int4": 24}, {"int8": 1}, {"int8": 9}], ), ( OVModelForVisualCausalLM, @@ -467,7 +467,7 @@ class OVWeightCompressionTest(unittest.TestCase): tokenizer=MODEL_NAMES["nanollava"], trust_remote_code=True, ), - [{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}], + [{"int8": 16, "int4": 14}, {"int8": 1}, {"int8": 15}], ), ] ) @@ -489,7 +489,7 @@ class OVWeightCompressionTest(unittest.TestCase): processor=MODEL_NAMES["minicpmv"], trust_remote_code=True, ), - [{"int8": 8, "int4": 22}, {"int8": 26}, {"int8": 1}, {"int8": 6}], + [{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 26}, {"int8": 6}], ), ( OVModelForVisualCausalLM, @@ -504,7 +504,7 @@ class OVWeightCompressionTest(unittest.TestCase): num_samples=1, trust_remote_code=True, ), - [{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}], + [{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 11}], ), ( OVModelForVisualCausalLM, @@ -519,7 +519,7 @@ class OVWeightCompressionTest(unittest.TestCase): num_samples=1, trust_remote_code=True, ), - [{"int8": 4, "int4": 14}, {"int8": 7}, {"int8": 1}, {"int8": 2}], + [{"int8": 4, "int4": 14}, {"int8": 1}, {"int8": 7}, {"int8": 2}], ), ( OVModelForVisualCausalLM, @@ -610,6 +610,7 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i if isinstance(v, Enum): original_config_as_dict[k] = v.value self.assertEqual(original_config_as_dict, loaded_config.quantization_config.to_dict()) + self.assertFalse(model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_pt_int8, expected_ov_int8): @@ -636,6 +637,7 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p # Verify that the configuration is correctly saved and loaded loaded_config = OVConfig.from_pretrained(tmp_dir) self.assertEqual(OVWeightQuantizationConfig().to_dict(), loaded_config.quantization_config.to_dict()) + self.assertFalse(model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS) def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_int8_nodes, expected_int4_nodes): @@ -663,6 +665,7 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i # Verify that the configuration is correctly saved and loaded loaded_config = OVConfig.from_pretrained(tmp_dir) self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) + self.assertFalse(model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) @parameterized.expand(SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_name, expected_pt_int8, expected_ov_int8): @@ -688,6 +691,7 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_name, e # Verify that the configuration is correctly saved and loaded loaded_config = OVConfig.from_pretrained(tmp_dir) self.assertEqual(OVWeightQuantizationConfig().to_dict(), loaded_config.quantization_config.to_dict()) + self.assertFalse(model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION) def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust_remote_code): @@ -709,19 +713,20 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust self.assertEqual(model._openvino_config.dtype, "int8") if model.export_feature.startswith("text2text-generation"): - models = [model.encoder, model.decoder] + models = [model.encoder.model, model.decoder.model] if model.decoder_with_past is not None: - models.append(model.decoder_with_past) + models.append(model.decoder_with_past.model) elif model.export_feature == "text-to-image": - models = [model.unet, model.vae_encoder, model.vae_decoder] - models.append(model.text_encoder if model_type == "stable-diffusion" else model.text_encoder_2) + models = [model.unet.model, model.vae_encoder.model, model.vae_decoder.model] + models.append( + model.text_encoder.model if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2.model + ) elif model_type == "open-clip": models = [model.text_model, model.visual_model] elif model.export_feature == "image-text-to-text": - models = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model] - models += [getattr(model, part) for part in model.additional_parts] + models = list(model.submodels.values()) else: - models = [model] + models = [model.model] if model_type == "open-clip": pytest.skip(reason="ticket 161043") @@ -734,6 +739,7 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust for i, model in enumerate(models): _, num_weight_nodes = get_num_quantized_nodes(model) self.assertEqual(expected_ov_int8[i], num_weight_nodes["int8"]) + self.assertFalse(model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION) def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_fake_nodes, expected_int8_nodes): @@ -834,9 +840,10 @@ def test_ovmodel_4bit_auto_compression_with_config( if isinstance(model, OVModelForCausalLM): submodels = [model.model] elif isinstance(model, OVModelForVisualCausalLM): - submodels = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model] - submodels += [getattr(model, part) for part in model.additional_parts] + submodels = list(model.submodels.values()) compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model) + for submodel in submodels: + self.assertFalse(submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) model.save_pretrained(tmp_dir) # At the moment the first model in the list is the only one we apply data-aware compression to @@ -863,6 +870,7 @@ def test_ovmodel_stateful_load_with_compressed_weights(self, model_cls, model_ty expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type][0] _, num_weight_nodes = get_num_quantized_nodes(model) self.assertEqual(expected_ov_int8, num_weight_nodes["int8"]) + self.assertFalse(model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION) def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type, trust_remote_code): @@ -870,28 +878,37 @@ def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type, tru MODEL_NAMES[model_type], export=True, load_in_8bit=False, trust_remote_code=trust_remote_code ) if model.export_feature.startswith("text2text-generation"): - models = [model.encoder, model.decoder] + models = [model.encoder.model, model.decoder.model] if model.decoder_with_past is not None: - models.append(model.decoder_with_past) + models.append(model.decoder_with_past.model) elif model.export_feature == "text-to-image": - models = [model.unet, model.vae_encoder, model.vae_decoder] - models.append(model.text_encoder if model_type == "stable-diffusion" else model.text_encoder_2) + models = [model.unet.model, model.vae_encoder.model, model.vae_decoder.model] + models.append( + model.text_encoder.model if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2.model + ) elif model_type == "open-clip": models = [model.text_model, model.visual_model] elif model.export_feature == "image-text-to-text": - models = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model] - models += [getattr(model, part) for part in model.additional_parts] + models = list(model.submodels.values()) else: - models = [model] + models = [model.model] - for i, model in enumerate(models): - _, num_weight_nodes = get_num_quantized_nodes(model) + for i, submodel in enumerate(models): + _, num_weight_nodes = get_num_quantized_nodes(submodel) self.assertEqual(0, num_weight_nodes["int8"]) + if "text-generation" in model.export_feature or ("image-text-to-text" in model.export_feature and i == 0): + self.assertTrue(submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) + kv_cache_precision = submodel.get_rt_info(["runtime_options", "KV_CACHE_PRECISION"]).value + self.assertTrue(kv_cache_precision == "f16") def test_ovmodel_load_large_model_with_default_compressed_weights(self): + compressed_model_mock_obj = unittest.mock.Mock() + compressed_model_mock_obj.has_rt_info.return_value = False + def main_export_in_stacktrace(*args, **kwargs): # Compression was called from `main_export` self.assertTrue(inspect.stack()[5].function == "main_export") + return compressed_model_mock_obj with unittest.mock.patch( "openvino.runtime.op.Constant.shape", new_callable=unittest.mock.PropertyMock @@ -929,15 +946,20 @@ def test_ovmodel_load_large_model_with_uncompressed_weights(self): ) as ov_constant_shape: ov_constant_shape.return_value = (2000000000,) with unittest.mock.patch("nncf.compress_weights") as compress_weights_patch: - _ = OVModelForCausalLM.from_pretrained( + model = OVModelForCausalLM.from_pretrained( MODEL_NAMES["llama"], export=True, load_in_8bit=False, compile=False, use_cache=False ) compress_weights_patch.assert_not_called() + self.assertTrue(model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) def test_ovmodel_load_large_model_with_additional_quantization_config(self): + compressed_model_mock_obj = unittest.mock.Mock() + compressed_model_mock_obj.has_rt_info.return_value = False + def main_export_not_in_stacktrace(*args, **kwargs): # Compression was not called from `main_export` self.assertTrue(all(frame_info.function != "main_export" for frame_info in inspect.stack())) + return compressed_model_mock_obj with unittest.mock.patch( "openvino.runtime.op.Constant.shape", new_callable=unittest.mock.PropertyMock @@ -990,8 +1012,7 @@ def test_ovmodel_4bit_dynamic_with_config( if isinstance(model, OVModelForCausalLM): submodels = [model.model] elif isinstance(model, OVModelForVisualCausalLM): - submodels = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model] - submodels += [getattr(model, part) for part in model.additional_parts] + submodels = list(model.submodels.values()) compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model) model.save_pretrained(tmp_dir) From 91bedf9b62d35d6e64c834945db269bd95e19303 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 5 Feb 2025 10:07:17 +0100 Subject: [PATCH 2/5] Add kv-cache precision flag check to a separate method --- tests/openvino/test_exporters_cli.py | 27 ++++++--------- tests/openvino/test_quantization.py | 49 ++++++++++++---------------- tests/openvino/utils_tests.py | 14 ++++---- 3 files changed, 39 insertions(+), 51 deletions(-) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 5e71109529..e697c4f4d1 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -21,7 +21,7 @@ from utils_tests import ( _ARCHITECTURES_TO_EXPECTED_INT8, MODEL_NAMES, - compare_num_quantized_nodes_per_model, + check_compression_state_per_model, get_num_quantized_nodes, ) @@ -356,31 +356,26 @@ def test_exporters_cli_int8(self, task: str, model_type: str): ).from_pretrained(tmpdir, **model_kwargs) if task.startswith("text2text-generation"): - models = [model.encoder.model, model.decoder.model] + models = [model.encoder, model.decoder] if task.endswith("with-past") and not model.decoder.stateful: - models.append(model.decoder_with_past.model) + models.append(model.decoder_with_past) elif ( model_type.startswith("stable-diffusion") or model_type.startswith("flux") or model_type.startswith("sana") ): - vision_model = model.unet.model if model.unet is not None else model.transformer.model - models = [vision_model, model.vae_encoder.model, model.vae_decoder.model] + models = [model.unet or model.transformer, model.vae_encoder, model.vae_decoder] models.append( - model.text_encoder.model - if model_type in ["stable-diffusion", "sana"] - else model.text_encoder_2.model + model.text_encoder if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2 ) elif task.startswith("image-text-to-text"): models = list(model.submodels.values()) else: - models = [model.model] + models = [model] expected_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type] - for i, model in enumerate(models): - _, num_weight_nodes = get_num_quantized_nodes(model) - self.assertEqual(expected_int8[i], num_weight_nodes["int8"]) - self.assertFalse(model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) + expected_int8 = [{"int8": it} for it in expected_int8] + check_compression_state_per_model(self, models, expected_int8) @parameterized.expand(SUPPORTED_SD_HYBRID_ARCHITECTURES) def test_exporters_cli_hybrid_quantization( @@ -421,13 +416,11 @@ def test_exporters_cli_4bit( submodels = [] if task == "text-generation-with-past": - submodels = [model.model] + submodels = [model] elif task == "image-text-to-text": submodels = list(model.submodels.values()) - for submodel in submodels: - self.assertFalse(submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) - compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model) + check_compression_state_per_model(self, submodels, expected_num_weight_nodes_per_model) self.assertTrue("--awq" not in option or b"Applying AWQ" in result.stdout) self.assertTrue("--scale-estimation" not in option or b"Applying Scale Estimation" in result.stdout) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index a5305b7cd5..56b2a77e9a 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -23,6 +23,7 @@ from functools import partial from typing import Union +import openvino as ov import pytest import evaluate import numpy as np @@ -82,7 +83,7 @@ MODEL_NAMES, get_num_quantized_nodes, _ARCHITECTURES_TO_EXPECTED_INT8, - compare_num_quantized_nodes_per_model, + check_compression_state_per_model, ) _TASK_TO_DATASET = { @@ -713,20 +714,18 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust self.assertEqual(model._openvino_config.dtype, "int8") if model.export_feature.startswith("text2text-generation"): - models = [model.encoder.model, model.decoder.model] + models = [model.encoder, model.decoder] if model.decoder_with_past is not None: - models.append(model.decoder_with_past.model) + models.append(model.decoder_with_past) elif model.export_feature == "text-to-image": - models = [model.unet.model, model.vae_encoder.model, model.vae_decoder.model] - models.append( - model.text_encoder.model if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2.model - ) + models = [model.unet, model.vae_encoder, model.vae_decoder] + models.append(model.text_encoder if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2) elif model_type == "open-clip": models = [model.text_model, model.visual_model] elif model.export_feature == "image-text-to-text": models = list(model.submodels.values()) else: - models = [model.model] + models = [model] if model_type == "open-clip": pytest.skip(reason="ticket 161043") @@ -736,10 +735,8 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust check_optimization_not_applicable_to_optimized_model(model, quantization_config={"bits": 8}) expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type] - for i, model in enumerate(models): - _, num_weight_nodes = get_num_quantized_nodes(model) - self.assertEqual(expected_ov_int8[i], num_weight_nodes["int8"]) - self.assertFalse(model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) + expected_ov_int8 = [{"int8": it} for it in expected_ov_int8] + check_compression_state_per_model(self, models, expected_ov_int8) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION) def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_fake_nodes, expected_int8_nodes): @@ -841,9 +838,7 @@ def test_ovmodel_4bit_auto_compression_with_config( submodels = [model.model] elif isinstance(model, OVModelForVisualCausalLM): submodels = list(model.submodels.values()) - compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model) - for submodel in submodels: - self.assertFalse(submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) + check_compression_state_per_model(self, submodels, expected_num_weight_nodes_per_model) model.save_pretrained(tmp_dir) # At the moment the first model in the list is the only one we apply data-aware compression to @@ -869,8 +864,7 @@ def test_ovmodel_stateful_load_with_compressed_weights(self, model_cls, model_ty expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type][0] _, num_weight_nodes = get_num_quantized_nodes(model) - self.assertEqual(expected_ov_int8, num_weight_nodes["int8"]) - self.assertFalse(model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) + check_compression_state_per_model(self, [model.model], [{"int8": expected_ov_int8}]) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION) def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type, trust_remote_code): @@ -878,27 +872,26 @@ def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type, tru MODEL_NAMES[model_type], export=True, load_in_8bit=False, trust_remote_code=trust_remote_code ) if model.export_feature.startswith("text2text-generation"): - models = [model.encoder.model, model.decoder.model] + models = [model.encoder, model.decoder] if model.decoder_with_past is not None: - models.append(model.decoder_with_past.model) + models.append(model.decoder_with_past) elif model.export_feature == "text-to-image": - models = [model.unet.model, model.vae_encoder.model, model.vae_decoder.model] - models.append( - model.text_encoder.model if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2.model - ) + models = [model.unet, model.vae_encoder, model.vae_decoder] + models.append(model.text_encoder if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2) elif model_type == "open-clip": models = [model.text_model, model.visual_model] elif model.export_feature == "image-text-to-text": models = list(model.submodels.values()) else: - models = [model.model] + models = [model] for i, submodel in enumerate(models): - _, num_weight_nodes = get_num_quantized_nodes(submodel) + ov_model = submodel if isinstance(submodel, ov.Model) else submodel.model + _, num_weight_nodes = get_num_quantized_nodes(ov_model) self.assertEqual(0, num_weight_nodes["int8"]) if "text-generation" in model.export_feature or ("image-text-to-text" in model.export_feature and i == 0): - self.assertTrue(submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) - kv_cache_precision = submodel.get_rt_info(["runtime_options", "KV_CACHE_PRECISION"]).value + self.assertTrue(ov_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) + kv_cache_precision = ov_model.get_rt_info(["runtime_options", "KV_CACHE_PRECISION"]).value self.assertTrue(kv_cache_precision == "f16") def test_ovmodel_load_large_model_with_default_compressed_weights(self): @@ -1013,7 +1006,7 @@ def test_ovmodel_4bit_dynamic_with_config( submodels = [model.model] elif isinstance(model, OVModelForVisualCausalLM): submodels = list(model.submodels.values()) - compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model) + check_compression_state_per_model(self, submodels, expected_num_weight_nodes_per_model) model.save_pretrained(tmp_dir) openvino_config = OVConfig.from_pretrained(tmp_dir) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 83ea3751d6..e4c2ede8e9 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -196,10 +196,10 @@ "stable-diffusion-3": (66, 42, 58, 30), "flux": (56, 24, 28, 64), "flux-fill": (56, 24, 28, 64), - "llava": (30, 9, 1), - "llava_next": (30, 9, 1), - "minicpmv": (30, 26, 1, 6), - "nanollava": (30, 15, 1), + "llava": (30, 1, 9), + "llava_next": (30, 1, 9), + "minicpmv": (30, 1, 26, 6), + "nanollava": (30, 1, 15), "qwen2_vl": (30, 1, 1, 10), "sana": (58, 28, 28, 18), } @@ -290,7 +290,7 @@ def new_forward( WQLinearMMFunction.forward = orig_gemm_forward -def compare_num_quantized_nodes_per_model( +def check_compression_state_per_model( test_case: unittest.TestCase, models: List[Union[ov.Model, OVBaseModel]], expected_num_weight_nodes_per_model: List[Dict], @@ -298,7 +298,9 @@ def compare_num_quantized_nodes_per_model( test_case.assertEqual(len(models), len(expected_num_weight_nodes_per_model)) actual_num_weights_per_model = [] for submodel, expected_num_weight_nodes in zip(models, expected_num_weight_nodes_per_model): - _, num_weight_nodes = get_num_quantized_nodes(submodel) + ov_model = submodel if isinstance(submodel, ov.Model) else submodel.model + _, num_weight_nodes = get_num_quantized_nodes(ov_model) expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)}) actual_num_weights_per_model.append(num_weight_nodes) + test_case.assertFalse(ov_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) test_case.assertEqual(expected_num_weight_nodes_per_model, actual_num_weights_per_model) From 96310278bee6cb02904a4cf71bc19c2b475cbf61 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 5 Feb 2025 10:20:27 +0100 Subject: [PATCH 3/5] Add deprecation warning for --- optimum/intel/openvino/configuration.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 59b4b65ddd..795ad52073 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -578,6 +578,13 @@ def __init__( ): super().__init__(bits=bits, sym=sym, group_size=weights_group_size, **kwargs) self.activations_group_size = activations_group_size + logger.warning( + "OVDynamicQuantizationConfig is deprecated and will be removed in optimum-intel v1.24.0. " + "Dynamic quantization and KV cache compression are enabled by default starting from OpenVINO 2025.0 and " + "there is no need to enable them manually. If you need precise control over these parameters, please " + "provide `DYNAMIC_QUANTIZATION_GROUP_SIZE` and `KV_CACHE_PRECISION` with `ov_config` argument during model " + "inference." + ) @dataclass From 283645886cdd446207c98ca76e7ec757e295d7aa Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 5 Feb 2025 10:58:53 +0100 Subject: [PATCH 4/5] Fix test --- tests/openvino/test_exporters_cli.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index e697c4f4d1..4bd47b535b 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -375,6 +375,8 @@ def test_exporters_cli_int8(self, task: str, model_type: str): expected_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type] expected_int8 = [{"int8": it} for it in expected_int8] + if task.startswith("text2text-generation") and (not task.endswith("with-past") or model.decoder.stateful): + expected_int8 = expected_int8[:2] check_compression_state_per_model(self, models, expected_int8) @parameterized.expand(SUPPORTED_SD_HYBRID_ARCHITECTURES) From 81dd46857b80d06cdce20afec1fd8e0d75f3e2ea Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 5 Feb 2025 11:53:26 +0100 Subject: [PATCH 5/5] Update optimum/intel/openvino/configuration.py Co-authored-by: Alexander Kozlov --- optimum/intel/openvino/configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 795ad52073..966ab57c51 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -580,7 +580,7 @@ def __init__( self.activations_group_size = activations_group_size logger.warning( "OVDynamicQuantizationConfig is deprecated and will be removed in optimum-intel v1.24.0. " - "Dynamic quantization and KV cache compression are enabled by default starting from OpenVINO 2025.0 and " + "Dynamic quantization and KV cache compression are enabled by default starting from OpenVINO 2024.6 and " "there is no need to enable them manually. If you need precise control over these parameters, please " "provide `DYNAMIC_QUANTIZATION_GROUP_SIZE` and `KV_CACHE_PRECISION` with `ov_config` argument during model " "inference."