Remove KV-cache compression disabling flag for compressed models (#1141)

nikita-savelyevv · AlexKoff88 · web-flow · commit f601b8b1fda4 · 2025-02-05T15:41:29.000+01:00
* Remove kv cache compression disabling flag for compressed models

* Add kv-cache precision flag check to a separate method

* Add deprecation warning for

* Fix test

* Update optimum/intel/openvino/configuration.py

Co-authored-by: Alexander Kozlov &lt;alexander.kozlov@intel.com&gt;

---------

Co-authored-by: Alexander Kozlov &lt;alexander.kozlov@intel.com&gt;
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -493,12 +493,6 @@ class StoreAttr(object):
             from optimum.intel.openvino.quantization import _weight_only_quantization
 
             _weight_only_quantization(submodel, quantization_config)
-            # kv cache compression disabled if quantization config is not provided,
-            # to keep aligned result of applying auto int8 compression and via explicit setting config, we should update it
-            if submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]):
-                prev_rt_info = submodel.get_rt_info("runtime_options").value
-                prev_rt_info.pop("KV_CACHE_PRECISION")
-                submodel.set_rt_info(prev_rt_info, "runtime_options")
             compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
             save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
             del submodel
diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
@@ -578,6 +578,13 @@ def __init__(
     ):
         super().__init__(bits=bits, sym=sym, group_size=weights_group_size, **kwargs)
         self.activations_group_size = activations_group_size
+        logger.warning(
+            "OVDynamicQuantizationConfig is deprecated and will be removed in optimum-intel v1.24.0. "
+            "Dynamic quantization and KV cache compression are enabled by default starting from OpenVINO 2024.6 and "
+            "there is no need to enable them manually. If you need precise control over these parameters, please "
+            "provide `DYNAMIC_QUANTIZATION_GROUP_SIZE` and `KV_CACHE_PRECISION` with `ov_config` argument during model "
+            "inference."
+        )
 
 
 @dataclass
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
@@ -1042,7 +1042,7 @@ def _weight_only_quantization(
         else:
             mode = CompressWeightsMode.INT4_SYM if config.sym else CompressWeightsMode.INT4_ASYM
 
-    return nncf.compress_weights(
+    compressed_model = nncf.compress_weights(
         model,
         mode=mode,
         ratio=config.ratio,
@@ -1060,6 +1060,15 @@ def _weight_only_quantization(
         **kwargs,
     )
 
+    # If KV cache compression was disabled, remove the disabling flag from the model
+    if compressed_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]):
+        prev_rt_info = compressed_model.get_rt_info("runtime_options").value
+        if prev_rt_info["KV_CACHE_PRECISION"] == "f16":
+            prev_rt_info.pop("KV_CACHE_PRECISION")
+            compressed_model.set_rt_info(prev_rt_info, "runtime_options")
+
+    return compressed_model
+
 
 def _full_quantization(
     model: openvino.runtime.Model,
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -21,7 +21,7 @@
 from utils_tests import (
     _ARCHITECTURES_TO_EXPECTED_INT8,
     MODEL_NAMES,
-    compare_num_quantized_nodes_per_model,
+    check_compression_state_per_model,
     get_num_quantized_nodes,
 )
 
@@ -192,27 +192,27 @@ class OVCLIExportTestCase(unittest.TestCase):
                     "image-text-to-text",
                     "llava_next",
                     "int4 --group-size 16 --ratio 0.8",
-                    [{"int8": 14, "int4": 16}, {"int8": 9}, {"int8": 1}],
+                    [{"int8": 14, "int4": 16}, {"int8": 1}, {"int8": 9}],
                 ),
                 (
                     "image-text-to-text",
                     "llava_next",
                     'int4 --group-size 16 --ratio 0.8 --sensitivity-metric "hessian_input_activation" '
                     "--dataset contextual --num-samples 1",
-                    [{"int8": 6, "int4": 24}, {"int8": 9}, {"int8": 1}],
+                    [{"int8": 6, "int4": 24}, {"int8": 1}, {"int8": 9}],
                 ),
                 (
                     "image-text-to-text",
                     "nanollava",
                     "int4 --group-size 8 --ratio 0.8 --trust-remote-code",
-                    [{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
+                    [{"int8": 16, "int4": 14}, {"int8": 1}, {"int8": 15}],
                 ),
                 (
                     "image-text-to-text",
                     "nanollava",
                     'int4 --group-size 8 --ratio 0.8 --sensitivity-metric "mean_activation_variance" '
                     "--dataset contextual --num-samples 1 --trust-remote-code",
-                    [{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
+                    [{"int8": 16, "int4": 14}, {"int8": 1}, {"int8": 15}],
                 ),
             ]
         )
@@ -224,40 +224,40 @@ class OVCLIExportTestCase(unittest.TestCase):
                     "image-text-to-text",
                     "minicpmv",
                     "int4 --group-size 4 --ratio 0.8 --trust-remote-code",
-                    [{"int8": 10, "int4": 20}, {"int8": 26}, {"int8": 1}, {"int8": 6}],
+                    [{"int8": 10, "int4": 20}, {"int8": 1}, {"int8": 26}, {"int8": 6}],
                 ),
                 (
                     "image-text-to-text",
                     "minicpmv",
                     'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
                     "--dataset contextual --num-samples 1 --trust-remote-code",
-                    [{"int8": 8, "int4": 22}, {"int8": 26}, {"int8": 1}, {"int8": 6}],
+                    [{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 26}, {"int8": 6}],
                 ),
                 (
                     "image-text-to-text",
                     "internvl2",
                     "int4 --group-size 4 --ratio 0.8 --trust-remote-code",
-                    [{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}],
+                    [{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 11}],
                 ),
                 (
                     "image-text-to-text",
                     "internvl2",
                     'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
                     "--dataset contextual --num-samples 1 --trust-remote-code",
-                    [{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}],
+                    [{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 11}],
                 ),
                 (
                     "image-text-to-text",
                     "phi3_v",
                     "int4 --group-size 4 --ratio 0.8 --trust-remote-code",
-                    [{"int8": 8, "int4": 10}, {"int8": 7}, {"int8": 1}, {"int8": 2}],
+                    [{"int8": 8, "int4": 10}, {"int8": 1}, {"int8": 7}, {"int8": 2}],
                 ),
                 (
                     "image-text-to-text",
                     "phi3_v",
                     'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
                     "--dataset contextual --num-samples 1 --trust-remote-code",
-                    [{"int8": 4, "int4": 14}, {"int8": 7}, {"int8": 1}, {"int8": 2}],
+                    [{"int8": 4, "int4": 14}, {"int8": 1}, {"int8": 7}, {"int8": 2}],
                 ),
                 (
                     "image-text-to-text",
@@ -369,14 +369,15 @@ def test_exporters_cli_int8(self, task: str, model_type: str):
                     model.text_encoder if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2
                 )
             elif task.startswith("image-text-to-text"):
-                models = [model.language_model, model.vision_embeddings]
+                models = list(model.submodels.values())
             else:
                 models = [model]
 
             expected_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type]
-            for i, model in enumerate(models):
-                _, num_weight_nodes = get_num_quantized_nodes(model)
-                self.assertEqual(expected_int8[i], num_weight_nodes["int8"])
+            expected_int8 = [{"int8": it} for it in expected_int8]
+            if task.startswith("text2text-generation") and (not task.endswith("with-past") or model.decoder.stateful):
+                expected_int8 = expected_int8[:2]
+            check_compression_state_per_model(self, models, expected_int8)
 
     @parameterized.expand(SUPPORTED_SD_HYBRID_ARCHITECTURES)
     def test_exporters_cli_hybrid_quantization(
@@ -389,11 +390,11 @@ def test_exporters_cli_hybrid_quantization(
                 check=True,
             )
             model = eval(_HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")]).from_pretrained(tmpdir)
-            num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(
-                model.unet if model.unet is not None else model.transformer
-            )
+            vision_model = model.unet.model if model.unet is not None else model.transformer.model
+            num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(vision_model)
             self.assertEqual(expected_int8_nodes, num_weight_nodes["int8"])
             self.assertEqual(expected_fake_nodes, num_fake_nodes)
+            self.assertFalse(vision_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
 
     @parameterized.expand(TEST_4BIT_CONFIGURATIONS)
     def test_exporters_cli_4bit(
@@ -419,10 +420,9 @@ def test_exporters_cli_4bit(
             if task == "text-generation-with-past":
                 submodels = [model]
             elif task == "image-text-to-text":
-                submodels = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model]
-                submodels += [getattr(model, part) for part in model.additional_parts]
+                submodels = list(model.submodels.values())
 
-            compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model)
+            check_compression_state_per_model(self, submodels, expected_num_weight_nodes_per_model)
 
             self.assertTrue("--awq" not in option or b"Applying AWQ" in result.stdout)
             self.assertTrue("--scale-estimation" not in option or b"Applying Scale Estimation" in result.stdout)
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py