Added kv_cache_precision property

dmitry-gorokhov · dmitry-gorokhov · commit 1c2cb9d1584b · 2024-02-14T16:53:20.000+04:00
diff --git a/src/bindings/python/src/openvino/runtime/properties/__init__.py b/src/bindings/python/src/openvino/runtime/properties/__init__.py
@@ -27,7 +27,6 @@
 from openvino._pyopenvino.properties import range_for_async_infer_requests
 from openvino._pyopenvino.properties import execution_devices
 from openvino._pyopenvino.properties import loaded_from_cache
-from openvino._pyopenvino.properties import dynamic_quantization_group_size
 
 # Submodules
 from openvino.runtime.properties import hint
diff --git a/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py b/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py
@@ -19,3 +19,5 @@
 from openvino._pyopenvino.properties.hint import num_requests
 from openvino._pyopenvino.properties.hint import model
 from openvino._pyopenvino.properties.hint import allow_auto_batching
+from openvino._pyopenvino.properties.hint import dynamic_quantization_group_size
+from openvino._pyopenvino.properties.hint import kv_cache_precision
diff --git a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp
@@ -38,7 +38,6 @@ void regmodule_properties(py::module m) {
     OPENVINO_SUPPRESS_DEPRECATED_END
     wrap_property_RW(m_properties, ov::force_tbb_terminate, "force_tbb_terminate");
     wrap_property_RW(m_properties, ov::enable_mmap, "enable_mmap");
-    wrap_property_RW(m_properties, ov::dynamic_quantization_group_size, "dynamic_quantization_group_size");
 
     wrap_property_RO(m_properties, ov::supported_properties, "supported_properties");
     wrap_property_RO(m_properties, ov::available_devices, "available_devices");
@@ -87,6 +86,8 @@ void regmodule_properties(py::module m) {
     wrap_property_RW(m_hint, ov::hint::num_requests, "num_requests");
     wrap_property_RW(m_hint, ov::hint::model, "model");
     wrap_property_RW(m_hint, ov::hint::allow_auto_batching, "allow_auto_batching");
+    wrap_property_RW(m_hint, ov::hint::dynamic_quantization_group_size, "dynamic_quantization_group_size");
+    wrap_property_RW(m_hint, ov::hint::kv_cache_precision, "kv_cache_precision");
 
     // Submodule intel_cpu
     py::module m_intel_cpu =
diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py
@@ -253,11 +253,6 @@ def test_properties_ro(ov_property_ro, expected_value):
         ),
         (props.force_tbb_terminate, "FORCE_TBB_TERMINATE", ((True, True), (False, False))),
         (props.enable_mmap, "ENABLE_MMAP", ((True, True), (False, False))),
-        (
-            props.dynamic_quantization_group_size,
-            "DYNAMIC_QUANTIZATION_GROUP_SIZE",
-            ((64, 64),),
-        ),
         (hints.inference_precision, "INFERENCE_PRECISION_HINT", ((Type.f32, Type.f32),)),
         (
             hints.model_priority,
@@ -309,6 +304,12 @@ def test_properties_ro(ov_property_ro, expected_value):
             "ALLOW_AUTO_BATCHING",
             ((True, True),),
         ),
+        (
+            hints.dynamic_quantization_group_size,
+            "DYNAMIC_QUANTIZATION_GROUP_SIZE",
+            ((64, 64),),
+        ),
+        (hints.kv_cache_precision, "KV_CACHE_PRECISION", ((Type.f32, Type.f32),)),
         (
             intel_cpu.denormals_optimization,
             "CPU_DENORMALS_OPTIMIZATION",
diff --git a/src/inference/include/openvino/runtime/properties.hpp b/src/inference/include/openvino/runtime/properties.hpp
@@ -504,6 +504,26 @@ inline std::istream& operator>>(std::istream& is, ExecutionMode& mode) {
  */
 static constexpr Property<ExecutionMode> execution_mode{"EXECUTION_MODE_HINT"};
 
+/**
+ * @brief This property defines group size for dynamic quantization optimization
+ * @ingroup ov_runtime_cpp_prop_api
+ *
+ * Dynamic quantization optimization provides an ability to get performance benefit from int8 compute.
+ * In contrast with static quantization dynamic approach assumes activations are quantized during inference.
+ * Despite the fact dynamic quantization has some runtime overheads, it might provide better accuracy metrics.
+ * This property defines granularity (aka block size) for dynamic quantization algorithms. Lower group size values
+ * might result in better accuracy, but the drawback is worse performance. Group size equal 0 means dynamic
+ * quantization optimization is disabled.
+ */
+static constexpr Property<uint64_t, PropertyMutability::RW> dynamic_quantization_group_size{
+    "DYNAMIC_QUANTIZATION_GROUP_SIZE"};
+
+/**
+ * @brief Hint for device to use specified precision for kv cache compression
+ * @ingroup ov_runtime_cpp_prop_api
+ */
+static constexpr Property<element::Type, PropertyMutability::RW> kv_cache_precision{"KV_CACHE_PRECISION"};
+
 }  // namespace hint
 
 /**
@@ -1159,17 +1179,4 @@ static constexpr Property<Affinity> affinity{"AFFINITY"};
  */
 static constexpr Property<std::vector<std::string>, PropertyMutability::RO> execution_devices{"EXECUTION_DEVICES"};
 
-/**
- * @brief This property defines group size for dynamic quantization optimization
- * @ingroup ov_runtime_cpp_prop_api
- *
- * Dynamic quantization optimization provides an ability to get performance benefit from int8 compute.
- * In contrast with static quantization dynamic approach assumes activations are quantized during inference.
- * Despite the fact dynamic quantization has some runtime overheads, it might provide better accuracy metrics.
- * This property defines granularity (aka block size) for dynamic quantization algorithms. Lower group size values
- * might result in better accuracy, but the drawback is worse performance. Group size equal 0 means dynamic
- * quantization optimization is disabled.
- */
-static constexpr Property<uint64_t, PropertyMutability::RW> dynamic_quantization_group_size{
-    "DYNAMIC_QUANTIZATION_GROUP_SIZE"};
 }  // namespace ov
diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp
@@ -198,7 +198,8 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
             RO_property(ov::intel_cpu::denormals_optimization.name()),
             RO_property(ov::log::level.name()),
             RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
-            RO_property(ov::dynamic_quantization_group_size.name()),
+            RO_property(ov::hint::dynamic_quantization_group_size.name()),
+            RO_property(ov::hint::kv_cache_precision.name()),
         };
     }
 
@@ -262,9 +263,11 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
     } else if (name == ov::intel_cpu::sparse_weights_decompression_rate) {
         return decltype(ov::intel_cpu::sparse_weights_decompression_rate)::value_type(
             config.fcSparseWeiDecompressionRate);
-    } else if (name == ov::dynamic_quantization_group_size) {
-        return decltype(ov::dynamic_quantization_group_size)::value_type(
+    } else if (name == ov::hint::dynamic_quantization_group_size) {
+        return decltype(ov::hint::dynamic_quantization_group_size)::value_type(
             config.fcDynamicQuantizationGroupSize);
+    } else if (name == ov::hint::kv_cache_precision) {
+        return decltype(ov::hint::kv_cache_precision)::value_type(config.kvCachePrecision);
     }
     OPENVINO_THROW("Unsupported property: ", name);
 }
diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp
@@ -12,6 +12,7 @@
 #include "openvino/runtime/properties.hpp"
 #include "utils/debug_capabilities.h"
 #include "utils/precision_support.h"
+#include "utils/cpu_utils.hpp"
 
 #include <algorithm>
 #include <map>
@@ -215,12 +216,12 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
             } else {
                 fcSparseWeiDecompressionRate = val_f;
             }
-        } else if (key == ov::dynamic_quantization_group_size.name()) {
+        } else if (key == ov::hint::dynamic_quantization_group_size.name()) {
             try {
                 fcDynamicQuantizationGroupSize = val.as<uint64_t>();
             } catch (const ov::Exception&) {
                 OPENVINO_THROW("Wrong value for property key ",
-                                ov::dynamic_quantization_group_size.name(),
+                                ov::hint::dynamic_quantization_group_size.name(),
                                 ". Expected only unsinged integer numbers");
             }
         } else if (key == ov::enable_profiling.name()) {
@@ -341,6 +342,21 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
                                ov::hint::execution_mode.name(),
                                ". Supported values: ov::hint::ExecutionMode::PERFORMANCE/ACCURACY");
             }
+        } else if (key == ov::hint::kv_cache_precision.name()) {
+            try {
+                auto const prec = val.as<ov::element::Type>();
+                if (one_of(prec, ov::element::f32, ov::element::f16, ov::element::bf16, ov::element::u8)) {
+                    kvCachePrecision = prec;
+                } else {
+                     OPENVINO_THROW("invalid value");
+                }
+            } catch (ov::Exception&) {
+                OPENVINO_THROW("Wrong value ",
+                               val.as<std::string>(),
+                               " for property key ",
+                               ov::hint::kv_cache_precision.name(),
+                               ". Supported values: s8, bf16, f16, f32");
+            }
         } else {
             OPENVINO_THROW("NotFound: Unsupported property ", key, " by CPU plugin.");
         }
diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h
@@ -56,6 +56,7 @@ struct Config {
     std::string device_id = {};
     float fcSparseWeiDecompressionRate = 1.0f;
     uint64_t fcDynamicQuantizationGroupSize = 0;
+    ov::element::Type kvCachePrecision = ov::element::f16;
 #if defined(OPENVINO_ARCH_X86_64)
     size_t rtCacheCapacity = 5000ul;
 #else
diff --git a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp
@@ -1555,9 +1555,11 @@ void ScaledDotProductAttention::updatePastkv(const MemoryPtr& mem_cur_k, const M
 ov::element::Type ScaledDotProductAttention::getKVCachePrecision() {
     ov::element::Type kvcache_precision;
     auto rtPrecision = getRuntimePrecision();
-    bool enableKVCacheFP16 = m_config.config.fuse_concat && mayiuse(cpu_isa_t::avx2) && rtPrecision != ov::element::bf16;
+    auto kvCachePrecisionHint = context->getConfig().kvCachePrecision;
+    bool enableKVCacheFP16 = m_config.config.fuse_concat && mayiuse(cpu_isa_t::avx2) &&
+        rtPrecision != ov::element::bf16 && kvCachePrecisionHint == ov::element::f16;
     kvcache_precision = enableKVCacheFP16 ? ov::element::f16 : rtPrecision;
-    bool use_int8_kv_cache_precision = false;
+    bool use_int8_kv_cache_precision = kvCachePrecisionHint == ov::element::u8;
     if (use_int8_kv_cache_precision)
         kvcache_precision = ov::element::u8;
     else
diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp
@@ -441,6 +441,10 @@ ov::Any Engine::get_property(const std::string& name, const ov::AnyMap& options)
         return res;
     } else if (name == ov::internal::exclusive_async_requests.name()) {
         return engConfig.exclusiveAsyncRequests;
+    } else if (name == ov::hint::dynamic_quantization_group_size) {
+        return decltype(ov::hint::dynamic_quantization_group_size)::value_type(engConfig.fcDynamicQuantizationGroupSize);
+    } else if (name == ov::hint::kv_cache_precision) {
+        return decltype(ov::hint::kv_cache_precision)::value_type(engConfig.kvCachePrecision);
     }
     return get_ro_property(name, options);
 }
@@ -480,7 +484,8 @@ ov::Any Engine::get_ro_property(const std::string& name, const ov::AnyMap& optio
                                                     RW_property(ov::intel_cpu::denormals_optimization.name()),
                                                     RW_property(ov::log::level.name()),
                                                     RW_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
-                                                    RW_property(ov::dynamic_quantization_group_size.name()),
+                                                    RW_property(ov::hint::dynamic_quantization_group_size.name()),
+                                                    RW_property(ov::hint::kv_cache_precision.name()),
         };
 
         std::vector<ov::PropertyName> supportedProperties;
@@ -525,8 +530,6 @@ ov::Any Engine::get_ro_property(const std::string& name, const ov::AnyMap& optio
         return decltype(ov::intel_cpu::denormals_optimization)::value_type(engConfig.denormalsOptMode == Config::DenormalsOptMode::DO_On);
     } else if (name == ov::intel_cpu::sparse_weights_decompression_rate) {
         return decltype(ov::intel_cpu::sparse_weights_decompression_rate)::value_type(engConfig.fcSparseWeiDecompressionRate);
-    } else if (name == ov::dynamic_quantization_group_size) {
-        return decltype(ov::dynamic_quantization_group_size)::value_type(engConfig.fcDynamicQuantizationGroupSize);
     } else if (name == ov::execution_devices) {
         return decltype(ov::execution_devices)::value_type{get_device_name()};
     } else if (name == ov::device::type) {
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -288,8 +288,8 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
                                                      ov::element::u4,
                                                      ov::element::i4,
                                                      ov::element::nf4};
-    bool fold_subtract_const = config.fcDynamicQuantizationGroupSize != 0;
-    CPU_REGISTER_PASS_X64(decompression_handling_manager, ov::pass::MarkDequantizationSubgraph, decompression_precisions, fold_subtract_const);
+    CPU_REGISTER_PASS_X64(decompression_handling_manager, ov::pass::MarkDequantizationSubgraph, decompression_precisions,
+        config.fcDynamicQuantizationGroupSize != 0);
     CPU_SET_CALLBACK_X64(decompression_handling_manager, [&](const_node_ptr &node) -> bool {
         return !is_decompression_multiply(node);
     }, ov::pass::MarkDequantizationSubgraph);
diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp
@@ -38,7 +38,8 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkSupportedPropertiesAreAvailable
         RO_property(ov::intel_cpu::denormals_optimization.name()),
         RO_property(ov::log::level.name()),
         RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
-        RO_property(ov::dynamic_quantization_group_size.name()),
+        RO_property(ov::hint::dynamic_quantization_group_size.name()),
+        RO_property(ov::hint::kv_cache_precision.name()),
     };
 
     ov::Core ie;
@@ -162,14 +163,25 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckSparseWeigthsDecompression
 TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckDynamicQuantizationGroupSize) {
     ov::Core core;
 
-    core.set_property(deviceName, ov::dynamic_quantization_group_size(64));
+    core.set_property(deviceName, ov::hint::dynamic_quantization_group_size(64));
     ov::CompiledModel compiledModel = core.compile_model(model, deviceName);
 
     size_t groupSize = 0;
-    ASSERT_NO_THROW(groupSize = compiledModel.get_property(ov::dynamic_quantization_group_size));
+    ASSERT_NO_THROW(groupSize = compiledModel.get_property(ov::hint::dynamic_quantization_group_size));
     ASSERT_EQ(groupSize, 64);
 }
 
+TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckKVCachePrecision) {
+    ov::Core core;
+
+    core.set_property(deviceName, ov::hint::kv_cache_precision(ov::element::f32));
+    ov::CompiledModel compiledModel = core.compile_model(model, deviceName);
+
+    auto kv_cache_precision_value = ov::element::undefined;
+    ASSERT_NO_THROW(kv_cache_precision_value = compiledModel.get_property(ov::hint::kv_cache_precision));
+    ASSERT_EQ(kv_cache_precision_value, ov::element::f32);
+}
+
 const auto bf16_if_can_be_emulated = ov::with_cpu_x86_avx512_core() ? ov::element::bf16 : ov::element::f32;
 
 TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckExecutionModeIsAvailableInCoreAndModel) {
diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp
@@ -52,7 +52,8 @@ TEST_F(OVClassConfigTestCPU, smoke_PluginAllSupportedPropertiesAreAvailable) {
         RW_property(ov::intel_cpu::denormals_optimization.name()),
         RW_property(ov::log::level.name()),
         RW_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
-        RW_property(ov::dynamic_quantization_group_size.name()),
+        RW_property(ov::hint::dynamic_quantization_group_size.name()),
+        RW_property(ov::hint::kv_cache_precision.name()),
     };
 
     ov::Core ie;