Skip to content

Commit 1c2cb9d

Browse files
Added kv_cache_precision property
1 parent 483793b commit 1c2cb9d

File tree

13 files changed

+84
-36
lines changed

13 files changed

+84
-36
lines changed

src/bindings/python/src/openvino/runtime/properties/__init__.py

-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
from openvino._pyopenvino.properties import range_for_async_infer_requests
2828
from openvino._pyopenvino.properties import execution_devices
2929
from openvino._pyopenvino.properties import loaded_from_cache
30-
from openvino._pyopenvino.properties import dynamic_quantization_group_size
3130

3231
# Submodules
3332
from openvino.runtime.properties import hint

src/bindings/python/src/openvino/runtime/properties/hint/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,5 @@
1919
from openvino._pyopenvino.properties.hint import num_requests
2020
from openvino._pyopenvino.properties.hint import model
2121
from openvino._pyopenvino.properties.hint import allow_auto_batching
22+
from openvino._pyopenvino.properties.hint import dynamic_quantization_group_size
23+
from openvino._pyopenvino.properties.hint import kv_cache_precision

src/bindings/python/src/pyopenvino/core/properties/properties.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ void regmodule_properties(py::module m) {
3838
OPENVINO_SUPPRESS_DEPRECATED_END
3939
wrap_property_RW(m_properties, ov::force_tbb_terminate, "force_tbb_terminate");
4040
wrap_property_RW(m_properties, ov::enable_mmap, "enable_mmap");
41-
wrap_property_RW(m_properties, ov::dynamic_quantization_group_size, "dynamic_quantization_group_size");
4241

4342
wrap_property_RO(m_properties, ov::supported_properties, "supported_properties");
4443
wrap_property_RO(m_properties, ov::available_devices, "available_devices");
@@ -87,6 +86,8 @@ void regmodule_properties(py::module m) {
8786
wrap_property_RW(m_hint, ov::hint::num_requests, "num_requests");
8887
wrap_property_RW(m_hint, ov::hint::model, "model");
8988
wrap_property_RW(m_hint, ov::hint::allow_auto_batching, "allow_auto_batching");
89+
wrap_property_RW(m_hint, ov::hint::dynamic_quantization_group_size, "dynamic_quantization_group_size");
90+
wrap_property_RW(m_hint, ov::hint::kv_cache_precision, "kv_cache_precision");
9091

9192
// Submodule intel_cpu
9293
py::module m_intel_cpu =

src/bindings/python/tests/test_runtime/test_properties.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -253,11 +253,6 @@ def test_properties_ro(ov_property_ro, expected_value):
253253
),
254254
(props.force_tbb_terminate, "FORCE_TBB_TERMINATE", ((True, True), (False, False))),
255255
(props.enable_mmap, "ENABLE_MMAP", ((True, True), (False, False))),
256-
(
257-
props.dynamic_quantization_group_size,
258-
"DYNAMIC_QUANTIZATION_GROUP_SIZE",
259-
((64, 64),),
260-
),
261256
(hints.inference_precision, "INFERENCE_PRECISION_HINT", ((Type.f32, Type.f32),)),
262257
(
263258
hints.model_priority,
@@ -309,6 +304,12 @@ def test_properties_ro(ov_property_ro, expected_value):
309304
"ALLOW_AUTO_BATCHING",
310305
((True, True),),
311306
),
307+
(
308+
hints.dynamic_quantization_group_size,
309+
"DYNAMIC_QUANTIZATION_GROUP_SIZE",
310+
((64, 64),),
311+
),
312+
(hints.kv_cache_precision, "KV_CACHE_PRECISION", ((Type.f32, Type.f32),)),
312313
(
313314
intel_cpu.denormals_optimization,
314315
"CPU_DENORMALS_OPTIMIZATION",

src/inference/include/openvino/runtime/properties.hpp

+20-13
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,26 @@ inline std::istream& operator>>(std::istream& is, ExecutionMode& mode) {
504504
*/
505505
static constexpr Property<ExecutionMode> execution_mode{"EXECUTION_MODE_HINT"};
506506

507+
/**
508+
* @brief This property defines group size for dynamic quantization optimization
509+
* @ingroup ov_runtime_cpp_prop_api
510+
*
511+
* Dynamic quantization optimization provides an ability to get performance benefit from int8 compute.
512+
* In contrast with static quantization dynamic approach assumes activations are quantized during inference.
513+
* Despite the fact dynamic quantization has some runtime overheads, it might provide better accuracy metrics.
514+
* This property defines granularity (aka block size) for dynamic quantization algorithms. Lower group size values
515+
* might result in better accuracy, but the drawback is worse performance. Group size equal 0 means dynamic
516+
* quantization optimization is disabled.
517+
*/
518+
static constexpr Property<uint64_t, PropertyMutability::RW> dynamic_quantization_group_size{
519+
"DYNAMIC_QUANTIZATION_GROUP_SIZE"};
520+
521+
/**
522+
* @brief Hint for device to use specified precision for kv cache compression
523+
* @ingroup ov_runtime_cpp_prop_api
524+
*/
525+
static constexpr Property<element::Type, PropertyMutability::RW> kv_cache_precision{"KV_CACHE_PRECISION"};
526+
507527
} // namespace hint
508528

509529
/**
@@ -1159,17 +1179,4 @@ static constexpr Property<Affinity> affinity{"AFFINITY"};
11591179
*/
11601180
static constexpr Property<std::vector<std::string>, PropertyMutability::RO> execution_devices{"EXECUTION_DEVICES"};
11611181

1162-
/**
1163-
* @brief This property defines group size for dynamic quantization optimization
1164-
* @ingroup ov_runtime_cpp_prop_api
1165-
*
1166-
* Dynamic quantization optimization provides an ability to get performance benefit from int8 compute.
1167-
* In contrast with static quantization dynamic approach assumes activations are quantized during inference.
1168-
* Despite the fact dynamic quantization has some runtime overheads, it might provide better accuracy metrics.
1169-
* This property defines granularity (aka block size) for dynamic quantization algorithms. Lower group size values
1170-
* might result in better accuracy, but the drawback is worse performance. Group size equal 0 means dynamic
1171-
* quantization optimization is disabled.
1172-
*/
1173-
static constexpr Property<uint64_t, PropertyMutability::RW> dynamic_quantization_group_size{
1174-
"DYNAMIC_QUANTIZATION_GROUP_SIZE"};
11751182
} // namespace ov

src/plugins/intel_cpu/src/compiled_model.cpp

+6-3
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,8 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
198198
RO_property(ov::intel_cpu::denormals_optimization.name()),
199199
RO_property(ov::log::level.name()),
200200
RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
201-
RO_property(ov::dynamic_quantization_group_size.name()),
201+
RO_property(ov::hint::dynamic_quantization_group_size.name()),
202+
RO_property(ov::hint::kv_cache_precision.name()),
202203
};
203204
}
204205

@@ -262,9 +263,11 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
262263
} else if (name == ov::intel_cpu::sparse_weights_decompression_rate) {
263264
return decltype(ov::intel_cpu::sparse_weights_decompression_rate)::value_type(
264265
config.fcSparseWeiDecompressionRate);
265-
} else if (name == ov::dynamic_quantization_group_size) {
266-
return decltype(ov::dynamic_quantization_group_size)::value_type(
266+
} else if (name == ov::hint::dynamic_quantization_group_size) {
267+
return decltype(ov::hint::dynamic_quantization_group_size)::value_type(
267268
config.fcDynamicQuantizationGroupSize);
269+
} else if (name == ov::hint::kv_cache_precision) {
270+
return decltype(ov::hint::kv_cache_precision)::value_type(config.kvCachePrecision);
268271
}
269272
OPENVINO_THROW("Unsupported property: ", name);
270273
}

src/plugins/intel_cpu/src/config.cpp

+18-2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "openvino/runtime/properties.hpp"
1313
#include "utils/debug_capabilities.h"
1414
#include "utils/precision_support.h"
15+
#include "utils/cpu_utils.hpp"
1516

1617
#include <algorithm>
1718
#include <map>
@@ -215,12 +216,12 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
215216
} else {
216217
fcSparseWeiDecompressionRate = val_f;
217218
}
218-
} else if (key == ov::dynamic_quantization_group_size.name()) {
219+
} else if (key == ov::hint::dynamic_quantization_group_size.name()) {
219220
try {
220221
fcDynamicQuantizationGroupSize = val.as<uint64_t>();
221222
} catch (const ov::Exception&) {
222223
OPENVINO_THROW("Wrong value for property key ",
223-
ov::dynamic_quantization_group_size.name(),
224+
ov::hint::dynamic_quantization_group_size.name(),
224225
". Expected only unsinged integer numbers");
225226
}
226227
} else if (key == ov::enable_profiling.name()) {
@@ -341,6 +342,21 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
341342
ov::hint::execution_mode.name(),
342343
". Supported values: ov::hint::ExecutionMode::PERFORMANCE/ACCURACY");
343344
}
345+
} else if (key == ov::hint::kv_cache_precision.name()) {
346+
try {
347+
auto const prec = val.as<ov::element::Type>();
348+
if (one_of(prec, ov::element::f32, ov::element::f16, ov::element::bf16, ov::element::u8)) {
349+
kvCachePrecision = prec;
350+
} else {
351+
OPENVINO_THROW("invalid value");
352+
}
353+
} catch (ov::Exception&) {
354+
OPENVINO_THROW("Wrong value ",
355+
val.as<std::string>(),
356+
" for property key ",
357+
ov::hint::kv_cache_precision.name(),
358+
". Supported values: s8, bf16, f16, f32");
359+
}
344360
} else {
345361
OPENVINO_THROW("NotFound: Unsupported property ", key, " by CPU plugin.");
346362
}

src/plugins/intel_cpu/src/config.h

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ struct Config {
5656
std::string device_id = {};
5757
float fcSparseWeiDecompressionRate = 1.0f;
5858
uint64_t fcDynamicQuantizationGroupSize = 0;
59+
ov::element::Type kvCachePrecision = ov::element::f16;
5960
#if defined(OPENVINO_ARCH_X86_64)
6061
size_t rtCacheCapacity = 5000ul;
6162
#else

src/plugins/intel_cpu/src/nodes/scaled_attn.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -1555,9 +1555,11 @@ void ScaledDotProductAttention::updatePastkv(const MemoryPtr& mem_cur_k, const M
15551555
ov::element::Type ScaledDotProductAttention::getKVCachePrecision() {
15561556
ov::element::Type kvcache_precision;
15571557
auto rtPrecision = getRuntimePrecision();
1558-
bool enableKVCacheFP16 = m_config.config.fuse_concat && mayiuse(cpu_isa_t::avx2) && rtPrecision != ov::element::bf16;
1558+
auto kvCachePrecisionHint = context->getConfig().kvCachePrecision;
1559+
bool enableKVCacheFP16 = m_config.config.fuse_concat && mayiuse(cpu_isa_t::avx2) &&
1560+
rtPrecision != ov::element::bf16 && kvCachePrecisionHint == ov::element::f16;
15591561
kvcache_precision = enableKVCacheFP16 ? ov::element::f16 : rtPrecision;
1560-
bool use_int8_kv_cache_precision = false;
1562+
bool use_int8_kv_cache_precision = kvCachePrecisionHint == ov::element::u8;
15611563
if (use_int8_kv_cache_precision)
15621564
kvcache_precision = ov::element::u8;
15631565
else

src/plugins/intel_cpu/src/plugin.cpp

+6-3
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,10 @@ ov::Any Engine::get_property(const std::string& name, const ov::AnyMap& options)
441441
return res;
442442
} else if (name == ov::internal::exclusive_async_requests.name()) {
443443
return engConfig.exclusiveAsyncRequests;
444+
} else if (name == ov::hint::dynamic_quantization_group_size) {
445+
return decltype(ov::hint::dynamic_quantization_group_size)::value_type(engConfig.fcDynamicQuantizationGroupSize);
446+
} else if (name == ov::hint::kv_cache_precision) {
447+
return decltype(ov::hint::kv_cache_precision)::value_type(engConfig.kvCachePrecision);
444448
}
445449
return get_ro_property(name, options);
446450
}
@@ -480,7 +484,8 @@ ov::Any Engine::get_ro_property(const std::string& name, const ov::AnyMap& optio
480484
RW_property(ov::intel_cpu::denormals_optimization.name()),
481485
RW_property(ov::log::level.name()),
482486
RW_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
483-
RW_property(ov::dynamic_quantization_group_size.name()),
487+
RW_property(ov::hint::dynamic_quantization_group_size.name()),
488+
RW_property(ov::hint::kv_cache_precision.name()),
484489
};
485490

486491
std::vector<ov::PropertyName> supportedProperties;
@@ -525,8 +530,6 @@ ov::Any Engine::get_ro_property(const std::string& name, const ov::AnyMap& optio
525530
return decltype(ov::intel_cpu::denormals_optimization)::value_type(engConfig.denormalsOptMode == Config::DenormalsOptMode::DO_On);
526531
} else if (name == ov::intel_cpu::sparse_weights_decompression_rate) {
527532
return decltype(ov::intel_cpu::sparse_weights_decompression_rate)::value_type(engConfig.fcSparseWeiDecompressionRate);
528-
} else if (name == ov::dynamic_quantization_group_size) {
529-
return decltype(ov::dynamic_quantization_group_size)::value_type(engConfig.fcDynamicQuantizationGroupSize);
530533
} else if (name == ov::execution_devices) {
531534
return decltype(ov::execution_devices)::value_type{get_device_name()};
532535
} else if (name == ov::device::type) {

src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -288,8 +288,8 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
288288
ov::element::u4,
289289
ov::element::i4,
290290
ov::element::nf4};
291-
bool fold_subtract_const = config.fcDynamicQuantizationGroupSize != 0;
292-
CPU_REGISTER_PASS_X64(decompression_handling_manager, ov::pass::MarkDequantizationSubgraph, decompression_precisions, fold_subtract_const);
291+
CPU_REGISTER_PASS_X64(decompression_handling_manager, ov::pass::MarkDequantizationSubgraph, decompression_precisions,
292+
config.fcDynamicQuantizationGroupSize != 0);
293293
CPU_SET_CALLBACK_X64(decompression_handling_manager, [&](const_node_ptr &node) -> bool {
294294
return !is_decompression_multiply(node);
295295
}, ov::pass::MarkDequantizationSubgraph);

src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp

+15-3
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkSupportedPropertiesAreAvailable
3838
RO_property(ov::intel_cpu::denormals_optimization.name()),
3939
RO_property(ov::log::level.name()),
4040
RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
41-
RO_property(ov::dynamic_quantization_group_size.name()),
41+
RO_property(ov::hint::dynamic_quantization_group_size.name()),
42+
RO_property(ov::hint::kv_cache_precision.name()),
4243
};
4344

4445
ov::Core ie;
@@ -162,14 +163,25 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckSparseWeigthsDecompression
162163
TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckDynamicQuantizationGroupSize) {
163164
ov::Core core;
164165

165-
core.set_property(deviceName, ov::dynamic_quantization_group_size(64));
166+
core.set_property(deviceName, ov::hint::dynamic_quantization_group_size(64));
166167
ov::CompiledModel compiledModel = core.compile_model(model, deviceName);
167168

168169
size_t groupSize = 0;
169-
ASSERT_NO_THROW(groupSize = compiledModel.get_property(ov::dynamic_quantization_group_size));
170+
ASSERT_NO_THROW(groupSize = compiledModel.get_property(ov::hint::dynamic_quantization_group_size));
170171
ASSERT_EQ(groupSize, 64);
171172
}
172173

174+
TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckKVCachePrecision) {
175+
ov::Core core;
176+
177+
core.set_property(deviceName, ov::hint::kv_cache_precision(ov::element::f32));
178+
ov::CompiledModel compiledModel = core.compile_model(model, deviceName);
179+
180+
auto kv_cache_precision_value = ov::element::undefined;
181+
ASSERT_NO_THROW(kv_cache_precision_value = compiledModel.get_property(ov::hint::kv_cache_precision));
182+
ASSERT_EQ(kv_cache_precision_value, ov::element::f32);
183+
}
184+
173185
const auto bf16_if_can_be_emulated = ov::with_cpu_x86_avx512_core() ? ov::element::bf16 : ov::element::f32;
174186

175187
TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckExecutionModeIsAvailableInCoreAndModel) {

src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ TEST_F(OVClassConfigTestCPU, smoke_PluginAllSupportedPropertiesAreAvailable) {
5252
RW_property(ov::intel_cpu::denormals_optimization.name()),
5353
RW_property(ov::log::level.name()),
5454
RW_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
55-
RW_property(ov::dynamic_quantization_group_size.name()),
55+
RW_property(ov::hint::dynamic_quantization_group_size.name()),
56+
RW_property(ov::hint::kv_cache_precision.name()),
5657
};
5758

5859
ov::Core ie;

0 commit comments

Comments
 (0)