Skip to content

Commit aa552d1

Browse files
CB: support different number of K and V heads per layer (openvinotoolkit#1610)
CVS-160810
1 parent e0488c8 commit aa552d1

File tree

5 files changed

+81
-74
lines changed

5 files changed

+81
-74
lines changed

src/cpp/src/device_config.hpp

+41-35
Original file line numberDiff line numberDiff line change
@@ -10,22 +10,27 @@
1010
#include "openvino/genai/scheduler_config.hpp"
1111

1212
namespace ov::genai {
13+
14+
/**
15+
* Per layer KV cache size configuration
16+
*/
17+
struct KVHeadConfig {
18+
size_t num_v_heads, num_k_heads;
19+
size_t v_head_size, k_head_size;
20+
};
21+
1322
class DeviceConfig {
1423
ov::element::Type m_kv_cache_type;
1524
std::vector<ov::PartialShape> m_key_cache_shape, m_value_cache_shape;
16-
std::vector<ov::Shape::value_type> m_num_kv_heads;
17-
ov::Shape::value_type m_head_size, m_num_decoder_layers;
18-
size_t m_num_kv_blocks = 0;
19-
size_t m_block_size = 0;
20-
size_t m_cache_size = 0;
25+
std::vector<KVHeadConfig> m_kv_heads_config;
26+
size_t m_num_decoder_layers = 0;
27+
size_t m_num_kv_blocks = 0, m_cache_size = 0; // KV cache sizes in either blocks or GBs
28+
size_t m_block_size = 0; // block size is per inference device
2129
std::string m_device;
2230

2331
size_t get_block_size_by_device(const std::string& device) const {
24-
const size_t cpu_block_size = 32;
25-
const size_t gpu_block_size = 16;
26-
27-
bool is_gpu = device.find("GPU") != std::string::npos;
28-
32+
const size_t cpu_block_size = 32, gpu_block_size = 16;
33+
const bool is_gpu = device.find("GPU") != std::string::npos;
2934
return is_gpu ? gpu_block_size : cpu_block_size;
3035
}
3136

@@ -83,17 +88,14 @@ class DeviceConfig {
8388

8489
if (scheduling_config.num_kv_blocks > 0) {
8590
m_num_kv_blocks = scheduling_config.num_kv_blocks;
86-
}
87-
else if (scheduling_config.cache_size > 0) {
91+
} else if (scheduling_config.cache_size > 0) {
8892
m_cache_size = scheduling_config.cache_size;
8993
}
9094
}
9195

92-
void set_model_params(std::vector<size_t> num_kv_heads, size_t head_size, size_t num_decoder_layers) {
93-
m_head_size = head_size;
94-
m_num_decoder_layers = num_decoder_layers;
95-
96-
m_num_kv_heads.assign(num_kv_heads.begin(), num_kv_heads.end());
96+
void set_kv_head_configs(std::vector<KVHeadConfig> kv_heads_config) {
97+
m_kv_heads_config = kv_heads_config;
98+
m_num_decoder_layers = m_kv_heads_config.size();
9799
m_key_cache_shape.reserve(m_num_decoder_layers);
98100
m_value_cache_shape.reserve(m_num_decoder_layers);
99101

@@ -103,35 +105,37 @@ class DeviceConfig {
103105
// |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)|
104106
// so, we have to extend head_size by 8, which is sizeof(float)
105107
// for scale and sizeof(float) for zeropoint
106-
if (m_kv_cache_type == ov::element::u8)
107-
m_head_size += 8;
108+
if (m_kv_cache_type == ov::element::u8) {
109+
for (size_t layer_id = 0; layer_id < m_num_decoder_layers; ++layer_id) {
110+
m_kv_heads_config[layer_id].k_head_size += 8;
111+
m_kv_heads_config[layer_id].v_head_size += 8;
112+
}
113+
}
108114
}
109115

110116
if (m_num_kv_blocks == 0 && m_cache_size > 0) {
111-
size_t block_size = 0;
112-
size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024;
113-
for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) {
114-
block_size += 2 * m_num_kv_heads[layer_id] * m_block_size * m_head_size * m_kv_cache_type.size();
115-
}
116-
m_num_kv_blocks = size_in_bytes / block_size;
117+
size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024; // convert GBs to bytes
118+
m_num_kv_blocks = size_in_bytes / get_block_size_in_bytes();
117119
}
118120

119121
for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) {
122+
const KVHeadConfig& config = m_kv_heads_config[layer_id];
123+
120124
m_value_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
121-
ov::Dimension(m_num_kv_heads[layer_id]),
125+
ov::Dimension(config.num_v_heads),
122126
ov::Dimension(m_block_size),
123-
ov::Dimension(m_head_size)});
127+
ov::Dimension(config.v_head_size)});
124128

125129
if (m_device.find("GPU") == std::string::npos) {
126130
m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
127-
ov::Dimension(m_num_kv_heads[layer_id]),
131+
ov::Dimension(config.num_k_heads),
128132
ov::Dimension(m_block_size),
129-
ov::Dimension(m_head_size)});
130-
} else if (m_device.find("GPU") != std::string::npos) {
133+
ov::Dimension(config.k_head_size)});
134+
} else if (m_device.find("GPU") != std::string::npos) {
131135
// Update key shape, as the key's shape is different from the value's shape
132136
m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
133-
ov::Dimension(m_num_kv_heads[layer_id]),
134-
ov::Dimension(m_head_size),
137+
ov::Dimension(config.num_k_heads),
138+
ov::Dimension(config.k_head_size),
135139
ov::Dimension(m_block_size)});
136140
}
137141
}
@@ -168,11 +172,13 @@ class DeviceConfig {
168172
}
169173

170174
size_t get_block_size_in_bytes() const {
171-
size_t block_size = 0;
175+
size_t block_size_in_bytes = 0;
172176
for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) {
173-
block_size += 2 * m_num_kv_heads[layer_id] * m_block_size * m_head_size * get_cache_precision().size();
177+
const KVHeadConfig& config = m_kv_heads_config[layer_id];
178+
block_size_in_bytes += config.k_head_size * config.num_k_heads + config.v_head_size * config.num_v_heads;
174179
}
175-
return block_size;
180+
block_size_in_bytes *= get_block_size() * get_cache_precision().size();
181+
return block_size_in_bytes;
176182
}
177183
};
178184
}

src/cpp/src/utils/paged_attention_transformations.cpp

+19-21
Original file line numberDiff line numberDiff line change
@@ -31,37 +31,35 @@ void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, boo
3131
}
3232

3333
void set_kv_cache_type_and_shape(std::shared_ptr<ov::Model> model, DeviceConfig& device_config) {
34-
const ov::ParameterVector& parameters = model->get_parameters();
35-
3634
std::map<std::string, std::shared_ptr<ov::op::v0::Parameter>> key_cache_params, value_cache_params;
37-
for (const auto& param_ptr : parameters) {
35+
for (const auto& param_ptr : model->get_parameters()) {
3836
const auto& name = param_ptr->get_friendly_name();
3937
if (name.find("key_cache.") == 0) {
4038
key_cache_params[name] = param_ptr;
41-
}
42-
else if (name.find("value_cache.") == 0) {
39+
} else if (name.find("value_cache.") == 0) {
4340
value_cache_params[name] = param_ptr;
4441
}
4542
}
4643

47-
OPENVINO_ASSERT(key_cache_params.size() > 0);
48-
OPENVINO_ASSERT(key_cache_params.size() == value_cache_params.size());
44+
OPENVINO_ASSERT(key_cache_params.size() == value_cache_params.size() && key_cache_params.size() > 0);
4945

50-
size_t num_layers = key_cache_params.size();
51-
// extract num_kv_heads and head_size
52-
std::string key_cache_param_name = "key_cache.0";
53-
OPENVINO_ASSERT(key_cache_params.count(key_cache_param_name) != 0, "key_cache.0 tensor not found among model parameters");
54-
ov::PartialShape k_shape = key_cache_params[key_cache_param_name]->get_partial_shape();
55-
OPENVINO_ASSERT(k_shape.rank().get_length() == 3, "KV cache shape is expected to have rank 3, while shape is ", k_shape);
56-
size_t head_size = k_shape[2].get_length();
57-
std::vector<size_t> num_kv_heads(num_layers);
58-
for (size_t idx = 0; idx < num_layers; idx++) {
59-
size_t num_heads = key_cache_params[std::string("key_cache.") + std::to_string(idx)]->get_partial_shape()[1].get_length();
60-
num_kv_heads[idx] = num_heads;
46+
size_t num_decoder_layers = key_cache_params.size();
47+
std::vector<KVHeadConfig> kv_heads_config(num_decoder_layers);
48+
49+
for (size_t idx = 0; idx < num_decoder_layers; idx++) {
50+
KVHeadConfig& config = kv_heads_config[idx];
51+
52+
auto key_shape = key_cache_params[std::string("key_cache.") + std::to_string(idx)]->get_partial_shape();
53+
config.num_k_heads = key_shape[1].get_length();
54+
config.k_head_size = key_shape[2].get_length();
55+
56+
auto value_shape = value_cache_params[std::string("value_cache.") + std::to_string(idx)]->get_partial_shape();
57+
config.num_v_heads = value_shape[1].get_length();
58+
config.v_head_size = value_shape[2].get_length();
6159
}
62-
device_config.set_model_params(num_kv_heads, head_size, num_layers);
60+
device_config.set_kv_head_configs(kv_heads_config);
6361

64-
for (size_t idx = 0; idx < num_layers; idx++) {
62+
for (size_t idx = 0; idx < num_decoder_layers; idx++) {
6563
auto k = key_cache_params[std::string("key_cache.") + std::to_string(idx)];
6664
auto v = value_cache_params[std::string("value_cache.") + std::to_string(idx)];
6765
k->set_element_type(device_config.get_cache_precision());
@@ -80,4 +78,4 @@ void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, Dev
8078

8179
} // namespace utils
8280
} // namespace genai
83-
} // namespace ov
81+
} // namespace ov

tests/cpp/cache_manager.cpp

+13-12
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,9 @@ TEST(TestCacheManager, test_cache_size_param) {
5656

5757
const std::string device = "CPU";
5858
ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
59-
size_t num_decoder_layers = 12;
60-
std::vector<size_t> num_kv_heads(12, 12);
61-
device_config.set_model_params(num_kv_heads, 64, num_decoder_layers);
59+
const size_t num_decoder_layers = 12;
60+
const std::vector<KVHeadConfig> kv_heads_config(num_decoder_layers, KVHeadConfig { 12, 12, 64, 64 });
61+
device_config.set_kv_head_configs(kv_heads_config);
6262

6363
ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request();
6464
auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
@@ -79,9 +79,9 @@ TEST(TestCacheManager, test_kv_blocks_param) {
7979

8080
const std::string device = "CPU";
8181
ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
82-
size_t num_decoder_layers = 12;
83-
std::vector<size_t> num_kv_heads(12, 12);
84-
device_config.set_model_params(num_kv_heads, 64, num_decoder_layers);
82+
const size_t num_decoder_layers = 12;
83+
const std::vector<KVHeadConfig> kv_heads_config(num_decoder_layers, KVHeadConfig { 12, 12, 64, 64 });
84+
device_config.set_kv_head_configs(kv_heads_config);
8585

8686
ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request();
8787
auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
@@ -100,15 +100,16 @@ TEST(TestCacheManager, test_dynamic_cache_increase) {
100100

101101
const std::string device = "CPU";
102102
ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
103-
size_t num_decoder_layers = 12;
104-
size_t head_size = 64;
105-
std::vector<size_t> num_kv_heads(12, 12);
106-
device_config.set_model_params(num_kv_heads, head_size, num_decoder_layers);
103+
const size_t num_decoder_layers = 12;
104+
const std::vector<KVHeadConfig> kv_heads_config(num_decoder_layers, KVHeadConfig { 12, 12, 64, 64 });
105+
device_config.set_kv_head_configs(kv_heads_config);
106+
107107
size_t block_size_in_bytes = 0;
108108
for (size_t layer_id = 0; layer_id < num_decoder_layers; layer_id++) {
109-
block_size_in_bytes += 2 * num_kv_heads[layer_id] * device_config.get_block_size() * head_size * device_config.get_cache_precision().size();
109+
KVHeadConfig config = kv_heads_config[layer_id];
110+
block_size_in_bytes += config.k_head_size * config.num_k_heads + config.v_head_size * config.num_v_heads;
110111
}
111-
112+
block_size_in_bytes *= device_config.get_block_size() * device_config.get_cache_precision().size();
112113

113114
ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request();
114115
auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);

tests/cpp/device_config.cpp

+6-4
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,15 @@ TEST(TestDeviceConfig, kv_cache_precision_u8) {
1818
const std::string device = "CPU";
1919
size_t num_decoder_layers = 12;
2020
size_t head_size = 64, head_size_u8 = head_size + 8;
21-
std::vector<size_t> num_kv_heads(12, 12);
2221

23-
ov::genai::DeviceConfig device_config_default(core, scheduler_config, "CPU");
24-
device_config_default.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers);
22+
ov::genai::KVHeadConfig kv_head_config { 12, 12, head_size_u8, head_size_u8 };
23+
ov::genai::KVHeadConfig kv_head_config_u8 { 12, 12, head_size, head_size };
2524

25+
ov::genai::DeviceConfig device_config_default(core, scheduler_config, "CPU");
2626
ov::genai::DeviceConfig device_config_u8(core, scheduler_config, "CPU", { ov::hint::kv_cache_precision(ov::element::u8) });
27-
device_config_u8.set_model_params(num_kv_heads, head_size, num_decoder_layers);
27+
28+
device_config_default.set_kv_head_configs(std::vector<ov::genai::KVHeadConfig>(num_decoder_layers, kv_head_config));
29+
device_config_u8.set_kv_head_configs(std::vector<ov::genai::KVHeadConfig>(num_decoder_layers, kv_head_config_u8));
2830

2931
const auto ratio = ov::element::f16.size() / ov::element::u8.size();
3032
ASSERT_EQ(device_config_default.get_num_kv_blocks() * ratio, device_config_u8.get_num_kv_blocks());

tests/cpp/scheduler.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,9 @@ std::shared_ptr<CacheManager> init_cache_manager(SchedulerConfig scheduler_confi
4747
size_t num_decoder_layers = 12;
4848
ov::InferRequest request = core.compile_model(get_model(core, num_decoder_layers)).create_infer_request();
4949
size_t head_size = 64, head_size_u8 = head_size + 8;
50-
std::vector<size_t> num_kv_heads(12, 12);
50+
std::vector<KVHeadConfig> kv_head_configs(num_decoder_layers, KVHeadConfig { 12, 12, head_size_u8, head_size_u8 });
5151
ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
52-
device_config.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers);
52+
device_config.set_kv_head_configs(kv_head_configs);
5353
return std::make_shared<CacheManager>(device_config, request, core);
5454
}
5555

0 commit comments

Comments
 (0)