10
10
#include " openvino/genai/scheduler_config.hpp"
11
11
12
12
namespace ov ::genai {
13
+
14
+ /* *
15
+ * Per layer KV cache size configuration
16
+ */
17
+ struct KVHeadConfig {
18
+ size_t num_v_heads, num_k_heads;
19
+ size_t v_head_size, k_head_size;
20
+ };
21
+
13
22
class DeviceConfig {
14
23
ov::element::Type m_kv_cache_type;
15
24
std::vector<ov::PartialShape> m_key_cache_shape, m_value_cache_shape;
16
- std::vector<ov::Shape::value_type> m_num_kv_heads;
17
- ov::Shape::value_type m_head_size, m_num_decoder_layers;
18
- size_t m_num_kv_blocks = 0 ;
19
- size_t m_block_size = 0 ;
20
- size_t m_cache_size = 0 ;
25
+ std::vector<KVHeadConfig> m_kv_heads_config;
26
+ size_t m_num_decoder_layers = 0 ;
27
+ size_t m_num_kv_blocks = 0 , m_cache_size = 0 ; // KV cache sizes in either blocks or GBs
28
+ size_t m_block_size = 0 ; // block size is per inference device
21
29
std::string m_device;
22
30
23
31
size_t get_block_size_by_device (const std::string& device) const {
24
- const size_t cpu_block_size = 32 ;
25
- const size_t gpu_block_size = 16 ;
26
-
27
- bool is_gpu = device.find (" GPU" ) != std::string::npos;
28
-
32
+ const size_t cpu_block_size = 32 , gpu_block_size = 16 ;
33
+ const bool is_gpu = device.find (" GPU" ) != std::string::npos;
29
34
return is_gpu ? gpu_block_size : cpu_block_size;
30
35
}
31
36
@@ -83,17 +88,14 @@ class DeviceConfig {
83
88
84
89
if (scheduling_config.num_kv_blocks > 0 ) {
85
90
m_num_kv_blocks = scheduling_config.num_kv_blocks ;
86
- }
87
- else if (scheduling_config.cache_size > 0 ) {
91
+ } else if (scheduling_config.cache_size > 0 ) {
88
92
m_cache_size = scheduling_config.cache_size ;
89
93
}
90
94
}
91
95
92
- void set_model_params (std::vector<size_t > num_kv_heads, size_t head_size, size_t num_decoder_layers) {
93
- m_head_size = head_size;
94
- m_num_decoder_layers = num_decoder_layers;
95
-
96
- m_num_kv_heads.assign (num_kv_heads.begin (), num_kv_heads.end ());
96
+ void set_kv_head_configs (std::vector<KVHeadConfig> kv_heads_config) {
97
+ m_kv_heads_config = kv_heads_config;
98
+ m_num_decoder_layers = m_kv_heads_config.size ();
97
99
m_key_cache_shape.reserve (m_num_decoder_layers);
98
100
m_value_cache_shape.reserve (m_num_decoder_layers);
99
101
@@ -103,35 +105,37 @@ class DeviceConfig {
103
105
// |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)|
104
106
// so, we have to extend head_size by 8, which is sizeof(float)
105
107
// for scale and sizeof(float) for zeropoint
106
- if (m_kv_cache_type == ov::element::u8)
107
- m_head_size += 8 ;
108
+ if (m_kv_cache_type == ov::element::u8) {
109
+ for (size_t layer_id = 0 ; layer_id < m_num_decoder_layers; ++layer_id) {
110
+ m_kv_heads_config[layer_id].k_head_size += 8 ;
111
+ m_kv_heads_config[layer_id].v_head_size += 8 ;
112
+ }
113
+ }
108
114
}
109
115
110
116
if (m_num_kv_blocks == 0 && m_cache_size > 0 ) {
111
- size_t block_size = 0 ;
112
- size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024 ;
113
- for (size_t layer_id = 0 ; layer_id < m_num_decoder_layers; layer_id++) {
114
- block_size += 2 * m_num_kv_heads[layer_id] * m_block_size * m_head_size * m_kv_cache_type.size ();
115
- }
116
- m_num_kv_blocks = size_in_bytes / block_size;
117
+ size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024 ; // convert GBs to bytes
118
+ m_num_kv_blocks = size_in_bytes / get_block_size_in_bytes ();
117
119
}
118
120
119
121
for (size_t layer_id = 0 ; layer_id < m_num_decoder_layers; layer_id++) {
122
+ const KVHeadConfig& config = m_kv_heads_config[layer_id];
123
+
120
124
m_value_cache_shape.push_back (ov::PartialShape{ov::Dimension::dynamic (),
121
- ov::Dimension (m_num_kv_heads[layer_id] ),
125
+ ov::Dimension (config. num_v_heads ),
122
126
ov::Dimension (m_block_size),
123
- ov::Dimension (m_head_size )});
127
+ ov::Dimension (config. v_head_size )});
124
128
125
129
if (m_device.find (" GPU" ) == std::string::npos) {
126
130
m_key_cache_shape.push_back (ov::PartialShape{ov::Dimension::dynamic (),
127
- ov::Dimension (m_num_kv_heads[layer_id] ),
131
+ ov::Dimension (config. num_k_heads ),
128
132
ov::Dimension (m_block_size),
129
- ov::Dimension (m_head_size )});
130
- } else if (m_device.find (" GPU" ) != std::string::npos) {
133
+ ov::Dimension (config. k_head_size )});
134
+ } else if (m_device.find (" GPU" ) != std::string::npos) {
131
135
// Update key shape, as the key's shape is different from the value's shape
132
136
m_key_cache_shape.push_back (ov::PartialShape{ov::Dimension::dynamic (),
133
- ov::Dimension (m_num_kv_heads[layer_id] ),
134
- ov::Dimension (m_head_size ),
137
+ ov::Dimension (config. num_k_heads ),
138
+ ov::Dimension (config. k_head_size ),
135
139
ov::Dimension (m_block_size)});
136
140
}
137
141
}
@@ -168,11 +172,13 @@ class DeviceConfig {
168
172
}
169
173
170
174
size_t get_block_size_in_bytes () const {
171
- size_t block_size = 0 ;
175
+ size_t block_size_in_bytes = 0 ;
172
176
for (size_t layer_id = 0 ; layer_id < m_num_decoder_layers; layer_id++) {
173
- block_size += 2 * m_num_kv_heads[layer_id] * m_block_size * m_head_size * get_cache_precision ().size ();
177
+ const KVHeadConfig& config = m_kv_heads_config[layer_id];
178
+ block_size_in_bytes += config.k_head_size * config.num_k_heads + config.v_head_size * config.num_v_heads ;
174
179
}
175
- return block_size;
180
+ block_size_in_bytes *= get_block_size () * get_cache_precision ().size ();
181
+ return block_size_in_bytes;
176
182
}
177
183
};
178
184
}
0 commit comments