12
12
13
13
#include " sequence_group.hpp"
14
14
15
-
16
15
namespace ov ::genai {
17
16
18
17
class KVCacheBlock {
@@ -188,7 +187,10 @@ class CacheStateDumper;
188
187
*/
189
188
class BlockAllocator {
190
189
std::vector<std::list<KVCacheBlock::Ptr >> m_free_blocks;
191
- int m_total_num_blocks;
190
+ // We keep m_free_blocks_num instead of m_free_blocks[X].size() to WA old CXX library implementation issue for std::list::size()
191
+ // see https://stackoverflow.com/questions/13157164/why-isnt-stdlist-size-constant-time
192
+ std::vector<size_t > m_free_blocks_num;
193
+ size_t m_total_num_blocks;
192
194
friend class CacheStateDumper ;
193
195
size_t m_num_layers;
194
196
bool m_enable_prefix_caching;
@@ -202,8 +204,8 @@ class BlockAllocator {
202
204
* @param num_layers The number of separate attention layers with KV caches in the LLM associated with the pipeline.
203
205
* Blocks returned will be vectors with this size, each vector entry to be associated with a separate layer's KV cache.
204
206
*/
205
- BlockAllocator (int num_blocks, bool enable_prefix_caching, size_t num_layers = 1 ) :
206
- m_total_num_blocks (num_blocks), m_num_layers(num_layers), m_enable_prefix_caching(enable_prefix_caching), m_overwriteable_blocks(num_layers) {
207
+ BlockAllocator (size_t num_blocks, bool enable_prefix_caching, size_t num_layers = 1 ) :
208
+ m_free_blocks_num (num_layers, num_blocks), m_total_num_blocks(num_blocks), m_num_layers(num_layers), m_enable_prefix_caching(enable_prefix_caching), m_overwriteable_blocks(num_layers) {
207
209
OPENVINO_ASSERT (num_layers != 0 , " num_layers must be non-zero" );
208
210
m_free_blocks.resize (m_num_layers);
209
211
for (auto & per_layer_block_list : m_free_blocks) {
@@ -224,7 +226,7 @@ class BlockAllocator {
224
226
* @return Number of free blocks for this layer.
225
227
*/
226
228
size_t num_free_blocks (size_t layer_idx) const {
227
- return m_free_blocks [layer_idx]. size () + m_overwriteable_blocks. num_blocks ();
229
+ return m_free_blocks_num [layer_idx] + num_overwriteable_blocks ();
228
230
}
229
231
230
232
/* *
@@ -270,6 +272,7 @@ class BlockAllocator {
270
272
block_ptr->release ();
271
273
if (block_ptr->is_free ()) {
272
274
m_free_blocks[layer_idx].push_back (block_ptr);
275
+ ++m_free_blocks_num[layer_idx];
273
276
}
274
277
}
275
278
@@ -325,6 +328,7 @@ class BlockAllocator {
325
328
// actual collision case
326
329
for (size_t layer_idx = 0 ; layer_idx < colliding_blocks_per_layer.size (); layer_idx++) {
327
330
m_free_blocks[layer_idx].push_back (colliding_blocks_per_layer[layer_idx]);
331
+ ++m_free_blocks_num[layer_idx];
328
332
}
329
333
}
330
334
m_overwriteable_blocks.add (blocks_for_all_layers);
@@ -333,12 +337,14 @@ class BlockAllocator {
333
337
// TODO (vshampor): more fine-grained hash store control
334
338
for (size_t layer_idx = 0 ; layer_idx < blocks_for_all_layers.size (); layer_idx++) {
335
339
m_free_blocks[layer_idx].push_back (blocks_for_all_layers[layer_idx]);
340
+ ++m_free_blocks_num[layer_idx];
336
341
}
337
342
}
338
343
}
339
344
else {
340
345
for (size_t layer_idx = 0 ; layer_idx < blocks_for_all_layers.size (); layer_idx++) {
341
346
m_free_blocks[layer_idx].push_back (blocks_for_all_layers[layer_idx]);
347
+ ++m_free_blocks_num[layer_idx];
342
348
}
343
349
}
344
350
}
@@ -368,6 +374,7 @@ class BlockAllocator {
368
374
KVCacheBlock::Ptr allocated_block = m_free_blocks[layer_idx].front ();
369
375
allocated_block->increment ();
370
376
m_free_blocks[layer_idx].pop_front ();
377
+ --m_free_blocks_num[layer_idx];
371
378
return allocated_block;
372
379
}
373
380
@@ -386,7 +393,7 @@ class BlockAllocator {
386
393
OPENVINO_ASSERT (m_enable_prefix_caching);
387
394
OPENVINO_ASSERT (can_allocate_blocks (1 ));
388
395
389
- if (m_free_blocks [0 ]. size () > 0 ) {
396
+ if (m_free_blocks_num [0 ] > 0 ) {
390
397
// allocate new empty block
391
398
BlocksPerLayer allocated_blocks;
392
399
allocated_blocks.reserve (m_num_layers);
@@ -396,6 +403,7 @@ class BlockAllocator {
396
403
allocated_block->set_hash (hash);
397
404
allocated_blocks.push_back (allocated_block);
398
405
m_free_blocks[i].pop_front ();
406
+ --m_free_blocks_num[i];
399
407
}
400
408
cached_blocks[hash] = allocated_blocks;
401
409
return allocated_blocks;
0 commit comments