Skip to content

Commit 3c9fd76

Browse files
CB: fixed scheduler perf on old platforms (openvinotoolkit#1284)
Ported openvinotoolkit#1283 to current master
1 parent cbb1fa0 commit 3c9fd76

File tree

2 files changed

+17
-6
lines changed

2 files changed

+17
-6
lines changed

src/cpp/src/block_manager.hpp

+14-6
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212

1313
#include "sequence_group.hpp"
1414

15-
1615
namespace ov::genai {
1716

1817
class KVCacheBlock {
@@ -188,7 +187,10 @@ class CacheStateDumper;
188187
*/
189188
class BlockAllocator {
190189
std::vector<std::list<KVCacheBlock::Ptr>> m_free_blocks;
191-
int m_total_num_blocks;
190+
// We keep m_free_blocks_num instead of m_free_blocks[X].size() to WA old CXX library implementation issue for std::list::size()
191+
// see https://stackoverflow.com/questions/13157164/why-isnt-stdlist-size-constant-time
192+
std::vector<size_t> m_free_blocks_num;
193+
size_t m_total_num_blocks;
192194
friend class CacheStateDumper;
193195
size_t m_num_layers;
194196
bool m_enable_prefix_caching;
@@ -202,8 +204,8 @@ class BlockAllocator {
202204
* @param num_layers The number of separate attention layers with KV caches in the LLM associated with the pipeline.
203205
* Blocks returned will be vectors with this size, each vector entry to be associated with a separate layer's KV cache.
204206
*/
205-
BlockAllocator(int num_blocks, bool enable_prefix_caching, size_t num_layers = 1) :
206-
m_total_num_blocks(num_blocks), m_num_layers(num_layers), m_enable_prefix_caching(enable_prefix_caching), m_overwriteable_blocks(num_layers) {
207+
BlockAllocator(size_t num_blocks, bool enable_prefix_caching, size_t num_layers = 1) :
208+
m_free_blocks_num(num_layers, num_blocks), m_total_num_blocks(num_blocks), m_num_layers(num_layers), m_enable_prefix_caching(enable_prefix_caching), m_overwriteable_blocks(num_layers) {
207209
OPENVINO_ASSERT(num_layers != 0, "num_layers must be non-zero");
208210
m_free_blocks.resize(m_num_layers);
209211
for (auto& per_layer_block_list : m_free_blocks) {
@@ -224,7 +226,7 @@ class BlockAllocator {
224226
* @return Number of free blocks for this layer.
225227
*/
226228
size_t num_free_blocks(size_t layer_idx) const {
227-
return m_free_blocks[layer_idx].size() + m_overwriteable_blocks.num_blocks();
229+
return m_free_blocks_num[layer_idx] + num_overwriteable_blocks();
228230
}
229231

230232
/**
@@ -270,6 +272,7 @@ class BlockAllocator {
270272
block_ptr->release();
271273
if (block_ptr->is_free()) {
272274
m_free_blocks[layer_idx].push_back(block_ptr);
275+
++m_free_blocks_num[layer_idx];
273276
}
274277
}
275278

@@ -325,6 +328,7 @@ class BlockAllocator {
325328
// actual collision case
326329
for (size_t layer_idx = 0; layer_idx < colliding_blocks_per_layer.size(); layer_idx++) {
327330
m_free_blocks[layer_idx].push_back(colliding_blocks_per_layer[layer_idx]);
331+
++m_free_blocks_num[layer_idx];
328332
}
329333
}
330334
m_overwriteable_blocks.add(blocks_for_all_layers);
@@ -333,12 +337,14 @@ class BlockAllocator {
333337
// TODO (vshampor): more fine-grained hash store control
334338
for (size_t layer_idx = 0; layer_idx < blocks_for_all_layers.size(); layer_idx++) {
335339
m_free_blocks[layer_idx].push_back(blocks_for_all_layers[layer_idx]);
340+
++m_free_blocks_num[layer_idx];
336341
}
337342
}
338343
}
339344
else {
340345
for (size_t layer_idx = 0; layer_idx < blocks_for_all_layers.size(); layer_idx++) {
341346
m_free_blocks[layer_idx].push_back(blocks_for_all_layers[layer_idx]);
347+
++m_free_blocks_num[layer_idx];
342348
}
343349
}
344350
}
@@ -368,6 +374,7 @@ class BlockAllocator {
368374
KVCacheBlock::Ptr allocated_block = m_free_blocks[layer_idx].front();
369375
allocated_block->increment();
370376
m_free_blocks[layer_idx].pop_front();
377+
--m_free_blocks_num[layer_idx];
371378
return allocated_block;
372379
}
373380

@@ -386,7 +393,7 @@ class BlockAllocator {
386393
OPENVINO_ASSERT(m_enable_prefix_caching);
387394
OPENVINO_ASSERT(can_allocate_blocks(1));
388395

389-
if (m_free_blocks[0].size() > 0) {
396+
if (m_free_blocks_num[0] > 0) {
390397
// allocate new empty block
391398
BlocksPerLayer allocated_blocks;
392399
allocated_blocks.reserve(m_num_layers);
@@ -396,6 +403,7 @@ class BlockAllocator {
396403
allocated_block->set_hash(hash);
397404
allocated_blocks.push_back(allocated_block);
398405
m_free_blocks[i].pop_front();
406+
--m_free_blocks_num[i];
399407
}
400408
cached_blocks[hash] = allocated_blocks;
401409
return allocated_blocks;

src/cpp/src/sequence_group.hpp

+3
Original file line numberDiff line numberDiff line change
@@ -477,6 +477,9 @@ class SequenceGroup {
477477
}
478478

479479
void clear_waiting_sequences() {
480+
if (!is_waiting())
481+
return;
482+
480483
for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) {
481484
if (m_sequences[seq_id]->is_waiting()) {
482485
m_sequences[seq_id]->set_status(SequenceStatus::RUNNING);

0 commit comments

Comments
 (0)