Skip to content

Commit a4bb9f0

Browse files
Merge pull request #11 from ilya-lavrenov/change-block-size
Changed block_size according to latest CPU changes
2 parents e7fd50f + 2daf27b commit a4bb9f0

File tree

5 files changed

+6
-5
lines changed

5 files changed

+6
-5
lines changed

text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ int main(int argc, char* argv[]) try {
6666
.max_num_batched_tokens = 32,
6767
// cache params
6868
.num_kv_blocks = 364,
69-
.block_size = 16,
69+
.block_size = 32,
7070
// mode - vLLM or dynamic_split_fuse
7171
.dynamic_split_fuse = dynamic_split_fuse,
7272
// vLLM specific params

text_generation/causal_lm/cpp/continuous_batching/apps/throughput_benchmark.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,8 @@ int main(int argc, char* argv[]) try {
177177
// Perform the first inference
178178
SchedulerConfig scheduler_config {
179179
.max_num_batched_tokens = max_batch_size,
180-
.num_kv_blocks = 36800,
181-
.block_size = 16,
180+
.num_kv_blocks = 15000,
181+
.block_size = 32,
182182
.dynamic_split_fuse = dynamic_split_fuse,
183183
.max_num_seqs = 256, // not used if dynamic_split_fuse=True
184184
.max_paddings = 256, // not used if dynamic_split_fuse=True

text_generation/causal_lm/cpp/continuous_batching/library/include/generation_config.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include <cstdlib>
88
#include <limits>
9+
#include <string>
910
#include <functional>
1011

1112
enum class StopCriteria {

text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ struct SchedulerConfig {
1616
std::size_t num_kv_blocks = 16;
1717

1818
// block size for KV cache
19-
std::size_t block_size = 16;
19+
std::size_t block_size = 32;
2020

2121
// whether to split prompt / generate to different scheduling phases
2222
bool dynamic_split_fuse = true;

text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from common import run_test_pipeline, get_models_list
66

7-
scheduler_params_list = [{"num_kv_blocks": 300, "block_size": 16, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256},
7+
scheduler_params_list = [{"num_kv_blocks": 300, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256},
88
{"num_kv_blocks": 40, "block_size": 4, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, # test preemption for dynamic_split_fuse
99
{"num_kv_blocks": 40, "block_size": 4, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}] # test preemption for vllm
1010
@pytest.mark.parametrize("scheduler_params", scheduler_params_list)

0 commit comments

Comments
 (0)