Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prefix caching for sequences with embeddings. #1841

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions src/cpp/src/block_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1073,7 +1073,7 @@ class BlockManager {
// When add_request() is executed in multiple threads accessing to cached_blocks causes segfault.
// The mutex is needed to prevent such segfaults.
const std::lock_guard<std::mutex> lock(m_cached_blocks_map_mutex);
auto prompt_ids = group->get_prompt_ids();
auto prompt_len = group->get_prompt_len();
auto sequences = group->get_not_finished_sequences();
OPENVINO_ASSERT(sequences.size() == 1);
auto sequence = sequences[0];
Expand All @@ -1085,11 +1085,11 @@ class BlockManager {
auto& block_table = m_block_table[seq_id];

size_t content_len = 0;
while (content_len < prompt_ids.size()) {
while (content_len < prompt_len) {
size_t prev_iteration_content_len = content_len;
content_len += m_block_size;
if (content_len > prompt_ids.size()) {
content_len = prompt_ids.size();
if (content_len > prompt_len) {
content_len = prompt_len;
}
// restore fully filled blocks
auto full_block_hash = sequence->get_hash(content_len);
Expand All @@ -1101,11 +1101,11 @@ class BlockManager {
block->set_timestamp(timestamp);
block_table[layer_idx].push_back(block);
}
group->update_processed_tokens_num(content_len == prompt_ids.size() ? content_len - 1 : content_len);
group->update_processed_tokens_num(content_len == prompt_len ? content_len - 1 : content_len);
} else {
// restore partially filled block
for (size_t i = 1; i < m_block_size; i++) {
if (prev_iteration_content_len + i > prompt_ids.size()) {
if (prev_iteration_content_len + i > prompt_len) {
break;
}
auto hash = sequence->get_hash(prev_iteration_content_len + i);
Expand All @@ -1118,8 +1118,7 @@ class BlockManager {
block->set_timestamp(timestamp);
block_table[layer_idx].push_back(block);
}

group->update_processed_tokens_num(prev_iteration_content_len + i == prompt_ids.size() ? prev_iteration_content_len + i - 1 : prev_iteration_content_len + i);
group->update_processed_tokens_num(prev_iteration_content_len + i == prompt_len ? prev_iteration_content_len + i - 1 : prev_iteration_content_len + i);

break;
}
Expand Down
7 changes: 4 additions & 3 deletions src/cpp/src/continuous_batching_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,9 +264,6 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request
SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, input_ids, sampling_params, m_block_size);

if (m_scheduler->get_config().enable_prefix_caching) {
if (m_model_input_type == ModelInputType::EMBEDDINGS) {
OPENVINO_THROW("Prefix caching is not supported for VLM models.");
}
m_scheduler->restore_cached_blocks(sequence_group);
}

Expand Down Expand Up @@ -400,6 +397,10 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {

free_fork_timer.end();
}

// append embeddings for generated tokens
if (m_model_input_type == ModelInputType::EMBEDDINGS)
m_model_runner->append_embeddings(m_requests, scheduler_output);

// notify requests dropped by handle
{
Expand Down
87 changes: 59 additions & 28 deletions src/cpp/src/model_runner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ class ModelRunner {
size_t total_num_tokens = 0, total_num_blocks = 0;
size_t max_context_len_val = 0;
size_t hidden_size = 0;
size_t num_generated_ids = 0;
OPENVINO_ASSERT(sequence_groups.size() > 0);
auto sequence_group_type = sequence_groups[0]->get_sequence_group_type();
if (sequence_group_type == SequenceGroupType::EMBEDDINGS) {
Expand All @@ -135,9 +134,6 @@ class ModelRunner {
total_num_tokens += sequence_group->get_num_scheduled_tokens() * num_sequences;
total_num_blocks += sequence_group->get_num_blocks() * num_sequences;
max_context_len_val = std::max(max_context_len_val, sequence_group->get_context_len());
for (auto seq: sequence_group->get_running_sequences()) {
num_generated_ids += seq->get_generated_len();
}
}

ov::Tensor
Expand All @@ -163,27 +159,6 @@ class ModelRunner {
if (sequence_group_type == SequenceGroupType::EMBEDDINGS) {
OPENVINO_ASSERT(m_embedding.get_request(), "Got sequence group with embeddings, but embeddings model wasn't set.");
inputs_embeds_data = inputs_embeds.data<float>();

ov::Tensor generated_ids = ov::Tensor(ov::element::i64, {1, num_generated_ids});
int64_t *generated_ids_data = generated_ids.data<int64_t>();
size_t pos = 0;
for (size_t i = 0; i < num_sequence_groups; ++i) {
size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids[i];
SequenceGroup::CPtr sequence_group = sequence_groups[seq_group_id];
for (auto seq: sequence_group->get_running_sequences()) {
auto generated_ids = seq->get_generated_ids();
for (size_t token_idx = 0; token_idx < generated_ids.size(); token_idx++) {
generated_ids_data[pos] = generated_ids[token_idx];
pos++;
}
}
}
if (pos > 0) {
// TODO: Compute embeddings only for last generated token, while previously generated embeddings save in SequenceGroup
generated_ids_embeds = m_embedding.infer(generated_ids);
generated_ids_embeds_data = generated_ids_embeds.data<float>();
}

} else if (sequence_group_type == SequenceGroupType::TOKENS) {
input_ids_data = input_ids.data<int64_t>();
}
Expand Down Expand Up @@ -234,8 +209,8 @@ class ModelRunner {
sequence_group->get_prompt_ids()[position_id] :
sequence->get_generated_ids()[position_id - prompt_len];
} else if (sequence_group_type == SequenceGroupType::EMBEDDINGS) {
auto embeds_pos = position_id < prompt_len ? 0 : hidden_size * (position_id - prompt_len);
const float* src = position_id < prompt_len ? sequence_group->get_input_embeds()[position_id].data() : generated_ids_embeds_data + embeds_pos;
const auto& generated_embeds = sequence->get_generated_ids_embeds();
const float* src = position_id < prompt_len ? sequence_group->get_input_embeds()[position_id].data() : generated_embeds[position_id - prompt_len].data();
std::copy_n(src, hidden_size, inputs_embeds_data + token_id * hidden_size);
} else {
OPENVINO_THROW("Unknown model inputs type.");
Expand Down Expand Up @@ -271,7 +246,6 @@ class ModelRunner {
input_ids_data += num_scheduled_tokens;
} else if (sequence_group_type == SequenceGroupType::EMBEDDINGS) {
inputs_embeds_data += num_scheduled_tokens * hidden_size;
generated_ids_embeds_data += sequence->get_generated_len() * hidden_size;
}

position_ids_data += num_scheduled_tokens;
Expand Down Expand Up @@ -337,6 +311,63 @@ class ModelRunner {
return m_request.get_tensor("logits");
}

void append_embeddings(const std::vector<SequenceGroup::Ptr> & sequence_groups, const Scheduler::Output& scheduler_output) {
size_t num_sequence_groups = scheduler_output.m_scheduled_sequence_groups_ids.size();
size_t num_generated_ids_without_embeddings = 0;
OPENVINO_ASSERT(sequence_groups.size() > 0);

// compute aggregated values
for (size_t i = 0; i < num_sequence_groups; ++i) {
size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids[i];
SequenceGroup::CPtr sequence_group = sequence_groups[seq_group_id];
size_t num_sequences = sequence_group->num_running_seqs();
OPENVINO_ASSERT(sequence_group->get_sequence_group_type() == SequenceGroupType::EMBEDDINGS);
for (auto seq: sequence_group->get_running_sequences()) {
num_generated_ids_without_embeddings += seq->get_generated_len() - seq->get_generated_ids_embeds().size();
}
}
size_t hidden_size = sequence_groups[0]->get_hidden_size();

ov::Tensor generated_ids_embeds;
float *generated_ids_embeds_data = nullptr;

OPENVINO_ASSERT(m_embedding.get_request(), "Got sequence group with embeddings, but embeddings model wasn't set.");

ov::Tensor generated_ids = ov::Tensor(ov::element::i64, {1, num_generated_ids_without_embeddings});
int64_t *generated_ids_data = generated_ids.data<int64_t>();
size_t pos = 0;
for (size_t i = 0; i < num_sequence_groups; ++i) {
size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids[i];
SequenceGroup::CPtr sequence_group = sequence_groups[seq_group_id];
for (auto seq: sequence_group->get_running_sequences()) {
const auto& generated_ids = seq->get_generated_ids();
for (size_t token_idx = seq->get_generated_ids_embeds().size(); token_idx < generated_ids.size(); token_idx++) {
generated_ids_data[pos] = generated_ids[token_idx];
pos++;
}
}
}
if (pos > 0) {
generated_ids_embeds = m_embedding.infer(generated_ids);
generated_ids_embeds_data = generated_ids_embeds.data<float>();

for (size_t i = 0; i < num_sequence_groups; ++i) {
size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids[i];
size_t embeds_pos = 0;
SequenceGroup::Ptr sequence_group = sequence_groups[seq_group_id];
for (auto seq: sequence_group->get_running_sequences()) {
auto generated_ids = seq->get_generated_ids();
size_t new_embeds_count = seq->get_generated_len() - seq->get_generated_ids_embeds().size();
ov::Coordinate start{0, embeds_pos, 0};
ov::Coordinate end{1, embeds_pos + new_embeds_count, hidden_size};
ov::Tensor embedding(generated_ids_embeds, start, end);
seq->append_generated_ids_embeds(embedding);
embeds_pos += new_embeds_count;
}
}
}
}

private:
void _fill_indices_from_block_tables(
const std::vector<std::string>& dst_tensor_names,
Expand Down
50 changes: 43 additions & 7 deletions src/cpp/src/sequence_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,56 @@ size_t Sequence::_make_hash(size_t content_length) {
content.insert(content.end(), m_prefix_hashes.begin(), m_prefix_hashes.begin() + prefix_hashes_needed_count);

// get tokens corresponding to current block
const auto prompt_ids = sequence_group->get_prompt_ids();
OPENVINO_ASSERT(content_length <= prompt_ids.size() + m_generated_ids.size());
if (block_start_idx < prompt_ids.size()) {
content.insert(content.end(), prompt_ids.begin() + block_start_idx, prompt_ids.begin() + std::min(prompt_ids.size(), content_length));
if (sequence_group->get_sequence_group_type() == SequenceGroupType::TOKENS) {
const auto prompt_ids = sequence_group->get_prompt_ids();
OPENVINO_ASSERT(content_length <= prompt_ids.size() + m_generated_ids.size());
if (block_start_idx < prompt_ids.size()) {
content.insert(content.end(), prompt_ids.begin() + block_start_idx, prompt_ids.begin() + std::min(prompt_ids.size(), content_length));
}
if (content_length > prompt_ids.size()) {
size_t start = block_start_idx < prompt_ids.size() ? 0 : block_start_idx - prompt_ids.size();
content.insert(content.end(), m_generated_ids.begin() + start, m_generated_ids.begin() + content_length - prompt_ids.size());
}
}
if (content_length > prompt_ids.size()) {
size_t start = block_start_idx < prompt_ids.size() ? 0 : block_start_idx - prompt_ids.size();
content.insert(content.end(), m_generated_ids.begin() + start, m_generated_ids.begin() + content_length - prompt_ids.size());
else if (sequence_group->get_sequence_group_type() == SequenceGroupType::EMBEDDINGS) {
const auto& input_embeds = sequence_group->get_input_embeds();
const auto generated_embeds = m_generated_ids_embeds;
OPENVINO_ASSERT(content_length <= input_embeds.size() + generated_embeds.size());

// get inputs embeddings
if (block_start_idx < input_embeds.size()) {
for (size_t idx = block_start_idx; idx < std::min(input_embeds.size(), content_length); idx++) {
auto embed = _reduce_embedding(input_embeds[idx]);
content.insert(content.end(), embed.begin(), embed.end());
}
}

// get generated ids embeddings
if (content_length > input_embeds.size()) {
size_t start = block_start_idx < input_embeds.size() ? 0 : block_start_idx - input_embeds.size();
for (size_t idx = start; idx < content_length - input_embeds.size(); idx++) {
auto embed = _reduce_embedding(generated_embeds[idx]);
content.insert(content.end(), embed.begin(), embed.end());
}
}
}
else {
OPENVINO_THROW("Hash calculation is not supported for this sequence type.");
}
const char* data = reinterpret_cast<const char*>(content.data());
std::size_t size = content.size() * sizeof(content[0]);
return std::hash<std::string_view>{}(std::string_view(data, size));
}

std::vector<int64_t> Sequence::_reduce_embedding(const std::vector<float>& embedding) {
size_t res_size = std::min((size_t)ceil(float(embedding.size()) / m_embeddings_hash_calculation_stride), m_embeddings_hash_max_num_values);
std::vector<int64_t> res(res_size);
for (size_t i = 0, idx=0; idx < res_size; i+= m_embeddings_hash_calculation_stride, idx++) {
res[idx] = std::round(embedding[i] * m_multiplier);
}
return res;
}

// Each KV block can be uniquely identified by
// the tokens within the block and the tokens in the prefix before the block.
// hash(prefix tokens + block tokens) <--> KV Block
Expand Down
48 changes: 40 additions & 8 deletions src/cpp/src/sequence_group.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,26 +49,38 @@ class Sequence {
std::vector<int64_t> m_prefix_hashes;
SequenceGroup* m_sequence_group = nullptr;
static std::mutex m_counter_mutex;
std::vector<std::vector<float>> m_generated_ids_embeds;
SequenceGroupType m_type;
size_t m_hidden_size;

// Embeddings hash calculation params
static constexpr size_t m_embeddings_hash_max_num_values = 10; // max number of values used for embeddings hash calculation
static constexpr size_t m_embeddings_hash_calculation_stride = 50; // the stride with which values are taken from embeddings vector
static constexpr size_t m_multiplier = 10000; // multiplier by which float values are multiplied before conversion to size_t

size_t _make_hash(size_t content_length);

explicit Sequence(const uint64_t id) : m_grouped_id(id) {}
static std::vector<int64_t> _reduce_embedding(const std::vector<float>& embedding);

explicit Sequence(const uint64_t id, const SequenceGroupType type, const size_t hidden_size) : m_grouped_id(id), m_type(type), m_hidden_size(hidden_size) {}

Sequence(const Sequence& seq, const uint64_t id) :
m_generated_ids(seq.m_generated_ids),
m_grouped_id(id),
m_status(seq.m_status),
m_cumulative_log_prob(seq.m_cumulative_log_prob),
m_sequence_group(seq.m_sequence_group) {
m_sequence_group(seq.m_sequence_group),
m_type(seq.m_type),
m_hidden_size(seq.m_hidden_size) {
OPENVINO_ASSERT(seq.m_id != m_id);
}

public:
using Ptr = std::shared_ptr<Sequence>;
using CPtr = std::shared_ptr<const Sequence>;

static Sequence::Ptr create(const uint64_t id) {
return Sequence::Ptr(new Sequence(id));
static Sequence::Ptr create(const uint64_t id, const SequenceGroupType type = SequenceGroupType::TOKENS, const size_t hidden_size = 0) {
return Sequence::Ptr(new Sequence(id, type, hidden_size));
}

static Sequence::Ptr fork(Sequence::CPtr sequence, const uint64_t id) {
Expand Down Expand Up @@ -191,6 +203,25 @@ class Sequence {
m_sequence_group = sequence_group;
}

const std::vector<std::vector<float>>& get_generated_ids_embeds() const {
OPENVINO_ASSERT(m_type == ov::genai::SequenceGroupType::EMBEDDINGS);
return m_generated_ids_embeds;
}

void append_generated_ids_embeds(ov::Tensor generated_ids_embeds) {
OPENVINO_ASSERT(m_type == SequenceGroupType::EMBEDDINGS);
auto embeds_count = generated_ids_embeds.get_shape()[1];
OPENVINO_ASSERT(m_hidden_size == generated_ids_embeds.get_shape()[2]);

auto current_embeds_size = m_generated_ids_embeds.size();
for (size_t i = current_embeds_size, idx = 0; i < current_embeds_size + embeds_count; i++, idx++) {
m_generated_ids_embeds.emplace_back(std::vector<float>());
m_generated_ids_embeds[i].resize(m_hidden_size);
std::copy_n(generated_ids_embeds.data<float>() + idx * m_hidden_size, m_hidden_size, m_generated_ids_embeds[i].begin());

}
}

std::shared_ptr<SequenceGroup> get_sequence_group_ptr() const;

// Each KV block can be uniquely identified by
Expand Down Expand Up @@ -261,6 +292,7 @@ class SequenceGroup : public std::enable_shared_from_this<SequenceGroup> {
: SequenceGroup(request_id, sampling_params, block_size) {

size_t prompt_len;
size_t hidden_size = 0;
if (input_ids.get_shape().size() > 1) {
prompt_len = input_ids.get_shape()[1];
} else {
Expand All @@ -273,11 +305,11 @@ class SequenceGroup : public std::enable_shared_from_this<SequenceGroup> {
std::copy_n(input_ids.data<int64_t>(), prompt_len, m_prompt_ids.begin());
m_sequence_group_type = SequenceGroupType::TOKENS;
} else if (input_ids.get_element_type() == ov::element::f32) {
auto embeds_len = input_ids.get_shape()[2];
hidden_size = input_ids.get_shape()[2];
m_input_embeds.resize(prompt_len);
for (size_t i = 0; i < prompt_len; i++) {
m_input_embeds[i].resize(embeds_len);
std::copy_n(input_ids.data<float>() + i * embeds_len, embeds_len, m_input_embeds[i].begin());
m_input_embeds[i].resize(hidden_size);
std::copy_n(input_ids.data<float>() + i * hidden_size, hidden_size, m_input_embeds[i].begin());
}
m_sequence_group_type = SequenceGroupType::EMBEDDINGS;
}
Expand All @@ -287,7 +319,7 @@ class SequenceGroup : public std::enable_shared_from_this<SequenceGroup> {
m_prompt_log_probs.reserve(prompt_len);

// create a single sequence
add_sequence(Sequence::create(m_next_sequence_id++));
add_sequence(Sequence::create(m_next_sequence_id++, m_sequence_group_type, hidden_size));
}

void add_sequence(const Sequence::Ptr & sequence) {
Expand Down
Loading
Loading