Skip to content

Commit 2e3cc74

Browse files
committed
Add sampling to vlm pipeline by Sampler
1 parent 2450d96 commit 2e3cc74

File tree

5 files changed

+220
-57
lines changed

5 files changed

+220
-57
lines changed

samples/CMakeLists.txt

+2
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ install(DIRECTORY
2929
cpp/whisper_speech_recognition
3030
cpp/stable_diffusion
3131
cpp/lora_greedy_causal_lm
32+
# cpp/visual_language_chat
33+
# cpp/continuous_batching_accuracy
3234
DESTINATION samples/cpp COMPONENT cpp_samples_genai)
3335

3436
install(DIRECTORY

samples/cpp/visual_language_chat/visual_language_chat.cpp

+9-2
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,20 @@ int main(int argc, char* argv[]) try {
3131
}
3232
pipe.generate(
3333
prompt,
34-
ov::genai::image(std::move(image)),
34+
// ov::genai::image(std::move(image)),
35+
// ov::genai::generation_config(ov::genai::beam_search()),
36+
ov::genai::generation_config(ov::genai::greedy()),
37+
// ov::genai::generation_config(ov::genai::multinomial()),
3538
ov::genai::streamer(print_subword)
3639
);
3740
std::cout << "\n----------\n"
3841
"question:\n";
3942
while (std::getline(std::cin, prompt)) {
40-
pipe.generate(prompt, ov::genai::streamer(print_subword));
43+
pipe.generate(prompt,
44+
// ov::genai::generation_config(ov::genai::beam_search()),
45+
ov::genai::generation_config(ov::genai::greedy()),
46+
// ov::genai::generation_config(ov::genai::multinomial()),
47+
ov::genai::streamer(print_subword));
4148
std::cout << "\n----------\n"
4249
"question:\n";
4350
}

src/cpp/src/sampler.cpp

+22
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,20 @@ Sampler::GroupBeamSearcher::GroupBeamSearcher(SequenceGroup::Ptr sequence_group,
230230
}
231231
}
232232

233+
std::vector<int32_t> Sampler::GroupBeamSearcher::get_beam_idxs() {
234+
std::vector<int32_t> next_beams;
235+
236+
for (Group& group : m_groups) {
237+
if (!group.done) {
238+
for (Beam& beam : group.ongoing) {
239+
next_beams.push_back(beam.m_global_beam_idx);
240+
}
241+
}
242+
}
243+
244+
return next_beams;
245+
}
246+
233247
void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output) {
234248
assert(m_parameters.num_beams % m_parameters.num_beam_groups == 0 &&
235249
"number of beams should be divisible by number of groups");
@@ -560,6 +574,14 @@ std::vector<int64_t> Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen
560574
return dropped_seq_ids;
561575
}
562576

577+
std::vector<int32_t> Sampler::get_beam_idxs(uint64_t request_id) {
578+
std::vector<int32_t> beams;
579+
if (m_beam_search_info.find(request_id) != m_beam_search_info.end()) {
580+
GroupBeamSearcher beam_searcher = m_beam_search_info.at(request_id);
581+
std::vector<int32_t> beams = beam_searcher.get_beam_idxs();
582+
}
583+
return beams;
584+
}
563585

564586
SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits) {
565587
const float * logits_data = logits.data<float>();

src/cpp/src/sampler.hpp

+4
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ class Sampler {
6161
SamplerOutput sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits);
6262
void set_seed(size_t seed) { rng_engine.seed(seed); }
6363
void clear_beam_search_info(uint64_t request_id);
64+
65+
std::vector<int32_t> get_beam_idxs(uint64_t request_id);
6466
};
6567

6668
class Sampler::GroupBeamSearcher {
@@ -105,5 +107,7 @@ class Sampler::GroupBeamSearcher {
105107

106108
void select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output);
107109
void finalize(SamplerOutput& sampler_output);
110+
111+
std::vector<int32_t> get_beam_idxs();
108112
};
109113
}

src/cpp/src/vlm_pipeline.cpp

+183-55
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@
1313
#include <optional>
1414
#include <random>
1515

16+
#include "sampler.hpp"
17+
18+
#include "debug_utils.hpp"
19+
1620
using namespace ov::genai;
1721

1822
namespace {
@@ -416,6 +420,160 @@ ov::Tensor get_image_embedding(const EncodedImage& encoded_image, Tokenizer& tok
416420
}
417421
}
418422

423+
void forward_embedings_and_lm(SequenceGroup::CPtr sequence_group, ov::InferRequest& embedding, ov::InferRequest& language, const VLMConfig m_vlm_config, const std::shared_ptr<Sampler> sampler) {
424+
// compute aggregated values
425+
size_t num_sequences = sequence_group->num_running_seqs();
426+
size_t batch_size_in_sequences = num_sequences;
427+
size_t total_num_tokens = sequence_group->get_num_scheduled_tokens() * num_sequences;
428+
size_t total_num_blocks = sequence_group->get_num_blocks() * num_sequences;
429+
size_t max_context_len_val = std::max(max_context_len_val, sequence_group->get_context_len());
430+
431+
ov::Tensor
432+
input_ids(ov::element::i64, {total_num_tokens, 1}),
433+
position_ids(ov::element::i64, {total_num_tokens, 1}),
434+
beam_idx(ov::element::i32, { total_num_tokens });
435+
436+
// get raw pointers to copy to
437+
int64_t
438+
* input_ids_data = input_ids.data<int64_t>(),
439+
* position_ids_data = position_ids.data<int64_t>();
440+
int32_t
441+
* beam_idx_data = beam_idx.data<int32_t>();
442+
443+
std::vector<Sequence::CPtr> running_sequences = sequence_group->get_running_sequences();
444+
size_t num_running_sequences = running_sequences.size();
445+
size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens();
446+
size_t group_position_id = sequence_group->get_num_processed_tokens();
447+
448+
// spec: In case of multiple input tokens for current sequence (prompt_len > 1),
449+
// context_len corresponds to first token within subgroup of scheduled tokens
450+
size_t group_context_len = group_position_id;
451+
452+
for (size_t seq_id = 0; seq_id < num_running_sequences; ++seq_id) {
453+
Sequence::CPtr sequence = running_sequences[seq_id];
454+
455+
for (size_t token_id = 0, position_id = group_position_id; token_id < num_scheduled_tokens; ++token_id, ++position_id) {
456+
// compute token for current sequence
457+
input_ids_data[token_id] = position_id < sequence_group->get_prompt_len() ?
458+
sequence_group->get_prompt_ids()[position_id] :
459+
sequence->get_generated_ids()[position_id - sequence_group->get_prompt_len()];
460+
461+
position_ids_data[token_id] = position_id;
462+
}
463+
464+
// apply strides to shift to a next sequence
465+
input_ids_data += num_scheduled_tokens;
466+
position_ids_data += num_scheduled_tokens;
467+
}
468+
469+
embedding.set_input_tensor(input_ids);
470+
471+
embedding.infer();
472+
const ov::Tensor& embed_prompt_tensor = embedding.get_output_tensor();
473+
float* embed_data = embed_prompt_tensor.data<float>();
474+
for (auto idx = 0; idx < embed_prompt_tensor.get_size(); idx++) {
475+
embed_data[idx] = embed_data[idx] * m_vlm_config.scale_emb;
476+
}
477+
478+
language.set_tensor("inputs_embeds", embed_prompt_tensor);
479+
480+
language.get_tensor("attention_mask").set_shape({ total_num_tokens, language.get_tensor("attention_mask").get_shape()[1] + 1 });
481+
std::fill_n(language.get_tensor("attention_mask").data<int64_t>(), language.get_tensor("attention_mask").get_size(), 1);
482+
483+
language.set_tensor("position_ids", position_ids);
484+
std::vector<int32_t> beam_idxs = sampler->get_beam_idxs(sequence_group->get_request_id());
485+
if (beam_idxs.empty()) {
486+
for (size_t i = 0; i < num_sequences; i++) {
487+
beam_idx_data[i] = 0;
488+
}
489+
} else {
490+
for (size_t i = 0; i < beam_idxs.size(); i++) {
491+
beam_idx_data[i] = beam_idxs.at(i);
492+
}
493+
}
494+
language.set_tensor("beam_idx", beam_idx);
495+
496+
// print_tensor("input_ids", input_ids);
497+
// print_tensor("position_ids", position_ids);
498+
// print_tensor("attention_mask", language.get_tensor("attention_mask"));
499+
// print_tensor("beam_idx", beam_idx);
500+
501+
language.infer();
502+
}
503+
504+
EncodedGenerationResult get_lm_encoded_results(
505+
ov::InferRequest& language,
506+
ov::InferRequest& embedding,
507+
ov::Tensor inputs_embeds,
508+
const VLMConfig m_vlm_config,
509+
const std::shared_ptr<StreamerBase> streamer_ptr,
510+
const std::shared_ptr<Sampler> sampler,
511+
std::vector<SequenceGroup::Ptr> requests
512+
) {
513+
SequenceGroup::Ptr request = requests.back();
514+
GenerationHandle generation = std::make_shared<GenerationHandleImpl>(request->get_generation_stream(), request->get_sampling_parameters());
515+
516+
language.set_tensor("inputs_embeds", inputs_embeds);
517+
518+
size_t history_len = language.get_tensor("attention_mask").get_shape().at(1);
519+
language.get_tensor("attention_mask").set_shape({1, history_len + inputs_embeds.get_shape()[1]});
520+
std::fill_n(language.get_tensor("attention_mask").data<int64_t>(), language.get_tensor("attention_mask").get_size(), 1);
521+
522+
language.get_tensor("position_ids").set_shape({1, inputs_embeds.get_shape().at(1)});
523+
std::iota(language.get_tensor("position_ids").data<int64_t>(), language.get_tensor("position_ids").data<int64_t>() + language.get_tensor("position_ids").get_size(), history_len);
524+
525+
language.get_tensor("beam_idx").set_shape({ BATCH_SIZE });
526+
language.get_tensor("beam_idx").data<int32_t>()[0] = 0;
527+
528+
language.infer();
529+
530+
int64_t sequence_len = language.get_tensor("logits").get_shape().at(1);
531+
request->schedule_tokens(sequence_len);
532+
533+
SamplerOutput sampler_output = sampler->sample(requests, language.get_tensor("logits"));
534+
535+
language.get_tensor("inputs_embeds").set_shape({BATCH_SIZE, 1, m_vlm_config.hidden_size});
536+
language.get_tensor("position_ids").set_shape({ BATCH_SIZE, 1 });
537+
538+
539+
while (!request->has_finished()) {
540+
request->schedule_tokens(1);
541+
542+
forward_embedings_and_lm(request, embedding, language, m_vlm_config, sampler);
543+
544+
if (streamer_ptr) {
545+
// first sequences
546+
int64_t out_token = request.get()->operator[](0)->get_generated_ids().back();
547+
if (streamer_ptr->put(out_token)) {
548+
break;
549+
}
550+
}
551+
552+
sampler_output = sampler->sample(requests, language.get_tensor("logits"));
553+
}
554+
555+
if (streamer_ptr) {
556+
streamer_ptr->end();
557+
}
558+
559+
EncodedGenerationResult result;
560+
result.m_request_id = 1;
561+
std::vector<GenerationOutput> generation_outputs = generation->read_all();
562+
std::sort(generation_outputs.begin(), generation_outputs.end(), [=] (GenerationOutput& r1, GenerationOutput& r2) {
563+
return r1.score > r2.score;
564+
});
565+
566+
auto num_outputs = std::min(request->get_sampling_parameters().num_return_sequences, generation_outputs.size());
567+
for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) {
568+
const auto& generation_output = generation_outputs[generation_output_idx];
569+
result.m_generation_ids.push_back(std::move(generation_output.generated_ids));
570+
result.m_scores.push_back(generation_output.score);
571+
}
572+
result.m_status = generation->get_status();
573+
574+
return result;
575+
}
576+
419577
VLMPipeline::VLMPipeline(
420578
const std::filesystem::path& model_dir,
421579
const Tokenizer& tokenizer,
@@ -473,31 +631,16 @@ DecodedResults VLMPipeline::generate(
473631
ov::Tensor prompt_tensor = process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb);
474632
inputs_embeds = concatenate_mid_dim(imgEmbedTensor, prompt_tensor);
475633
}
476-
m_language.set_tensor("inputs_embeds", inputs_embeds);
477-
size_t history_len = m_language.get_tensor("attention_mask").get_shape().at(1);
478-
m_language.get_tensor("attention_mask").set_shape({1, history_len + inputs_embeds.get_shape()[1]});
479-
std::fill_n(m_language.get_tensor("attention_mask").data<int64_t>(), m_language.get_tensor("attention_mask").get_size(), 1);
480-
m_language.get_tensor("position_ids").set_shape({1, inputs_embeds.get_shape().at(1)});
481-
std::iota(m_language.get_tensor("position_ids").data<int64_t>(), m_language.get_tensor("position_ids").data<int64_t>() + m_language.get_tensor("position_ids").get_size(), history_len);
482-
m_language.get_tensor("beam_idx").set_shape({ BATCH_SIZE });
483-
m_language.get_tensor("beam_idx").data<int32_t>()[0] = 0;
484634

485-
m_language.infer();
635+
std::shared_ptr<Sampler> sampler = std::make_shared<Sampler>(m_tokenizer);
486636

487-
ov::Shape logits_shape = m_language.get_tensor("logits").get_shape();
488-
auto attention_size = m_language.get_tensor("attention_mask").get_size();
637+
std::vector<SequenceGroup::Ptr> requests;
638+
// request_id, input_ids, generation_config, block_size, enable_prefix_caching
639+
// now we have one prompt as input, so we need one request
640+
SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(0, input_ids, generation_config, 1, false);
641+
sequence_group->set_sequence_group_ptr(sequence_group);
642+
requests.push_back(sequence_group);
489643

490-
int64_t sequence_len = m_language.get_tensor("logits").get_shape().at(1) - 1;
491-
size_t vocab_size = m_language.get_tensor("logits").get_shape().back();
492-
float* logits = m_language.get_tensor("logits").data<float>() + sequence_len * vocab_size;
493-
int64_t out_token = std::max_element(logits, logits + vocab_size) - logits;
494-
495-
m_language.get_tensor("inputs_embeds").set_shape({BATCH_SIZE, 1, m_vlm_config.hidden_size});
496-
m_language.get_tensor("position_ids").set_shape({ BATCH_SIZE, 1 });
497-
498-
m_embedding.get_input_tensor().set_shape({ 1, 1 });
499-
500-
int64_t eos_token_id = m_tokenizer.get_eos_token_id();
501644
std::shared_ptr<StreamerBase> streamer_ptr = std::visit(overloaded{
502645
[&m_tokenizer = m_tokenizer](
503646
const std::function<bool(std::string)>& callback
@@ -511,47 +654,23 @@ DecodedResults VLMPipeline::generate(
511654
return std::shared_ptr<StreamerBase>{nullptr};
512655
},
513656
}, streamer);
514-
std::vector<int64_t> generated;
515-
while (true) { //(out_token != eos_token_id)
516-
//out_token embedding
517-
m_embedding.get_input_tensor().data<int64_t>()[0] = out_token;
518-
m_embedding.infer();
519-
const ov::Tensor& embed_prompt_tensor = m_embedding.get_output_tensor();
520-
float* embed_data = embed_prompt_tensor.data<float>();
521-
for (auto idx = 0; idx < embed_prompt_tensor.get_size(); idx++) {
522-
embed_data[idx] = embed_data[idx] * m_vlm_config.scale_emb;
523-
}
524657

525-
m_language.set_tensor("inputs_embeds", embed_prompt_tensor);
526-
m_language.get_tensor("attention_mask").set_shape({ BATCH_SIZE, m_language.get_tensor("attention_mask").get_shape()[1] + 1 });
527-
std::fill_n(m_language.get_tensor("attention_mask").data<int64_t>(), m_language.get_tensor("attention_mask").get_size(), 1);
528-
m_language.get_tensor("position_ids").data<int64_t>()[0] = int64_t(m_language.get_tensor("attention_mask").get_size() - 2);
529-
530-
m_language.infer();
531-
532-
generated.push_back(out_token);
533-
if (streamer_ptr && streamer_ptr->put(out_token)) {
534-
break;
535-
}
536-
logits = m_language.get_tensor("logits").data<float>();
537-
538-
out_token = std::max_element(logits, logits + vocab_size) - logits;
539-
if (out_token == eos_token_id) {
540-
break;
541-
}
542-
}
543-
544-
if (streamer_ptr) {
545-
streamer_ptr->end();
546-
}
658+
EncodedGenerationResult encoded_result = get_lm_encoded_results(m_language, m_embedding, inputs_embeds, m_vlm_config, streamer_ptr, sampler, requests);
547659

548660
if (!is_chat_conversation) {
549661
for (auto& variable : m_language.query_state()) {
550662
variable.reset();
551663
}
552664
m_language.get_tensor("attention_mask").set_shape({1, 0});
553665
}
554-
return {{m_tokenizer.decode(generated)}};
666+
667+
DecodedResults decoded;
668+
for (size_t idx = 0; idx < encoded_result.m_generation_ids.size(); ++idx) {
669+
decoded.texts.push_back(m_tokenizer.decode(encoded_result.m_generation_ids.at(idx)));
670+
decoded.scores.push_back(encoded_result.m_scores.at(idx));
671+
}
672+
673+
return decoded;
555674
}
556675

557676
DecodedResults VLMPipeline::generate(
@@ -562,6 +681,15 @@ DecodedResults VLMPipeline::generate(
562681
ov::genai::OptionalGenerationConfig config_arg = utils::get_config_from_map(config_map);
563682
GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config();
564683
config.update_generation_config(config_map);
684+
685+
// If eos_token_id was not provided, take value
686+
if (config.eos_token_id == -1)
687+
config.set_eos_token_id(m_tokenizer.get_eos_token_id());
688+
689+
// if (is_chat_conversation && config.num_return_sequences > 1) {
690+
// config.num_return_sequences = 1;
691+
// }
692+
565693
return generate(
566694
prompt,
567695
config_map.end() == image ? std::vector<ov::Tensor>{}

0 commit comments

Comments
 (0)