13
13
#include < optional>
14
14
#include < random>
15
15
16
+ #include " sampler.hpp"
17
+
18
+ #include " debug_utils.hpp"
19
+
16
20
using namespace ov ::genai;
17
21
18
22
namespace {
@@ -416,6 +420,160 @@ ov::Tensor get_image_embedding(const EncodedImage& encoded_image, Tokenizer& tok
416
420
}
417
421
}
418
422
423
+ void forward_embedings_and_lm (SequenceGroup::CPtr sequence_group, ov::InferRequest& embedding, ov::InferRequest& language, const VLMConfig m_vlm_config, const std::shared_ptr<Sampler> sampler) {
424
+ // compute aggregated values
425
+ size_t num_sequences = sequence_group->num_running_seqs ();
426
+ size_t batch_size_in_sequences = num_sequences;
427
+ size_t total_num_tokens = sequence_group->get_num_scheduled_tokens () * num_sequences;
428
+ size_t total_num_blocks = sequence_group->get_num_blocks () * num_sequences;
429
+ size_t max_context_len_val = std::max (max_context_len_val, sequence_group->get_context_len ());
430
+
431
+ ov::Tensor
432
+ input_ids (ov::element::i64, {total_num_tokens, 1 }),
433
+ position_ids (ov::element::i64, {total_num_tokens, 1 }),
434
+ beam_idx (ov::element::i32, { total_num_tokens });
435
+
436
+ // get raw pointers to copy to
437
+ int64_t
438
+ * input_ids_data = input_ids.data <int64_t >(),
439
+ * position_ids_data = position_ids.data <int64_t >();
440
+ int32_t
441
+ * beam_idx_data = beam_idx.data <int32_t >();
442
+
443
+ std::vector<Sequence::CPtr> running_sequences = sequence_group->get_running_sequences ();
444
+ size_t num_running_sequences = running_sequences.size ();
445
+ size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens ();
446
+ size_t group_position_id = sequence_group->get_num_processed_tokens ();
447
+
448
+ // spec: In case of multiple input tokens for current sequence (prompt_len > 1),
449
+ // context_len corresponds to first token within subgroup of scheduled tokens
450
+ size_t group_context_len = group_position_id;
451
+
452
+ for (size_t seq_id = 0 ; seq_id < num_running_sequences; ++seq_id) {
453
+ Sequence::CPtr sequence = running_sequences[seq_id];
454
+
455
+ for (size_t token_id = 0 , position_id = group_position_id; token_id < num_scheduled_tokens; ++token_id, ++position_id) {
456
+ // compute token for current sequence
457
+ input_ids_data[token_id] = position_id < sequence_group->get_prompt_len () ?
458
+ sequence_group->get_prompt_ids ()[position_id] :
459
+ sequence->get_generated_ids ()[position_id - sequence_group->get_prompt_len ()];
460
+
461
+ position_ids_data[token_id] = position_id;
462
+ }
463
+
464
+ // apply strides to shift to a next sequence
465
+ input_ids_data += num_scheduled_tokens;
466
+ position_ids_data += num_scheduled_tokens;
467
+ }
468
+
469
+ embedding.set_input_tensor (input_ids);
470
+
471
+ embedding.infer ();
472
+ const ov::Tensor& embed_prompt_tensor = embedding.get_output_tensor ();
473
+ float * embed_data = embed_prompt_tensor.data <float >();
474
+ for (auto idx = 0 ; idx < embed_prompt_tensor.get_size (); idx++) {
475
+ embed_data[idx] = embed_data[idx] * m_vlm_config.scale_emb ;
476
+ }
477
+
478
+ language.set_tensor (" inputs_embeds" , embed_prompt_tensor);
479
+
480
+ language.get_tensor (" attention_mask" ).set_shape ({ total_num_tokens, language.get_tensor (" attention_mask" ).get_shape ()[1 ] + 1 });
481
+ std::fill_n (language.get_tensor (" attention_mask" ).data <int64_t >(), language.get_tensor (" attention_mask" ).get_size (), 1 );
482
+
483
+ language.set_tensor (" position_ids" , position_ids);
484
+ std::vector<int32_t > beam_idxs = sampler->get_beam_idxs (sequence_group->get_request_id ());
485
+ if (beam_idxs.empty ()) {
486
+ for (size_t i = 0 ; i < num_sequences; i++) {
487
+ beam_idx_data[i] = 0 ;
488
+ }
489
+ } else {
490
+ for (size_t i = 0 ; i < beam_idxs.size (); i++) {
491
+ beam_idx_data[i] = beam_idxs.at (i);
492
+ }
493
+ }
494
+ language.set_tensor (" beam_idx" , beam_idx);
495
+
496
+ // print_tensor("input_ids", input_ids);
497
+ // print_tensor("position_ids", position_ids);
498
+ // print_tensor("attention_mask", language.get_tensor("attention_mask"));
499
+ // print_tensor("beam_idx", beam_idx);
500
+
501
+ language.infer ();
502
+ }
503
+
504
+ EncodedGenerationResult get_lm_encoded_results (
505
+ ov::InferRequest& language,
506
+ ov::InferRequest& embedding,
507
+ ov::Tensor inputs_embeds,
508
+ const VLMConfig m_vlm_config,
509
+ const std::shared_ptr<StreamerBase> streamer_ptr,
510
+ const std::shared_ptr<Sampler> sampler,
511
+ std::vector<SequenceGroup::Ptr > requests
512
+ ) {
513
+ SequenceGroup::Ptr request = requests.back ();
514
+ GenerationHandle generation = std::make_shared<GenerationHandleImpl>(request->get_generation_stream (), request->get_sampling_parameters ());
515
+
516
+ language.set_tensor (" inputs_embeds" , inputs_embeds);
517
+
518
+ size_t history_len = language.get_tensor (" attention_mask" ).get_shape ().at (1 );
519
+ language.get_tensor (" attention_mask" ).set_shape ({1 , history_len + inputs_embeds.get_shape ()[1 ]});
520
+ std::fill_n (language.get_tensor (" attention_mask" ).data <int64_t >(), language.get_tensor (" attention_mask" ).get_size (), 1 );
521
+
522
+ language.get_tensor (" position_ids" ).set_shape ({1 , inputs_embeds.get_shape ().at (1 )});
523
+ std::iota (language.get_tensor (" position_ids" ).data <int64_t >(), language.get_tensor (" position_ids" ).data <int64_t >() + language.get_tensor (" position_ids" ).get_size (), history_len);
524
+
525
+ language.get_tensor (" beam_idx" ).set_shape ({ BATCH_SIZE });
526
+ language.get_tensor (" beam_idx" ).data <int32_t >()[0 ] = 0 ;
527
+
528
+ language.infer ();
529
+
530
+ int64_t sequence_len = language.get_tensor (" logits" ).get_shape ().at (1 );
531
+ request->schedule_tokens (sequence_len);
532
+
533
+ SamplerOutput sampler_output = sampler->sample (requests, language.get_tensor (" logits" ));
534
+
535
+ language.get_tensor (" inputs_embeds" ).set_shape ({BATCH_SIZE, 1 , m_vlm_config.hidden_size });
536
+ language.get_tensor (" position_ids" ).set_shape ({ BATCH_SIZE, 1 });
537
+
538
+
539
+ while (!request->has_finished ()) {
540
+ request->schedule_tokens (1 );
541
+
542
+ forward_embedings_and_lm (request, embedding, language, m_vlm_config, sampler);
543
+
544
+ if (streamer_ptr) {
545
+ // first sequences
546
+ int64_t out_token = request.get ()->operator [](0 )->get_generated_ids ().back ();
547
+ if (streamer_ptr->put (out_token)) {
548
+ break ;
549
+ }
550
+ }
551
+
552
+ sampler_output = sampler->sample (requests, language.get_tensor (" logits" ));
553
+ }
554
+
555
+ if (streamer_ptr) {
556
+ streamer_ptr->end ();
557
+ }
558
+
559
+ EncodedGenerationResult result;
560
+ result.m_request_id = 1 ;
561
+ std::vector<GenerationOutput> generation_outputs = generation->read_all ();
562
+ std::sort (generation_outputs.begin (), generation_outputs.end (), [=] (GenerationOutput& r1, GenerationOutput& r2) {
563
+ return r1.score > r2.score ;
564
+ });
565
+
566
+ auto num_outputs = std::min (request->get_sampling_parameters ().num_return_sequences , generation_outputs.size ());
567
+ for (size_t generation_output_idx = 0 ; generation_output_idx < num_outputs; ++generation_output_idx) {
568
+ const auto & generation_output = generation_outputs[generation_output_idx];
569
+ result.m_generation_ids .push_back (std::move (generation_output.generated_ids ));
570
+ result.m_scores .push_back (generation_output.score );
571
+ }
572
+ result.m_status = generation->get_status ();
573
+
574
+ return result;
575
+ }
576
+
419
577
VLMPipeline::VLMPipeline (
420
578
const std::filesystem::path& model_dir,
421
579
const Tokenizer& tokenizer,
@@ -473,31 +631,16 @@ DecodedResults VLMPipeline::generate(
473
631
ov::Tensor prompt_tensor = process_prompt (m_embedding, input_ids, m_vlm_config.scale_emb );
474
632
inputs_embeds = concatenate_mid_dim (imgEmbedTensor, prompt_tensor);
475
633
}
476
- m_language.set_tensor (" inputs_embeds" , inputs_embeds);
477
- size_t history_len = m_language.get_tensor (" attention_mask" ).get_shape ().at (1 );
478
- m_language.get_tensor (" attention_mask" ).set_shape ({1 , history_len + inputs_embeds.get_shape ()[1 ]});
479
- std::fill_n (m_language.get_tensor (" attention_mask" ).data <int64_t >(), m_language.get_tensor (" attention_mask" ).get_size (), 1 );
480
- m_language.get_tensor (" position_ids" ).set_shape ({1 , inputs_embeds.get_shape ().at (1 )});
481
- std::iota (m_language.get_tensor (" position_ids" ).data <int64_t >(), m_language.get_tensor (" position_ids" ).data <int64_t >() + m_language.get_tensor (" position_ids" ).get_size (), history_len);
482
- m_language.get_tensor (" beam_idx" ).set_shape ({ BATCH_SIZE });
483
- m_language.get_tensor (" beam_idx" ).data <int32_t >()[0 ] = 0 ;
484
634
485
- m_language. infer ( );
635
+ std::shared_ptr<Sampler> sampler = std::make_shared<Sampler>(m_tokenizer );
486
636
487
- ov::Shape logits_shape = m_language.get_tensor (" logits" ).get_shape ();
488
- auto attention_size = m_language.get_tensor (" attention_mask" ).get_size ();
637
+ std::vector<SequenceGroup::Ptr > requests;
638
+ // request_id, input_ids, generation_config, block_size, enable_prefix_caching
639
+ // now we have one prompt as input, so we need one request
640
+ SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(0 , input_ids, generation_config, 1 , false );
641
+ sequence_group->set_sequence_group_ptr (sequence_group);
642
+ requests.push_back (sequence_group);
489
643
490
- int64_t sequence_len = m_language.get_tensor (" logits" ).get_shape ().at (1 ) - 1 ;
491
- size_t vocab_size = m_language.get_tensor (" logits" ).get_shape ().back ();
492
- float * logits = m_language.get_tensor (" logits" ).data <float >() + sequence_len * vocab_size;
493
- int64_t out_token = std::max_element (logits, logits + vocab_size) - logits;
494
-
495
- m_language.get_tensor (" inputs_embeds" ).set_shape ({BATCH_SIZE, 1 , m_vlm_config.hidden_size });
496
- m_language.get_tensor (" position_ids" ).set_shape ({ BATCH_SIZE, 1 });
497
-
498
- m_embedding.get_input_tensor ().set_shape ({ 1 , 1 });
499
-
500
- int64_t eos_token_id = m_tokenizer.get_eos_token_id ();
501
644
std::shared_ptr<StreamerBase> streamer_ptr = std::visit (overloaded{
502
645
[&m_tokenizer = m_tokenizer](
503
646
const std::function<bool (std::string)>& callback
@@ -511,47 +654,23 @@ DecodedResults VLMPipeline::generate(
511
654
return std::shared_ptr<StreamerBase>{nullptr };
512
655
},
513
656
}, streamer);
514
- std::vector<int64_t > generated;
515
- while (true ) { // (out_token != eos_token_id)
516
- // out_token embedding
517
- m_embedding.get_input_tensor ().data <int64_t >()[0 ] = out_token;
518
- m_embedding.infer ();
519
- const ov::Tensor& embed_prompt_tensor = m_embedding.get_output_tensor ();
520
- float * embed_data = embed_prompt_tensor.data <float >();
521
- for (auto idx = 0 ; idx < embed_prompt_tensor.get_size (); idx++) {
522
- embed_data[idx] = embed_data[idx] * m_vlm_config.scale_emb ;
523
- }
524
657
525
- m_language.set_tensor (" inputs_embeds" , embed_prompt_tensor);
526
- m_language.get_tensor (" attention_mask" ).set_shape ({ BATCH_SIZE, m_language.get_tensor (" attention_mask" ).get_shape ()[1 ] + 1 });
527
- std::fill_n (m_language.get_tensor (" attention_mask" ).data <int64_t >(), m_language.get_tensor (" attention_mask" ).get_size (), 1 );
528
- m_language.get_tensor (" position_ids" ).data <int64_t >()[0 ] = int64_t (m_language.get_tensor (" attention_mask" ).get_size () - 2 );
529
-
530
- m_language.infer ();
531
-
532
- generated.push_back (out_token);
533
- if (streamer_ptr && streamer_ptr->put (out_token)) {
534
- break ;
535
- }
536
- logits = m_language.get_tensor (" logits" ).data <float >();
537
-
538
- out_token = std::max_element (logits, logits + vocab_size) - logits;
539
- if (out_token == eos_token_id) {
540
- break ;
541
- }
542
- }
543
-
544
- if (streamer_ptr) {
545
- streamer_ptr->end ();
546
- }
658
+ EncodedGenerationResult encoded_result = get_lm_encoded_results (m_language, m_embedding, inputs_embeds, m_vlm_config, streamer_ptr, sampler, requests);
547
659
548
660
if (!is_chat_conversation) {
549
661
for (auto & variable : m_language.query_state ()) {
550
662
variable.reset ();
551
663
}
552
664
m_language.get_tensor (" attention_mask" ).set_shape ({1 , 0 });
553
665
}
554
- return {{m_tokenizer.decode (generated)}};
666
+
667
+ DecodedResults decoded;
668
+ for (size_t idx = 0 ; idx < encoded_result.m_generation_ids .size (); ++idx) {
669
+ decoded.texts .push_back (m_tokenizer.decode (encoded_result.m_generation_ids .at (idx)));
670
+ decoded.scores .push_back (encoded_result.m_scores .at (idx));
671
+ }
672
+
673
+ return decoded;
555
674
}
556
675
557
676
DecodedResults VLMPipeline::generate (
@@ -562,6 +681,15 @@ DecodedResults VLMPipeline::generate(
562
681
ov::genai::OptionalGenerationConfig config_arg = utils::get_config_from_map (config_map);
563
682
GenerationConfig config = (config_arg.has_value ()) ? *config_arg : get_generation_config ();
564
683
config.update_generation_config (config_map);
684
+
685
+ // If eos_token_id was not provided, take value
686
+ if (config.eos_token_id == -1 )
687
+ config.set_eos_token_id (m_tokenizer.get_eos_token_id ());
688
+
689
+ // if (is_chat_conversation && config.num_return_sequences > 1) {
690
+ // config.num_return_sequences = 1;
691
+ // }
692
+
565
693
return generate (
566
694
prompt,
567
695
config_map.end () == image ? std::vector<ov::Tensor>{}
0 commit comments