@@ -367,7 +367,7 @@ DecodedResults VLMPipeline::generate(
367
367
}
368
368
}
369
369
images_prompt += prompt;
370
- std::string new_templated_chat_history ;
370
+ ov::Tensor encoded_input ;
371
371
if (m_is_chat_conversation) {
372
372
// KV cache in model already contains prompts and answers from previous iterations.
373
373
// So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
@@ -379,32 +379,45 @@ DecodedResults VLMPipeline::generate(
379
379
// KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
380
380
m_history.push_back ({{" role" , " user" }, {" content" , images_prompt}});
381
381
constexpr bool add_generation_prompt = true ;
382
- new_templated_chat_history = m_tokenizer.apply_chat_template (m_history, add_generation_prompt);
382
+ std::string new_templated_chat_history = m_tokenizer.apply_chat_template (m_history, add_generation_prompt);
383
+ ov::Tensor new_chat_tokens = m_tokenizer.encode (new_templated_chat_history).input_ids ;
384
+ if (0 == m_language.get_tensor (" attention_mask" ).get_shape ().at (1 )) {
385
+ encoded_input = new_chat_tokens;
386
+ } else {
387
+ TokenizedInputs prev_chat_tokens = m_tokenizer.encode (
388
+ m_templated_chat_history
389
+ );
390
+ encoded_input = utils::subtract_chat_tokenized_inputs (
391
+ {new_chat_tokens}, prev_chat_tokens
392
+ ).input_ids ;
393
+ }
394
+ m_templated_chat_history = std::move (new_templated_chat_history);
395
+ } else {
396
+ encoded_input = m_tokenizer.encode (images_prompt).input_ids ;
383
397
}
384
- ov::Tensor special_tokens = m_tokenizer.encode (
385
- m_vlm_config.im_start
386
- + m_vlm_config.im_end
387
- + m_vlm_config.slice_start
388
- + m_vlm_config.slice_end
389
- ).input_ids ;
390
- OPENVINO_ASSERT (
391
- 4 == special_tokens.get_shape ().at (1 ),
392
- " Every special token must be represented with a single int."
393
- );
394
- size_t im_start_id = special_tokens.data <int64_t >()[0 ];
395
- size_t im_end_id = special_tokens.data <int64_t >()[1 ];
396
- size_t slice_start_id = special_tokens.data <int64_t >()[2 ];
397
- size_t slice_end_id = special_tokens.data <int64_t >()[3 ];
398
- ov::Tensor input_ids = m_tokenizer.encode (new_templated_chat_history).input_ids ;
399
- m_embedding.set_input_tensor (input_ids);
398
+ m_embedding.set_input_tensor (encoded_input);
400
399
m_embedding.infer ();
401
400
ov::Tensor inputs_embeds = m_embedding.get_output_tensor ();
402
401
OPENVINO_ASSERT (
403
402
m_vlm_config.hidden_size == inputs_embeds.get_shape ().at (2 ),
404
403
" Unexpected embedding size"
405
404
);
406
405
if (!rgbs.empty ()) {
407
- int64_t * ids = input_ids.data <int64_t >();
406
+ ov::Tensor special_tokens = m_tokenizer.encode (
407
+ m_vlm_config.im_start
408
+ + m_vlm_config.im_end
409
+ + m_vlm_config.slice_start
410
+ + m_vlm_config.slice_end
411
+ ).input_ids ;
412
+ OPENVINO_ASSERT (
413
+ 4 == special_tokens.get_shape ().at (1 ),
414
+ " Every special token must be represented with a single int."
415
+ );
416
+ size_t im_start_id = special_tokens.data <int64_t >()[0 ];
417
+ size_t im_end_id = special_tokens.data <int64_t >()[1 ];
418
+ size_t slice_start_id = special_tokens.data <int64_t >()[2 ];
419
+ size_t slice_end_id = special_tokens.data <int64_t >()[3 ];
420
+ int64_t * ids = encoded_input.data <int64_t >();
408
421
const ov::Tensor& resampled_source = resample (*this , embeds.resized_source , {embeds.resized_source_size });
409
422
float * emb = resampled_source.data <float >();
410
423
bool replacing = false ;
@@ -519,22 +532,19 @@ DecodedResults VLMPipeline::generate(
519
532
streamer_ptr->end ();
520
533
}
521
534
535
+ std::string decoded_results = m_tokenizer.decode (generated);
522
536
if (m_is_chat_conversation) {
523
- // auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history);
524
- // if (m_is_cache_empty) {
525
- // encoded_input = new_chat_tokens;
526
- // } else {
527
- // auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history);
528
- // encoded_input = subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens);
529
- // }
530
- // m_templated_chat_history = new_templated_chat_history;
537
+ // Tail of chat template is missing in KV cache.
538
+ // Find the tail to concatenate it with the next input prompt.
539
+ m_templated_chat_history.append (decoded_results);
540
+ m_history.push_back ({{" role" , " assistant" }, {" content" , decoded_results}});
531
541
} else {
532
542
for (auto & variable : m_language.query_state ()) {
533
543
variable.reset ();
534
544
}
535
545
m_language.get_tensor (" attention_mask" ).set_shape ({1 , 0 });
536
546
}
537
- return {{m_tokenizer. decode (generated )}};
547
+ return {{std::move (decoded_results )}};
538
548
}
539
549
540
550
DecodedResults VLMPipeline::generate (
0 commit comments