Skip to content

Commit ebaadb7

Browse files
committed
Finilize chat_template
1 parent 447e745 commit ebaadb7

File tree

4 files changed

+54
-47
lines changed

4 files changed

+54
-47
lines changed

samples/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ install(DIRECTORY
2626
cpp/multinomial_causal_lm
2727
# Don't install prompt_lookup_decoding_lm and speculative_decoding_lm because they don't use openvino_genai library and arent verifyed yet.
2828
# Don't install continuous_batching_accuracy and continuous_batching_benchmark because they depend on json.
29+
cpp/visual_language_chat
2930
cpp/whisper_speech_recognition
3031
cpp/stable_diffusion
3132
cpp/lora_greedy_causal_lm

src/cpp/src/llm_pipeline.cpp

+1-19
Original file line numberDiff line numberDiff line change
@@ -17,24 +17,6 @@
1717
#include "text_callback_streamer.hpp"
1818
#include "openvino/genai/lora_adapter.hpp"
1919

20-
namespace {
21-
22-
ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& fisrt, const ov::genai::TokenizedInputs& second){
23-
auto first_size = fisrt.input_ids.get_size();
24-
auto second_size = second.input_ids.get_size();
25-
ov::Shape new_shape{1, first_size - second_size};
26-
27-
ov::Tensor new_input_ids(ov::element::i64, new_shape);
28-
auto data_ptr = fisrt.input_ids.data<int64_t>();
29-
std::copy(data_ptr + second_size, data_ptr + first_size, new_input_ids.data<int64_t>());
30-
31-
ov::Tensor new_attention_mask(ov::element::i64, new_shape);
32-
std::fill_n(new_attention_mask.data<int64_t>(), new_shape[1], 1);
33-
34-
return {new_input_ids, new_attention_mask};
35-
}
36-
}
37-
3820
namespace ov {
3921
namespace genai {
4022

@@ -156,7 +138,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
156138
encoded_input = new_chat_tokens;
157139
} else {
158140
auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens_));
159-
encoded_input = subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens);
141+
encoded_input = utils::subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens);
160142
}
161143
m_templated_chat_history = new_templated_chat_history;
162144
// TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied

src/cpp/src/utils.hpp

+14
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,20 @@ ProcessorConfig from_any_map(
8686

8787
std::pair<ov::AnyMap, ov::AnyMap> split_core_complile_config(const ov::AnyMap& plugin_config);
8888

89+
inline ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& fisrt, const ov::genai::TokenizedInputs& second){
90+
auto first_size = fisrt.input_ids.get_size();
91+
auto second_size = second.input_ids.get_size();
92+
ov::Shape new_shape{1, first_size - second_size};
93+
94+
ov::Tensor new_input_ids(ov::element::i64, new_shape);
95+
auto data_ptr = fisrt.input_ids.data<int64_t>();
96+
std::copy(data_ptr + second_size, data_ptr + first_size, new_input_ids.data<int64_t>());
97+
98+
ov::Tensor new_attention_mask(ov::element::i64, new_shape);
99+
std::fill_n(new_attention_mask.data<int64_t>(), new_shape[1], 1);
100+
101+
return {new_input_ids, new_attention_mask};
102+
}
89103
} // namespace utils
90104
} // namespace genai
91105
} // namespace ov

src/cpp/src/vlm_pipeline.cpp

+38-28
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ DecodedResults VLMPipeline::generate(
367367
}
368368
}
369369
images_prompt += prompt;
370-
std::string new_templated_chat_history;
370+
ov::Tensor encoded_input;
371371
if (m_is_chat_conversation) {
372372
// KV cache in model already contains prompts and answers from previous iterations.
373373
// So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
@@ -379,32 +379,45 @@ DecodedResults VLMPipeline::generate(
379379
// KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
380380
m_history.push_back({{"role", "user"}, {"content", images_prompt}});
381381
constexpr bool add_generation_prompt = true;
382-
new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
382+
std::string new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
383+
ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids;
384+
if (0 == m_language.get_tensor("attention_mask").get_shape().at(1)) {
385+
encoded_input = new_chat_tokens;
386+
} else {
387+
TokenizedInputs prev_chat_tokens = m_tokenizer.encode(
388+
m_templated_chat_history
389+
);
390+
encoded_input = utils::subtract_chat_tokenized_inputs(
391+
{new_chat_tokens}, prev_chat_tokens
392+
).input_ids;
393+
}
394+
m_templated_chat_history = std::move(new_templated_chat_history);
395+
} else {
396+
encoded_input = m_tokenizer.encode(images_prompt).input_ids;
383397
}
384-
ov::Tensor special_tokens = m_tokenizer.encode(
385-
m_vlm_config.im_start
386-
+ m_vlm_config.im_end
387-
+ m_vlm_config.slice_start
388-
+ m_vlm_config.slice_end
389-
).input_ids;
390-
OPENVINO_ASSERT(
391-
4 == special_tokens.get_shape().at(1),
392-
"Every special token must be represented with a single int."
393-
);
394-
size_t im_start_id = special_tokens.data<int64_t>()[0];
395-
size_t im_end_id = special_tokens.data<int64_t>()[1];
396-
size_t slice_start_id = special_tokens.data<int64_t>()[2];
397-
size_t slice_end_id = special_tokens.data<int64_t>()[3];
398-
ov::Tensor input_ids = m_tokenizer.encode(new_templated_chat_history).input_ids;
399-
m_embedding.set_input_tensor(input_ids);
398+
m_embedding.set_input_tensor(encoded_input);
400399
m_embedding.infer();
401400
ov::Tensor inputs_embeds = m_embedding.get_output_tensor();
402401
OPENVINO_ASSERT(
403402
m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2),
404403
"Unexpected embedding size"
405404
);
406405
if (!rgbs.empty()) {
407-
int64_t* ids = input_ids.data<int64_t>();
406+
ov::Tensor special_tokens = m_tokenizer.encode(
407+
m_vlm_config.im_start
408+
+ m_vlm_config.im_end
409+
+ m_vlm_config.slice_start
410+
+ m_vlm_config.slice_end
411+
).input_ids;
412+
OPENVINO_ASSERT(
413+
4 == special_tokens.get_shape().at(1),
414+
"Every special token must be represented with a single int."
415+
);
416+
size_t im_start_id = special_tokens.data<int64_t>()[0];
417+
size_t im_end_id = special_tokens.data<int64_t>()[1];
418+
size_t slice_start_id = special_tokens.data<int64_t>()[2];
419+
size_t slice_end_id = special_tokens.data<int64_t>()[3];
420+
int64_t* ids = encoded_input.data<int64_t>();
408421
const ov::Tensor& resampled_source = resample(*this, embeds.resized_source, {embeds.resized_source_size});
409422
float* emb = resampled_source.data<float>();
410423
bool replacing = false;
@@ -519,22 +532,19 @@ DecodedResults VLMPipeline::generate(
519532
streamer_ptr->end();
520533
}
521534

535+
std::string decoded_results = m_tokenizer.decode(generated);
522536
if (m_is_chat_conversation) {
523-
// auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history);
524-
// if (m_is_cache_empty) {
525-
// encoded_input = new_chat_tokens;
526-
// } else {
527-
// auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history);
528-
// encoded_input = subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens);
529-
// }
530-
// m_templated_chat_history = new_templated_chat_history;
537+
// Tail of chat template is missing in KV cache.
538+
// Find the tail to concatenate it with the next input prompt.
539+
m_templated_chat_history.append(decoded_results);
540+
m_history.push_back({{"role", "assistant"}, {"content", decoded_results}});
531541
} else {
532542
for (auto& variable : m_language.query_state()) {
533543
variable.reset();
534544
}
535545
m_language.get_tensor("attention_mask").set_shape({1, 0});
536546
}
537-
return {{m_tokenizer.decode(generated)}};
547+
return {{std::move(decoded_results)}};
538548
}
539549

540550
DecodedResults VLMPipeline::generate(

0 commit comments

Comments
 (0)