@@ -44,12 +44,6 @@ class InputsEmbedder::IInputsEmbedder {
44
44
public:
45
45
virtual ov::Tensor get_inputs_embeds (const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) = 0;
46
46
47
- ov::Tensor get_input_embeddings (const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) {
48
- ov::Tensor inputs_embeds = get_inputs_embeds (prompt, images, metrics);
49
- m_inputs_embeds_size = inputs_embeds.get_shape ().at (1 );
50
- return inputs_embeds;
51
- }
52
-
53
47
virtual std::pair<ov::Tensor, std::optional<int64_t >> get_position_ids (const size_t inputs_embeds_size, const size_t history_size) {
54
48
ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1 , inputs_embeds_size }};
55
49
std::iota (position_ids.data <int64_t >(), position_ids.data <int64_t >() + position_ids.get_size (), history_size);
@@ -72,32 +66,6 @@ class InputsEmbedder::IInputsEmbedder {
72
66
m_stop_token_ids = stop_token_ids;
73
67
}
74
68
75
- <<<<<<< HEAD
76
- =======
77
- virtual void update_tokenized_history (const ov::genai::utils::GenerationFinishInfo generation_finish_info, bool is_beam_search, size_t last_answer_len) {
78
- if (is_beam_search) {
79
- m_kv_history_manager.trusted_history_length = m_tokenized_history.size ();
80
- m_kv_history_manager.num_tokens_to_remove_from_kv_cache = last_answer_len;
81
- } else {
82
- m_kv_history_manager.reset ();
83
- }
84
-
85
- m_last_disappeared_token = generation_finish_info.probably_disappeared_token ;
86
-
87
- if (m_is_chat_conversation) {
88
- if (generation_finish_info.streaming_finish_status == ov::genai::GenerationStatus::CANCEL) {
89
- // let's remove last answer and prompt
90
- m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_inputs_embeds_size + last_answer_len;
91
- m_tokenized_history = m_prev_tokenized_history;
92
- m_kv_history_manager.reset_kv_cache = m_tokenized_history.empty ();
93
- } else {
94
- auto encoded_result = generation_finish_info.results .tokens [0 ];
95
- std::copy (encoded_result.begin (), encoded_result.end (), std::back_inserter (m_tokenized_history));
96
- }
97
- }
98
- }
99
-
100
- >>>>>>> 19b756cd (update comments)
101
69
void set_apply_chat_template_status (bool apply_chat_template) {
102
70
m_apply_chat_template = apply_chat_template;
103
71
}
@@ -114,19 +82,15 @@ class InputsEmbedder::IInputsEmbedder {
114
82
m_history = {{{" role" , " system" }, {" content" , system_message}}};
115
83
}
116
84
117
- void update_chat_history (const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) {
85
+ virtual void update_chat_history (const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) {
86
+ m_kv_cache_state.num_tokens_to_trim = 0 ;
118
87
if (generation_finish_status == ov::genai::GenerationStatus::CANCEL) {
119
88
// If chat generation process was cancelled by user, let's rollback to previous state of history
120
89
m_history.pop_back ();
121
- if (!m_history.empty ()) {
122
- constexpr bool add_generation_prompt = true ;
123
- m_templated_chat_history = m_tokenizer.apply_chat_template (m_history, add_generation_prompt);
124
- }
125
90
} else {
126
91
// Tail of chat template is missing in KV cache.
127
92
// Find the tail to concatenate it with the next input prompt.
128
93
m_history.push_back ({{" role" , " assistant" }, {" content" , decoded_results}});
129
- m_kv_cache_state.num_tokens_to_trim = 0 ;
130
94
}
131
95
}
132
96
@@ -410,9 +374,9 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
410
374
return inputs_embeds;
411
375
}
412
376
413
- virtual void update_tokenized_history (const ov::genai::utils::GenerationFinishInfo generation_finish_info, bool is_beam_search, size_t last_answer_len ) {
414
- IInputsEmbedder::update_tokenized_history (generation_finish_info, is_beam_search, last_answer_len );
415
- if (generation_finish_info. streaming_finish_status == ov::genai::GenerationStatus::CANCEL) {
377
+ virtual void update_chat_history (const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status ) {
378
+ IInputsEmbedder::update_chat_history (decoded_results, generation_finish_status );
379
+ if (generation_finish_status == ov::genai::GenerationStatus::CANCEL) {
416
380
m_image_id = m_prev_image_id;
417
381
}
418
382
}
@@ -1562,9 +1526,9 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
1562
1526
m_tokens_per_images.clear ();
1563
1527
}
1564
1528
1565
- virtual void update_tokenized_history (const ov::genai::utils::GenerationFinishInfo generation_finish_info, bool is_beam_search, size_t last_answer_len ) {
1566
- IInputsEmbedder::update_tokenized_history (generation_finish_info, is_beam_search, last_answer_len );
1567
- if (generation_finish_info. streaming_finish_status == ov::genai::GenerationStatus::CANCEL)
1529
+ virtual void update_chat_history (const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status ) {
1530
+ IInputsEmbedder::update_chat_history (decoded_results, generation_finish_status );
1531
+ if (generation_finish_status == ov::genai::GenerationStatus::CANCEL)
1568
1532
m_tokens_per_images = m_prev_tokens_per_images;
1569
1533
}
1570
1534
};
@@ -2014,10 +1978,6 @@ ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const st
2014
1978
return m_impl->get_inputs_embeds (prompt, images, metrics);
2015
1979
}
2016
1980
2017
- ov::Tensor InputsEmbedder::get_input_embeddings (const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) {
2018
- return m_impl->get_input_embeddings (prompt, images, metrics);
2019
- }
2020
-
2021
1981
std::pair<ov::Tensor, std::optional<int64_t >> InputsEmbedder::get_position_ids (const size_t inputs_embeds_size, const size_t history_size) {
2022
1982
return m_impl->get_position_ids (inputs_embeds_size, history_size);
2023
1983
}
@@ -2034,10 +1994,6 @@ KVCacheState& InputsEmbedder::get_kv_cache_state() {
2034
1994
return m_impl->get_kv_cache_state ();
2035
1995
}
2036
1996
2037
- bool InputsEmbedder::should_reset_kv_cache () const {
2038
- return m_impl->should_reset_kv_cache ();
2039
- }
2040
-
2041
1997
Tokenizer InputsEmbedder::get_tokenizer () const {
2042
1998
return m_impl->get_tokenizer ();
2043
1999
}
0 commit comments