@@ -29,7 +29,6 @@ std::pair<ov::Tensor, std::optional<int64_t>> InputsEmbedder::IInputsEmbedder::g
29
29
30
30
void InputsEmbedder::IInputsEmbedder::start_chat (const std::string& system_message) {
31
31
m_is_chat_conversation = true ;
32
- m_kv_history_trim_manager.reset ();
33
32
if (!m_kv_cache_state.get_state ().empty ()) {
34
33
m_history.clear ();
35
34
m_kv_cache_state.reset_state ();
@@ -40,17 +39,20 @@ void InputsEmbedder::IInputsEmbedder::start_chat(const std::string& system_messa
40
39
m_history = {{{" role" , " system" }, {" content" , system_message}}};
41
40
}
42
41
43
- void InputsEmbedder::IInputsEmbedder::update_chat_history (const std::string& decoded_results) {
44
- // Tail of chat template is missing in KV cache.
45
- // Find the tail to concatenate it with the next input prompt.
46
- m_history.push_back ({{" role" , " assistant" }, {" content" , decoded_results}});
47
- m_kv_history_trim_manager.reset ();
42
+ void InputsEmbedder::IInputsEmbedder::update_chat_history (const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) {
43
+ m_kv_cache_state.num_tokens_to_trim = 0 ;
44
+ if (generation_finish_status == ov::genai::GenerationStatus::CANCEL) {
45
+ // If chat generation process was cancelled by user, let's rollback to previous state of history
46
+ m_history.pop_back ();
47
+ } else {
48
+ // Tail of chat template is missing in KV cache.
49
+ // Find the tail to concatenate it with the next input prompt.
50
+ m_history.push_back ({{" role" , " assistant" }, {" content" , decoded_results}});
51
+ }
48
52
}
49
53
50
54
void InputsEmbedder::IInputsEmbedder::finish_chat () {
51
55
m_is_chat_conversation = false ;
52
- m_kv_history_trim_manager.reset ();
53
-
54
56
m_history.clear ();
55
57
m_kv_cache_state.reset_state ();
56
58
}
@@ -123,7 +125,7 @@ ov::Tensor InputsEmbedder::IInputsEmbedder::apply_chat_template_tokenize(const s
123
125
ov::Tensor InputsEmbedder::IInputsEmbedder::update_history (const ov::Tensor& new_chat_tokens) {
124
126
ov::Tensor encoded_inputs;
125
127
if (m_is_chat_conversation) {
126
- ov::genai::align_kv_cache_and_history (m_kv_history_trim_manager, new_chat_tokens, m_kv_cache_state);
128
+ ov::genai::align_kv_cache_and_history (new_chat_tokens, m_kv_cache_state);
127
129
encoded_inputs = get_chat_encoded_input (new_chat_tokens, m_kv_cache_state).input_ids ;
128
130
} else {
129
131
encoded_inputs = new_chat_tokens;
@@ -225,7 +227,7 @@ EmbeddingsModel InputsEmbedder::get_embedding_model() const {
225
227
return m_impl->get_embedding_model ();
226
228
}
227
229
228
- KVCacheState& InputsEmbedder::get_kv_cache_state () {
230
+ ov::genai::utils:: KVCacheState& InputsEmbedder::get_kv_cache_state () {
229
231
return m_impl->get_kv_cache_state ();
230
232
}
231
233
0 commit comments