10
10
#include " openvino/opsets/opset13.hpp"
11
11
12
12
#include " utils.hpp"
13
+ #include " debug_utils.hpp"
13
14
#include < regex>
14
15
15
16
namespace ov ::genai {
@@ -82,25 +83,36 @@ class InputsEmbedder::IInputsEmbedder {
82
83
m_stop_token_ids = stop_token_ids;
83
84
}
84
85
85
- void update_tokenized_history (const ov::genai::utils::GenerationFinishInfo generation_finish_info, bool is_beam_search, size_t last_answer_len, size_t inputs_embeds_size ) {
86
+ void update_tokenized_history (const ov::genai::utils::GenerationFinishInfo generation_finish_info, bool is_beam_search, size_t last_answer_len, size_t full_len ) {
86
87
if (is_beam_search) {
87
88
m_kv_history_manager.trusted_history_length = m_tokenized_history.size ();
88
89
m_kv_history_manager.num_tokens_to_remove_from_kv_cache = last_answer_len;
90
+ std::cout << " 1 1" << std::endl;
89
91
} else {
90
92
m_kv_history_manager.reset ();
93
+ std::cout << " 1 2" << std::endl;
91
94
}
92
95
93
96
m_last_disappeared_token = generation_finish_info.probably_disappeared_token ;
94
97
95
98
if (generation_finish_info.streaming_finish_status == ov::genai::GenerationStatus::CANCEL) {
96
99
// let's remove last answer and prompt
97
- m_kv_history_manager.num_tokens_to_remove_from_kv_cache = inputs_embeds_size + last_answer_len;
100
+ m_kv_history_manager.num_tokens_to_remove_from_kv_cache = full_len + last_answer_len;
98
101
m_tokenized_history = std::move (m_prev_tokenized_history);
99
102
m_kv_history_manager.reset_kv_cache = m_tokenized_history.empty ();
103
+ // std::cout << " 2 1" << std::endl;
100
104
} else {
101
105
auto encoded_result = generation_finish_info.results .tokens [0 ];
102
106
std::copy (encoded_result.begin (), encoded_result.end (), std::back_inserter (m_tokenized_history));
107
+ std::cout << " 2 2" << std::endl;
103
108
}
109
+ std::cout << " \n CANCEL " << m_tokenized_history.size () << std::endl;
110
+ print_array (m_tokenized_history.data (), m_tokenized_history.size ());
111
+
112
+ std::cout << " last_answer_len " << last_answer_len << std::endl;
113
+ std::cout << " full_len " << full_len << std::endl;
114
+ std::cout << " m_kv_history_manager.num_tokens_to_remove_from_kv_cache " << m_kv_history_manager.num_tokens_to_remove_from_kv_cache << std::endl;
115
+ std::cout << " m_kv_history_manager.trusted_history_length " << m_kv_history_manager.trusted_history_length << std::endl;
104
116
}
105
117
106
118
void set_apply_chat_template_status (bool apply_chat_template) {
@@ -114,7 +126,6 @@ class InputsEmbedder::IInputsEmbedder {
114
126
m_history.clear ();
115
127
m_templated_chat_history.clear ();
116
128
m_tokenized_history.clear ();
117
- m_prev_tokenized_history.clear ();
118
129
}
119
130
if (system_message.empty ()) {
120
131
return ;
@@ -143,7 +154,6 @@ class InputsEmbedder::IInputsEmbedder {
143
154
m_history.clear ();
144
155
m_templated_chat_history.clear ();
145
156
m_tokenized_history.clear ();
146
- m_prev_tokenized_history.clear ();
147
157
}
148
158
149
159
protected:
@@ -194,6 +204,9 @@ class InputsEmbedder::IInputsEmbedder {
194
204
auto end_tokenizer_time = std::chrono::steady_clock::now ();
195
205
metrics.raw_metrics .tokenization_durations .emplace_back (PerfMetrics::get_microsec (end_tokenizer_time - start_tokenizer_time));
196
206
m_templated_chat_history = std::move (new_templated_chat_history);
207
+
208
+ print_tensor (" prev_chat_tokens" , prev_chat_tokens);
209
+ print_tensor (" new_chat_tokens" , new_chat_tokens);
197
210
return {new_chat_tokens, prev_chat_tokens};
198
211
} else {
199
212
ov::Tensor encoded_input_ids;
@@ -235,24 +248,36 @@ class InputsEmbedder::IInputsEmbedder {
235
248
}
236
249
237
250
m_prev_tokenized_history.clear ();
251
+
238
252
if (m_tokenized_history.empty ()) {
239
253
encoded_input_ids = new_chat_tokens;
240
-
241
254
} else if (trusted_history_length != SIZE_MAX || m_kv_history_manager.does_history_cache_need_to_update ()) {
255
+ std::cout << " trusted_history_length " << trusted_history_length << std::endl;
256
+ std::cout << " m_kv_history_manager " << m_kv_history_manager.trusted_history_length << std::endl;
257
+ std::cout << " m_kv_history_manager " << m_kv_history_manager.num_tokens_to_remove_from_kv_cache << std::endl;
258
+
259
+ std::cout << " prev_chat_tokens " << prev_chat_tokens.get_size () << std::endl;
260
+ std::cout << " m_tokenized_history " << m_tokenized_history.size () << std::endl;
261
+
242
262
// does_history_cache_need_to_update will be true here if beam search is activated
243
263
// in beam search mode we want to remove all history about last model answer from kv cache and add the best answer directly
244
264
// if we have difference in model answer and decoded answer it anyway will be less then entire history, so let's use data from m_kv_history_manager
245
265
if (m_kv_history_manager.does_history_cache_need_to_update ()) {
246
266
trusted_history_length = m_kv_history_manager.trusted_history_length ;
247
267
} else {
248
- auto num_tokens_to_remove_from_kv_cache = m_tokenized_history.size () - trusted_history_length;
268
+ int64_t num_tokens_to_remove_from_kv_cache = m_tokenized_history.size () - trusted_history_length;
269
+ std::cout << " num_tokens_to_remove_from_kv_cache " << num_tokens_to_remove_from_kv_cache << std::endl;
249
270
// last generated token is present in tokenized_history, but not included to attention mask, let's keep it in history
250
- num_tokens_to_remove_from_kv_cache -= 1 ;
271
+ if (num_tokens_to_remove_from_kv_cache > 0 )
272
+ num_tokens_to_remove_from_kv_cache -= 1 ;
273
+
274
+ std::cout << " num_tokens_to_remove_from_kv_cache " << num_tokens_to_remove_from_kv_cache << std::endl;
251
275
252
276
// if streaming was used and cancelled on prev step, m_kv_history_manager.num_tokens_to_remove_from_kv_cache could be already set
253
277
// and it would be bigger as it includes answer + prompt
254
278
m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_kv_history_manager.num_tokens_to_remove_from_kv_cache > num_tokens_to_remove_from_kv_cache ?
255
279
m_kv_history_manager.num_tokens_to_remove_from_kv_cache : num_tokens_to_remove_from_kv_cache;
280
+ std::cout << " m_kv_history_manager.num_tokens_to_remove_from_kv_cache " << m_kv_history_manager.num_tokens_to_remove_from_kv_cache << std::endl;
256
281
}
257
282
258
283
std::copy_n (m_tokenized_history.data (), trusted_history_length, std::back_inserter (m_prev_tokenized_history));
@@ -277,6 +302,8 @@ class InputsEmbedder::IInputsEmbedder {
277
302
}
278
303
m_tokenized_history.clear ();
279
304
std::copy_n (new_chat_tokens.data <int64_t >(), new_chat_tokens.get_size (), std::back_inserter (m_tokenized_history));
305
+
306
+ print_tensor (" encoded_input_ids" , encoded_input_ids);
280
307
return encoded_input_ids;
281
308
} else {
282
309
m_tokenized_history.clear ();
0 commit comments