debug prints

sbalandi · sbalandi · commit 439a2556c93b · 2025-02-13T16:57:10.000Z
diff --git a/src/cpp/src/debug_utils.hpp b/src/cpp/src/debug_utils.hpp
@@ -12,7 +12,7 @@
 template <typename T>
 void print_array(T * array, size_t size) {
     std::cout << " => [ ";
-    for (size_t i = 0; i < std::min(size, size_t(10)); ++i) {
+    for (size_t i = 0; i < size; ++i) {
         std::cout << array[i] << " ";
     }
     std::cout << " ] " << std::endl;
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
@@ -349,6 +349,8 @@ void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t se
         auto shape = old_tensor.get_shape();
         shape[seq_length_axis] -= remove_from_end;
 
+        std::cout << " trim_kv_cache " << shape[seq_length_axis] << std::endl;
+
         ov::Coordinate new_shape_begin{0, 0, 0, 0};
         ov::Coordinate new_shape_end{shape};
 
diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -10,6 +10,7 @@
 #include "openvino/opsets/opset13.hpp"
 
 #include "utils.hpp"
+#include "debug_utils.hpp"
 #include <regex>
 
 namespace ov::genai {
@@ -82,25 +83,36 @@ class InputsEmbedder::IInputsEmbedder {
         m_stop_token_ids = stop_token_ids;
     }
 
-    void update_tokenized_history(const ov::genai::utils::GenerationFinishInfo generation_finish_info, bool is_beam_search, size_t last_answer_len, size_t inputs_embeds_size) {
+    void update_tokenized_history(const ov::genai::utils::GenerationFinishInfo generation_finish_info, bool is_beam_search, size_t last_answer_len, size_t full_len) {
         if (is_beam_search) {
             m_kv_history_manager.trusted_history_length = m_tokenized_history.size();
             m_kv_history_manager.num_tokens_to_remove_from_kv_cache = last_answer_len;
+            std::cout << " 1 1"  << std::endl;
         } else {
             m_kv_history_manager.reset();
+            std::cout << " 1 2"  << std::endl;
         }
 
         m_last_disappeared_token = generation_finish_info.probably_disappeared_token;
 
         if (generation_finish_info.streaming_finish_status == ov::genai::GenerationStatus::CANCEL) {
             // let's remove last answer and prompt
-            m_kv_history_manager.num_tokens_to_remove_from_kv_cache = inputs_embeds_size + last_answer_len;
+            m_kv_history_manager.num_tokens_to_remove_from_kv_cache = full_len + last_answer_len;
             m_tokenized_history = std::move(m_prev_tokenized_history);
             m_kv_history_manager.reset_kv_cache = m_tokenized_history.empty();
+            // std::cout << " 2 1"  << std::endl;
         } else {
             auto encoded_result = generation_finish_info.results.tokens[0];
             std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_history));
+            std::cout << "  2 2"  << std::endl;
         }
+        std::cout << " \nCANCEL " << m_tokenized_history.size() << std::endl;
+        print_array(m_tokenized_history.data(), m_tokenized_history.size());
+
+        std::cout << " last_answer_len " << last_answer_len << std::endl;
+        std::cout << " full_len " << full_len << std::endl;
+        std::cout << " m_kv_history_manager.num_tokens_to_remove_from_kv_cache " << m_kv_history_manager.num_tokens_to_remove_from_kv_cache << std::endl;
+        std::cout << " m_kv_history_manager.trusted_history_length " << m_kv_history_manager.trusted_history_length << std::endl;
     }
 
     void set_apply_chat_template_status(bool apply_chat_template) {
@@ -114,7 +126,6 @@ class InputsEmbedder::IInputsEmbedder {
             m_history.clear();
             m_templated_chat_history.clear();
             m_tokenized_history.clear();
-            m_prev_tokenized_history.clear();
         }
         if (system_message.empty()) {
             return;
@@ -143,7 +154,6 @@ class InputsEmbedder::IInputsEmbedder {
         m_history.clear();
         m_templated_chat_history.clear();
         m_tokenized_history.clear();
-        m_prev_tokenized_history.clear();
     }
 
 protected:
@@ -194,6 +204,9 @@ class InputsEmbedder::IInputsEmbedder {
             auto end_tokenizer_time = std::chrono::steady_clock::now();
             metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
             m_templated_chat_history = std::move(new_templated_chat_history);
+
+            print_tensor("prev_chat_tokens", prev_chat_tokens);
+            print_tensor("new_chat_tokens", new_chat_tokens);
             return {new_chat_tokens, prev_chat_tokens};
         } else {
             ov::Tensor encoded_input_ids;
@@ -235,24 +248,36 @@ class InputsEmbedder::IInputsEmbedder {
             }
 
             m_prev_tokenized_history.clear();
+
             if (m_tokenized_history.empty()) {
                 encoded_input_ids = new_chat_tokens;
-
             } else if (trusted_history_length != SIZE_MAX || m_kv_history_manager.does_history_cache_need_to_update()) {
+                std::cout << "trusted_history_length " << trusted_history_length << std::endl;
+                std::cout << "m_kv_history_manager " << m_kv_history_manager.trusted_history_length << std::endl;
+                std::cout << "m_kv_history_manager " << m_kv_history_manager.num_tokens_to_remove_from_kv_cache << std::endl;
+
+                std::cout << "prev_chat_tokens " << prev_chat_tokens.get_size() << std::endl;
+                std::cout << "m_tokenized_history " << m_tokenized_history.size() << std::endl;
+
                 // does_history_cache_need_to_update will be true here if beam search is activated
                 // in beam search mode we want to remove all history about last model answer from kv cache and add the best answer directly
                 // if we have difference in model answer and decoded answer it anyway will be less then entire history, so let's use data from m_kv_history_manager
                 if (m_kv_history_manager.does_history_cache_need_to_update()) {
                     trusted_history_length = m_kv_history_manager.trusted_history_length;
                 } else {
-                    auto num_tokens_to_remove_from_kv_cache = m_tokenized_history.size() - trusted_history_length;
+                    int64_t num_tokens_to_remove_from_kv_cache = m_tokenized_history.size() - trusted_history_length;
+                    std::cout << "num_tokens_to_remove_from_kv_cache " << num_tokens_to_remove_from_kv_cache << std::endl;
                     // last generated token is present in tokenized_history, but not included to attention mask, let's keep it in history
-                    num_tokens_to_remove_from_kv_cache -= 1;
+                    if (num_tokens_to_remove_from_kv_cache > 0)
+                        num_tokens_to_remove_from_kv_cache -= 1;
+
+                    std::cout << "num_tokens_to_remove_from_kv_cache " << num_tokens_to_remove_from_kv_cache << std::endl;
 
                     // if streaming was used and cancelled on prev step, m_kv_history_manager.num_tokens_to_remove_from_kv_cache could be already set
                     // and it would be bigger as it includes answer + prompt
                     m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_kv_history_manager.num_tokens_to_remove_from_kv_cache > num_tokens_to_remove_from_kv_cache ?
                                                                               m_kv_history_manager.num_tokens_to_remove_from_kv_cache : num_tokens_to_remove_from_kv_cache;
+                    std::cout << "m_kv_history_manager.num_tokens_to_remove_from_kv_cache " << m_kv_history_manager.num_tokens_to_remove_from_kv_cache << std::endl;
                 }
 
                 std::copy_n(m_tokenized_history.data(), trusted_history_length, std::back_inserter(m_prev_tokenized_history));
@@ -277,6 +302,8 @@ class InputsEmbedder::IInputsEmbedder {
             }
             m_tokenized_history.clear();
             std::copy_n(new_chat_tokens.data<int64_t>(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history));
+
+            print_tensor("encoded_input_ids", encoded_input_ids);
             return encoded_input_ids;
         } else {
             m_tokenized_history.clear();
diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
@@ -193,10 +193,15 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         size_t request_id = 0;
         size_t block_size = 1; // not used
 
+
         size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1) - to_remove_from_hist;
         size_t inputs_embeds_size = inputs_embeds.get_shape().at(1);
 
+        std::cout << "history_size " << history_size << std::endl;
+        std::cout << "history_size + inputs_embeds_size " << inputs_embeds_size + history_size << std::endl;
+
         auto tokenized_history = m_inputs_embedder->get_tokenized_history();
+        std::cout << "tokenized_history  " << tokenized_history.size() << std::endl;
         ov::Tensor prompt_ids(ov::element::i64, { history_size + inputs_embeds_size });
         OPENVINO_ASSERT(prompt_ids.get_size() >= tokenized_history.size(), "Prompt ids size is less than tokenized history size");
         std::fill_n(prompt_ids.data<int64_t>(), prompt_ids.get_size(), m_tokenizer.get_pad_token_id());

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@`
`12`	`12`	`template <typename T>`
`13`	`13`	`void print_array(T * array, size_t size) {`
`14`	`14`	`std::cout << " => [ ";`
`15`		`- for (size_t i = 0; i < std::min(size, size_t(10)); ++i) {`
	`15`	`+ for (size_t i = 0; i < size; ++i) {`
`16`	`16`	`std::cout << array[i] << " ";`
`17`	`17`	`}`
`18`	`18`	`std::cout << " ] " << std::endl;`