update according to m_kv_cache_state

sbalandi · sbalandi · commit f9adf6b464a2 · 2025-02-26T00:24:16.000Z
diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp
@@ -45,7 +45,7 @@ StatefulLLMPipeline::StatefulLLMPipeline(
         m_use_full_chat_history = true;
 
     if (!m_use_full_chat_history)
-        m_kv_history_trim_manager.kv_cache_seq_length_axis = ov::genai::utils::get_kv_axes_pos(model).seq_len;
+        m_kv_cache_state.seq_length_axis = ov::genai::utils::get_kv_axes_pos(model).seq_len;
 
     auto filtered_properties = extract_adapters_from_properties(properties, &m_generation_config.adapters);
     if (m_generation_config.adapters) {
@@ -119,7 +119,7 @@ DecodedResults StatefulLLMPipeline::generate(
             if (m_use_full_chat_history) {
                 encoded_input = new_chat_tokens;
             } else {
-                ov::genai::align_kv_cache_and_history(m_kv_history_trim_manager, new_chat_tokens.input_ids, m_kv_cache_state);
+                ov::genai::align_kv_cache_and_history(new_chat_tokens.input_ids, m_kv_cache_state);
                 encoded_input = get_chat_encoded_input(new_chat_tokens.input_ids, m_kv_cache_state);
             }
             // TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied
@@ -208,7 +208,7 @@ EncodedResults StatefulLLMPipeline::generate(
     // Tail of previous output in chat mode is missing in KV cache.
     if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS) {
         ov::Tensor new_chat_tokens = ov::Tensor{ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data()};
-        ov::genai::align_kv_cache_and_history(m_kv_history_trim_manager, new_chat_tokens, m_kv_cache_state);
+        ov::genai::align_kv_cache_and_history(new_chat_tokens, m_kv_cache_state);
 
         auto encoded_input = get_chat_encoded_input(new_chat_tokens, m_kv_cache_state);
         input_ids = encoded_input.input_ids;
@@ -245,8 +245,8 @@ EncodedResults StatefulLLMPipeline::generate(
         if (m_kv_cache_state.get_state().empty() || m_use_full_chat_history)
             reset_kv_state();
         else
-            ov::genai::utils::trim_kv_cache(m_model_runner, m_kv_history_trim_manager.num_tokens_to_trim,
-                                            m_kv_history_trim_manager.kv_cache_seq_length_axis, m_adapter_controller);
+            ov::genai::utils::trim_kv_cache(m_model_runner, m_kv_cache_state.num_tokens_to_trim,
+                                            m_kv_cache_state.seq_length_axis, m_adapter_controller);
     }
 
     size_t kv_cache_len = 0;
@@ -319,7 +319,7 @@ EncodedResults StatefulLLMPipeline::generate(
     m_chat_generation_finish_status = finish_info.streaming_finish_status;
 
     if (is_chat_conversation) {
-        m_kv_history_trim_manager.reset();
+        m_kv_cache_state.num_tokens_to_trim = 0;
 
         if (m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS) {
             if (m_chat_generation_finish_status == ov::genai::GenerationStatus::CANCEL) {
@@ -328,7 +328,7 @@ EncodedResults StatefulLLMPipeline::generate(
                 std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
             }
         } else if (config.is_beam_search()) {
-            m_kv_history_trim_manager.num_tokens_to_trim = m_model_runner.get_tensor("attention_mask").get_shape()[1] - prev_attn_mask_size;
+            m_kv_cache_state.num_tokens_to_trim = m_model_runner.get_tensor("attention_mask").get_shape()[1] - prev_attn_mask_size;
         }
     } else {
         m_kv_cache_state.reset_state();
@@ -369,7 +369,6 @@ void StatefulLLMPipeline::reset_kv_state() {
 
 void StatefulLLMPipeline::finish_chat() {
     is_chat_conversation = false;
-    m_kv_history_trim_manager.reset();
     m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
     bool have_state = 0 != m_model_runner.get_tensor("attention_mask").get_size();
     if (!m_kv_cache_state.get_state().empty() || have_state) {
diff --git a/src/cpp/src/llm_pipeline_stateful.hpp b/src/cpp/src/llm_pipeline_stateful.hpp
@@ -18,15 +18,11 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
     ChatHistory m_history;
     std::vector<int64_t> m_tokenized_chat_history;
     ov::genai::utils::GenerationChatInputsType m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
-    // If sequence contains some symbols, which could be ambiguously encoded by tokenizer, we need to trim kv cache
-    // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history 
-    // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history
-    ov::genai::KVCacheTrimManager m_kv_history_trim_manager = {0, 2};
     // Finish reason of last generation for chat scenario
     ov::genai::GenerationStatus m_chat_generation_finish_status = ov::genai::GenerationStatus::RUNNING;
     // if True, full history will be used as prompt on each chat generation
     bool m_use_full_chat_history = false;
-    // reflection of tokens contained in the kv cache
+    // include reflection of tokens contained in the kv cache and amount of tokens, which are needed to trim from kv cache on the next step of chat
     KVCacheState m_kv_cache_state;
 
     void reset_kv_state();
diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp
@@ -320,7 +320,7 @@ TokenizedInputs get_chat_encoded_input(const ov::Tensor& new_chat_tokens, KVCach
 }
 
 
-void align_kv_cache_and_history(ov::genai::KVCacheTrimManager& kv_history_manager, const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state) {
+void align_kv_cache_and_history(const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state) {
     // KV cache in model already contains prompts and answers from previous iterations.
     // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
     // token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,
@@ -338,7 +338,7 @@ void align_kv_cache_and_history(ov::genai::KVCacheTrimManager& kv_history_manage
     size_t first_diverse_tokens_idx = ov::genai::utils::get_first_history_difference(new_chat_tokens, state);
     // in the case of beam_search the longest answer is in the kv cache, but the best one is needed
     // so generated tokens were not added to KVCacheState and num_tokens_to_trim was set to the size of the generated serquence
-    kv_history_manager.num_tokens_to_trim = kv_history_manager.num_tokens_to_trim > 0 ? kv_history_manager.num_tokens_to_trim : (state.size() - first_diverse_tokens_idx);
+    kv_cache_state.num_tokens_to_trim = kv_cache_state.num_tokens_to_trim > 0 ? kv_cache_state.num_tokens_to_trim : (state.size() - first_diverse_tokens_idx);
     state.resize(first_diverse_tokens_idx);
 }
 
diff --git a/src/cpp/src/lm_encoding.hpp b/src/cpp/src/lm_encoding.hpp
@@ -11,6 +11,9 @@ namespace genai {
 class KVCacheState {
     std::vector<int64_t> state;
 public:
+    size_t num_tokens_to_trim = 0;
+    size_t seq_length_axis = 2;
+
     std::vector<int64_t>& get_state() {
         return state;
     }
@@ -20,18 +23,8 @@ class KVCacheState {
     }
 
     void reset_state() {
-        return state.clear();
-    }
-};
-
-
-struct KVCacheTrimManager
-{
-    size_t num_tokens_to_trim = 0;
-    size_t kv_cache_seq_length_axis = 2;
-
-    void reset() {
         num_tokens_to_trim = 0;
+        state.clear();
     }
 };
 
@@ -41,7 +34,7 @@ ov::genai::utils::GenerationFinishInfo get_lm_encoded_results(ov::InferRequest&
                                                               std::optional<ov::Tensor> position_ids, KVCacheState& m_kv_cache_state, std::optional<EmbeddingsModel> m_embedding, std::optional<int64_t> rope_delta = std::nullopt);
 
 
-void align_kv_cache_and_history(ov::genai::KVCacheTrimManager& kv_history_manager, const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state);
+void align_kv_cache_and_history(const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state);
 
 
 TokenizedInputs get_chat_encoded_input(const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state);
diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -44,12 +44,6 @@ class InputsEmbedder::IInputsEmbedder {
 public:
     virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) = 0;
 
-    ov::Tensor get_input_embeddings(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) {
-        ov::Tensor inputs_embeds = get_inputs_embeds(prompt, images, metrics);
-        m_inputs_embeds_size = inputs_embeds.get_shape().at(1);
-        return inputs_embeds;
-    }
-
     virtual std::pair<ov::Tensor, std::optional<int64_t>> get_position_ids(const size_t inputs_embeds_size, const size_t history_size) {
         ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds_size }};
         std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), history_size);
@@ -72,32 +66,6 @@ class InputsEmbedder::IInputsEmbedder {
         m_stop_token_ids = stop_token_ids;
     }
 
-<<<<<<< HEAD
-=======
-    virtual void update_tokenized_history(const ov::genai::utils::GenerationFinishInfo generation_finish_info, bool is_beam_search, size_t last_answer_len) {
-        if (is_beam_search) {
-            m_kv_history_manager.trusted_history_length = m_tokenized_history.size();
-            m_kv_history_manager.num_tokens_to_remove_from_kv_cache = last_answer_len;
-        } else {
-            m_kv_history_manager.reset();
-        }
-
-        m_last_disappeared_token = generation_finish_info.probably_disappeared_token;
-
-        if (m_is_chat_conversation) {
-            if (generation_finish_info.streaming_finish_status == ov::genai::GenerationStatus::CANCEL) {
-                // let's remove last answer and prompt
-                m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_inputs_embeds_size + last_answer_len;
-                m_tokenized_history = m_prev_tokenized_history;
-                m_kv_history_manager.reset_kv_cache = m_tokenized_history.empty();
-            } else {
-                auto encoded_result = generation_finish_info.results.tokens[0];
-                std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_history));
-            }
-        }
-    }
-
->>>>>>> 19b756cd (update comments)
     void set_apply_chat_template_status(bool apply_chat_template) {
         m_apply_chat_template = apply_chat_template;
     }
@@ -114,19 +82,15 @@ class InputsEmbedder::IInputsEmbedder {
         m_history = {{{"role", "system"}, {"content", system_message}}};
     }
 
-    void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) {
+    virtual void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) {
+        m_kv_cache_state.num_tokens_to_trim = 0;
         if (generation_finish_status == ov::genai::GenerationStatus::CANCEL) {
             // If chat generation process was cancelled by user, let's rollback to previous state of history
             m_history.pop_back();
-            if (!m_history.empty()) {
-                constexpr bool add_generation_prompt = true;
-                m_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
-            }
         } else {
             // Tail of chat template is missing in KV cache.
             // Find the tail to concatenate it with the next input prompt.
             m_history.push_back({{"role", "assistant"}, {"content", decoded_results}});
-            m_kv_cache_state.num_tokens_to_trim = 0;
         }
     }
 
@@ -410,9 +374,9 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
         return inputs_embeds;
     }
 
-    virtual void update_tokenized_history(const ov::genai::utils::GenerationFinishInfo generation_finish_info, bool is_beam_search, size_t last_answer_len) {
-        IInputsEmbedder::update_tokenized_history(generation_finish_info, is_beam_search, last_answer_len);
-        if (generation_finish_info.streaming_finish_status == ov::genai::GenerationStatus::CANCEL) {
+    virtual void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) {
+        IInputsEmbedder::update_chat_history(decoded_results, generation_finish_status);
+        if (generation_finish_status == ov::genai::GenerationStatus::CANCEL) {
             m_image_id = m_prev_image_id;
         }
     }
@@ -1562,9 +1526,9 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
         m_tokens_per_images.clear();
     }
 
-    virtual void update_tokenized_history(const ov::genai::utils::GenerationFinishInfo generation_finish_info, bool is_beam_search, size_t last_answer_len) {
-        IInputsEmbedder::update_tokenized_history(generation_finish_info, is_beam_search, last_answer_len);
-        if (generation_finish_info.streaming_finish_status == ov::genai::GenerationStatus::CANCEL)
+    virtual void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) {
+        IInputsEmbedder::update_chat_history(decoded_results, generation_finish_status);
+        if (generation_finish_status == ov::genai::GenerationStatus::CANCEL)
             m_tokens_per_images = m_prev_tokens_per_images;
     }
 };
@@ -2014,10 +1978,6 @@ ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const st
     return m_impl->get_inputs_embeds(prompt, images, metrics);
 }
 
-ov::Tensor InputsEmbedder::get_input_embeddings(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) {
-    return m_impl->get_input_embeddings(prompt, images, metrics);
-}
-
 std::pair<ov::Tensor, std::optional<int64_t>> InputsEmbedder::get_position_ids(const size_t inputs_embeds_size, const size_t history_size) {
     return m_impl->get_position_ids(inputs_embeds_size, history_size);
 }
@@ -2034,10 +1994,6 @@ KVCacheState& InputsEmbedder::get_kv_cache_state() {
     return  m_impl->get_kv_cache_state();
 }
 
-bool InputsEmbedder::should_reset_kv_cache() const {
-    return m_impl->should_reset_kv_cache();
-}
-
 Tokenizer InputsEmbedder::get_tokenizer() const {
     return m_impl->get_tokenizer();
 }
diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp
@@ -36,9 +36,6 @@ class InputsEmbedder {
     // compute input embedding for prompt and multiple images
     ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics);
 
-    // computes input embedding for prompt and multiple images and saves input_embeddings size
-    ov::Tensor get_input_embeddings(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics);
-
     // compute position ids for language model input
     std::pair<ov::Tensor, std::optional<int64_t>> get_position_ids(const size_t inputs_embeds_size, const size_t history_size);
 
@@ -53,9 +50,6 @@ class InputsEmbedder {
     // get reflection of tokens contained in the kv cache
     KVCacheState& get_kv_cache_state();
 
-    // returns true, if we need to remove full kv cache, in that case it's needed to reset it instead of manually updating
-    bool should_reset_kv_cache() const;
-
     // starts chat and adds optional system_message to chat history
     void start_chat(const std::string& system_message);
 
diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
@@ -45,8 +45,6 @@ class ov::genai::VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
     std::shared_ptr<InputsEmbedder> m_inputs_embedder;
     // Axis num in kv cache from m_language model, which contains information about history len
     size_t m_kv_cache_seq_length_axis = 2;
-    // Load pipeline time
-    float m_load_time_ms = 0;
     // Component for applying sampling to lm outputs
     Sampler m_sampler;
 public:
@@ -163,23 +161,15 @@ class ov::genai::VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
         m_inputs_embedder->set_apply_chat_template_status(generation_config.apply_chat_template);
 
         auto start_get_inputs_embeds = std::chrono::steady_clock::now();
-        ov::Tensor inputs_embeds = m_inputs_embedder->get_input_embeddings(prompt, rgbs, perf_metrics);
+        ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics);
         auto end_get_inputs_embeds = std::chrono::steady_clock::now();
 
-<<<<<<< HEAD
         KVCacheState& kv_cache_state = m_inputs_embedder->get_kv_cache_state();
         if (m_is_chat_conversation)
             if (kv_cache_state.get_state().empty())
                 m_language.reset_state();
             else
                 ov::genai::utils::trim_kv_cache(m_language, kv_cache_state.num_tokens_to_trim, kv_cache_state.seq_length_axis, std::nullopt);
-=======
-        auto to_remove_from_hist = m_inputs_embedder->get_num_tokens_to_remove_from_hist();
-        if (m_inputs_embedder->should_reset_kv_cache())
-            m_language.reset_state();
-        else
-            ov::genai::utils::trim_kv_cache(m_language, to_remove_from_hist, m_kv_cache_seq_length_axis, std::nullopt);
->>>>>>> 19b756cd (update comments)
 
         std::vector<SequenceGroup::Ptr> requests;
         size_t request_id = 0;
@@ -228,11 +218,6 @@ class ov::genai::VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
         }
         auto decode_end_time = std::chrono::steady_clock::now();
 
-<<<<<<< HEAD
-=======
-        m_inputs_embedder->update_tokenized_history(finish_info, generation_config.is_beam_search(), m_language.get_tensor("attention_mask").get_shape()[1] - (history_size + inputs_embeds_size));
-
->>>>>>> 19b756cd (update comments)
         std::string decoded_results = decoded.texts.at(0);
         if (m_is_chat_conversation)
             m_inputs_embedder->update_chat_history(decoded_results, finish_info.streaming_finish_status);

Original file line number	Diff line number	Diff line change
`@@ -320,7 +320,7 @@ TokenizedInputs get_chat_encoded_input(const ov::Tensor& new_chat_tokens, KVCach`
`320`	`320`	`}`
`321`	`321`
`322`	`322`
`323`		`-void align_kv_cache_and_history(ov::genai::KVCacheTrimManager& kv_history_manager, const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state) {`
	`323`	`+void align_kv_cache_and_history(const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state) {`
`324`	`324`	`// KV cache in model already contains prompts and answers from previous iterations.`
`325`	`325`	`// So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns`
`326`	`326`	`// token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,`
`@@ -338,7 +338,7 @@ void align_kv_cache_and_history(ov::genai::KVCacheTrimManager& kv_history_manage`
`338`	`338`	`size_t first_diverse_tokens_idx = ov::genai::utils::get_first_history_difference(new_chat_tokens, state);`
`339`	`339`	`// in the case of beam_search the longest answer is in the kv cache, but the best one is needed`
`340`	`340`	`// so generated tokens were not added to KVCacheState and num_tokens_to_trim was set to the size of the generated serquence`
`341`		`- kv_history_manager.num_tokens_to_trim = kv_history_manager.num_tokens_to_trim > 0 ? kv_history_manager.num_tokens_to_trim : (state.size() - first_diverse_tokens_idx);`
	`341`	`+ kv_cache_state.num_tokens_to_trim = kv_cache_state.num_tokens_to_trim > 0 ? kv_cache_state.num_tokens_to_trim : (state.size() - first_diverse_tokens_idx);`
`342`	`342`	`state.resize(first_diverse_tokens_idx);`
`343`	`343`	`}`
`344`	`344`