update according to comments

sbalandi · sbalandi · commit 2237b5e14b86 · 2025-03-03T21:36:10.000Z
diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp
@@ -275,11 +275,10 @@ EncodedResults StatefulLLMPipeline::generate(
                     "but you have '" + std::to_string(num_inputs) + "' inputs");
 
     if (is_chat_conversation) {
-        if (m_kv_cache_state.get_state().empty() || m_use_full_chat_history)
+        if (m_use_full_chat_history)
             reset_kv_state();
         else
-            ov::genai::utils::trim_kv_cache(m_model_runner, m_kv_cache_state.num_tokens_to_trim,
-                                            m_kv_cache_state.seq_length_axis, m_adapter_controller);
+            ov::genai::utils::trim_kv_cache(m_model_runner, m_kv_cache_state, m_adapter_controller);
     }
 
     size_t kv_cache_len = 0;
diff --git a/src/cpp/src/llm_pipeline_stateful.hpp b/src/cpp/src/llm_pipeline_stateful.hpp
@@ -27,7 +27,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
     size_t m_max_kv_cache_size = std::numeric_limits<size_t>::max();
     bool m_is_npu = false;
     // include reflection of tokens contained in the kv cache and amount of tokens, which are needed to trim from kv cache on the next step of chat
-    KVCacheState m_kv_cache_state;
+    utils::KVCacheState m_kv_cache_state;
 
     void reset_kv_state();
 public:
diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp
@@ -82,7 +82,7 @@ ov::genai::utils::GenerationFinishInfo get_lm_encoded_results(
     Sampler& sampler,
     std::vector<SequenceGroup::Ptr> sequence_groups,
     std::optional<ov::Tensor> position_ids,
-    KVCacheState& kv_cache_state,
+    utils::KVCacheState& kv_cache_state,
     std::optional<EmbeddingsModel> m_embedding,
     std::optional<int64_t> rope_delta,
     const size_t max_kv_cache_size
@@ -298,7 +298,7 @@ ov::genai::utils::GenerationFinishInfo get_lm_encoded_results(
 }
 
 
-TokenizedInputs get_chat_encoded_input(const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state) {
+TokenizedInputs get_chat_encoded_input(const ov::Tensor& new_chat_tokens, utils::KVCacheState& kv_cache_state) {
     TokenizedInputs encoded_input;
     size_t kv_cache_len = kv_cache_state.get_state().size();
     if (kv_cache_len == 0) {
@@ -325,7 +325,7 @@ TokenizedInputs get_chat_encoded_input(const ov::Tensor& new_chat_tokens, KVCach
 }
 
 
-void align_kv_cache_and_history(const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state) {
+void align_kv_cache_and_history(const ov::Tensor& new_chat_tokens, utils::KVCacheState& kv_cache_state) {
     // KV cache in model already contains prompts and answers from previous iterations.
     // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
     // token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,
diff --git a/src/cpp/src/lm_encoding.hpp b/src/cpp/src/lm_encoding.hpp
@@ -8,37 +8,16 @@
 namespace ov {
 namespace genai {
 
-class KVCacheState {
-    std::vector<int64_t> state;
-public:
-    size_t num_tokens_to_trim = 0;
-    size_t seq_length_axis = 2;
-
-    std::vector<int64_t>& get_state() {
-        return state;
-    }
-
-    void add_inputs(const ov::Tensor& inputs_ids) {
-        std::copy_n(inputs_ids.data<int64_t>(), inputs_ids.get_size(), std::back_inserter(state));
-    }
-
-    void reset_state() {
-        num_tokens_to_trim = 0;
-        state.clear();
-    }
-};
-
-
 ov::genai::utils::GenerationFinishInfo get_lm_encoded_results(ov::InferRequest& m_llm, const ov::Tensor& input_ids, const ov::Tensor& attention_mask,
                                                               const std::shared_ptr<StreamerBase>& streamer_ptr, Sampler& sampler, std::vector<SequenceGroup::Ptr> sequence_groups,
-                                                              std::optional<ov::Tensor> position_ids, KVCacheState& m_kv_cache_state, std::optional<EmbeddingsModel> m_embedding,
+                                                              std::optional<ov::Tensor> position_ids, utils::KVCacheState& m_kv_cache_state, std::optional<EmbeddingsModel> m_embedding,
                                                               std::optional<int64_t> rope_delta = std::nullopt, const size_t max_kv_cache_size = std::numeric_limits<size_t>::max());
 
 
-void align_kv_cache_and_history(const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state);
+void align_kv_cache_and_history(const ov::Tensor& new_chat_tokens, utils::KVCacheState& kv_cache_state);
 
 
-TokenizedInputs get_chat_encoded_input(const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state);
+TokenizedInputs get_chat_encoded_input(const ov::Tensor& new_chat_tokens, utils::KVCacheState& kv_cache_state);
 
 }
 }
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
@@ -380,13 +380,27 @@ KVAxesPosition get_kv_axes_pos(std::shared_ptr<const ov::Model> model) {
     return kv_pos;
 }
 
-void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t seq_length_axis, std::optional<AdapterController> adapter_controller) {
+void trim_kv_cache(ov::InferRequest request, KVCacheState& kv_cache_state, std::optional<AdapterController> adapter_controller) {
+    if (kv_cache_state.get_state().empty()) {
+        if (adapter_controller) {
+            for(auto& state: request.query_state()) {
+                if(!adapter_controller->has_state_name(state.get_name())) {
+                    state.reset();
+                }
+            }
+        } else {
+            request.reset_state();
+        }
+
+        return;
+    }
+
     // nothing to trim in this case
-    if (remove_from_end == 0)
+    if (kv_cache_state.num_tokens_to_trim == 0)
         return;
 
     auto states = request.query_state();
-    
+
     OPENVINO_ASSERT(states.size() > 0, "Request contains no states.");
 
     for (auto& state : states) {
@@ -396,7 +410,7 @@ void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t se
         ov::Tensor old_tensor = state.get_state();
         // [BATCH_SIZE, num_kv_heads, seq_len, head_size]
         auto shape = old_tensor.get_shape();
-        shape[seq_length_axis] -= remove_from_end;
+        shape[kv_cache_state.seq_length_axis] -= kv_cache_state.num_tokens_to_trim;
 
         ov::Coordinate new_shape_begin{0, 0, 0, 0};
         ov::Coordinate new_shape_end{shape};
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
@@ -104,7 +104,27 @@ struct KVAxesPosition {
 
 KVAxesPosition get_kv_axes_pos(std::shared_ptr<const ov::Model> model);
 
-void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t seq_length_axis, std::optional<AdapterController> adapter_controller);
+class KVCacheState {
+    std::vector<int64_t> state;
+public:
+    size_t num_tokens_to_trim = 0;
+    size_t seq_length_axis = 2;
+
+    std::vector<int64_t>& get_state() {
+        return state;
+    }
+
+    void add_inputs(const ov::Tensor& inputs_ids) {
+        std::copy_n(inputs_ids.data<int64_t>(), inputs_ids.get_size(), std::back_inserter(state));
+    }
+
+    void reset_state() {
+        num_tokens_to_trim = 0;
+        state.clear();
+    }
+};
+
+void trim_kv_cache(ov::InferRequest request, KVCacheState& kv_cache_state, std::optional<AdapterController> adapter_controller);
 
 ov::Tensor push_front_inputs(const ov::Tensor& base_tensor, int64_t add_to_front);
 
diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -29,7 +29,6 @@ std::pair<ov::Tensor, std::optional<int64_t>> InputsEmbedder::IInputsEmbedder::g
 
 void InputsEmbedder::IInputsEmbedder::start_chat(const std::string& system_message) {
     m_is_chat_conversation = true;
-    m_kv_history_trim_manager.reset();
     if (!m_kv_cache_state.get_state().empty()) {
         m_history.clear();
         m_kv_cache_state.reset_state();
@@ -40,17 +39,20 @@ void InputsEmbedder::IInputsEmbedder::start_chat(const std::string& system_messa
     m_history = {{{"role", "system"}, {"content", system_message}}};
 }
 
-void InputsEmbedder::IInputsEmbedder::update_chat_history(const std::string& decoded_results) {
-    // Tail of chat template is missing in KV cache.
-    // Find the tail to concatenate it with the next input prompt.
-    m_history.push_back({{"role", "assistant"}, {"content", decoded_results}});
-    m_kv_history_trim_manager.reset();
+void InputsEmbedder::IInputsEmbedder::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) {
+    m_kv_cache_state.num_tokens_to_trim = 0;
+    if (generation_finish_status == ov::genai::GenerationStatus::CANCEL) {
+        // If chat generation process was cancelled by user, let's rollback to previous state of history
+        m_history.pop_back();
+    } else {
+        // Tail of chat template is missing in KV cache.
+        // Find the tail to concatenate it with the next input prompt.
+        m_history.push_back({{"role", "assistant"}, {"content", decoded_results}});
+    }
 }
 
 void InputsEmbedder::IInputsEmbedder::finish_chat() {
     m_is_chat_conversation = false;
-    m_kv_history_trim_manager.reset();
-
     m_history.clear();
     m_kv_cache_state.reset_state();
 }
@@ -123,7 +125,7 @@ ov::Tensor InputsEmbedder::IInputsEmbedder::apply_chat_template_tokenize(const s
 ov::Tensor InputsEmbedder::IInputsEmbedder::update_history(const ov::Tensor& new_chat_tokens) {
     ov::Tensor encoded_inputs;
     if (m_is_chat_conversation) {
-        ov::genai::align_kv_cache_and_history(m_kv_history_trim_manager, new_chat_tokens, m_kv_cache_state);
+        ov::genai::align_kv_cache_and_history(new_chat_tokens, m_kv_cache_state);
         encoded_inputs = get_chat_encoded_input(new_chat_tokens, m_kv_cache_state).input_ids;
     } else {
         encoded_inputs = new_chat_tokens;
@@ -225,7 +227,7 @@ EmbeddingsModel InputsEmbedder::get_embedding_model() const {
     return m_impl->get_embedding_model();
 }
 
-KVCacheState& InputsEmbedder::get_kv_cache_state() {
+ov::genai::utils::KVCacheState& InputsEmbedder::get_kv_cache_state() {
     return  m_impl->get_kv_cache_state();
 }
 
diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp
@@ -45,7 +45,7 @@ class InputsEmbedder {
     Tokenizer get_tokenizer() const;
 
     // get reflection of tokens contained in the kv cache
-    KVCacheState& get_kv_cache_state();
+    utils::KVCacheState& get_kv_cache_state();
 
     // starts chat and adds optional system_message to chat history
     void start_chat(const std::string& system_message);
@@ -77,16 +77,12 @@ class InputsEmbedder {
         bool m_is_chat_conversation = false;
         // Chat history
         ChatHistory m_history;
-        // If sequence contains some symbols, which could be ambiguous encoded by tokenizer, we need to trim kv cache
-        // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history 
-        // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history
-        ov::genai::KVCacheTrimManager m_kv_history_trim_manager = {0, 2};
         // True if chat template should be applied for non-chat scenario
         bool m_apply_chat_template = true;
         // Finish reason of last generation for chat scenario
         ov::genai::GenerationStatus m_chat_generation_finish_status = ov::genai::GenerationStatus::RUNNING;
         // reflection of tokens contained in the kv cache
-        KVCacheState m_kv_cache_state;
+        utils::KVCacheState m_kv_cache_state;
     public:
         virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) = 0;
     
@@ -100,21 +96,17 @@ class InputsEmbedder {
             return m_tokenizer;
         }
     
-        KVCacheState& get_kv_cache_state() {
+        utils::KVCacheState& get_kv_cache_state() {
             return m_kv_cache_state;
         }
     
-        size_t get_num_tokens_to_remove_from_hist() const {
-            return m_kv_history_trim_manager.num_tokens_to_trim;
-        }
-    
         void set_apply_chat_template_status(bool apply_chat_template) {
             m_apply_chat_template = apply_chat_template;
         }
     
         virtual void start_chat(const std::string& system_message);
     
-        void update_chat_history(const std::string& decoded_results);
+        virtual void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status);
     
         virtual void finish_chat();
     
diff --git a/src/cpp/src/visual_language/minicpm/classes.cpp b/src/cpp/src/visual_language/minicpm/classes.cpp
@@ -667,6 +667,14 @@ ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& prompt, c
     return inputs_embeds;
 }
 
+void InputsEmbedderMiniCPM::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) {
+    IInputsEmbedder::update_chat_history(decoded_results, generation_finish_status);
+    if (generation_finish_status == ov::genai::GenerationStatus::CANCEL)
+        m_image_id = m_prev_image_id;
+    else
+        m_prev_image_id = m_image_id;
+}
+
 void InputsEmbedderMiniCPM::start_chat(const std::string& system_message) {
     IInputsEmbedder::start_chat(system_message);
     m_image_id = 0;
diff --git a/src/cpp/src/visual_language/minicpm/classes.hpp b/src/cpp/src/visual_language/minicpm/classes.hpp
@@ -30,6 +30,7 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
     ov::Tensor m_pos_embed_cache;
     // Used to insert <image_id>i</image_id> per image (not a slice).
     size_t m_image_id = 0;
+    size_t m_prev_image_id = 0;
 
 public:
     InputsEmbedderMiniCPM(
@@ -48,6 +49,8 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
 
     ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override;
 
+    void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) override;
+
     void start_chat(const std::string& system_message) override;
 
     void finish_chat() override;
diff --git a/src/cpp/src/visual_language/phi3_vision/classes.cpp b/src/cpp/src/visual_language/phi3_vision/classes.cpp
@@ -471,6 +471,8 @@ ov::Tensor insert_image_placeholders(const std::vector<ov::Tensor>& chunks, cons
             length,
             merged.data<int64_t>() + offset
         );
+        if (tokens_per_images.empty())
+            continue;
         offset += length;
         if (offset < merged_length) {
             std::fill_n(
@@ -602,6 +604,14 @@ ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& prompt, con
     return inputs_embeds;
 }
 
+void InputsEmbedderPhi3V::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) {
+    IInputsEmbedder::update_chat_history(decoded_results, generation_finish_status);
+    if (generation_finish_status == ov::genai::GenerationStatus::CANCEL)
+        m_tokens_per_images = m_prev_tokens_per_images;
+    else
+        m_prev_tokens_per_images = m_tokens_per_images;
+}
+
 void InputsEmbedderPhi3V::start_chat(const std::string& system_message) {
     IInputsEmbedder::start_chat(system_message);
     m_tokens_per_images.clear();
diff --git a/src/cpp/src/visual_language/phi3_vision/classes.hpp b/src/cpp/src/visual_language/phi3_vision/classes.hpp
@@ -30,6 +30,8 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
 
     ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override;
 
+    void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) override;
+
     void start_chat(const std::string& system_message) override;
 
     void finish_chat() override;
@@ -38,6 +40,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
     ov::InferRequest m_hd_feature_transformer;
     ov::InferRequest m_vision_projection;
     std::vector<size_t> m_tokens_per_images;
+    std::vector<size_t> m_prev_tokens_per_images;
 };
 
 } // namespace ov::genai
diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
@@ -66,8 +66,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
         utils::print_compiled_model_properties(compiled_language_model, "VLM language model");
         auto language_model = compiled_language_model.get_runtime_model();
 
-        KVCacheState& kv_cache_state = m_inputs_embedder->get_kv_cache_state();
-        kv_cache_state.seq_length_axis = ov::genai::utils::get_kv_axes_pos(language_model).seq_len;
+        utils::KVCacheState& kv_cache_state = m_inputs_embedder->get_kv_cache_state();
+        kv_cache_state.seq_length_axis = utils::get_kv_axes_pos(language_model).seq_len;
 
         m_language = compiled_language_model.create_infer_request();
 
@@ -140,25 +140,21 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
 
         m_inputs_embedder->set_apply_chat_template_status(generation_config.apply_chat_template);
 
+        utils::KVCacheState& kv_cache_state = m_inputs_embedder->get_kv_cache_state();
+        if (m_is_chat_conversation)
+            utils::trim_kv_cache(m_language, kv_cache_state, std::nullopt);
+
         auto start_get_inputs_embeds = std::chrono::steady_clock::now();
         ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics);
         auto end_get_inputs_embeds = std::chrono::steady_clock::now();
 
-        KVCacheState& kv_cache_state = m_inputs_embedder->get_kv_cache_state();
-        if (m_is_chat_conversation)
-            if (kv_cache_state.get_state().empty())
-                m_language.reset_state();
-            else
-                ov::genai::utils::trim_kv_cache(m_language, kv_cache_state.num_tokens_to_trim, kv_cache_state.seq_length_axis, std::nullopt);
-
         std::vector<SequenceGroup::Ptr> requests;
         size_t request_id = 0;
         size_t block_size = 1; // not used
 
         size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1) - kv_cache_state.num_tokens_to_trim;
         size_t inputs_embeds_size = inputs_embeds.get_shape().at(1);
 
-
         std::vector<int64_t> tokenized_history = kv_cache_state.get_state();
         ov::Tensor prompt_ids(ov::element::i64, { history_size + inputs_embeds_size });
         OPENVINO_ASSERT(prompt_ids.get_size() >= tokenized_history.size(), "Prompt ids size is less than tokenized history size");