From 1c238fcdf65373985f85fcc8dc85e2994124a08e Mon Sep 17 00:00:00 2001 From: sbalandi Date: Wed, 12 Feb 2025 16:13:25 +0000 Subject: [PATCH 1/8] Implement CANCEL for streaming with VLM Pipeline --- .../src/visual_language/inputs_embedder.cpp | 8 +- .../src/visual_language/inputs_embedder.hpp | 6 +- src/cpp/src/visual_language/pipeline.cpp | 18 ++-- tests/python_tests/test_vlm_pipeline.py | 86 ++++++++++++++++++- 4 files changed, 104 insertions(+), 14 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index cbc6cf2120..50a212081a 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -229,8 +229,8 @@ KVCacheState& InputsEmbedder::get_kv_cache_state() { return m_impl->get_kv_cache_state(); } -size_t InputsEmbedder::get_num_tokens_to_remove_from_hist() const { - return m_impl->get_num_tokens_to_remove_from_hist(); +bool InputsEmbedder::should_reset_kv_cache() const { + return m_impl->should_reset_kv_cache(); } Tokenizer InputsEmbedder::get_tokenizer() const { @@ -241,8 +241,8 @@ void InputsEmbedder::start_chat(const std::string& system_message) { return m_impl->start_chat(system_message); } -void InputsEmbedder::update_chat_history(const std::string& decoded_results) { - return m_impl->update_chat_history(decoded_results); +void InputsEmbedder::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) { + return m_impl->update_chat_history(decoded_results, generation_finish_status); } void InputsEmbedder::set_apply_chat_template_status(bool apply_chat_template) { diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 5c8fcbbce9..581564fa05 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -47,14 +47,14 @@ class InputsEmbedder { // get reflection of tokens contained in the kv cache KVCacheState& get_kv_cache_state(); - // returns amount of elements, which need to remove from the end of the KV cache - size_t get_num_tokens_to_remove_from_hist() const; + // returns true, if we need to remove full kv cache, in that case it's needed to reset it instead of manually updating + bool should_reset_kv_cache() const; // starts chat and adds optional system_message to chat history void start_chat(const std::string& system_message); // adds currently generated text to chat history - void update_chat_history(const std::string& decoded_results); + void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status); // set the apply_chat_template flag, which determines whether chat template should be applied for non-chat scenarios void set_apply_chat_template_status(bool apply_chat_template); diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 612f34187f..53ba5fcfc0 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -42,6 +42,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ std::shared_ptr m_inputs_embedder; // Axis num in kv cache from m_language model, which contains information about history len size_t m_kv_cache_seq_length_axis = 2; + // Load pipeline time + float m_load_time_ms = 0; // Component for applying sampling to lm outputs Sampler m_sampler; size_t m_max_kv_cache_size = std::numeric_limits::max(); @@ -93,7 +95,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ ov::genai::utils::print_compiled_model_properties(compiled_language_model, "VLM language model"); m_language = compiled_language_model.create_infer_request(); - m_kv_cache_seq_length_axis = utils::get_kv_axes_pos(language_model).seq_len; + m_kv_cache_seq_length_axis = kv_pos.seq_len; m_language.get_tensor("attention_mask").set_shape({1, 0}); auto embedder_properties = device_propertes.empty() @@ -186,17 +188,21 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics); auto end_get_inputs_embeds = std::chrono::steady_clock::now(); - auto to_remove_from_hist = m_inputs_embedder->get_num_tokens_to_remove_from_hist(); - utils::trim_kv_cache(m_language, to_remove_from_hist, m_kv_cache_seq_length_axis, std::nullopt); + KVCacheState& kv_cache_state = m_inputs_embedder->get_kv_cache_state(); + if (m_is_chat_conversation) + if (kv_cache_state.get_state().empty()) + m_language.reset_state(); + else + ov::genai::utils::trim_kv_cache(m_language, kv_cache_state.num_tokens_to_trim, kv_cache_state.seq_length_axis, std::nullopt); std::vector requests; size_t request_id = 0; size_t block_size = 1; // not used - size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1) - to_remove_from_hist; + size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1) - kv_cache_state.num_tokens_to_trim; size_t inputs_embeds_size = inputs_embeds.get_shape().at(1); - KVCacheState& kv_cache_state = m_inputs_embedder->get_kv_cache_state(); + std::vector tokenized_history = kv_cache_state.get_state(); ov::Tensor prompt_ids(ov::element::i64, { history_size + inputs_embeds_size }); OPENVINO_ASSERT(prompt_ids.get_size() >= tokenized_history.size(), "Prompt ids size is less than tokenized history size"); @@ -237,7 +243,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ std::string decoded_results = decoded.texts.at(0); if (m_is_chat_conversation) - m_inputs_embedder->update_chat_history(decoded_results); + m_inputs_embedder->update_chat_history(decoded_results, finish_info.streaming_finish_status); else kv_cache_state.reset_state(); diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index f2b697285a..49a4fcf027 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -8,7 +8,7 @@ import sys import transformers from optimum.intel.openvino import OVModelForVisualCausalLM -from openvino_genai import VLMPipeline, GenerationConfig, SchedulerConfig, ContinuousBatchingPipeline, GenerationStatus +from openvino_genai import VLMPipeline, GenerationConfig, SchedulerConfig, ContinuousBatchingPipeline, GenerationStatus, StreamingStatus from utils.network import retry_request from utils.generation_config import get_beam_search, get_multinomial_all_parameters, get_greedy @@ -359,3 +359,87 @@ def test_vlm_npu_no_exception(model_id, cache): for link in image_links_for_testing[2]: image = get_image_by_link(link) out = ov_pipe.generate(prompts[0], images=[image], generation_config=generation_config) + + +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.parametrize("model_id", model_ids) +@pytest.mark.parametrize("iteration_images", [image_links_for_testing[1], []]) +def test_vlm_pipeline_chat_streamer_cancel_second_generate(model_id, iteration_images, cache): + callback_questions = [ + '1+1=', + 'Why is the Sun yellow?', + 'What is the previous answer?' + ] + + current_iter = 0 + num_iters = 3 + def streamer(subword): + nonlocal current_iter + current_iter += 1 + return StreamingStatus.CANCEL if current_iter == num_iters else StreamingStatus.RUNNING + + + models_path = get_ov_model(model_id, cache) + ov_pipe = VLMPipeline(models_path, "CPU") + generation_config = ov_pipe.get_generation_config() + generation_config.max_new_tokens = 30 + generation_config.set_eos_token_id(ov_pipe.get_tokenizer().get_eos_token_id()) + generation_config.ignore_eos = True + + images = [] + for link in iteration_images: + images.append(get_image_by_link(link)) + + results_with_cancel = "" + ov_pipe.start_chat() + results_with_cancel += ov_pipe.generate(callback_questions[0], images=images, generation_config=generation_config).texts[0] + # doesn't add to results_with_cancel as it should be complitely removed from the history + ov_pipe.generate(callback_questions[1], images=images, generation_config=generation_config, streamer=streamer) + results_with_cancel += ov_pipe.generate(callback_questions[2], images=images, generation_config=generation_config).texts[0] + ov_pipe.finish_chat() + + results = "" + ov_pipe.start_chat() + results += ov_pipe.generate(callback_questions[0], images=images, generation_config=generation_config).texts[0] + results += ov_pipe.generate(callback_questions[2], images=images, generation_config=generation_config).texts[0] + ov_pipe.finish_chat() + + assert results_with_cancel == results + + +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.parametrize("model_id", model_ids) +@pytest.mark.parametrize("iteration_images", [image_links_for_testing[1], []]) +def test_vlm_pipeline_chat_streamer_cancel_first_generate(model_id, iteration_images, cache): + callback_questions = [ + 'Why is the Sun yellow?', + '1+1=', + ] + + current_iter = 0 + num_iters = 3 + def streamer(subword): + nonlocal current_iter + current_iter += 1 + return StreamingStatus.CANCEL if current_iter == num_iters else StreamingStatus.RUNNING + + models_path = get_ov_model(model_id, cache) + ov_pipe = VLMPipeline(models_path, "CPU") + generation_config = ov_pipe.get_generation_config() + generation_config.max_new_tokens = 30 + generation_config.ignore_eos = True + generation_config.set_eos_token_id(ov_pipe.get_tokenizer().get_eos_token_id()) + + images = [] + for link in iteration_images: + images.append(get_image_by_link(link)) + + ov_pipe.start_chat() + res_first = ov_pipe.generate(callback_questions[0], images=images, generation_config=generation_config, streamer=streamer).texts[0] + current_iter = 0 + res_second = ov_pipe.generate(callback_questions[0], images=images, generation_config=generation_config, streamer=streamer).texts[0] + ov_pipe.finish_chat() + + assert res_first == res_second \ No newline at end of file From 3bbfe6cf24fb000ba4da6f5eb02e3e2283c81232 Mon Sep 17 00:00:00 2001 From: sbalandi Date: Fri, 14 Feb 2025 19:01:32 +0000 Subject: [PATCH 2/8] update comments --- .../src/visual_language/inputs_embedder.cpp | 5 +++ .../src/visual_language/inputs_embedder.hpp | 3 ++ src/cpp/src/visual_language/pipeline.cpp | 2 +- tests/python_tests/test_vlm_pipeline.py | 42 ++++++++++++++++++- 4 files changed, 50 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 50a212081a..80f6fa822b 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -21,6 +21,7 @@ namespace ov::genai { // Base InputsEmbedder class + std::pair> InputsEmbedder::IInputsEmbedder::get_position_ids(const size_t inputs_embeds_size, const size_t history_size) { ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds_size }}; std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), history_size); @@ -217,6 +218,10 @@ ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const st return m_impl->get_inputs_embeds(prompt, images, metrics); } +ov::Tensor InputsEmbedder::get_input_embeddings(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) { + return m_impl->get_input_embeddings(prompt, images, metrics); +} + std::pair> InputsEmbedder::get_position_ids(const size_t inputs_embeds_size, const size_t history_size) { return m_impl->get_position_ids(inputs_embeds_size, history_size); } diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 581564fa05..5c271bd1a8 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -35,6 +35,9 @@ class InputsEmbedder { // compute input embedding for prompt and multiple images ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics); + // computes input embedding for prompt and multiple images and saves input_embeddings size + ov::Tensor get_input_embeddings(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics); + // compute position ids for language model input std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size); diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 53ba5fcfc0..ee8daf9ff2 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -185,7 +185,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ m_inputs_embedder->set_apply_chat_template_status(generation_config.apply_chat_template); auto start_get_inputs_embeds = std::chrono::steady_clock::now(); - ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics); + ov::Tensor inputs_embeds = m_inputs_embedder->get_input_embeddings(prompt, rgbs, perf_metrics); auto end_get_inputs_embeds = std::chrono::steady_clock::now(); KVCacheState& kv_cache_state = m_inputs_embedder->get_kv_cache_state(); diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index 49a4fcf027..385d51004b 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -398,6 +398,16 @@ def streamer(subword): ov_pipe.generate(callback_questions[1], images=images, generation_config=generation_config, streamer=streamer) results_with_cancel += ov_pipe.generate(callback_questions[2], images=images, generation_config=generation_config).texts[0] ov_pipe.finish_chat() + + results = "" + ov_pipe.start_chat() + results += ov_pipe.generate(callback_questions[0], images=images, generation_config=generation_config).texts[0] + + generation_config.ignore_eos = True + results += ov_pipe.generate(callback_questions[2], images=images, generation_config=generation_config).texts[0] + ov_pipe.finish_chat() + + assert(results_with_cancel == results) results = "" ov_pipe.start_chat() @@ -442,4 +452,34 @@ def streamer(subword): res_second = ov_pipe.generate(callback_questions[0], images=images, generation_config=generation_config, streamer=streamer).texts[0] ov_pipe.finish_chat() - assert res_first == res_second \ No newline at end of file + assert res_first == res_second + + +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.parametrize("model_id", model_ids) +@pytest.mark.parametrize("iteration_images", [[[], image_links_for_testing[1]], [image_links_for_testing[1], image_links_for_testing[1]], [[], image_links_for_testing[1], []]]) +def test_vlm_pipeline_chat_image_combination(model_id, iteration_images, cache): + def streamer(word: str) -> bool: + nonlocal result_from_streamer + result_from_streamer.append(word) + return False + + models_path = get_ov_model(model_id, cache) + ov_pipe = VLMPipeline(models_path, "CPU") + generation_config = ov_pipe.get_generation_config() + generation_config.max_new_tokens = 30 + generation_config.set_eos_token_id(ov_pipe.get_tokenizer().get_eos_token_id()) + + for images_links in iteration_images: + ov_pipe.start_chat() + + images = [] + for link in images_links: + images.append(get_image_by_link(link)) + + result_from_streamer = [] + res = ov_pipe.generate(prompts[0], images=images, generation_config=generation_config, streamer=streamer) + assert res.texts[0] == ''.join(result_from_streamer) + + ov_pipe.finish_chat() From fecc402958f7f3e3cbe26b7703d088d8947b79aa Mon Sep 17 00:00:00 2001 From: sbalandi Date: Mon, 17 Feb 2025 13:33:00 +0000 Subject: [PATCH 3/8] fix test --- .../src/visual_language/inputs_embedder.cpp | 1 - tests/python_tests/test_vlm_pipeline.py | 66 ++++++------------- 2 files changed, 20 insertions(+), 47 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 80f6fa822b..f0976642ca 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -21,7 +21,6 @@ namespace ov::genai { // Base InputsEmbedder class - std::pair> InputsEmbedder::IInputsEmbedder::get_position_ids(const size_t inputs_embeds_size, const size_t history_size) { ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds_size }}; std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), history_size); diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index 385d51004b..bd887c889d 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -224,7 +224,10 @@ def test_vlm_with_scheduler_vs_default(config, cache): @pytest.mark.nightly @pytest.mark.parametrize("model_id", model_ids) @pytest.mark.parametrize("system_message", ["", "You are a helpful assistant."]) -def test_vlm_pipeline_chat(model_id, system_message, cache): +@pytest.mark.parametrize("iteration_images", [[image_links_for_testing[0], image_links_for_testing[0]], [image_links_for_testing[0], image_links_for_testing[2], image_links_for_testing[0]], + [image_links_for_testing[1], image_links_for_testing[1]], [image_links_for_testing[1], image_links_for_testing[1], image_links_for_testing[1]], + [image_links_for_testing[2], image_links_for_testing[1]], [image_links_for_testing[2], image_links_for_testing[0], image_links_for_testing[1]]]) +def test_vlm_pipeline_chat(model_id, system_message, iteration_images, cache): def streamer(word: str) -> bool: nonlocal result_from_streamer result_from_streamer.append(word) @@ -236,23 +239,26 @@ def streamer(word: str) -> bool: generation_config.max_new_tokens = 30 generation_config.set_eos_token_id(ov_pipe.get_tokenizer().get_eos_token_id()) - for links in image_links_for_testing: + ov_pipe.start_chat(system_message) + + images = [] + for link in iteration_images[0]: + images.append(get_image_by_link(link)) + + result_from_streamer = [] + res = ov_pipe.generate(prompts[0], images=images, generation_config=generation_config, streamer=streamer) + assert res.texts[0] == ''.join(result_from_streamer) + + for image_set in iteration_images[1:]: images = [] - for link in links: + for link in image_set: images.append(get_image_by_link(link)) - ov_pipe.start_chat(system_message) - result_from_streamer = [] - res = ov_pipe.generate(prompts[0], images=images, generation_config=generation_config, streamer=streamer) + res = ov_pipe.generate(prompts[1], images=images, generation_config=generation_config, streamer=streamer) assert res.texts[0] == ''.join(result_from_streamer) - for prompt in prompts[1:]: - result_from_streamer = [] - res = ov_pipe.generate(prompt, generation_config=generation_config, streamer=streamer) - assert res.texts[0] == ''.join(result_from_streamer) - - ov_pipe.finish_chat() + ov_pipe.finish_chat() @pytest.mark.precommit @@ -398,16 +404,14 @@ def streamer(subword): ov_pipe.generate(callback_questions[1], images=images, generation_config=generation_config, streamer=streamer) results_with_cancel += ov_pipe.generate(callback_questions[2], images=images, generation_config=generation_config).texts[0] ov_pipe.finish_chat() - + results = "" ov_pipe.start_chat() results += ov_pipe.generate(callback_questions[0], images=images, generation_config=generation_config).texts[0] - - generation_config.ignore_eos = True results += ov_pipe.generate(callback_questions[2], images=images, generation_config=generation_config).texts[0] ov_pipe.finish_chat() - assert(results_with_cancel == results) + assert results_with_cancel == results results = "" ov_pipe.start_chat() @@ -453,33 +457,3 @@ def streamer(subword): ov_pipe.finish_chat() assert res_first == res_second - - -@pytest.mark.precommit -@pytest.mark.nightly -@pytest.mark.parametrize("model_id", model_ids) -@pytest.mark.parametrize("iteration_images", [[[], image_links_for_testing[1]], [image_links_for_testing[1], image_links_for_testing[1]], [[], image_links_for_testing[1], []]]) -def test_vlm_pipeline_chat_image_combination(model_id, iteration_images, cache): - def streamer(word: str) -> bool: - nonlocal result_from_streamer - result_from_streamer.append(word) - return False - - models_path = get_ov_model(model_id, cache) - ov_pipe = VLMPipeline(models_path, "CPU") - generation_config = ov_pipe.get_generation_config() - generation_config.max_new_tokens = 30 - generation_config.set_eos_token_id(ov_pipe.get_tokenizer().get_eos_token_id()) - - for images_links in iteration_images: - ov_pipe.start_chat() - - images = [] - for link in images_links: - images.append(get_image_by_link(link)) - - result_from_streamer = [] - res = ov_pipe.generate(prompts[0], images=images, generation_config=generation_config, streamer=streamer) - assert res.texts[0] == ''.join(result_from_streamer) - - ov_pipe.finish_chat() From 1a02fe3b212259297c4d8f29f0f5c8dddf7c0590 Mon Sep 17 00:00:00 2001 From: sbalandi Date: Wed, 26 Feb 2025 00:15:02 +0000 Subject: [PATCH 4/8] update according to m_kv_cache_state --- src/cpp/src/llm_pipeline_stateful.cpp | 15 +++++++-------- src/cpp/src/llm_pipeline_stateful.hpp | 6 +----- src/cpp/src/lm_encoding.cpp | 4 ++-- src/cpp/src/lm_encoding.hpp | 17 +++++------------ src/cpp/src/visual_language/inputs_embedder.cpp | 8 -------- src/cpp/src/visual_language/inputs_embedder.hpp | 6 ------ src/cpp/src/visual_language/pipeline.cpp | 4 +--- 7 files changed, 16 insertions(+), 44 deletions(-) diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp index 2d8ba48f3c..6b418a51b3 100644 --- a/src/cpp/src/llm_pipeline_stateful.cpp +++ b/src/cpp/src/llm_pipeline_stateful.cpp @@ -60,7 +60,7 @@ StatefulLLMPipeline::StatefulLLMPipeline( } if (!m_use_full_chat_history) - m_kv_history_trim_manager.kv_cache_seq_length_axis = kv_pos.seq_len; + m_kv_cache_state.seq_length_axis = kv_pos.seq_len; auto filtered_properties = extract_adapters_from_properties(properties, &m_generation_config.adapters); if (m_generation_config.adapters) { @@ -143,7 +143,7 @@ DecodedResults StatefulLLMPipeline::generate( if (m_use_full_chat_history) { encoded_input = new_chat_tokens; } else { - ov::genai::align_kv_cache_and_history(m_kv_history_trim_manager, new_chat_tokens.input_ids, m_kv_cache_state); + ov::genai::align_kv_cache_and_history(new_chat_tokens.input_ids, m_kv_cache_state); encoded_input = get_chat_encoded_input(new_chat_tokens.input_ids, m_kv_cache_state); } // TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied @@ -238,7 +238,7 @@ EncodedResults StatefulLLMPipeline::generate( // Tail of previous output in chat mode is missing in KV cache. if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS) { ov::Tensor new_chat_tokens = ov::Tensor{ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data()}; - ov::genai::align_kv_cache_and_history(m_kv_history_trim_manager, new_chat_tokens, m_kv_cache_state); + ov::genai::align_kv_cache_and_history(new_chat_tokens, m_kv_cache_state); auto encoded_input = get_chat_encoded_input(new_chat_tokens, m_kv_cache_state); input_ids = encoded_input.input_ids; @@ -284,8 +284,8 @@ EncodedResults StatefulLLMPipeline::generate( if (m_kv_cache_state.get_state().empty() || m_use_full_chat_history) reset_kv_state(); else - ov::genai::utils::trim_kv_cache(m_model_runner, m_kv_history_trim_manager.num_tokens_to_trim, - m_kv_history_trim_manager.kv_cache_seq_length_axis, m_adapter_controller); + ov::genai::utils::trim_kv_cache(m_model_runner, m_kv_cache_state.num_tokens_to_trim, + m_kv_cache_state.seq_length_axis, m_adapter_controller); } size_t kv_cache_len = 0; @@ -358,7 +358,7 @@ EncodedResults StatefulLLMPipeline::generate( m_chat_generation_finish_status = finish_info.streaming_finish_status; if (is_chat_conversation) { - m_kv_history_trim_manager.reset(); + m_kv_cache_state.num_tokens_to_trim = 0; if (m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS) { if (m_chat_generation_finish_status == ov::genai::GenerationStatus::CANCEL) { @@ -367,7 +367,7 @@ EncodedResults StatefulLLMPipeline::generate( std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history)); } } else if (config.is_beam_search()) { - m_kv_history_trim_manager.num_tokens_to_trim = m_model_runner.get_tensor("attention_mask").get_shape()[1] - prev_attn_mask_size; + m_kv_cache_state.num_tokens_to_trim = m_model_runner.get_tensor("attention_mask").get_shape()[1] - prev_attn_mask_size; } } @@ -406,7 +406,6 @@ void StatefulLLMPipeline::reset_kv_state() { void StatefulLLMPipeline::finish_chat() { is_chat_conversation = false; - m_kv_history_trim_manager.reset(); m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF; bool have_state = 0 != m_model_runner.get_tensor("attention_mask").get_size(); if (!m_kv_cache_state.get_state().empty() || have_state) { diff --git a/src/cpp/src/llm_pipeline_stateful.hpp b/src/cpp/src/llm_pipeline_stateful.hpp index 3558c4c1f3..0f2b5075a5 100644 --- a/src/cpp/src/llm_pipeline_stateful.hpp +++ b/src/cpp/src/llm_pipeline_stateful.hpp @@ -20,17 +20,13 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { ChatHistory m_history; std::vector m_tokenized_chat_history; ov::genai::utils::GenerationChatInputsType m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF; - // If sequence contains some symbols, which could be ambiguously encoded by tokenizer, we need to trim kv cache - // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history - // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history - ov::genai::KVCacheTrimManager m_kv_history_trim_manager = {0, 2}; // Finish reason of last generation for chat scenario ov::genai::GenerationStatus m_chat_generation_finish_status = ov::genai::GenerationStatus::RUNNING; // if True, full history will be used as prompt on each chat generation bool m_use_full_chat_history = false; size_t m_max_kv_cache_size = std::numeric_limits::max(); bool m_is_npu = false; - // reflection of tokens contained in the kv cache + // include reflection of tokens contained in the kv cache and amount of tokens, which are needed to trim from kv cache on the next step of chat KVCacheState m_kv_cache_state; void reset_kv_state(); diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp index 61f9a169b5..1985ea5f1e 100644 --- a/src/cpp/src/lm_encoding.cpp +++ b/src/cpp/src/lm_encoding.cpp @@ -325,7 +325,7 @@ TokenizedInputs get_chat_encoded_input(const ov::Tensor& new_chat_tokens, KVCach } -void align_kv_cache_and_history(ov::genai::KVCacheTrimManager& kv_history_manager, const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state) { +void align_kv_cache_and_history(const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state) { // KV cache in model already contains prompts and answers from previous iterations. // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns // token_ids = {, ...}. So if tokenizer applies only to the new prompt, @@ -343,7 +343,7 @@ void align_kv_cache_and_history(ov::genai::KVCacheTrimManager& kv_history_manage size_t first_diverse_tokens_idx = ov::genai::utils::get_first_history_difference(new_chat_tokens, state); // in the case of beam_search the longest answer is in the kv cache, but the best one is needed // so generated tokens were not added to KVCacheState and num_tokens_to_trim was set to the size of the generated serquence - kv_history_manager.num_tokens_to_trim = kv_history_manager.num_tokens_to_trim > 0 ? kv_history_manager.num_tokens_to_trim : (state.size() - first_diverse_tokens_idx); + kv_cache_state.num_tokens_to_trim = kv_cache_state.num_tokens_to_trim > 0 ? kv_cache_state.num_tokens_to_trim : (state.size() - first_diverse_tokens_idx); state.resize(first_diverse_tokens_idx); } diff --git a/src/cpp/src/lm_encoding.hpp b/src/cpp/src/lm_encoding.hpp index c817ef19a6..c7f64b8f6f 100644 --- a/src/cpp/src/lm_encoding.hpp +++ b/src/cpp/src/lm_encoding.hpp @@ -11,6 +11,9 @@ namespace genai { class KVCacheState { std::vector state; public: + size_t num_tokens_to_trim = 0; + size_t seq_length_axis = 2; + std::vector& get_state() { return state; } @@ -20,18 +23,8 @@ class KVCacheState { } void reset_state() { - return state.clear(); - } -}; - - -struct KVCacheTrimManager -{ - size_t num_tokens_to_trim = 0; - size_t kv_cache_seq_length_axis = 2; - - void reset() { num_tokens_to_trim = 0; + state.clear(); } }; @@ -42,7 +35,7 @@ ov::genai::utils::GenerationFinishInfo get_lm_encoded_results(ov::InferRequest& std::optional rope_delta = std::nullopt, const size_t max_kv_cache_size = std::numeric_limits::max()); -void align_kv_cache_and_history(ov::genai::KVCacheTrimManager& kv_history_manager, const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state); +void align_kv_cache_and_history(const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state); TokenizedInputs get_chat_encoded_input(const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state); diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index f0976642ca..8eb81c3f1c 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -217,10 +217,6 @@ ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const st return m_impl->get_inputs_embeds(prompt, images, metrics); } -ov::Tensor InputsEmbedder::get_input_embeddings(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) { - return m_impl->get_input_embeddings(prompt, images, metrics); -} - std::pair> InputsEmbedder::get_position_ids(const size_t inputs_embeds_size, const size_t history_size) { return m_impl->get_position_ids(inputs_embeds_size, history_size); } @@ -233,10 +229,6 @@ KVCacheState& InputsEmbedder::get_kv_cache_state() { return m_impl->get_kv_cache_state(); } -bool InputsEmbedder::should_reset_kv_cache() const { - return m_impl->should_reset_kv_cache(); -} - Tokenizer InputsEmbedder::get_tokenizer() const { return m_impl->get_tokenizer(); } diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 5c271bd1a8..4dfea5211a 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -35,9 +35,6 @@ class InputsEmbedder { // compute input embedding for prompt and multiple images ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics); - // computes input embedding for prompt and multiple images and saves input_embeddings size - ov::Tensor get_input_embeddings(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics); - // compute position ids for language model input std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size); @@ -50,9 +47,6 @@ class InputsEmbedder { // get reflection of tokens contained in the kv cache KVCacheState& get_kv_cache_state(); - // returns true, if we need to remove full kv cache, in that case it's needed to reset it instead of manually updating - bool should_reset_kv_cache() const; - // starts chat and adds optional system_message to chat history void start_chat(const std::string& system_message); diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index ee8daf9ff2..fedd2b4c2c 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -42,8 +42,6 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ std::shared_ptr m_inputs_embedder; // Axis num in kv cache from m_language model, which contains information about history len size_t m_kv_cache_seq_length_axis = 2; - // Load pipeline time - float m_load_time_ms = 0; // Component for applying sampling to lm outputs Sampler m_sampler; size_t m_max_kv_cache_size = std::numeric_limits::max(); @@ -185,7 +183,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ m_inputs_embedder->set_apply_chat_template_status(generation_config.apply_chat_template); auto start_get_inputs_embeds = std::chrono::steady_clock::now(); - ov::Tensor inputs_embeds = m_inputs_embedder->get_input_embeddings(prompt, rgbs, perf_metrics); + ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics); auto end_get_inputs_embeds = std::chrono::steady_clock::now(); KVCacheState& kv_cache_state = m_inputs_embedder->get_kv_cache_state(); From a7693261e665e7e674219c2f1e30585306a41951 Mon Sep 17 00:00:00 2001 From: sbalandi Date: Mon, 3 Mar 2025 16:10:26 +0000 Subject: [PATCH 5/8] update according to comments --- src/cpp/src/llm_pipeline_stateful.cpp | 5 ++-- src/cpp/src/llm_pipeline_stateful.hpp | 2 +- src/cpp/src/lm_encoding.cpp | 7 ++--- src/cpp/src/lm_encoding.hpp | 27 +++---------------- src/cpp/src/utils.cpp | 22 ++++++++++++--- src/cpp/src/utils.hpp | 24 ++++++++++++++++- .../src/visual_language/inputs_embedder.cpp | 22 ++++++++------- .../src/visual_language/inputs_embedder.hpp | 16 +++-------- .../src/visual_language/minicpm/classes.cpp | 8 ++++++ .../src/visual_language/minicpm/classes.hpp | 3 +++ .../visual_language/phi3_vision/classes.cpp | 10 +++++++ .../visual_language/phi3_vision/classes.hpp | 3 +++ src/cpp/src/visual_language/pipeline.cpp | 15 ++++------- 13 files changed, 96 insertions(+), 68 deletions(-) diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp index 6b418a51b3..ec5c9e897e 100644 --- a/src/cpp/src/llm_pipeline_stateful.cpp +++ b/src/cpp/src/llm_pipeline_stateful.cpp @@ -281,11 +281,10 @@ EncodedResults StatefulLLMPipeline::generate( "but you have '" + std::to_string(num_inputs) + "' inputs"); if (is_chat_conversation) { - if (m_kv_cache_state.get_state().empty() || m_use_full_chat_history) + if (m_use_full_chat_history) reset_kv_state(); else - ov::genai::utils::trim_kv_cache(m_model_runner, m_kv_cache_state.num_tokens_to_trim, - m_kv_cache_state.seq_length_axis, m_adapter_controller); + ov::genai::utils::trim_kv_cache(m_model_runner, m_kv_cache_state, m_adapter_controller); } size_t kv_cache_len = 0; diff --git a/src/cpp/src/llm_pipeline_stateful.hpp b/src/cpp/src/llm_pipeline_stateful.hpp index 0f2b5075a5..04c510a0c9 100644 --- a/src/cpp/src/llm_pipeline_stateful.hpp +++ b/src/cpp/src/llm_pipeline_stateful.hpp @@ -27,7 +27,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { size_t m_max_kv_cache_size = std::numeric_limits::max(); bool m_is_npu = false; // include reflection of tokens contained in the kv cache and amount of tokens, which are needed to trim from kv cache on the next step of chat - KVCacheState m_kv_cache_state; + utils::KVCacheState m_kv_cache_state; void reset_kv_state(); public: diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp index 1985ea5f1e..c3d649a95b 100644 --- a/src/cpp/src/lm_encoding.cpp +++ b/src/cpp/src/lm_encoding.cpp @@ -82,7 +82,7 @@ ov::genai::utils::GenerationFinishInfo get_lm_encoded_results( Sampler& sampler, std::vector sequence_groups, std::optional position_ids, - KVCacheState& kv_cache_state, + utils::KVCacheState& kv_cache_state, std::optional m_embedding, std::optional rope_delta, const size_t max_kv_cache_size @@ -298,7 +298,7 @@ ov::genai::utils::GenerationFinishInfo get_lm_encoded_results( } -TokenizedInputs get_chat_encoded_input(const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state) { +TokenizedInputs get_chat_encoded_input(const ov::Tensor& new_chat_tokens, utils::KVCacheState& kv_cache_state) { TokenizedInputs encoded_input; size_t kv_cache_len = kv_cache_state.get_state().size(); if (kv_cache_len == 0) { @@ -325,7 +325,7 @@ TokenizedInputs get_chat_encoded_input(const ov::Tensor& new_chat_tokens, KVCach } -void align_kv_cache_and_history(const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state) { +void align_kv_cache_and_history(const ov::Tensor& new_chat_tokens, utils::KVCacheState& kv_cache_state) { // KV cache in model already contains prompts and answers from previous iterations. // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns // token_ids = {, ...}. So if tokenizer applies only to the new prompt, @@ -345,6 +345,7 @@ void align_kv_cache_and_history(const ov::Tensor& new_chat_tokens, KVCacheState& // so generated tokens were not added to KVCacheState and num_tokens_to_trim was set to the size of the generated serquence kv_cache_state.num_tokens_to_trim = kv_cache_state.num_tokens_to_trim > 0 ? kv_cache_state.num_tokens_to_trim : (state.size() - first_diverse_tokens_idx); state.resize(first_diverse_tokens_idx); + kv_cache_state.reset_mem_state = state.empty(); } } // namespace genai diff --git a/src/cpp/src/lm_encoding.hpp b/src/cpp/src/lm_encoding.hpp index c7f64b8f6f..69a787713f 100644 --- a/src/cpp/src/lm_encoding.hpp +++ b/src/cpp/src/lm_encoding.hpp @@ -8,37 +8,16 @@ namespace ov { namespace genai { -class KVCacheState { - std::vector state; -public: - size_t num_tokens_to_trim = 0; - size_t seq_length_axis = 2; - - std::vector& get_state() { - return state; - } - - void add_inputs(const ov::Tensor& inputs_ids) { - std::copy_n(inputs_ids.data(), inputs_ids.get_size(), std::back_inserter(state)); - } - - void reset_state() { - num_tokens_to_trim = 0; - state.clear(); - } -}; - - ov::genai::utils::GenerationFinishInfo get_lm_encoded_results(ov::InferRequest& m_llm, const ov::Tensor& input_ids, const ov::Tensor& attention_mask, const std::shared_ptr& streamer_ptr, Sampler& sampler, std::vector sequence_groups, - std::optional position_ids, KVCacheState& m_kv_cache_state, std::optional m_embedding, + std::optional position_ids, utils::KVCacheState& m_kv_cache_state, std::optional m_embedding, std::optional rope_delta = std::nullopt, const size_t max_kv_cache_size = std::numeric_limits::max()); -void align_kv_cache_and_history(const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state); +void align_kv_cache_and_history(const ov::Tensor& new_chat_tokens, utils::KVCacheState& kv_cache_state); -TokenizedInputs get_chat_encoded_input(const ov::Tensor& new_chat_tokens, KVCacheState& kv_cache_state); +TokenizedInputs get_chat_encoded_input(const ov::Tensor& new_chat_tokens, utils::KVCacheState& kv_cache_state); } } diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index aca1693562..9c75a537d6 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -325,13 +325,27 @@ KVAxesPosition get_kv_axes_pos(std::shared_ptr model) { return kv_pos; } -void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t seq_length_axis, std::optional adapter_controller) { +void trim_kv_cache(ov::InferRequest request, KVCacheState& kv_cache_state, std::optional adapter_controller) { + if (kv_cache_state.reset_mem_state) { + if (adapter_controller) { + for(auto& state: request.query_state()) { + if(!adapter_controller->has_state_name(state.get_name())) { + state.reset(); + } + } + } else { + request.reset_state(); + } + + return; + } + // nothing to trim in this case - if (remove_from_end == 0) + if (kv_cache_state.num_tokens_to_trim == 0) return; auto states = request.query_state(); - + OPENVINO_ASSERT(states.size() > 0, "Request contains no states."); for (auto& state : states) { @@ -341,7 +355,7 @@ void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t se ov::Tensor old_tensor = state.get_state(); // [BATCH_SIZE, num_kv_heads, seq_len, head_size] auto shape = old_tensor.get_shape(); - shape[seq_length_axis] -= remove_from_end; + shape[kv_cache_state.seq_length_axis] -= kv_cache_state.num_tokens_to_trim; ov::Coordinate new_shape_begin{0, 0, 0, 0}; ov::Coordinate new_shape_end{shape}; diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 4c8453b97b..9b60875b22 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -102,7 +102,29 @@ struct KVAxesPosition { KVAxesPosition get_kv_axes_pos(std::shared_ptr model); -void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t seq_length_axis, std::optional adapter_controller); +class KVCacheState { + std::vector state; +public: + size_t num_tokens_to_trim = 0; + size_t seq_length_axis = 2; + bool reset_mem_state = false; + + std::vector& get_state() { + return state; + } + + void add_inputs(const ov::Tensor& inputs_ids) { + std::copy_n(inputs_ids.data(), inputs_ids.get_size(), std::back_inserter(state)); + } + + void reset_state() { + reset_mem_state = false; + num_tokens_to_trim = 0; + state.clear(); + } +}; + +void trim_kv_cache(ov::InferRequest request, KVCacheState& kv_cache_state, std::optional adapter_controller); ov::Tensor push_front_inputs(const ov::Tensor& base_tensor, int64_t add_to_front); diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 8eb81c3f1c..87165f76b4 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -29,7 +29,6 @@ std::pair> InputsEmbedder::IInputsEmbedder::g void InputsEmbedder::IInputsEmbedder::start_chat(const std::string& system_message) { m_is_chat_conversation = true; - m_kv_history_trim_manager.reset(); if (!m_kv_cache_state.get_state().empty()) { m_history.clear(); m_kv_cache_state.reset_state(); @@ -40,17 +39,20 @@ void InputsEmbedder::IInputsEmbedder::start_chat(const std::string& system_messa m_history = {{{"role", "system"}, {"content", system_message}}}; } -void InputsEmbedder::IInputsEmbedder::update_chat_history(const std::string& decoded_results) { - // Tail of chat template is missing in KV cache. - // Find the tail to concatenate it with the next input prompt. - m_history.push_back({{"role", "assistant"}, {"content", decoded_results}}); - m_kv_history_trim_manager.reset(); +void InputsEmbedder::IInputsEmbedder::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) { + m_kv_cache_state.num_tokens_to_trim = 0; + if (generation_finish_status == ov::genai::GenerationStatus::CANCEL) { + // If chat generation process was cancelled by user, let's rollback to previous state of history + m_history.pop_back(); + } else { + // Tail of chat template is missing in KV cache. + // Find the tail to concatenate it with the next input prompt. + m_history.push_back({{"role", "assistant"}, {"content", decoded_results}}); + } } void InputsEmbedder::IInputsEmbedder::finish_chat() { m_is_chat_conversation = false; - m_kv_history_trim_manager.reset(); - m_history.clear(); m_kv_cache_state.reset_state(); } @@ -123,7 +125,7 @@ ov::Tensor InputsEmbedder::IInputsEmbedder::apply_chat_template_tokenize(const s ov::Tensor InputsEmbedder::IInputsEmbedder::update_history(const ov::Tensor& new_chat_tokens) { ov::Tensor encoded_inputs; if (m_is_chat_conversation) { - ov::genai::align_kv_cache_and_history(m_kv_history_trim_manager, new_chat_tokens, m_kv_cache_state); + ov::genai::align_kv_cache_and_history(new_chat_tokens, m_kv_cache_state); encoded_inputs = get_chat_encoded_input(new_chat_tokens, m_kv_cache_state).input_ids; } else { encoded_inputs = new_chat_tokens; @@ -225,7 +227,7 @@ EmbeddingsModel InputsEmbedder::get_embedding_model() const { return m_impl->get_embedding_model(); } -KVCacheState& InputsEmbedder::get_kv_cache_state() { +ov::genai::utils::KVCacheState& InputsEmbedder::get_kv_cache_state() { return m_impl->get_kv_cache_state(); } diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 4dfea5211a..5645db32a0 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -45,7 +45,7 @@ class InputsEmbedder { Tokenizer get_tokenizer() const; // get reflection of tokens contained in the kv cache - KVCacheState& get_kv_cache_state(); + utils::KVCacheState& get_kv_cache_state(); // starts chat and adds optional system_message to chat history void start_chat(const std::string& system_message); @@ -77,16 +77,12 @@ class InputsEmbedder { bool m_is_chat_conversation = false; // Chat history ChatHistory m_history; - // If sequence contains some symbols, which could be ambiguous encoded by tokenizer, we need to trim kv cache - // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history - // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history - ov::genai::KVCacheTrimManager m_kv_history_trim_manager = {0, 2}; // True if chat template should be applied for non-chat scenario bool m_apply_chat_template = true; // Finish reason of last generation for chat scenario ov::genai::GenerationStatus m_chat_generation_finish_status = ov::genai::GenerationStatus::RUNNING; // reflection of tokens contained in the kv cache - KVCacheState m_kv_cache_state; + utils::KVCacheState m_kv_cache_state; public: virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) = 0; @@ -100,21 +96,17 @@ class InputsEmbedder { return m_tokenizer; } - KVCacheState& get_kv_cache_state() { + utils::KVCacheState& get_kv_cache_state() { return m_kv_cache_state; } - size_t get_num_tokens_to_remove_from_hist() const { - return m_kv_history_trim_manager.num_tokens_to_trim; - } - void set_apply_chat_template_status(bool apply_chat_template) { m_apply_chat_template = apply_chat_template; } virtual void start_chat(const std::string& system_message); - void update_chat_history(const std::string& decoded_results); + virtual void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status); virtual void finish_chat(); diff --git a/src/cpp/src/visual_language/minicpm/classes.cpp b/src/cpp/src/visual_language/minicpm/classes.cpp index 697ea64e50..e24cd22438 100644 --- a/src/cpp/src/visual_language/minicpm/classes.cpp +++ b/src/cpp/src/visual_language/minicpm/classes.cpp @@ -667,6 +667,14 @@ ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& prompt, c return inputs_embeds; } +void InputsEmbedderMiniCPM::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) { + IInputsEmbedder::update_chat_history(decoded_results, generation_finish_status); + if (generation_finish_status == ov::genai::GenerationStatus::CANCEL) + m_image_id = m_prev_image_id; + else + m_prev_image_id = m_image_id; +} + void InputsEmbedderMiniCPM::start_chat(const std::string& system_message) { IInputsEmbedder::start_chat(system_message); m_image_id = 0; diff --git a/src/cpp/src/visual_language/minicpm/classes.hpp b/src/cpp/src/visual_language/minicpm/classes.hpp index 0ddc160231..99e71faf44 100644 --- a/src/cpp/src/visual_language/minicpm/classes.hpp +++ b/src/cpp/src/visual_language/minicpm/classes.hpp @@ -30,6 +30,7 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { ov::Tensor m_pos_embed_cache; // Used to insert i per image (not a slice). size_t m_image_id = 0; + size_t m_prev_image_id = 0; public: InputsEmbedderMiniCPM( @@ -48,6 +49,8 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override; + void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) override; + void start_chat(const std::string& system_message) override; void finish_chat() override; diff --git a/src/cpp/src/visual_language/phi3_vision/classes.cpp b/src/cpp/src/visual_language/phi3_vision/classes.cpp index f1f094d1d6..fa5adfaf7f 100644 --- a/src/cpp/src/visual_language/phi3_vision/classes.cpp +++ b/src/cpp/src/visual_language/phi3_vision/classes.cpp @@ -471,6 +471,8 @@ ov::Tensor insert_image_placeholders(const std::vector& chunks, cons length, merged.data() + offset ); + if (tokens_per_images.empty()) + continue; offset += length; if (offset < merged_length) { std::fill_n( @@ -602,6 +604,14 @@ ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& prompt, con return inputs_embeds; } +void InputsEmbedderPhi3V::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) { + IInputsEmbedder::update_chat_history(decoded_results, generation_finish_status); + if (generation_finish_status == ov::genai::GenerationStatus::CANCEL) + m_tokens_per_images = m_prev_tokens_per_images; + else + m_prev_tokens_per_images = m_tokens_per_images; +} + void InputsEmbedderPhi3V::start_chat(const std::string& system_message) { IInputsEmbedder::start_chat(system_message); m_tokens_per_images.clear(); diff --git a/src/cpp/src/visual_language/phi3_vision/classes.hpp b/src/cpp/src/visual_language/phi3_vision/classes.hpp index 6fd922125e..006429723a 100644 --- a/src/cpp/src/visual_language/phi3_vision/classes.hpp +++ b/src/cpp/src/visual_language/phi3_vision/classes.hpp @@ -30,6 +30,8 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override; + void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) override; + void start_chat(const std::string& system_message) override; void finish_chat() override; @@ -38,6 +40,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { ov::InferRequest m_hd_feature_transformer; ov::InferRequest m_vision_projection; std::vector m_tokens_per_images; + std::vector m_prev_tokens_per_images; }; } // namespace ov::genai diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index fedd2b4c2c..2e3e264680 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -40,8 +40,6 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ bool m_is_chat_conversation = false; // InputsEmbedder std::shared_ptr m_inputs_embedder; - // Axis num in kv cache from m_language model, which contains information about history len - size_t m_kv_cache_seq_length_axis = 2; // Component for applying sampling to lm outputs Sampler m_sampler; size_t m_max_kv_cache_size = std::numeric_limits::max(); @@ -63,7 +61,6 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ auto language_model_path = models_dir / "openvino_language_model.xml"; auto language_model = utils::singleton_core().read_model(language_model_path, {}, properties_copy); auto kv_pos = ov::genai::utils::get_kv_axes_pos(language_model); - m_kv_cache_seq_length_axis = kv_pos.seq_len; // In case user provided properties per-device // { @@ -93,7 +90,9 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ ov::genai::utils::print_compiled_model_properties(compiled_language_model, "VLM language model"); m_language = compiled_language_model.create_infer_request(); - m_kv_cache_seq_length_axis = kv_pos.seq_len; + + utils::KVCacheState& kv_cache_state = m_inputs_embedder->get_kv_cache_state(); + kv_cache_state.seq_length_axis = kv_pos.seq_len; m_language.get_tensor("attention_mask").set_shape({1, 0}); auto embedder_properties = device_propertes.empty() @@ -186,12 +185,9 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics); auto end_get_inputs_embeds = std::chrono::steady_clock::now(); - KVCacheState& kv_cache_state = m_inputs_embedder->get_kv_cache_state(); + utils::KVCacheState& kv_cache_state = m_inputs_embedder->get_kv_cache_state(); if (m_is_chat_conversation) - if (kv_cache_state.get_state().empty()) - m_language.reset_state(); - else - ov::genai::utils::trim_kv_cache(m_language, kv_cache_state.num_tokens_to_trim, kv_cache_state.seq_length_axis, std::nullopt); + utils::trim_kv_cache(m_language, kv_cache_state, std::nullopt); std::vector requests; size_t request_id = 0; @@ -200,7 +196,6 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1) - kv_cache_state.num_tokens_to_trim; size_t inputs_embeds_size = inputs_embeds.get_shape().at(1); - std::vector tokenized_history = kv_cache_state.get_state(); ov::Tensor prompt_ids(ov::element::i64, { history_size + inputs_embeds_size }); OPENVINO_ASSERT(prompt_ids.get_size() >= tokenized_history.size(), "Prompt ids size is less than tokenized history size"); From cb2cdec12108d8621ba0dce480712d58803557e9 Mon Sep 17 00:00:00 2001 From: sbalandi Date: Tue, 4 Mar 2025 19:31:18 +0000 Subject: [PATCH 6/8] update --- src/cpp/src/lm_encoding.cpp | 3 ++- src/cpp/src/visual_language/inputs_embedder.cpp | 7 ++++--- src/cpp/src/visual_language/inputs_embedder.hpp | 4 ++-- src/cpp/src/visual_language/minicpm/classes.cpp | 4 ++-- src/cpp/src/visual_language/minicpm/classes.hpp | 2 +- src/cpp/src/visual_language/phi3_vision/classes.cpp | 4 ++-- src/cpp/src/visual_language/phi3_vision/classes.hpp | 2 +- src/cpp/src/visual_language/pipeline.cpp | 2 +- 8 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp index c3d649a95b..29bf5e4221 100644 --- a/src/cpp/src/lm_encoding.cpp +++ b/src/cpp/src/lm_encoding.cpp @@ -344,7 +344,8 @@ void align_kv_cache_and_history(const ov::Tensor& new_chat_tokens, utils::KVCach // in the case of beam_search the longest answer is in the kv cache, but the best one is needed // so generated tokens were not added to KVCacheState and num_tokens_to_trim was set to the size of the generated serquence kv_cache_state.num_tokens_to_trim = kv_cache_state.num_tokens_to_trim > 0 ? kv_cache_state.num_tokens_to_trim : (state.size() - first_diverse_tokens_idx); - state.resize(first_diverse_tokens_idx); + OPENVINO_ASSERT(state.size() >= kv_cache_state.num_tokens_to_trim, "Size of kv cache is less then amount of tokens, which should be trimmed from kv cache."); + state.resize(state.size() - kv_cache_state.num_tokens_to_trim); kv_cache_state.reset_mem_state = state.empty(); } diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 87165f76b4..c19927b22e 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -39,11 +39,12 @@ void InputsEmbedder::IInputsEmbedder::start_chat(const std::string& system_messa m_history = {{{"role", "system"}, {"content", system_message}}}; } -void InputsEmbedder::IInputsEmbedder::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) { +void InputsEmbedder::IInputsEmbedder::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status, size_t processed_tokens_amount) { m_kv_cache_state.num_tokens_to_trim = 0; if (generation_finish_status == ov::genai::GenerationStatus::CANCEL) { // If chat generation process was cancelled by user, let's rollback to previous state of history m_history.pop_back(); + m_kv_cache_state.num_tokens_to_trim = processed_tokens_amount; } else { // Tail of chat template is missing in KV cache. // Find the tail to concatenate it with the next input prompt. @@ -239,8 +240,8 @@ void InputsEmbedder::start_chat(const std::string& system_message) { return m_impl->start_chat(system_message); } -void InputsEmbedder::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) { - return m_impl->update_chat_history(decoded_results, generation_finish_status); +void InputsEmbedder::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status, size_t processed_tokens_amount) { + return m_impl->update_chat_history(decoded_results, generation_finish_status, processed_tokens_amount); } void InputsEmbedder::set_apply_chat_template_status(bool apply_chat_template) { diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 5645db32a0..d5b86cfd1f 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -51,7 +51,7 @@ class InputsEmbedder { void start_chat(const std::string& system_message); // adds currently generated text to chat history - void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status); + void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status, size_t processed_tokens_amount); // set the apply_chat_template flag, which determines whether chat template should be applied for non-chat scenarios void set_apply_chat_template_status(bool apply_chat_template); @@ -106,7 +106,7 @@ class InputsEmbedder { virtual void start_chat(const std::string& system_message); - virtual void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status); + virtual void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status, size_t processed_tokens_amount); virtual void finish_chat(); diff --git a/src/cpp/src/visual_language/minicpm/classes.cpp b/src/cpp/src/visual_language/minicpm/classes.cpp index e24cd22438..a5198f68c7 100644 --- a/src/cpp/src/visual_language/minicpm/classes.cpp +++ b/src/cpp/src/visual_language/minicpm/classes.cpp @@ -667,8 +667,8 @@ ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& prompt, c return inputs_embeds; } -void InputsEmbedderMiniCPM::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) { - IInputsEmbedder::update_chat_history(decoded_results, generation_finish_status); +void InputsEmbedderMiniCPM::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status, size_t processed_tokens_amount) { + IInputsEmbedder::update_chat_history(decoded_results, generation_finish_status, processed_tokens_amount); if (generation_finish_status == ov::genai::GenerationStatus::CANCEL) m_image_id = m_prev_image_id; else diff --git a/src/cpp/src/visual_language/minicpm/classes.hpp b/src/cpp/src/visual_language/minicpm/classes.hpp index 99e71faf44..ed23667b44 100644 --- a/src/cpp/src/visual_language/minicpm/classes.hpp +++ b/src/cpp/src/visual_language/minicpm/classes.hpp @@ -49,7 +49,7 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override; - void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) override; + void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status, size_t processed_tokens_amount) override; void start_chat(const std::string& system_message) override; diff --git a/src/cpp/src/visual_language/phi3_vision/classes.cpp b/src/cpp/src/visual_language/phi3_vision/classes.cpp index fa5adfaf7f..0086f89f55 100644 --- a/src/cpp/src/visual_language/phi3_vision/classes.cpp +++ b/src/cpp/src/visual_language/phi3_vision/classes.cpp @@ -604,8 +604,8 @@ ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& prompt, con return inputs_embeds; } -void InputsEmbedderPhi3V::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) { - IInputsEmbedder::update_chat_history(decoded_results, generation_finish_status); +void InputsEmbedderPhi3V::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status, size_t processed_tokens_amount) { + IInputsEmbedder::update_chat_history(decoded_results, generation_finish_status, processed_tokens_amount); if (generation_finish_status == ov::genai::GenerationStatus::CANCEL) m_tokens_per_images = m_prev_tokens_per_images; else diff --git a/src/cpp/src/visual_language/phi3_vision/classes.hpp b/src/cpp/src/visual_language/phi3_vision/classes.hpp index 006429723a..3b83fb2002 100644 --- a/src/cpp/src/visual_language/phi3_vision/classes.hpp +++ b/src/cpp/src/visual_language/phi3_vision/classes.hpp @@ -30,7 +30,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override; - void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) override; + void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status, size_t processed_tokens_amount) override; void start_chat(const std::string& system_message) override; diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 2e3e264680..3da182f5c8 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -236,7 +236,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ std::string decoded_results = decoded.texts.at(0); if (m_is_chat_conversation) - m_inputs_embedder->update_chat_history(decoded_results, finish_info.streaming_finish_status); + m_inputs_embedder->update_chat_history(decoded_results, finish_info.streaming_finish_status, m_language.get_tensor("attention_mask").get_shape().at(1) - history_size); else kv_cache_state.reset_state(); From b7cbdbd7f0c3c4a619fd2d53b92ee4e8fe23e7cd Mon Sep 17 00:00:00 2001 From: sbalandi Date: Wed, 5 Mar 2025 00:06:04 +0000 Subject: [PATCH 7/8] fix ci to beam search and cancel with image both work --- src/cpp/src/lm_encoding.cpp | 5 ++--- src/cpp/src/visual_language/inputs_embedder.cpp | 4 ++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp index 29bf5e4221..53d95df3f6 100644 --- a/src/cpp/src/lm_encoding.cpp +++ b/src/cpp/src/lm_encoding.cpp @@ -343,9 +343,8 @@ void align_kv_cache_and_history(const ov::Tensor& new_chat_tokens, utils::KVCach size_t first_diverse_tokens_idx = ov::genai::utils::get_first_history_difference(new_chat_tokens, state); // in the case of beam_search the longest answer is in the kv cache, but the best one is needed // so generated tokens were not added to KVCacheState and num_tokens_to_trim was set to the size of the generated serquence - kv_cache_state.num_tokens_to_trim = kv_cache_state.num_tokens_to_trim > 0 ? kv_cache_state.num_tokens_to_trim : (state.size() - first_diverse_tokens_idx); - OPENVINO_ASSERT(state.size() >= kv_cache_state.num_tokens_to_trim, "Size of kv cache is less then amount of tokens, which should be trimmed from kv cache."); - state.resize(state.size() - kv_cache_state.num_tokens_to_trim); + kv_cache_state.num_tokens_to_trim += state.size() - first_diverse_tokens_idx; + state.resize(first_diverse_tokens_idx); kv_cache_state.reset_mem_state = state.empty(); } diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index c19927b22e..46a7d24cf1 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -44,6 +44,10 @@ void InputsEmbedder::IInputsEmbedder::update_chat_history(const std::string& dec if (generation_finish_status == ov::genai::GenerationStatus::CANCEL) { // If chat generation process was cancelled by user, let's rollback to previous state of history m_history.pop_back(); + + std::vector& state = m_kv_cache_state.get_state(); + state.resize(state.size() - processed_tokens_amount); + m_kv_cache_state.reset_mem_state = state.empty(); m_kv_cache_state.num_tokens_to_trim = processed_tokens_amount; } else { // Tail of chat template is missing in KV cache. From b0305a255e0c4ce948a4e8ddd5dede1745914da7 Mon Sep 17 00:00:00 2001 From: sbalandi Date: Thu, 6 Mar 2025 16:25:11 +0000 Subject: [PATCH 8/8] rebase + update --- src/cpp/src/llm_pipeline_stateful.cpp | 1 - src/cpp/src/visual_language/inputs_embedder.cpp | 12 +++++++----- src/cpp/src/visual_language/inputs_embedder.hpp | 6 ++++-- src/cpp/src/visual_language/minicpm/classes.cpp | 4 ++-- src/cpp/src/visual_language/minicpm/classes.hpp | 2 +- src/cpp/src/visual_language/phi3_vision/classes.cpp | 5 +++-- src/cpp/src/visual_language/phi3_vision/classes.hpp | 2 +- src/cpp/src/visual_language/pipeline.cpp | 8 ++++---- tests/python_tests/test_vlm_pipeline.py | 8 +++++--- 9 files changed, 27 insertions(+), 21 deletions(-) diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp index ec5c9e897e..5050bae790 100644 --- a/src/cpp/src/llm_pipeline_stateful.cpp +++ b/src/cpp/src/llm_pipeline_stateful.cpp @@ -212,7 +212,6 @@ EncodedResults StatefulLLMPipeline::generate( reset_kv_state(); m_model_runner.get_tensor("attention_mask").set_shape({1, 0}); m_kv_cache_state.reset_state(); - m_kv_history_trim_manager.reset(); } auto start_time = std::chrono::steady_clock::now(); diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 46a7d24cf1..2d48db0ff1 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -39,16 +39,17 @@ void InputsEmbedder::IInputsEmbedder::start_chat(const std::string& system_messa m_history = {{{"role", "system"}, {"content", system_message}}}; } -void InputsEmbedder::IInputsEmbedder::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status, size_t processed_tokens_amount) { +void InputsEmbedder::IInputsEmbedder::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) { m_kv_cache_state.num_tokens_to_trim = 0; if (generation_finish_status == ov::genai::GenerationStatus::CANCEL) { // If chat generation process was cancelled by user, let's rollback to previous state of history m_history.pop_back(); std::vector& state = m_kv_cache_state.get_state(); - state.resize(state.size() - processed_tokens_amount); + + m_kv_cache_state.num_tokens_to_trim = state.size() - m_prev_hist_length; + state.resize(m_prev_hist_length); m_kv_cache_state.reset_mem_state = state.empty(); - m_kv_cache_state.num_tokens_to_trim = processed_tokens_amount; } else { // Tail of chat template is missing in KV cache. // Find the tail to concatenate it with the next input prompt. @@ -142,6 +143,7 @@ ov::Tensor InputsEmbedder::IInputsEmbedder::update_history(const ov::Tensor& new ov::Tensor InputsEmbedder::IInputsEmbedder::get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics) { const auto new_chat_tokens = apply_chat_template_tokenize(prompt, metrics); auto new_input_ids = update_history(new_chat_tokens); + m_prev_hist_length = m_kv_cache_state.get_state().size(); m_kv_cache_state.add_inputs(new_input_ids); return new_input_ids; @@ -244,8 +246,8 @@ void InputsEmbedder::start_chat(const std::string& system_message) { return m_impl->start_chat(system_message); } -void InputsEmbedder::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status, size_t processed_tokens_amount) { - return m_impl->update_chat_history(decoded_results, generation_finish_status, processed_tokens_amount); +void InputsEmbedder::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) { + return m_impl->update_chat_history(decoded_results, generation_finish_status); } void InputsEmbedder::set_apply_chat_template_status(bool apply_chat_template) { diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index d5b86cfd1f..5eec6cd41e 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -51,7 +51,7 @@ class InputsEmbedder { void start_chat(const std::string& system_message); // adds currently generated text to chat history - void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status, size_t processed_tokens_amount); + void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status); // set the apply_chat_template flag, which determines whether chat template should be applied for non-chat scenarios void set_apply_chat_template_status(bool apply_chat_template); @@ -83,6 +83,8 @@ class InputsEmbedder { ov::genai::GenerationStatus m_chat_generation_finish_status = ov::genai::GenerationStatus::RUNNING; // reflection of tokens contained in the kv cache utils::KVCacheState m_kv_cache_state; + // length of attention_mask/kv cache at the beginning of generation() + size_t m_prev_hist_length = 0; public: virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) = 0; @@ -106,7 +108,7 @@ class InputsEmbedder { virtual void start_chat(const std::string& system_message); - virtual void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status, size_t processed_tokens_amount); + virtual void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status); virtual void finish_chat(); diff --git a/src/cpp/src/visual_language/minicpm/classes.cpp b/src/cpp/src/visual_language/minicpm/classes.cpp index a5198f68c7..e24cd22438 100644 --- a/src/cpp/src/visual_language/minicpm/classes.cpp +++ b/src/cpp/src/visual_language/minicpm/classes.cpp @@ -667,8 +667,8 @@ ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& prompt, c return inputs_embeds; } -void InputsEmbedderMiniCPM::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status, size_t processed_tokens_amount) { - IInputsEmbedder::update_chat_history(decoded_results, generation_finish_status, processed_tokens_amount); +void InputsEmbedderMiniCPM::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) { + IInputsEmbedder::update_chat_history(decoded_results, generation_finish_status); if (generation_finish_status == ov::genai::GenerationStatus::CANCEL) m_image_id = m_prev_image_id; else diff --git a/src/cpp/src/visual_language/minicpm/classes.hpp b/src/cpp/src/visual_language/minicpm/classes.hpp index ed23667b44..99e71faf44 100644 --- a/src/cpp/src/visual_language/minicpm/classes.hpp +++ b/src/cpp/src/visual_language/minicpm/classes.hpp @@ -49,7 +49,7 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override; - void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status, size_t processed_tokens_amount) override; + void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) override; void start_chat(const std::string& system_message) override; diff --git a/src/cpp/src/visual_language/phi3_vision/classes.cpp b/src/cpp/src/visual_language/phi3_vision/classes.cpp index 0086f89f55..ac0d8adbaa 100644 --- a/src/cpp/src/visual_language/phi3_vision/classes.cpp +++ b/src/cpp/src/visual_language/phi3_vision/classes.cpp @@ -550,6 +550,7 @@ ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& prompt, con } ov::Tensor new_merged_tokens = insert_image_placeholders(new_chat_tokens, m_tokens_per_images); ov::Tensor new_tokens = update_history(new_merged_tokens); + m_prev_hist_length = m_kv_cache_state.get_state().size(); m_kv_cache_state.add_inputs(new_tokens); std::vector tokens = drop_image_placeholders(new_tokens); @@ -604,8 +605,8 @@ ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& prompt, con return inputs_embeds; } -void InputsEmbedderPhi3V::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status, size_t processed_tokens_amount) { - IInputsEmbedder::update_chat_history(decoded_results, generation_finish_status, processed_tokens_amount); +void InputsEmbedderPhi3V::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) { + IInputsEmbedder::update_chat_history(decoded_results, generation_finish_status); if (generation_finish_status == ov::genai::GenerationStatus::CANCEL) m_tokens_per_images = m_prev_tokens_per_images; else diff --git a/src/cpp/src/visual_language/phi3_vision/classes.hpp b/src/cpp/src/visual_language/phi3_vision/classes.hpp index 3b83fb2002..006429723a 100644 --- a/src/cpp/src/visual_language/phi3_vision/classes.hpp +++ b/src/cpp/src/visual_language/phi3_vision/classes.hpp @@ -30,7 +30,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override; - void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status, size_t processed_tokens_amount) override; + void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) override; void start_chat(const std::string& system_message) override; diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 3da182f5c8..57f325d7fa 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -90,9 +90,6 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ ov::genai::utils::print_compiled_model_properties(compiled_language_model, "VLM language model"); m_language = compiled_language_model.create_infer_request(); - - utils::KVCacheState& kv_cache_state = m_inputs_embedder->get_kv_cache_state(); - kv_cache_state.seq_length_axis = kv_pos.seq_len; m_language.get_tensor("attention_mask").set_shape({1, 0}); auto embedder_properties = device_propertes.empty() @@ -102,6 +99,9 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ m_tokenizer = m_inputs_embedder->get_tokenizer(); m_embedding = m_inputs_embedder->get_embedding_model(); + utils::KVCacheState& kv_cache_state = m_inputs_embedder->get_kv_cache_state(); + kv_cache_state.seq_length_axis = kv_pos.seq_len; + // If eos_token_id was not provided, take value if (m_generation_config.eos_token_id == -1) { m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id()); @@ -236,7 +236,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ std::string decoded_results = decoded.texts.at(0); if (m_is_chat_conversation) - m_inputs_embedder->update_chat_history(decoded_results, finish_info.streaming_finish_status, m_language.get_tensor("attention_mask").get_shape().at(1) - history_size); + m_inputs_embedder->update_chat_history(decoded_results, finish_info.streaming_finish_status); else kv_cache_state.reset_state(); diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index bd887c889d..575c848388 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -224,9 +224,11 @@ def test_vlm_with_scheduler_vs_default(config, cache): @pytest.mark.nightly @pytest.mark.parametrize("model_id", model_ids) @pytest.mark.parametrize("system_message", ["", "You are a helpful assistant."]) -@pytest.mark.parametrize("iteration_images", [[image_links_for_testing[0], image_links_for_testing[0]], [image_links_for_testing[0], image_links_for_testing[2], image_links_for_testing[0]], - [image_links_for_testing[1], image_links_for_testing[1]], [image_links_for_testing[1], image_links_for_testing[1], image_links_for_testing[1]], - [image_links_for_testing[2], image_links_for_testing[1]], [image_links_for_testing[2], image_links_for_testing[0], image_links_for_testing[1]]]) +@pytest.mark.parametrize("iteration_images", [[image_links_for_testing[0], image_links_for_testing[0]], # generation with text input only + [image_links_for_testing[0], image_links_for_testing[2], image_links_for_testing[0]], # combination of generations with text input and text + image input, empty image first + [image_links_for_testing[2], image_links_for_testing[1]], # generation with text + image input + [image_links_for_testing[2], image_links_for_testing[0], image_links_for_testing[1]]] # combination of generations with text input and text + image input, image input first + ) def test_vlm_pipeline_chat(model_id, system_message, iteration_images, cache): def streamer(word: str) -> bool: nonlocal result_from_streamer