Enable LM part of VLM to work on NPU

TolyaTalamanov · TolyaTalamanov · commit 17d92697d44b · 2025-03-03T12:51:16.000Z
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
@@ -118,9 +118,9 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::string& device,
     const ov::AnyMap& user_properties) {
     auto start_time = std::chrono::steady_clock::now();
-
     auto [properties, attention_backend] = extract_attention_backend(user_properties);
 
+
     // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
     if (explicitly_requires_paged_attention(properties)) {
         auto [device_properties, scheduler_config] = utils::extract_scheduler_config(properties, get_latency_oriented_scheduler_config());
diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp
@@ -71,7 +71,7 @@ StatefulLLMPipeline::StatefulLLMPipeline(
     if (m_is_npu) {
         utils::KVDesc kv_desc;
         std::tie(compiled_model, kv_desc) = utils::compile_decoder_for_npu(
-            model, *filtered_properties, kv_pos, models_path
+            model, *filtered_properties, kv_pos, models_path / "openvino_model.xml"
         );
         m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
     } else {
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
@@ -116,7 +116,9 @@ StatefulLLMPipeline::StatefulLLMPipeline(
 ) : LLMPipelineImplBase(tokenizer, generation_config),
     m_sampler(m_tokenizer) {
     auto kv_pos = ov::genai::utils::get_kv_axes_pos(model);
-    auto [compiled, kv_desc] = utils::compile_decoder_for_npu(model, properties, kv_pos, models_path);
+    auto [compiled, kv_desc] = utils::compile_decoder_for_npu(
+        model, properties, kv_pos, models_path / "openvino_model.xml"
+    );
     m_max_prompt_len = kv_desc.max_prompt_len;
     m_kvcache_total = kv_desc.max_prompt_len + kv_desc.min_response_len;
     m_request = compiled.create_infer_request();
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
@@ -492,7 +492,7 @@ compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
             properties[ov::cache_mode.name()] = CacheMode::OPTIMIZE_SPEED;
             compiled = ov::genai::utils::singleton_core().compile_model(model, "NPU", properties);
         } else {
-            compiled = ov::genai::utils::singleton_core().compile_model(model_path / "openvino_model.xml", "NPU", properties);
+            compiled = ov::genai::utils::singleton_core().compile_model(model_path, "NPU", properties);
         }
         // Also export compiled model if required
         if (export_blob) {
diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
@@ -44,6 +44,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
     size_t m_kv_cache_seq_length_axis = 2;
     // Component for applying sampling to lm outputs
     Sampler m_sampler;
+    size_t m_max_kv_cache_size = std::numeric_limits<size_t>::max();
+    bool m_is_npu = false;
 public:
     VLMPipelineImpl(
         const std::filesystem::path& models_dir,
@@ -54,23 +56,53 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
             utils::from_config_json_if_exists<GenerationConfig>(
                 models_dir, "generation_config.json"
             )
-        } {
-        m_inputs_embedder = std::make_shared<InputsEmbedder>(models_dir, device, properties);
-
-        m_tokenizer = m_inputs_embedder->get_tokenizer();
-        m_embedding = m_inputs_embedder->get_embedding_model();
-
-        auto compiled_language_model = utils::singleton_core().compile_model(
-            models_dir / "openvino_language_model.xml", device, properties
+        },
+        m_is_chat_conversation{false} {
+        m_is_npu = device.find("NPU") != std::string::npos;
+        auto properties_copy = properties;
+        auto language_model_path = models_dir / "openvino_language_model.xml";
+        auto language_model =  utils::singleton_core().read_model(language_model_path, {}, properties_copy);
+        auto kv_pos = ov::genai::utils::get_kv_axes_pos(language_model);
+        m_kv_cache_seq_length_axis = kv_pos.seq_len;
+
+        // User provided properties in the following format:
+        // {
+        //     ov::device::properties("NPU", ...),
+        //     ov::device::properties("CPU", ...)
+        // }
+        auto device_propertes = utils::pop_or_default<ov::AnyMap>(
+            properties_copy, ov::device::properties.name(), { }
         );
-        utils::print_compiled_model_properties(compiled_language_model, "VLM language model");
-        auto language_model = compiled_language_model.get_runtime_model();
-        m_kv_cache_seq_length_axis = utils::get_kv_axes_pos(language_model).seq_len;
+        // Otherwise, the same properties are used for all models
+        auto lm_properties = device_propertes.empty()
+            ? properties_copy
+            : utils::pop_or_default<ov::AnyMap>(device_propertes, device, {});
+
+        ov::CompiledModel compiled_language_model;
+        auto embedder_device = device;
+        if (m_is_npu) {
+            embedder_device = "CPU";
+            utils::KVDesc kv_desc;
+            std::tie(compiled_language_model, kv_desc) = utils::compile_decoder_for_npu(
+                language_model, lm_properties, kv_pos, language_model_path
+            );
+            m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
+        } else {
+            compiled_language_model = utils::singleton_core().compile_model(language_model, device, lm_properties);
+        }
+        ov::genai::utils::print_compiled_model_properties(compiled_language_model, "VLM language model");
 
         m_language = compiled_language_model.create_infer_request();
-
+        m_kv_cache_seq_length_axis = utils::get_kv_axes_pos(language_model).seq_len;
         m_language.get_tensor("attention_mask").set_shape({1, 0});
 
+        auto embedder_properties = device_propertes.empty()
+            ? properties_copy
+            : utils::pop_or_default<ov::AnyMap>(device_propertes, embedder_device, {});
+        m_inputs_embedder = std::make_shared<InputsEmbedder>(models_dir, embedder_device, embedder_properties);
+        m_tokenizer = m_inputs_embedder->get_tokenizer();
+        m_embedding = m_inputs_embedder->get_embedding_model();
+
         // If eos_token_id was not provided, take value
         if (m_generation_config.eos_token_id == -1) {
             m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
@@ -80,7 +112,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
         m_sampler.set_seed(m_generation_config.rng_seed);
     }
 
-    
+
     VLMPipelineImpl(
         const ModelsMap& models_map,
         const Tokenizer& tokenizer,
@@ -90,6 +122,10 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
         const GenerationConfig& generation_config
     ) :
         m_generation_config{generation_config} {
+        m_is_npu = device.find("NPU") != std::string::npos;
+        OPENVINO_ASSERT(m_is_npu &&
+            "VLMPipeline initialization from string isn't supported for NPU device");
+
         m_inputs_embedder = std::make_shared<InputsEmbedder>(models_map, tokenizer, config_dir_path, device, properties);
 
         m_tokenizer = m_inputs_embedder->get_tokenizer();
@@ -179,9 +215,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
             m_sampler.set_seed(generation_config.rng_seed);
         }
 
-        utils::GenerationFinishInfo finish_info = get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, m_sampler, requests,
-                                                                                               position_ids, kv_cache_state, m_embedding, rope_delta);
-
+        ov::genai::utils::GenerationFinishInfo finish_info = ov::genai::get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, m_sampler, requests,
+                                                                                               position_ids, kv_cache_state, m_embedding, rope_delta, m_max_kv_cache_size);
         EncodedResults& encoded_result = finish_info.results;
 
         auto decode_start_time = std::chrono::steady_clock::now();
@@ -208,7 +243,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
         res_raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(generate_end_time - generate_start_time));
         res_raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_end_time - decode_start_time));
         res_raw_counters.tokenization_durations.insert(res_raw_counters.tokenization_durations.end(), raw_counters.tokenization_durations.begin(), raw_counters.tokenization_durations.end());
-        
+
         // VLM specific perf metrics
         decoded.perf_metrics.vlm_raw_metrics.prepare_embeddings_durations.emplace_back(PerfMetrics::get_microsec(end_get_inputs_embeds - start_get_inputs_embeds));
 
@@ -220,6 +255,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
     }
 
     void start_chat(const std::string& system_message) override {
+        OPENVINO_ASSERT(!m_is_npu && "start_chat() isn't supported in VLMPipeline for NPU device");
         m_is_chat_conversation = true;
         bool have_state = 0 != m_language.get_tensor("attention_mask").get_size();
         if (have_state) {
@@ -232,6 +268,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
     }
 
     void finish_chat() override {
+        OPENVINO_ASSERT(!m_is_npu && "finish_chat() isn't supported in VLMPipeline for NPU device");
         m_is_chat_conversation = false;
         // Resetting state may be slow.
         m_language.reset_state();
@@ -276,8 +313,8 @@ VLMPipeline::VLMPipeline(
 ) {
     auto start_time = std::chrono::steady_clock::now();
 
-    if (properties.find(scheduler_config.name()) != properties.end() || 
-        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || 
+    if (properties.find(scheduler_config.name()) != properties.end() ||
+        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
         properties.find(prompt_lookup.name()) != properties.end()) {
         auto [plugin_config, scheduler_config] = utils::extract_scheduler_config(properties);
         m_pimpl = std::make_unique<VLMContinuousBatchingAdapter>(models_dir, scheduler_config, device, plugin_config);
@@ -298,8 +335,8 @@ VLMPipeline::VLMPipeline(
     const GenerationConfig& generation_config
 ) {
     auto start_time = std::chrono::steady_clock::now();
-    if (properties.find(scheduler_config.name()) != properties.end() || 
-        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || 
+    if (properties.find(scheduler_config.name()) != properties.end() ||
+        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
         properties.find(prompt_lookup.name()) != properties.end()) {
         auto [plugin_config, scheduler_config] = utils::extract_scheduler_config(properties);
         m_pimpl = std::make_unique<VLMContinuousBatchingAdapter>(models_map, tokenizer, config_dir_path, scheduler_config, device, plugin_config, generation_config);

Original file line number	Diff line number	Diff line change
`@@ -492,7 +492,7 @@ compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,`
`492`	`492`	`properties[ov::cache_mode.name()] = CacheMode::OPTIMIZE_SPEED;`
`493`	`493`	`compiled = ov::genai::utils::singleton_core().compile_model(model, "NPU", properties);`
`494`	`494`	`} else {`
`495`		`- compiled = ov::genai::utils::singleton_core().compile_model(model_path / "openvino_model.xml", "NPU", properties);`
	`495`	`+ compiled = ov::genai::utils::singleton_core().compile_model(model_path, "NPU", properties);`
`496`	`496`	`}`
`497`	`497`	`// Also export compiled model if required`
`498`	`498`	`if (export_blob) {`