openvinotoolkit · smirnov-alexey · Mar 13, 2025 · Mar 19, 2025
diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp
@@ -39,17 +39,15 @@ StatefulLLMPipeline::StatefulLLMPipeline(
         tokenizer,
         device,
         properties,
-        utils::from_config_json_if_exists(models_path),
-        models_path
+        utils::from_config_json_if_exists(models_path)
     } {}
 
 StatefulLLMPipeline::StatefulLLMPipeline(
     const std::shared_ptr<ov::Model>& model,
     const ov::genai::Tokenizer& tokenizer,
     const std::string& device,
     const ov::AnyMap& properties,
-    const ov::genai::GenerationConfig& generation_config,
-    const std::filesystem::path& models_path)
+    const ov::genai::GenerationConfig& generation_config)
     : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
     utils::apply_slice_before_matmul_transformation(model);
     auto kv_pos = ov::genai::utils::get_kv_axes_pos(model);
@@ -70,9 +68,7 @@ StatefulLLMPipeline::StatefulLLMPipeline(
     ov::CompiledModel compiled_model;
     if (m_is_npu) {
         utils::KVDesc kv_desc;
-        std::tie(compiled_model, kv_desc) = utils::compile_decoder_for_npu(
-            model, *filtered_properties, kv_pos, models_path / "openvino_model.xml"
-        );
+        std::tie(compiled_model, kv_desc) = utils::compile_decoder_for_npu(model, *filtered_properties, kv_pos);
         m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
     } else {
        compiled_model = utils::singleton_core().compile_model(model, device, *filtered_properties);

diff --git a/src/cpp/src/llm_pipeline_stateful.hpp b/src/cpp/src/llm_pipeline_stateful.hpp
@@ -50,8 +50,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         const ov::genai::Tokenizer& tokenizer,
         const std::string& device,
         const ov::AnyMap& config,
-        const ov::genai::GenerationConfig& generation_config,
-        const std::filesystem::path& models_path = {}
+        const ov::genai::GenerationConfig& generation_config
     );
 
     StatefulLLMPipeline(

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
@@ -102,23 +102,19 @@ StatefulLLMPipeline::StatefulLLMPipeline(
 ): StatefulLLMPipeline(
        genai::utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, config),
        tokenizer, config,
-       utils::from_config_json_if_exists(models_path),
-       models_path
+       utils::from_config_json_if_exists(models_path)
    ) {
 }
 
 StatefulLLMPipeline::StatefulLLMPipeline(
     const std::shared_ptr<ov::Model>& model,
     const ov::genai::Tokenizer& tokenizer,
     const ov::AnyMap& properties,
-    const ov::genai::GenerationConfig& generation_config,
-    const std::filesystem::path& models_path
+    const ov::genai::GenerationConfig& generation_config
 ) : LLMPipelineImplBase(tokenizer, generation_config),
     m_sampler(m_tokenizer) {
     auto kv_pos = ov::genai::utils::get_kv_axes_pos(model);
-    auto [compiled, kv_desc] = utils::compile_decoder_for_npu(
-        model, properties, kv_pos, models_path / "openvino_model.xml"
-    );
+    auto [compiled, kv_desc] = utils::compile_decoder_for_npu(model, properties, kv_pos);
     m_max_prompt_len = kv_desc.max_prompt_len;
     m_kvcache_total = kv_desc.max_prompt_len + kv_desc.min_response_len;
     m_request = compiled.create_infer_request();
@@ -358,16 +354,14 @@ LLMPipelineFactory::create(const std::filesystem::path& models_path,
 std::unique_ptr<LLMPipelineImplBase> LLMPipelineFactory::create(const std::shared_ptr<ov::Model>& model,
                                                                 const ov::genai::Tokenizer& tokenizer,
                                                                 const ov::AnyMap& properties,
-                                                                const ov::genai::GenerationConfig& generation_config,
-                                                                const std::filesystem::path& models_path) {
+                                                                const ov::genai::GenerationConfig& generation_config) {
     auto properties_copy = properties;
     const auto pipeline_mode = str_to_pipeline(utils::pop_or_default(properties_copy, "STATIC_PIPELINE", std::string("STATEFUL")));
     if (pipeline_mode == StaticPipelineKind::STATEFUL) {
         return std::make_unique<ov::genai::static_llm::StatefulLLMPipeline>(model,
                                                                             tokenizer,
                                                                             properties_copy,
-                                                                            generation_config,
-                                                                            models_path);
+                                                                            generation_config);
     }
     OPENVINO_ASSERT(false);
 }

diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp
@@ -23,8 +23,7 @@ struct LLMPipelineFactory {
     static std::unique_ptr<LLMPipelineImplBase> create(const std::shared_ptr<ov::Model>& model,
                                                        const ov::genai::Tokenizer& tokenizer,
                                                        const ov::AnyMap& properties,
-                                                       const ov::genai::GenerationConfig& generation_config,
-                                                       const std::filesystem::path& models_path = {});
+                                                       const ov::genai::GenerationConfig& generation_config);
 };
 
 class StatefulLLMPipeline : public LLMPipelineImplBase {
@@ -39,8 +38,7 @@ class StatefulLLMPipeline : public LLMPipelineImplBase {
         const std::shared_ptr<ov::Model>& model,
         const ov::genai::Tokenizer& tokenizer,
         const ov::AnyMap& properties,
-        const ov::genai::GenerationConfig& generation_config,
-        const std::filesystem::path& path = {}
+        const ov::genai::GenerationConfig& generation_config
     );
 
     DecodedResults generate(

diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
@@ -418,8 +418,7 @@ void print_compiled_model_properties(ov::CompiledModel& compiled_Model, const ch
 std::pair<ov::CompiledModel, KVDesc>
 compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
                         const ov::AnyMap& config,
-                        const KVAxesPosition& kv_pos,
-                        const std::filesystem::path& model_path) {
+                        const KVAxesPosition& kv_pos) {
     ov::CompiledModel compiled;
     ov::AnyMap properties = config;
     KVDesc kv_desc;
@@ -443,17 +442,7 @@ compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
         kv_desc.max_prompt_len = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u);
         kv_desc.min_response_len = pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u);
         update_npu_config(properties, model, kv_pos, kv_desc);
-        auto cache_mode = get_option<CacheMode>(config, ov::cache_mode.name());
-        // NB: Select OPTIMIZE_SPEED with model_path isn't provided
-        if ((cache_mode.has_value() && *cache_mode == CacheMode::OPTIMIZE_SPEED)) {
-            compiled = ov::genai::utils::singleton_core().compile_model(model, "NPU", properties);
-        } else if (model_path.empty()) {
-            // Set config to OPTIMIZE_SPEED
-            properties[ov::cache_mode.name()] = CacheMode::OPTIMIZE_SPEED;
-            compiled = ov::genai::utils::singleton_core().compile_model(model, "NPU", properties);
-        } else {
-            compiled = ov::genai::utils::singleton_core().compile_model(model_path, "NPU", properties);
-        }
+        compiled = ov::genai::utils::singleton_core().compile_model(model, "NPU", properties);
         // Also export compiled model if required
         if (export_blob) {
             if (blob_path.empty()) {

diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
@@ -137,8 +137,7 @@ struct KVDesc {
 
 std::pair<ov::CompiledModel, KVDesc> compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
                                                              const ov::AnyMap& config,
-                                                             const KVAxesPosition& kv_pos,
-                                                             const std::filesystem::path& path = {});
+                                                             const KVAxesPosition& kv_pos);
 
 /// @brief SharedOptional is a wrapper around a reference to an existing object and an optional shared alternative value.
 /// The difference from std::optional is that the default state is not empty and contains a reference to an existing object outside the class.

diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
@@ -80,9 +80,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
         if (m_is_npu) {
             embedder_device = "CPU";
             utils::KVDesc kv_desc;
-            std::tie(compiled_language_model, kv_desc) = utils::compile_decoder_for_npu(
-                language_model, lm_properties, kv_pos, language_model_path
-            );
+            std::tie(compiled_language_model, kv_desc) = utils::compile_decoder_for_npu(language_model, lm_properties, kv_pos);
             m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
         } else {
             compiled_language_model = utils::singleton_core().compile_model(language_model, device, lm_properties);