@@ -690,19 +690,38 @@ namespace static_llm {
690
690
StatefulLLMPipeline::StatefulLLMPipeline (
691
691
const std::filesystem::path& models_path,
692
692
const ov::genai::Tokenizer& tokenizer,
693
- const std::string&,
693
+ const std::string& device ,
694
694
const ov::AnyMap& config
695
695
) : LLMPipelineImplBase(tokenizer,
696
696
utils::from_config_json_if_exists (models_path)),
697
697
m_sampler(m_tokenizer) {
698
-
699
- auto model = genai::utils::singleton_core ().read_model (models_path / " openvino_model.xml" , {}, config);
700
- ModelConfigDesc model_desc = get_modeldesc_from_json (models_path / " config.json" );
701
698
ov::AnyMap properties = config;
702
-
703
- auto compiled = setupAndCompileModel (model, model_desc, properties);
704
- m_request = compiled->create_infer_request ();
705
- m_sampler.set_seed (m_generation_config.rng_seed );
699
+ const auto use_blob = pop_or_default (properties, " USE_BLOB" , false );
700
+ if (use_blob) {
701
+ auto blob_path = pop_or_default (properties, " BLOB_PATH" , std::string{});
702
+ if (blob_path.empty ()) {
703
+ blob_path = (models_path / " openvino_model.blob" ).string ();
704
+ }
705
+ if (!std::filesystem::exists (blob_path)) {
706
+ OPENVINO_THROW (" Blob file is not found at: " + blob_path);
707
+ }
708
+ std::ifstream fin (blob_path, std::ios::in | std::ios::binary);
709
+ if (!fin.is_open ()) {
710
+ OPENVINO_THROW (" Blob file can't be opened: " + blob_path);
711
+ }
712
+ auto compiled = genai::utils::singleton_core ().import_model (fin, device, {});
713
+ m_max_prompt_len = compiled.get_property (" NPUW_LLM_MAX_PROMPT_LEN" ).as <uint32_t >();
714
+ auto min_resp_len = compiled.get_property (" NPUW_LLM_MIN_RESPONSE_LEN" ).as <uint32_t >();
715
+ m_kvcache_total = m_max_prompt_len + min_resp_len;
716
+ m_request = compiled.create_infer_request ();
717
+ } else {
718
+ auto model = genai::utils::singleton_core ().read_model (models_path / " openvino_model.xml" , {}, config);
719
+ ModelConfigDesc model_desc = get_modeldesc_from_json (models_path / " config.json" );
720
+ ov::AnyMap properties = config;
721
+ auto compiled = setupAndCompileModel (model, model_desc, properties);
722
+ m_request = compiled->create_infer_request ();
723
+ m_sampler.set_seed (m_generation_config.rng_seed );
724
+ }
706
725
}
707
726
708
727
@@ -721,11 +740,9 @@ StatefulLLMPipeline::StatefulLLMPipeline(
721
740
m_sampler.set_seed (m_generation_config.rng_seed );
722
741
}
723
742
724
- std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel (
725
- const std::shared_ptr<ov::Model>& model,
743
+ void StatefulLLMPipeline::updateStatefulConfig (
726
744
const ModelConfigDesc& model_desc,
727
745
ov::AnyMap& pipeline_config) {
728
-
729
746
const uint32_t kMaxPromptLen = pop_int_and_cast (pipeline_config, " MAX_PROMPT_LEN" ).value_or (1024u );
730
747
const uint32_t kMinResponseLen = pop_int_and_cast (pipeline_config, " MIN_RESPONSE_LEN" ).value_or (128u );
731
748
m_max_prompt_len = kMaxPromptLen ;
@@ -755,6 +772,13 @@ std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
755
772
756
773
// Replace CACHE_DIR option if NPUW is enabled
757
774
set_npuw_cache_dir (pipeline_config);
775
+ }
776
+
777
+ std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel (
778
+ const std::shared_ptr<ov::Model>& model,
779
+ const ModelConfigDesc& model_desc,
780
+ ov::AnyMap& pipeline_config) {
781
+ updateStatefulConfig (model_desc, pipeline_config);
758
782
759
783
return std::make_shared<ov::CompiledModel>(genai::utils::singleton_core ().compile_model (model, " NPU" , pipeline_config));
760
784
}
0 commit comments