Skip to content

Commit 2e5c2a1

Browse files
[LLM] [NPU] StaticLLMPipeline: Import model if blob file is present in Stateful pipeline (#1494)
Depends on openvinotoolkit/openvino#27915 Depends on openvinotoolkit/openvino#28420 --------- Co-authored-by: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
1 parent 34ca4e0 commit 2e5c2a1

File tree

2 files changed

+39
-11
lines changed

2 files changed

+39
-11
lines changed

src/cpp/src/llm_pipeline_static.cpp

+35-11
Original file line numberDiff line numberDiff line change
@@ -690,19 +690,38 @@ namespace static_llm {
690690
StatefulLLMPipeline::StatefulLLMPipeline(
691691
const std::filesystem::path& models_path,
692692
const ov::genai::Tokenizer& tokenizer,
693-
const std::string&,
693+
const std::string& device,
694694
const ov::AnyMap& config
695695
) : LLMPipelineImplBase(tokenizer,
696696
utils::from_config_json_if_exists(models_path)),
697697
m_sampler(m_tokenizer) {
698-
699-
auto model = genai::utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, config);
700-
ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json");
701698
ov::AnyMap properties = config;
702-
703-
auto compiled = setupAndCompileModel(model, model_desc, properties);
704-
m_request = compiled->create_infer_request();
705-
m_sampler.set_seed(m_generation_config.rng_seed);
699+
const auto use_blob = pop_or_default(properties, "USE_BLOB", false);
700+
if (use_blob) {
701+
auto blob_path = pop_or_default(properties, "BLOB_PATH", std::string{});
702+
if (blob_path.empty()) {
703+
blob_path = (models_path / "openvino_model.blob").string();
704+
}
705+
if (!std::filesystem::exists(blob_path)) {
706+
OPENVINO_THROW("Blob file is not found at: " + blob_path);
707+
}
708+
std::ifstream fin(blob_path, std::ios::in | std::ios::binary);
709+
if (!fin.is_open()) {
710+
OPENVINO_THROW("Blob file can't be opened: " + blob_path);
711+
}
712+
auto compiled = genai::utils::singleton_core().import_model(fin, device, {});
713+
m_max_prompt_len = compiled.get_property("NPUW_LLM_MAX_PROMPT_LEN").as<uint32_t>();
714+
auto min_resp_len = compiled.get_property("NPUW_LLM_MIN_RESPONSE_LEN").as<uint32_t>();
715+
m_kvcache_total = m_max_prompt_len + min_resp_len;
716+
m_request = compiled.create_infer_request();
717+
} else {
718+
auto model = genai::utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, config);
719+
ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json");
720+
ov::AnyMap properties = config;
721+
auto compiled = setupAndCompileModel(model, model_desc, properties);
722+
m_request = compiled->create_infer_request();
723+
m_sampler.set_seed(m_generation_config.rng_seed);
724+
}
706725
}
707726

708727

@@ -721,11 +740,9 @@ StatefulLLMPipeline::StatefulLLMPipeline(
721740
m_sampler.set_seed(m_generation_config.rng_seed);
722741
}
723742

724-
std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
725-
const std::shared_ptr<ov::Model>& model,
743+
void StatefulLLMPipeline::updateStatefulConfig(
726744
const ModelConfigDesc& model_desc,
727745
ov::AnyMap& pipeline_config) {
728-
729746
const uint32_t kMaxPromptLen = pop_int_and_cast(pipeline_config, "MAX_PROMPT_LEN").value_or(1024u);
730747
const uint32_t kMinResponseLen = pop_int_and_cast(pipeline_config, "MIN_RESPONSE_LEN").value_or(128u);
731748
m_max_prompt_len = kMaxPromptLen;
@@ -755,6 +772,13 @@ std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
755772

756773
// Replace CACHE_DIR option if NPUW is enabled
757774
set_npuw_cache_dir(pipeline_config);
775+
}
776+
777+
std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
778+
const std::shared_ptr<ov::Model>& model,
779+
const ModelConfigDesc& model_desc,
780+
ov::AnyMap& pipeline_config) {
781+
updateStatefulConfig(model_desc, pipeline_config);
758782

759783
return std::make_shared<ov::CompiledModel>(genai::utils::singleton_core().compile_model(model, "NPU", pipeline_config));
760784
}

src/cpp/src/llm_pipeline_static.hpp

+4
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ class StatefulLLMPipeline : public LLMPipelineImplBase {
5959
const ModelConfigDesc& model_desc,
6060
ov::AnyMap& pipeline_config);
6161

62+
void updateStatefulConfig(
63+
const ModelConfigDesc& model_desc,
64+
ov::AnyMap& pipeline_config);
65+
6266
DecodedResults generate(
6367
StringInputs inputs,
6468
OptionalGenerationConfig generation_config,

0 commit comments

Comments
 (0)