Skip to content

Commit 2e08d78

Browse files
Switch NPU LLM execution to ov::genai::StatefulLLMPipeline (#1677)
Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
1 parent cba97fd commit 2e08d78

10 files changed

+342
-756
lines changed

src/cpp/src/llm_pipeline.cpp

+18-9
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,9 @@ ov::genai::LLMPipeline::LLMPipeline(
128128
}
129129

130130
if (m_pimpl == nullptr && device == "NPU") {
131-
m_pimpl = static_llm::LLMPipelineFactory::create(models_path, tokenizer, device, properties);
131+
m_pimpl = properties.count("STATIC_PIPELINE")
132+
? static_llm::LLMPipelineFactory::create(models_path, tokenizer, properties)
133+
: std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties);
132134
}
133135

134136
// try to call CB adapter one more time, but with safe guard to silent exception
@@ -166,7 +168,9 @@ ov::genai::LLMPipeline::LLMPipeline(
166168
}
167169

168170
if (m_pimpl == nullptr && device == "NPU") {
169-
m_pimpl = static_llm::LLMPipelineFactory::create(models_path, device, properties);
171+
m_pimpl = properties.count("STATIC_PIPELINE")
172+
? static_llm::LLMPipelineFactory::create(models_path, properties)
173+
: std::make_unique<StatefulLLMPipeline>(models_path, device, properties);
170174
}
171175

172176
// try to call CB adapter one more time, but with safe guard to silent exception
@@ -208,13 +212,18 @@ ov::genai::LLMPipeline::LLMPipeline(
208212
}
209213

210214
if (m_pimpl == nullptr && device == "NPU") {
211-
m_pimpl = static_llm::LLMPipelineFactory::create(
212-
utils::singleton_core().read_model(model_str, weights_tensor),
213-
tokenizer,
214-
device,
215-
properties,
216-
generation_config
217-
);
215+
m_pimpl = properties.count("STATIC_PIPELINE")
216+
? static_llm::LLMPipelineFactory::create(
217+
utils::singleton_core().read_model(model_str, weights_tensor),
218+
tokenizer,
219+
properties,
220+
generation_config)
221+
: std::make_unique<StatefulLLMPipeline>(
222+
utils::singleton_core().read_model(model_str, weights_tensor),
223+
tokenizer,
224+
device,
225+
properties,
226+
generation_config);
218227
}
219228

220229
// try to call CB adapter one more time, but with safe guard to silent exception

src/cpp/src/llm_pipeline_stateful.cpp

+41-8
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,17 @@ StatefulLLMPipeline::StatefulLLMPipeline(
1717
const ov::genai::Tokenizer& tokenizer,
1818
OptionalGenerationConfig generation_config)
1919
: LLMPipelineImplBase(tokenizer, generation_config.has_value() ? *generation_config : GenerationConfig()),
20-
m_model_runner(request) {}
20+
m_model_runner(request) {
21+
auto compiled_model = m_model_runner.get_compiled_model();
22+
auto execution_devices = compiled_model.get_property(ov::execution_devices);
23+
if (execution_devices[0].find("NPU") != std::string::npos) {
24+
OPENVINO_ASSERT(execution_devices.size() == 1u);
25+
m_is_npu = true;
26+
const auto max_prompt_len = compiled_model.get_property("NPUW_LLM_MAX_PROMPT_LEN").as<uint32_t>();
27+
const auto min_response_len = compiled_model.get_property("NPUW_LLM_MIN_RESPONSE_LEN").as<uint32_t>();
28+
m_max_kv_cache_size = max_prompt_len + min_response_len;
29+
}
30+
}
2131

2232
StatefulLLMPipeline::StatefulLLMPipeline(
2333
const std::filesystem::path& models_path,
@@ -29,30 +39,44 @@ StatefulLLMPipeline::StatefulLLMPipeline(
2939
tokenizer,
3040
device,
3141
properties,
32-
utils::from_config_json_if_exists(models_path)
42+
utils::from_config_json_if_exists(models_path),
43+
models_path
3344
} {}
3445

3546
StatefulLLMPipeline::StatefulLLMPipeline(
3647
const std::shared_ptr<ov::Model>& model,
3748
const ov::genai::Tokenizer& tokenizer,
3849
const std::string& device,
3950
const ov::AnyMap& properties,
40-
const ov::genai::GenerationConfig& generation_config)
51+
const ov::genai::GenerationConfig& generation_config,
52+
const std::filesystem::path& models_path)
4153
: LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
4254
utils::apply_slice_before_matmul_transformation(model);
55+
auto kv_pos = ov::genai::utils::get_kv_axes_pos(model);
4356

44-
if (device.find("NPU") != std::string::npos)
57+
if (device.find("NPU") != std::string::npos) {
58+
m_is_npu = true;
4559
m_use_full_chat_history = true;
60+
}
4661

4762
if (!m_use_full_chat_history)
48-
m_kv_history_trim_manager.kv_cache_seq_length_axis = ov::genai::utils::get_kv_axes_pos(model).seq_len;
63+
m_kv_history_trim_manager.kv_cache_seq_length_axis = kv_pos.seq_len;
4964

5065
auto filtered_properties = extract_adapters_from_properties(properties, &m_generation_config.adapters);
5166
if (m_generation_config.adapters) {
5267
m_generation_config.adapters->set_tensor_name_prefix("base_model.model.");
5368
m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device); // TODO: Make the prefix name configurable
5469
}
55-
ov::CompiledModel compiled_model = utils::singleton_core().compile_model(model, device, *filtered_properties);
70+
ov::CompiledModel compiled_model;
71+
if (m_is_npu) {
72+
utils::KVDesc kv_desc;
73+
std::tie(compiled_model, kv_desc) = utils::compile_decoder_for_npu(
74+
model, *filtered_properties, kv_pos, models_path
75+
);
76+
m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
77+
} else {
78+
compiled_model = utils::singleton_core().compile_model(model, device, *filtered_properties);
79+
}
5680
m_model_runner = compiled_model.create_infer_request();
5781
ov::genai::utils::print_compiled_model_properties(compiled_model, "Stateful LLM model");
5882

@@ -225,12 +249,21 @@ EncodedResults StatefulLLMPipeline::generate(
225249
config.set_eos_token_id(m_generation_config.eos_token_id);
226250
config.validate();
227251

252+
auto batch_size = input_ids.get_shape().at(0);
253+
254+
if (m_is_npu) {
255+
OPENVINO_ASSERT(batch_size == 1u, "Currently only batch size equal to 1 is supported for NPU device!");
256+
OPENVINO_ASSERT(config.is_greedy_decoding() || config.is_multinomial(),
257+
"Currently only greedy and multinomial decoding are supported for NPU device!");
258+
OPENVINO_ASSERT(config.num_return_sequences == 1u,
259+
"Currently only \"num_return_sequences\" equal to 1 is supported for NPU device!");
260+
}
261+
228262
// Stateful pipeline does not provide logprobs for prompt tokens
229263
OPENVINO_ASSERT(config.echo == false, "Echo is not supported in the stateful pipeline");
230264

231265
std::shared_ptr<StreamerBase> streamer_ptr = ov::genai::utils::create_streamer(streamer, m_tokenizer);
232266

233-
auto batch_size = input_ids.get_shape().at(0);
234267
OPENVINO_ASSERT(streamer_ptr == nullptr || batch_size == 1 && config.num_return_sequences == 1 &&
235268
(config.is_greedy_decoding() || config.is_multinomial()),
236269
"Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");
@@ -314,7 +347,7 @@ EncodedResults StatefulLLMPipeline::generate(
314347
}
315348

316349
ov::genai::utils::GenerationFinishInfo finish_info = get_lm_encoded_results(m_model_runner, input_ids, concatenated_attention_mask, streamer_ptr, m_sampler,
317-
requests, position_ids, m_kv_cache_state, std::nullopt, std::nullopt);
350+
requests, position_ids, m_kv_cache_state, std::nullopt, std::nullopt, m_max_kv_cache_size);
318351
ov::genai::EncodedResults& result = finish_info.results;
319352
m_chat_generation_finish_status = finish_info.streaming_finish_status;
320353

src/cpp/src/llm_pipeline_stateful.hpp

+6-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
// SPDX-License-Identifier: Apache-2.0
33

44

5+
#include <limits>
6+
57
#include "llm_pipeline_base.hpp"
68
#include "lm_encoding.hpp"
79
#include "sampler.hpp"
@@ -26,6 +28,8 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
2628
ov::genai::GenerationStatus m_chat_generation_finish_status = ov::genai::GenerationStatus::RUNNING;
2729
// if True, full history will be used as prompt on each chat generation
2830
bool m_use_full_chat_history = false;
31+
size_t m_max_kv_cache_size = std::numeric_limits<size_t>::max();
32+
bool m_is_npu = false;
2933
// reflection of tokens contained in the kv cache
3034
KVCacheState m_kv_cache_state;
3135

@@ -50,7 +54,8 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
5054
const ov::genai::Tokenizer& tokenizer,
5155
const std::string& device,
5256
const ov::AnyMap& config,
53-
const ov::genai::GenerationConfig& generation_config
57+
const ov::genai::GenerationConfig& generation_config,
58+
const std::filesystem::path& models_path = {}
5459
);
5560

5661
StatefulLLMPipeline(

0 commit comments

Comments
 (0)