@@ -17,7 +17,17 @@ StatefulLLMPipeline::StatefulLLMPipeline(
17
17
const ov::genai::Tokenizer& tokenizer,
18
18
OptionalGenerationConfig generation_config)
19
19
: LLMPipelineImplBase(tokenizer, generation_config.has_value() ? *generation_config : GenerationConfig()),
20
- m_model_runner (request) {}
20
+ m_model_runner (request) {
21
+ auto compiled_model = m_model_runner.get_compiled_model ();
22
+ auto execution_devices = compiled_model.get_property (ov::execution_devices);
23
+ if (execution_devices[0 ].find (" NPU" ) != std::string::npos) {
24
+ OPENVINO_ASSERT (execution_devices.size () == 1u );
25
+ m_is_npu = true ;
26
+ const auto max_prompt_len = compiled_model.get_property (" NPUW_LLM_MAX_PROMPT_LEN" ).as <uint32_t >();
27
+ const auto min_response_len = compiled_model.get_property (" NPUW_LLM_MIN_RESPONSE_LEN" ).as <uint32_t >();
28
+ m_max_kv_cache_size = max_prompt_len + min_response_len;
29
+ }
30
+ }
21
31
22
32
StatefulLLMPipeline::StatefulLLMPipeline (
23
33
const std::filesystem::path& models_path,
@@ -29,30 +39,44 @@ StatefulLLMPipeline::StatefulLLMPipeline(
29
39
tokenizer,
30
40
device,
31
41
properties,
32
- utils::from_config_json_if_exists (models_path)
42
+ utils::from_config_json_if_exists (models_path),
43
+ models_path
33
44
} {}
34
45
35
46
StatefulLLMPipeline::StatefulLLMPipeline (
36
47
const std::shared_ptr<ov::Model>& model,
37
48
const ov::genai::Tokenizer& tokenizer,
38
49
const std::string& device,
39
50
const ov::AnyMap& properties,
40
- const ov::genai::GenerationConfig& generation_config)
51
+ const ov::genai::GenerationConfig& generation_config,
52
+ const std::filesystem::path& models_path)
41
53
: LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
42
54
utils::apply_slice_before_matmul_transformation (model);
55
+ auto kv_pos = ov::genai::utils::get_kv_axes_pos (model);
43
56
44
- if (device.find (" NPU" ) != std::string::npos)
57
+ if (device.find (" NPU" ) != std::string::npos) {
58
+ m_is_npu = true ;
45
59
m_use_full_chat_history = true ;
60
+ }
46
61
47
62
if (!m_use_full_chat_history)
48
- m_kv_history_trim_manager.kv_cache_seq_length_axis = ov::genai::utils::get_kv_axes_pos (model) .seq_len ;
63
+ m_kv_history_trim_manager.kv_cache_seq_length_axis = kv_pos .seq_len ;
49
64
50
65
auto filtered_properties = extract_adapters_from_properties (properties, &m_generation_config.adapters );
51
66
if (m_generation_config.adapters ) {
52
67
m_generation_config.adapters ->set_tensor_name_prefix (" base_model.model." );
53
68
m_adapter_controller = AdapterController (model, *m_generation_config.adapters , device); // TODO: Make the prefix name configurable
54
69
}
55
- ov::CompiledModel compiled_model = utils::singleton_core ().compile_model (model, device, *filtered_properties);
70
+ ov::CompiledModel compiled_model;
71
+ if (m_is_npu) {
72
+ utils::KVDesc kv_desc;
73
+ std::tie (compiled_model, kv_desc) = utils::compile_decoder_for_npu (
74
+ model, *filtered_properties, kv_pos, models_path
75
+ );
76
+ m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len ;
77
+ } else {
78
+ compiled_model = utils::singleton_core ().compile_model (model, device, *filtered_properties);
79
+ }
56
80
m_model_runner = compiled_model.create_infer_request ();
57
81
ov::genai::utils::print_compiled_model_properties (compiled_model, " Stateful LLM model" );
58
82
@@ -225,12 +249,21 @@ EncodedResults StatefulLLMPipeline::generate(
225
249
config.set_eos_token_id (m_generation_config.eos_token_id );
226
250
config.validate ();
227
251
252
+ auto batch_size = input_ids.get_shape ().at (0 );
253
+
254
+ if (m_is_npu) {
255
+ OPENVINO_ASSERT (batch_size == 1u , " Currently only batch size equal to 1 is supported for NPU device!" );
256
+ OPENVINO_ASSERT (config.is_greedy_decoding () || config.is_multinomial (),
257
+ " Currently only greedy and multinomial decoding are supported for NPU device!" );
258
+ OPENVINO_ASSERT (config.num_return_sequences == 1u ,
259
+ " Currently only \" num_return_sequences\" equal to 1 is supported for NPU device!" );
260
+ }
261
+
228
262
// Stateful pipeline does not provide logprobs for prompt tokens
229
263
OPENVINO_ASSERT (config.echo == false , " Echo is not supported in the stateful pipeline" );
230
264
231
265
std::shared_ptr<StreamerBase> streamer_ptr = ov::genai::utils::create_streamer (streamer, m_tokenizer);
232
266
233
- auto batch_size = input_ids.get_shape ().at (0 );
234
267
OPENVINO_ASSERT (streamer_ptr == nullptr || batch_size == 1 && config.num_return_sequences == 1 &&
235
268
(config.is_greedy_decoding () || config.is_multinomial ()),
236
269
" Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding" );
@@ -314,7 +347,7 @@ EncodedResults StatefulLLMPipeline::generate(
314
347
}
315
348
316
349
ov::genai::utils::GenerationFinishInfo finish_info = get_lm_encoded_results (m_model_runner, input_ids, concatenated_attention_mask, streamer_ptr, m_sampler,
317
- requests, position_ids, m_kv_cache_state, std::nullopt, std::nullopt);
350
+ requests, position_ids, m_kv_cache_state, std::nullopt, std::nullopt, m_max_kv_cache_size );
318
351
ov::genai::EncodedResults& result = finish_info.results ;
319
352
m_chat_generation_finish_status = finish_info.streaming_finish_status ;
320
353
0 commit comments