@@ -44,6 +44,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
44
44
size_t m_kv_cache_seq_length_axis = 2 ;
45
45
// Component for applying sampling to lm outputs
46
46
Sampler m_sampler;
47
+ size_t m_max_kv_cache_size = std::numeric_limits<size_t >::max();
48
+ bool m_is_npu = false ;
47
49
public:
48
50
VLMPipelineImpl (
49
51
const std::filesystem::path& models_dir,
@@ -54,23 +56,53 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
54
56
utils::from_config_json_if_exists<GenerationConfig>(
55
57
models_dir, " generation_config.json"
56
58
)
57
- } {
58
- m_inputs_embedder = std::make_shared<InputsEmbedder>(models_dir, device, properties);
59
-
60
- m_tokenizer = m_inputs_embedder->get_tokenizer ();
61
- m_embedding = m_inputs_embedder->get_embedding_model ();
62
-
63
- auto compiled_language_model = utils::singleton_core ().compile_model (
64
- models_dir / " openvino_language_model.xml" , device, properties
59
+ },
60
+ m_is_chat_conversation{false } {
61
+ m_is_npu = device.find (" NPU" ) != std::string::npos;
62
+ auto properties_copy = properties;
63
+ auto language_model_path = models_dir / " openvino_language_model.xml" ;
64
+ auto language_model = utils::singleton_core ().read_model (language_model_path, {}, properties_copy);
65
+ auto kv_pos = ov::genai::utils::get_kv_axes_pos (language_model);
66
+ m_kv_cache_seq_length_axis = kv_pos.seq_len ;
67
+
68
+ // User provided properties in the following format:
69
+ // {
70
+ // ov::device::properties("NPU", ...),
71
+ // ov::device::properties("CPU", ...)
72
+ // }
73
+ auto device_propertes = utils::pop_or_default<ov::AnyMap>(
74
+ properties_copy, ov::device::properties.name (), { }
65
75
);
66
- utils::print_compiled_model_properties (compiled_language_model, " VLM language model" );
67
- auto language_model = compiled_language_model.get_runtime_model ();
68
- m_kv_cache_seq_length_axis = utils::get_kv_axes_pos (language_model).seq_len ;
76
+ // Otherwise, the same properties are used for all models
77
+ auto lm_properties = device_propertes.empty ()
78
+ ? properties_copy
79
+ : utils::pop_or_default<ov::AnyMap>(device_propertes, device, {});
80
+
81
+ ov::CompiledModel compiled_language_model;
82
+ auto embedder_device = device;
83
+ if (m_is_npu) {
84
+ embedder_device = " CPU" ;
85
+ utils::KVDesc kv_desc;
86
+ std::tie (compiled_language_model, kv_desc) = utils::compile_decoder_for_npu (
87
+ language_model, lm_properties, kv_pos, language_model_path
88
+ );
89
+ m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len ;
90
+ } else {
91
+ compiled_language_model = utils::singleton_core ().compile_model (language_model, device, lm_properties);
92
+ }
93
+ ov::genai::utils::print_compiled_model_properties (compiled_language_model, " VLM language model" );
69
94
70
95
m_language = compiled_language_model.create_infer_request ();
71
-
96
+ m_kv_cache_seq_length_axis = utils::get_kv_axes_pos (language_model). seq_len ;
72
97
m_language.get_tensor (" attention_mask" ).set_shape ({1 , 0 });
73
98
99
+ auto embedder_properties = device_propertes.empty ()
100
+ ? properties_copy
101
+ : utils::pop_or_default<ov::AnyMap>(device_propertes, embedder_device, {});
102
+ m_inputs_embedder = std::make_shared<InputsEmbedder>(models_dir, embedder_device, embedder_properties);
103
+ m_tokenizer = m_inputs_embedder->get_tokenizer ();
104
+ m_embedding = m_inputs_embedder->get_embedding_model ();
105
+
74
106
// If eos_token_id was not provided, take value
75
107
if (m_generation_config.eos_token_id == -1 ) {
76
108
m_generation_config.set_eos_token_id (m_tokenizer.get_eos_token_id ());
@@ -80,7 +112,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
80
112
m_sampler.set_seed (m_generation_config.rng_seed );
81
113
}
82
114
83
-
115
+
84
116
VLMPipelineImpl (
85
117
const ModelsMap& models_map,
86
118
const Tokenizer& tokenizer,
@@ -90,6 +122,10 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
90
122
const GenerationConfig& generation_config
91
123
) :
92
124
m_generation_config{generation_config} {
125
+ m_is_npu = device.find (" NPU" ) != std::string::npos;
126
+ OPENVINO_ASSERT (m_is_npu &&
127
+ " VLMPipeline initialization from string isn't supported for NPU device" );
128
+
93
129
m_inputs_embedder = std::make_shared<InputsEmbedder>(models_map, tokenizer, config_dir_path, device, properties);
94
130
95
131
m_tokenizer = m_inputs_embedder->get_tokenizer ();
@@ -179,9 +215,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
179
215
m_sampler.set_seed (generation_config.rng_seed );
180
216
}
181
217
182
- utils::GenerationFinishInfo finish_info = get_lm_encoded_results (m_language, inputs_embeds, new_atten_mask, streamer_ptr, m_sampler, requests,
183
- position_ids, kv_cache_state, m_embedding, rope_delta);
184
-
218
+ ov::genai::utils::GenerationFinishInfo finish_info = ov::genai::get_lm_encoded_results (m_language, inputs_embeds, new_atten_mask, streamer_ptr, m_sampler, requests,
219
+ position_ids, kv_cache_state, m_embedding, rope_delta, m_max_kv_cache_size);
185
220
EncodedResults& encoded_result = finish_info.results ;
186
221
187
222
auto decode_start_time = std::chrono::steady_clock::now ();
@@ -208,7 +243,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
208
243
res_raw_counters.generate_durations .emplace_back (PerfMetrics::get_microsec (generate_end_time - generate_start_time));
209
244
res_raw_counters.detokenization_durations .emplace_back (PerfMetrics::get_microsec (decode_end_time - decode_start_time));
210
245
res_raw_counters.tokenization_durations .insert (res_raw_counters.tokenization_durations .end (), raw_counters.tokenization_durations .begin (), raw_counters.tokenization_durations .end ());
211
-
246
+
212
247
// VLM specific perf metrics
213
248
decoded.perf_metrics .vlm_raw_metrics .prepare_embeddings_durations .emplace_back (PerfMetrics::get_microsec (end_get_inputs_embeds - start_get_inputs_embeds));
214
249
@@ -220,6 +255,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
220
255
}
221
256
222
257
void start_chat (const std::string& system_message) override {
258
+ OPENVINO_ASSERT (!m_is_npu && " start_chat() isn't supported in VLMPipeline for NPU device" );
223
259
m_is_chat_conversation = true ;
224
260
bool have_state = 0 != m_language.get_tensor (" attention_mask" ).get_size ();
225
261
if (have_state) {
@@ -232,6 +268,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
232
268
}
233
269
234
270
void finish_chat () override {
271
+ OPENVINO_ASSERT (!m_is_npu && " finish_chat() isn't supported in VLMPipeline for NPU device" );
235
272
m_is_chat_conversation = false ;
236
273
// Resetting state may be slow.
237
274
m_language.reset_state ();
@@ -276,8 +313,8 @@ VLMPipeline::VLMPipeline(
276
313
) {
277
314
auto start_time = std::chrono::steady_clock::now ();
278
315
279
- if (properties.find (scheduler_config.name ()) != properties.end () ||
280
- properties.find (utils::DRAFT_MODEL_ARG_NAME) != properties.end () ||
316
+ if (properties.find (scheduler_config.name ()) != properties.end () ||
317
+ properties.find (utils::DRAFT_MODEL_ARG_NAME) != properties.end () ||
281
318
properties.find (prompt_lookup.name ()) != properties.end ()) {
282
319
auto [plugin_config, scheduler_config] = utils::extract_scheduler_config (properties);
283
320
m_pimpl = std::make_unique<VLMContinuousBatchingAdapter>(models_dir, scheduler_config, device, plugin_config);
@@ -298,8 +335,8 @@ VLMPipeline::VLMPipeline(
298
335
const GenerationConfig& generation_config
299
336
) {
300
337
auto start_time = std::chrono::steady_clock::now ();
301
- if (properties.find (scheduler_config.name ()) != properties.end () ||
302
- properties.find (utils::DRAFT_MODEL_ARG_NAME) != properties.end () ||
338
+ if (properties.find (scheduler_config.name ()) != properties.end () ||
339
+ properties.find (utils::DRAFT_MODEL_ARG_NAME) != properties.end () ||
303
340
properties.find (prompt_lookup.name ()) != properties.end ()) {
304
341
auto [plugin_config, scheduler_config] = utils::extract_scheduler_config (properties);
305
342
m_pimpl = std::make_unique<VLMContinuousBatchingAdapter>(models_map, tokenizer, config_dir_path, scheduler_config, device, plugin_config, generation_config);
0 commit comments