@@ -44,6 +44,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
44
44
size_t m_kv_cache_seq_length_axis = 2 ;
45
45
// Component for applying sampling to lm outputs
46
46
Sampler m_sampler;
47
+ size_t m_max_kv_cache_size = std::numeric_limits<size_t >::max();
48
+ bool m_is_npu = false ;
47
49
public:
48
50
VLMPipelineImpl (
49
51
const std::filesystem::path& models_dir,
@@ -55,22 +57,52 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
55
57
models_dir, " generation_config.json"
56
58
)
57
59
} {
58
- m_inputs_embedder = std::make_shared<InputsEmbedder>(models_dir, device, properties);
59
-
60
- m_tokenizer = m_inputs_embedder->get_tokenizer ();
61
- m_embedding = m_inputs_embedder->get_embedding_model ();
62
-
63
- auto compiled_language_model = utils::singleton_core ().compile_model (
64
- models_dir / " openvino_language_model.xml" , device, properties
60
+ m_is_npu = device.find (" NPU" ) != std::string::npos;
61
+
62
+ auto properties_copy = properties;
63
+ auto language_model_path = models_dir / " openvino_language_model.xml" ;
64
+ auto language_model = utils::singleton_core ().read_model (language_model_path, {}, properties_copy);
65
+ auto kv_pos = ov::genai::utils::get_kv_axes_pos (language_model);
66
+ m_kv_cache_seq_length_axis = kv_pos.seq_len ;
67
+
68
+ // In case user provided properties per-device
69
+ // {
70
+ // ov::device::properties("NPU", ...),
71
+ // ov::device::properties("CPU", ...)
72
+ // }
73
+ auto device_propertes = utils::pop_or_default<ov::AnyMap>(
74
+ properties_copy, ov::device::properties.name (), { }
65
75
);
66
- utils::print_compiled_model_properties (compiled_language_model, " VLM language model" );
67
- auto language_model = compiled_language_model.get_runtime_model ();
68
- m_kv_cache_seq_length_axis = utils::get_kv_axes_pos (language_model).seq_len ;
76
+ // Otherwise, the same properties are used for all models and devices
77
+ auto lm_properties = device_propertes.empty ()
78
+ ? properties_copy
79
+ : utils::pop_or_default<ov::AnyMap>(device_propertes, device, {});
80
+
81
+ ov::CompiledModel compiled_language_model;
82
+ auto embedder_device = device;
83
+ if (m_is_npu) {
84
+ embedder_device = " CPU" ;
85
+ utils::KVDesc kv_desc;
86
+ std::tie (compiled_language_model, kv_desc) = utils::compile_decoder_for_npu (
87
+ language_model, lm_properties, kv_pos, language_model_path
88
+ );
89
+ m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len ;
90
+ } else {
91
+ compiled_language_model = utils::singleton_core ().compile_model (language_model, device, lm_properties);
92
+ }
93
+ ov::genai::utils::print_compiled_model_properties (compiled_language_model, " VLM language model" );
69
94
70
95
m_language = compiled_language_model.create_infer_request ();
71
-
96
+ m_kv_cache_seq_length_axis = utils::get_kv_axes_pos (language_model). seq_len ;
72
97
m_language.get_tensor (" attention_mask" ).set_shape ({1 , 0 });
73
98
99
+ auto embedder_properties = device_propertes.empty ()
100
+ ? properties_copy
101
+ : utils::pop_or_default<ov::AnyMap>(device_propertes, embedder_device, {});
102
+ m_inputs_embedder = std::make_shared<InputsEmbedder>(models_dir, embedder_device, embedder_properties);
103
+ m_tokenizer = m_inputs_embedder->get_tokenizer ();
104
+ m_embedding = m_inputs_embedder->get_embedding_model ();
105
+
74
106
// If eos_token_id was not provided, take value
75
107
if (m_generation_config.eos_token_id == -1 ) {
76
108
m_generation_config.set_eos_token_id (m_tokenizer.get_eos_token_id ());
@@ -80,7 +112,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
80
112
m_sampler.set_seed (m_generation_config.rng_seed );
81
113
}
82
114
83
-
115
+
84
116
VLMPipelineImpl (
85
117
const ModelsMap& models_map,
86
118
const Tokenizer& tokenizer,
@@ -90,6 +122,10 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
90
122
const GenerationConfig& generation_config
91
123
) :
92
124
m_generation_config{generation_config} {
125
+ m_is_npu = device.find (" NPU" ) != std::string::npos;
126
+ OPENVINO_ASSERT (m_is_npu,
127
+ " VLMPipeline initialization from string isn't supported for NPU device" );
128
+
93
129
m_inputs_embedder = std::make_shared<InputsEmbedder>(models_map, tokenizer, config_dir_path, device, properties);
94
130
95
131
m_tokenizer = m_inputs_embedder->get_tokenizer ();
@@ -136,6 +172,14 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
136
172
generation_config.set_eos_token_id (m_generation_config.eos_token_id );
137
173
generation_config.validate ();
138
174
175
+ if (m_is_npu) {
176
+ OPENVINO_ASSERT (rgbs.size () == 1u , " Currently only batch size equal to 1 is supported for NPU device!" );
177
+ OPENVINO_ASSERT (generation_config.is_greedy_decoding () || generation_config.is_multinomial (),
178
+ " Currently only greedy and multinomial decoding are supported for NPU device!" );
179
+ OPENVINO_ASSERT (generation_config.num_return_sequences == 1u ,
180
+ " Currently only \" num_return_sequences\" equal to 1 is supported for NPU device!" );
181
+ }
182
+
139
183
m_inputs_embedder->set_apply_chat_template_status (generation_config.apply_chat_template );
140
184
141
185
auto start_get_inputs_embeds = std::chrono::steady_clock::now ();
@@ -179,9 +223,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
179
223
m_sampler.set_seed (generation_config.rng_seed );
180
224
}
181
225
182
- utils::GenerationFinishInfo finish_info = get_lm_encoded_results (m_language, inputs_embeds, new_atten_mask, streamer_ptr, m_sampler, requests,
183
- position_ids, kv_cache_state, m_embedding, rope_delta);
184
-
226
+ ov::genai::utils::GenerationFinishInfo finish_info = ov::genai::get_lm_encoded_results (m_language, inputs_embeds, new_atten_mask, streamer_ptr, m_sampler, requests,
227
+ position_ids, kv_cache_state, m_embedding, rope_delta, m_max_kv_cache_size);
185
228
EncodedResults& encoded_result = finish_info.results ;
186
229
187
230
auto decode_start_time = std::chrono::steady_clock::now ();
@@ -208,7 +251,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
208
251
res_raw_counters.generate_durations .emplace_back (PerfMetrics::get_microsec (generate_end_time - generate_start_time));
209
252
res_raw_counters.detokenization_durations .emplace_back (PerfMetrics::get_microsec (decode_end_time - decode_start_time));
210
253
res_raw_counters.tokenization_durations .insert (res_raw_counters.tokenization_durations .end (), raw_counters.tokenization_durations .begin (), raw_counters.tokenization_durations .end ());
211
-
254
+
212
255
// VLM specific perf metrics
213
256
decoded.perf_metrics .vlm_raw_metrics .prepare_embeddings_durations .emplace_back (PerfMetrics::get_microsec (end_get_inputs_embeds - start_get_inputs_embeds));
214
257
@@ -220,6 +263,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
220
263
}
221
264
222
265
void start_chat (const std::string& system_message) override {
266
+ OPENVINO_ASSERT (!m_is_npu, " start_chat() isn't supported in VLMPipeline for NPU device" );
223
267
m_is_chat_conversation = true ;
224
268
bool have_state = 0 != m_language.get_tensor (" attention_mask" ).get_size ();
225
269
if (have_state) {
@@ -232,6 +276,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
232
276
}
233
277
234
278
void finish_chat () override {
279
+ OPENVINO_ASSERT (!m_is_npu, " finish_chat() isn't supported in VLMPipeline for NPU device" );
235
280
m_is_chat_conversation = false ;
236
281
// Resetting state may be slow.
237
282
m_language.reset_state ();
@@ -276,8 +321,8 @@ VLMPipeline::VLMPipeline(
276
321
) {
277
322
auto start_time = std::chrono::steady_clock::now ();
278
323
279
- if (properties.find (scheduler_config.name ()) != properties.end () ||
280
- properties.find (utils::DRAFT_MODEL_ARG_NAME) != properties.end () ||
324
+ if (properties.find (scheduler_config.name ()) != properties.end () ||
325
+ properties.find (utils::DRAFT_MODEL_ARG_NAME) != properties.end () ||
281
326
properties.find (prompt_lookup.name ()) != properties.end ()) {
282
327
auto [plugin_config, scheduler_config] = utils::extract_scheduler_config (properties);
283
328
m_pimpl = std::make_unique<VLMContinuousBatchingAdapter>(models_dir, scheduler_config, device, plugin_config);
@@ -298,8 +343,8 @@ VLMPipeline::VLMPipeline(
298
343
const GenerationConfig& generation_config
299
344
) {
300
345
auto start_time = std::chrono::steady_clock::now ();
301
- if (properties.find (scheduler_config.name ()) != properties.end () ||
302
- properties.find (utils::DRAFT_MODEL_ARG_NAME) != properties.end () ||
346
+ if (properties.find (scheduler_config.name ()) != properties.end () ||
347
+ properties.find (utils::DRAFT_MODEL_ARG_NAME) != properties.end () ||
303
348
properties.find (prompt_lookup.name ()) != properties.end ()) {
304
349
auto [plugin_config, scheduler_config] = utils::extract_scheduler_config (properties);
305
350
m_pimpl = std::make_unique<VLMContinuousBatchingAdapter>(models_map, tokenizer, config_dir_path, scheduler_config, device, plugin_config, generation_config);
0 commit comments