Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 17d9269

Browse files
committedMar 3, 2025·
Enable LM part of VLM to work on NPU
1 parent 2e08d78 commit 17d9269

File tree

5 files changed

+64
-25
lines changed

5 files changed

+64
-25
lines changed
 

‎src/cpp/src/llm_pipeline.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,9 @@ ov::genai::LLMPipeline::LLMPipeline(
118118
const std::string& device,
119119
const ov::AnyMap& user_properties) {
120120
auto start_time = std::chrono::steady_clock::now();
121-
122121
auto [properties, attention_backend] = extract_attention_backend(user_properties);
123122

123+
124124
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
125125
if (explicitly_requires_paged_attention(properties)) {
126126
auto [device_properties, scheduler_config] = utils::extract_scheduler_config(properties, get_latency_oriented_scheduler_config());

‎src/cpp/src/llm_pipeline_stateful.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ StatefulLLMPipeline::StatefulLLMPipeline(
7171
if (m_is_npu) {
7272
utils::KVDesc kv_desc;
7373
std::tie(compiled_model, kv_desc) = utils::compile_decoder_for_npu(
74-
model, *filtered_properties, kv_pos, models_path
74+
model, *filtered_properties, kv_pos, models_path / "openvino_model.xml"
7575
);
7676
m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
7777
} else {

‎src/cpp/src/llm_pipeline_static.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,9 @@ StatefulLLMPipeline::StatefulLLMPipeline(
116116
) : LLMPipelineImplBase(tokenizer, generation_config),
117117
m_sampler(m_tokenizer) {
118118
auto kv_pos = ov::genai::utils::get_kv_axes_pos(model);
119-
auto [compiled, kv_desc] = utils::compile_decoder_for_npu(model, properties, kv_pos, models_path);
119+
auto [compiled, kv_desc] = utils::compile_decoder_for_npu(
120+
model, properties, kv_pos, models_path / "openvino_model.xml"
121+
);
120122
m_max_prompt_len = kv_desc.max_prompt_len;
121123
m_kvcache_total = kv_desc.max_prompt_len + kv_desc.min_response_len;
122124
m_request = compiled.create_infer_request();

‎src/cpp/src/utils.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -492,7 +492,7 @@ compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
492492
properties[ov::cache_mode.name()] = CacheMode::OPTIMIZE_SPEED;
493493
compiled = ov::genai::utils::singleton_core().compile_model(model, "NPU", properties);
494494
} else {
495-
compiled = ov::genai::utils::singleton_core().compile_model(model_path / "openvino_model.xml", "NPU", properties);
495+
compiled = ov::genai::utils::singleton_core().compile_model(model_path, "NPU", properties);
496496
}
497497
// Also export compiled model if required
498498
if (export_blob) {

‎src/cpp/src/visual_language/pipeline.cpp

+58-21
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
4444
size_t m_kv_cache_seq_length_axis = 2;
4545
// Component for applying sampling to lm outputs
4646
Sampler m_sampler;
47+
size_t m_max_kv_cache_size = std::numeric_limits<size_t>::max();
48+
bool m_is_npu = false;
4749
public:
4850
VLMPipelineImpl(
4951
const std::filesystem::path& models_dir,
@@ -54,23 +56,53 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
5456
utils::from_config_json_if_exists<GenerationConfig>(
5557
models_dir, "generation_config.json"
5658
)
57-
} {
58-
m_inputs_embedder = std::make_shared<InputsEmbedder>(models_dir, device, properties);
59-
60-
m_tokenizer = m_inputs_embedder->get_tokenizer();
61-
m_embedding = m_inputs_embedder->get_embedding_model();
62-
63-
auto compiled_language_model = utils::singleton_core().compile_model(
64-
models_dir / "openvino_language_model.xml", device, properties
59+
},
60+
m_is_chat_conversation{false} {
61+
m_is_npu = device.find("NPU") != std::string::npos;
62+
auto properties_copy = properties;
63+
auto language_model_path = models_dir / "openvino_language_model.xml";
64+
auto language_model = utils::singleton_core().read_model(language_model_path, {}, properties_copy);
65+
auto kv_pos = ov::genai::utils::get_kv_axes_pos(language_model);
66+
m_kv_cache_seq_length_axis = kv_pos.seq_len;
67+
68+
// User provided properties in the following format:
69+
// {
70+
// ov::device::properties("NPU", ...),
71+
// ov::device::properties("CPU", ...)
72+
// }
73+
auto device_propertes = utils::pop_or_default<ov::AnyMap>(
74+
properties_copy, ov::device::properties.name(), { }
6575
);
66-
utils::print_compiled_model_properties(compiled_language_model, "VLM language model");
67-
auto language_model = compiled_language_model.get_runtime_model();
68-
m_kv_cache_seq_length_axis = utils::get_kv_axes_pos(language_model).seq_len;
76+
// Otherwise, the same properties are used for all models
77+
auto lm_properties = device_propertes.empty()
78+
? properties_copy
79+
: utils::pop_or_default<ov::AnyMap>(device_propertes, device, {});
80+
81+
ov::CompiledModel compiled_language_model;
82+
auto embedder_device = device;
83+
if (m_is_npu) {
84+
embedder_device = "CPU";
85+
utils::KVDesc kv_desc;
86+
std::tie(compiled_language_model, kv_desc) = utils::compile_decoder_for_npu(
87+
language_model, lm_properties, kv_pos, language_model_path
88+
);
89+
m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
90+
} else {
91+
compiled_language_model = utils::singleton_core().compile_model(language_model, device, lm_properties);
92+
}
93+
ov::genai::utils::print_compiled_model_properties(compiled_language_model, "VLM language model");
6994

7095
m_language = compiled_language_model.create_infer_request();
71-
96+
m_kv_cache_seq_length_axis = utils::get_kv_axes_pos(language_model).seq_len;
7297
m_language.get_tensor("attention_mask").set_shape({1, 0});
7398

99+
auto embedder_properties = device_propertes.empty()
100+
? properties_copy
101+
: utils::pop_or_default<ov::AnyMap>(device_propertes, embedder_device, {});
102+
m_inputs_embedder = std::make_shared<InputsEmbedder>(models_dir, embedder_device, embedder_properties);
103+
m_tokenizer = m_inputs_embedder->get_tokenizer();
104+
m_embedding = m_inputs_embedder->get_embedding_model();
105+
74106
// If eos_token_id was not provided, take value
75107
if (m_generation_config.eos_token_id == -1) {
76108
m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
@@ -80,7 +112,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
80112
m_sampler.set_seed(m_generation_config.rng_seed);
81113
}
82114

83-
115+
84116
VLMPipelineImpl(
85117
const ModelsMap& models_map,
86118
const Tokenizer& tokenizer,
@@ -90,6 +122,10 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
90122
const GenerationConfig& generation_config
91123
) :
92124
m_generation_config{generation_config} {
125+
m_is_npu = device.find("NPU") != std::string::npos;
126+
OPENVINO_ASSERT(m_is_npu &&
127+
"VLMPipeline initialization from string isn't supported for NPU device");
128+
93129
m_inputs_embedder = std::make_shared<InputsEmbedder>(models_map, tokenizer, config_dir_path, device, properties);
94130

95131
m_tokenizer = m_inputs_embedder->get_tokenizer();
@@ -179,9 +215,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
179215
m_sampler.set_seed(generation_config.rng_seed);
180216
}
181217

182-
utils::GenerationFinishInfo finish_info = get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, m_sampler, requests,
183-
position_ids, kv_cache_state, m_embedding, rope_delta);
184-
218+
ov::genai::utils::GenerationFinishInfo finish_info = ov::genai::get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, m_sampler, requests,
219+
position_ids, kv_cache_state, m_embedding, rope_delta, m_max_kv_cache_size);
185220
EncodedResults& encoded_result = finish_info.results;
186221

187222
auto decode_start_time = std::chrono::steady_clock::now();
@@ -208,7 +243,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
208243
res_raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(generate_end_time - generate_start_time));
209244
res_raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_end_time - decode_start_time));
210245
res_raw_counters.tokenization_durations.insert(res_raw_counters.tokenization_durations.end(), raw_counters.tokenization_durations.begin(), raw_counters.tokenization_durations.end());
211-
246+
212247
// VLM specific perf metrics
213248
decoded.perf_metrics.vlm_raw_metrics.prepare_embeddings_durations.emplace_back(PerfMetrics::get_microsec(end_get_inputs_embeds - start_get_inputs_embeds));
214249

@@ -220,6 +255,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
220255
}
221256

222257
void start_chat(const std::string& system_message) override {
258+
OPENVINO_ASSERT(!m_is_npu && "start_chat() isn't supported in VLMPipeline for NPU device");
223259
m_is_chat_conversation = true;
224260
bool have_state = 0 != m_language.get_tensor("attention_mask").get_size();
225261
if (have_state) {
@@ -232,6 +268,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
232268
}
233269

234270
void finish_chat() override {
271+
OPENVINO_ASSERT(!m_is_npu && "finish_chat() isn't supported in VLMPipeline for NPU device");
235272
m_is_chat_conversation = false;
236273
// Resetting state may be slow.
237274
m_language.reset_state();
@@ -276,8 +313,8 @@ VLMPipeline::VLMPipeline(
276313
) {
277314
auto start_time = std::chrono::steady_clock::now();
278315

279-
if (properties.find(scheduler_config.name()) != properties.end() ||
280-
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
316+
if (properties.find(scheduler_config.name()) != properties.end() ||
317+
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
281318
properties.find(prompt_lookup.name()) != properties.end()) {
282319
auto [plugin_config, scheduler_config] = utils::extract_scheduler_config(properties);
283320
m_pimpl = std::make_unique<VLMContinuousBatchingAdapter>(models_dir, scheduler_config, device, plugin_config);
@@ -298,8 +335,8 @@ VLMPipeline::VLMPipeline(
298335
const GenerationConfig& generation_config
299336
) {
300337
auto start_time = std::chrono::steady_clock::now();
301-
if (properties.find(scheduler_config.name()) != properties.end() ||
302-
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
338+
if (properties.find(scheduler_config.name()) != properties.end() ||
339+
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
303340
properties.find(prompt_lookup.name()) != properties.end()) {
304341
auto [plugin_config, scheduler_config] = utils::extract_scheduler_config(properties);
305342
m_pimpl = std::make_unique<VLMContinuousBatchingAdapter>(models_map, tokenizer, config_dir_path, scheduler_config, device, plugin_config, generation_config);

0 commit comments

Comments
 (0)
Please sign in to comment.