Skip to content

Commit 2bdc318

Browse files
Extend VLM to run LM on NPU (#1783)
Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
1 parent 370ed78 commit 2bdc318

File tree

8 files changed

+107
-27
lines changed

8 files changed

+107
-27
lines changed

samples/cpp/visual_language_chat/visual_language_chat.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ int main(int argc, char* argv[]) try {
1616

1717
std::vector<ov::Tensor> rgbs = utils::load_images(argv[2]);
1818

19-
std::string device = "CPU"; // GPU can be used as well
19+
// GPU and NPU can be used as well.
20+
// Note: If NPU selected, only language model will be run on NPU
21+
std::string device = "CPU";
2022
ov::AnyMap enable_compile_cache;
2123
if (device == "GPU") {
2224
// Cache compiled models on disk for GPU to save time on the

samples/python/visual_language_chat/visual_language_chat.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,9 @@ def main():
5555

5656
rgbs = read_images(args.image_dir)
5757

58-
device = 'CPU' # GPU can be used as well
58+
# GPU and NPU can be used as well.
59+
# Note: If NPU selected, only language model will be run on NPU
60+
device = 'CPU'
5961
enable_compile_cache = dict()
6062
if "GPU" == device:
6163
# Cache compiled models on disk for GPU to save time on the

src/cpp/src/llm_pipeline.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,6 @@ ov::genai::LLMPipeline::LLMPipeline(
118118
const std::string& device,
119119
const ov::AnyMap& user_properties) {
120120
auto start_time = std::chrono::steady_clock::now();
121-
122121
auto [properties, attention_backend] = extract_attention_backend(user_properties);
123122

124123
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues

src/cpp/src/llm_pipeline_stateful.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ StatefulLLMPipeline::StatefulLLMPipeline(
7171
if (m_is_npu) {
7272
utils::KVDesc kv_desc;
7373
std::tie(compiled_model, kv_desc) = utils::compile_decoder_for_npu(
74-
model, *filtered_properties, kv_pos, models_path
74+
model, *filtered_properties, kv_pos, models_path / "openvino_model.xml"
7575
);
7676
m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
7777
} else {

src/cpp/src/llm_pipeline_static.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,9 @@ StatefulLLMPipeline::StatefulLLMPipeline(
116116
) : LLMPipelineImplBase(tokenizer, generation_config),
117117
m_sampler(m_tokenizer) {
118118
auto kv_pos = ov::genai::utils::get_kv_axes_pos(model);
119-
auto [compiled, kv_desc] = utils::compile_decoder_for_npu(model, properties, kv_pos, models_path);
119+
auto [compiled, kv_desc] = utils::compile_decoder_for_npu(
120+
model, properties, kv_pos, models_path / "openvino_model.xml"
121+
);
120122
m_max_prompt_len = kv_desc.max_prompt_len;
121123
m_kvcache_total = kv_desc.max_prompt_len + kv_desc.min_response_len;
122124
m_request = compiled.create_infer_request();

src/cpp/src/utils.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -492,7 +492,7 @@ compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
492492
properties[ov::cache_mode.name()] = CacheMode::OPTIMIZE_SPEED;
493493
compiled = ov::genai::utils::singleton_core().compile_model(model, "NPU", properties);
494494
} else {
495-
compiled = ov::genai::utils::singleton_core().compile_model(model_path / "openvino_model.xml", "NPU", properties);
495+
compiled = ov::genai::utils::singleton_core().compile_model(model_path, "NPU", properties);
496496
}
497497
// Also export compiled model if required
498498
if (export_blob) {

src/cpp/src/visual_language/pipeline.cpp

+65-20
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
4444
size_t m_kv_cache_seq_length_axis = 2;
4545
// Component for applying sampling to lm outputs
4646
Sampler m_sampler;
47+
size_t m_max_kv_cache_size = std::numeric_limits<size_t>::max();
48+
bool m_is_npu = false;
4749
public:
4850
VLMPipelineImpl(
4951
const std::filesystem::path& models_dir,
@@ -55,22 +57,52 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
5557
models_dir, "generation_config.json"
5658
)
5759
} {
58-
m_inputs_embedder = std::make_shared<InputsEmbedder>(models_dir, device, properties);
59-
60-
m_tokenizer = m_inputs_embedder->get_tokenizer();
61-
m_embedding = m_inputs_embedder->get_embedding_model();
62-
63-
auto compiled_language_model = utils::singleton_core().compile_model(
64-
models_dir / "openvino_language_model.xml", device, properties
60+
m_is_npu = device.find("NPU") != std::string::npos;
61+
62+
auto properties_copy = properties;
63+
auto language_model_path = models_dir / "openvino_language_model.xml";
64+
auto language_model = utils::singleton_core().read_model(language_model_path, {}, properties_copy);
65+
auto kv_pos = ov::genai::utils::get_kv_axes_pos(language_model);
66+
m_kv_cache_seq_length_axis = kv_pos.seq_len;
67+
68+
// In case user provided properties per-device
69+
// {
70+
// ov::device::properties("NPU", ...),
71+
// ov::device::properties("CPU", ...)
72+
// }
73+
auto device_propertes = utils::pop_or_default<ov::AnyMap>(
74+
properties_copy, ov::device::properties.name(), { }
6575
);
66-
utils::print_compiled_model_properties(compiled_language_model, "VLM language model");
67-
auto language_model = compiled_language_model.get_runtime_model();
68-
m_kv_cache_seq_length_axis = utils::get_kv_axes_pos(language_model).seq_len;
76+
// Otherwise, the same properties are used for all models and devices
77+
auto lm_properties = device_propertes.empty()
78+
? properties_copy
79+
: utils::pop_or_default<ov::AnyMap>(device_propertes, device, {});
80+
81+
ov::CompiledModel compiled_language_model;
82+
auto embedder_device = device;
83+
if (m_is_npu) {
84+
embedder_device = "CPU";
85+
utils::KVDesc kv_desc;
86+
std::tie(compiled_language_model, kv_desc) = utils::compile_decoder_for_npu(
87+
language_model, lm_properties, kv_pos, language_model_path
88+
);
89+
m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
90+
} else {
91+
compiled_language_model = utils::singleton_core().compile_model(language_model, device, lm_properties);
92+
}
93+
ov::genai::utils::print_compiled_model_properties(compiled_language_model, "VLM language model");
6994

7095
m_language = compiled_language_model.create_infer_request();
71-
96+
m_kv_cache_seq_length_axis = utils::get_kv_axes_pos(language_model).seq_len;
7297
m_language.get_tensor("attention_mask").set_shape({1, 0});
7398

99+
auto embedder_properties = device_propertes.empty()
100+
? properties_copy
101+
: utils::pop_or_default<ov::AnyMap>(device_propertes, embedder_device, {});
102+
m_inputs_embedder = std::make_shared<InputsEmbedder>(models_dir, embedder_device, embedder_properties);
103+
m_tokenizer = m_inputs_embedder->get_tokenizer();
104+
m_embedding = m_inputs_embedder->get_embedding_model();
105+
74106
// If eos_token_id was not provided, take value
75107
if (m_generation_config.eos_token_id == -1) {
76108
m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
@@ -80,7 +112,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
80112
m_sampler.set_seed(m_generation_config.rng_seed);
81113
}
82114

83-
115+
84116
VLMPipelineImpl(
85117
const ModelsMap& models_map,
86118
const Tokenizer& tokenizer,
@@ -90,6 +122,10 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
90122
const GenerationConfig& generation_config
91123
) :
92124
m_generation_config{generation_config} {
125+
m_is_npu = device.find("NPU") != std::string::npos;
126+
OPENVINO_ASSERT(m_is_npu,
127+
"VLMPipeline initialization from string isn't supported for NPU device");
128+
93129
m_inputs_embedder = std::make_shared<InputsEmbedder>(models_map, tokenizer, config_dir_path, device, properties);
94130

95131
m_tokenizer = m_inputs_embedder->get_tokenizer();
@@ -136,6 +172,14 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
136172
generation_config.set_eos_token_id(m_generation_config.eos_token_id);
137173
generation_config.validate();
138174

175+
if (m_is_npu) {
176+
OPENVINO_ASSERT(rgbs.size() == 1u, "Currently only batch size equal to 1 is supported for NPU device!");
177+
OPENVINO_ASSERT(generation_config.is_greedy_decoding() || generation_config.is_multinomial(),
178+
"Currently only greedy and multinomial decoding are supported for NPU device!");
179+
OPENVINO_ASSERT(generation_config.num_return_sequences == 1u,
180+
"Currently only \"num_return_sequences\" equal to 1 is supported for NPU device!");
181+
}
182+
139183
m_inputs_embedder->set_apply_chat_template_status(generation_config.apply_chat_template);
140184

141185
auto start_get_inputs_embeds = std::chrono::steady_clock::now();
@@ -179,9 +223,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
179223
m_sampler.set_seed(generation_config.rng_seed);
180224
}
181225

182-
utils::GenerationFinishInfo finish_info = get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, m_sampler, requests,
183-
position_ids, kv_cache_state, m_embedding, rope_delta);
184-
226+
ov::genai::utils::GenerationFinishInfo finish_info = ov::genai::get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, m_sampler, requests,
227+
position_ids, kv_cache_state, m_embedding, rope_delta, m_max_kv_cache_size);
185228
EncodedResults& encoded_result = finish_info.results;
186229

187230
auto decode_start_time = std::chrono::steady_clock::now();
@@ -208,7 +251,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
208251
res_raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(generate_end_time - generate_start_time));
209252
res_raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_end_time - decode_start_time));
210253
res_raw_counters.tokenization_durations.insert(res_raw_counters.tokenization_durations.end(), raw_counters.tokenization_durations.begin(), raw_counters.tokenization_durations.end());
211-
254+
212255
// VLM specific perf metrics
213256
decoded.perf_metrics.vlm_raw_metrics.prepare_embeddings_durations.emplace_back(PerfMetrics::get_microsec(end_get_inputs_embeds - start_get_inputs_embeds));
214257

@@ -220,6 +263,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
220263
}
221264

222265
void start_chat(const std::string& system_message) override {
266+
OPENVINO_ASSERT(!m_is_npu, "start_chat() isn't supported in VLMPipeline for NPU device");
223267
m_is_chat_conversation = true;
224268
bool have_state = 0 != m_language.get_tensor("attention_mask").get_size();
225269
if (have_state) {
@@ -232,6 +276,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
232276
}
233277

234278
void finish_chat() override {
279+
OPENVINO_ASSERT(!m_is_npu, "finish_chat() isn't supported in VLMPipeline for NPU device");
235280
m_is_chat_conversation = false;
236281
// Resetting state may be slow.
237282
m_language.reset_state();
@@ -276,8 +321,8 @@ VLMPipeline::VLMPipeline(
276321
) {
277322
auto start_time = std::chrono::steady_clock::now();
278323

279-
if (properties.find(scheduler_config.name()) != properties.end() ||
280-
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
324+
if (properties.find(scheduler_config.name()) != properties.end() ||
325+
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
281326
properties.find(prompt_lookup.name()) != properties.end()) {
282327
auto [plugin_config, scheduler_config] = utils::extract_scheduler_config(properties);
283328
m_pimpl = std::make_unique<VLMContinuousBatchingAdapter>(models_dir, scheduler_config, device, plugin_config);
@@ -298,8 +343,8 @@ VLMPipeline::VLMPipeline(
298343
const GenerationConfig& generation_config
299344
) {
300345
auto start_time = std::chrono::steady_clock::now();
301-
if (properties.find(scheduler_config.name()) != properties.end() ||
302-
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
346+
if (properties.find(scheduler_config.name()) != properties.end() ||
347+
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
303348
properties.find(prompt_lookup.name()) != properties.end()) {
304349
auto [plugin_config, scheduler_config] = utils::extract_scheduler_config(properties);
305350
m_pimpl = std::make_unique<VLMContinuousBatchingAdapter>(models_map, tokenizer, config_dir_path, scheduler_config, device, plugin_config, generation_config);

tests/python_tests/test_vlm_pipeline.py

+31-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import openvino_tokenizers
55
import openvino
66
import pytest
7+
import platform
8+
import sys
79
import transformers
810
from optimum.intel.openvino import OVModelForVisualCausalLM
911
from openvino_genai import VLMPipeline, GenerationConfig, SchedulerConfig, ContinuousBatchingPipeline, GenerationStatus
@@ -92,7 +94,7 @@ def streamer(word: str) -> bool:
9294
images = []
9395
for link in links:
9496
images.append(get_image_by_link(link))
95-
97+
9698
result_from_streamer = []
9799
res = ov_pipe.generate(prompts[0], images=images, generation_config=generation_config, streamer=streamer)
98100
assert res.texts[0] == ''.join(result_from_streamer)
@@ -329,3 +331,31 @@ def test_perf_metrics(cache):
329331
mean_dur, std_dur = perf_metrics.get_prepare_embeddings_duration()
330332
assert np.allclose(mean_dur, np.mean(raw_dur))
331333
assert np.allclose(std_dur, np.std(raw_dur))
334+
335+
336+
@pytest.mark.precommit
337+
@pytest.mark.nightly
338+
# FIXME: katuni4ka/tiny-random-qwen2vl - fails on NPU
339+
@pytest.mark.parametrize("model_id", model_ids[:-1])
340+
@pytest.mark.skipif(
341+
sys.platform == "darwin" or platform.machine() in ["aarch64", "arm64", "ARM64"],
342+
reason="NPU plugin is available only on Linux and Windows x86_64",
343+
)
344+
def test_vlm_npu_no_exception(model_id, cache):
345+
models_path = get_ov_model(model_ids[0], cache)
346+
properties = {
347+
"DEVICE_PROPERTIES":
348+
{
349+
"NPU": { "NPUW_DEVICES": "CPU", "NPUW_ONLINE_PIPELINE": "NONE" }
350+
}
351+
}
352+
353+
ov_pipe = VLMPipeline(models_path, "NPU", config=properties)
354+
355+
generation_config = ov_pipe.get_generation_config()
356+
generation_config.max_new_tokens = 30
357+
generation_config.set_eos_token_id(ov_pipe.get_tokenizer().get_eos_token_id())
358+
359+
for link in image_links_for_testing[2]:
360+
image = get_image_by_link(link)
361+
out = ov_pipe.generate(prompts[0], images=[image], generation_config=generation_config)

0 commit comments

Comments
 (0)