Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend VLM to run LM on NPU #1783

Merged
merged 20 commits into from
Mar 6, 2025
Merged
Changes from 1 commit
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
17d9269
Enable LM part of VLM to work on NPU
TolyaTalamanov Mar 3, 2025
cefb3af
Merge branch 'master' into at/vlm-pipeline
TolyaTalamanov Mar 4, 2025
6ac899d
Add test and clean up
TolyaTalamanov Mar 4, 2025
52b60b2
Merge branch 'at/vlm-pipeline' of https://github.com/TolyaTalamanov/o…
TolyaTalamanov Mar 4, 2025
5737b0f
Update src/cpp/src/visual_language/pipeline.cpp
TolyaTalamanov Mar 4, 2025
2ed3a76
Update src/cpp/src/visual_language/pipeline.cpp
TolyaTalamanov Mar 4, 2025
ec0aad3
Update src/cpp/src/visual_language/pipeline.cpp
TolyaTalamanov Mar 4, 2025
23c1316
Merge branch 'at/vlm-pipeline' of https://github.com/TolyaTalamanov/o…
TolyaTalamanov Mar 4, 2025
ada0a05
Add tests for NPU VLM
TolyaTalamanov Mar 4, 2025
74ce19c
Comment sample
TolyaTalamanov Mar 4, 2025
db537a2
Merge branch 'master' into at/vlm-pipeline
ilya-lavrenov Mar 4, 2025
570c96c
Change vlm test for NPU
TolyaTalamanov Mar 5, 2025
6168bd7
Add comment about NPU into python VLM sample
TolyaTalamanov Mar 5, 2025
0c8b236
Merge branch 'at/vlm-pipeline' of https://github.com/TolyaTalamanov/o…
TolyaTalamanov Mar 5, 2025
f4fdfef
Merge branch 'master' into at/vlm-pipeline
TolyaTalamanov Mar 5, 2025
700e16d
Update test_vlm_pipeline.py
TolyaTalamanov Mar 5, 2025
37936ae
Update test_vlm_pipeline.py
TolyaTalamanov Mar 5, 2025
fa12185
Update test_vlm_pipeline.py
TolyaTalamanov Mar 5, 2025
9cf0c5f
Merge branch 'master' into at/vlm-pipeline
TolyaTalamanov Mar 5, 2025
4c85465
Apply suggestions from code review
ilya-lavrenov Mar 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add test and clean up
TolyaTalamanov committed Mar 4, 2025
commit 6ac899d3df7acd2ba306331908fcbcf062e86bd8
1 change: 0 additions & 1 deletion src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
@@ -120,7 +120,6 @@ ov::genai::LLMPipeline::LLMPipeline(
auto start_time = std::chrono::steady_clock::now();
auto [properties, attention_backend] = extract_attention_backend(user_properties);


// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
if (explicitly_requires_paged_attention(properties)) {
auto [device_properties, scheduler_config] = utils::extract_scheduler_config(properties, get_latency_oriented_scheduler_config());
16 changes: 12 additions & 4 deletions src/cpp/src/visual_language/pipeline.cpp
Original file line number Diff line number Diff line change
@@ -56,24 +56,24 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
utils::from_config_json_if_exists<GenerationConfig>(
models_dir, "generation_config.json"
)
},
m_is_chat_conversation{false} {
} {
m_is_npu = device.find("NPU") != std::string::npos;

auto properties_copy = properties;
auto language_model_path = models_dir / "openvino_language_model.xml";
auto language_model = utils::singleton_core().read_model(language_model_path, {}, properties_copy);
auto kv_pos = ov::genai::utils::get_kv_axes_pos(language_model);
m_kv_cache_seq_length_axis = kv_pos.seq_len;

// User provided properties in the following format:
// In case user provided properties per-device
// {
// ov::device::properties("NPU", ...),
// ov::device::properties("CPU", ...)
// }
auto device_propertes = utils::pop_or_default<ov::AnyMap>(
properties_copy, ov::device::properties.name(), { }
);
// Otherwise, the same properties are used for all models
// Otherwise, the same properties are used for all models and devices
auto lm_properties = device_propertes.empty()
? properties_copy
: utils::pop_or_default<ov::AnyMap>(device_propertes, device, {});
@@ -172,6 +172,14 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
generation_config.set_eos_token_id(m_generation_config.eos_token_id);
generation_config.validate();

if (m_is_npu) {
OPENVINO_ASSERT(rgbs.size() == 1u, "Currently only batch size equal to 1 is supported for NPU device!");
OPENVINO_ASSERT(generation_config.is_greedy_decoding() || generation_config.is_multinomial(),
"Currently only greedy and multinomial decoding are supported for NPU device!");
OPENVINO_ASSERT(generation_config.num_return_sequences == 1u,
"Currently only \"num_return_sequences\" equal to 1 is supported for NPU device!");
}

m_inputs_embedder->set_apply_chat_template_status(generation_config.apply_chat_template);

auto start_get_inputs_embeds = std::chrono::steady_clock::now();
26 changes: 25 additions & 1 deletion tests/python_tests/test_vlm_pipeline.py
Original file line number Diff line number Diff line change
@@ -92,7 +92,7 @@ def streamer(word: str) -> bool:
images = []
for link in links:
images.append(get_image_by_link(link))

result_from_streamer = []
res = ov_pipe.generate(prompts[0], images=images, generation_config=generation_config, streamer=streamer)
assert res.texts[0] == ''.join(result_from_streamer)
@@ -328,3 +328,27 @@ def test_perf_metrics(cache):
mean_dur, std_dur = perf_metrics.get_prepare_embeddings_duration()
assert np.allclose(mean_dur, np.mean(raw_dur))
assert np.allclose(std_dur, np.std(raw_dur))


@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.parametrize("model_id", model_ids)
def test_vlm_cpu_vs_npuw_cpu(model_id, cache):
models_path = get_ov_model(model_id, cache)

cpu_pipe = VLMPipeline(models_path, "CPU")
npu_pipe = VLMPipeline(models_path, "NPU")

generation_config = ov_pipe.get_generation_config()
generation_config.max_new_tokens = 30
generation_config.set_eos_token_id(ov_pipe.get_tokenizer().get_eos_token_id())

for links in image_links_for_testing[2]:
# NPU only works with single image input
assert len(links) == 1
image = get_image_by_link(links[0])

ref_out = cpu_pipe.generate(prompts[0], images=[image], generation_config=generation_config)
actual_out = npu_pipe.generate(prompts[0], images=[image], generation_config=generation_config)

assert ref_out.texts[0] == actual_out.texts[0]