Skip to content

Commit 90b1ab1

Browse files
authored
Add sampling to vlm pipeline by Sampler (openvinotoolkit#950)
CVS-152890
1 parent 1778e50 commit 90b1ab1

File tree

8 files changed

+221
-219
lines changed

8 files changed

+221
-219
lines changed

.github/workflows/causal_lm_cpp.yml

+8-2
Original file line numberDiff line numberDiff line change
@@ -710,7 +710,8 @@ jobs:
710710
python -m pip install -U "optimum<1.23" --no-dependencies
711711
source ./ov/setupvars.sh
712712
optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 MiniCPM-V-2_6 --trust-remote-code
713-
wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg
713+
mkdir cat_img
714+
wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat_img/cat.jpg
714715
- name: Generate reference
715716
shell: python
716717
run: |
@@ -741,6 +742,11 @@ jobs:
741742
&& timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./MiniCPM-V-2_6/ lines.png
742743
<<< $'What is unusual on this image?' | tee cpp.txt
743744
- run: diff cpp.txt ref.txt
745+
- name: Run visual_language_chat C++ sample with dir - MiniCPM-V-2_6
746+
run: >
747+
source ./ov/setupvars.sh
748+
&& timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./MiniCPM-V-2_6/ cat_img
749+
<<< $'What is unusual on this image?'
744750
- name: Download and convert LLaVa 1.5 model and an image
745751
run: |
746752
source ./ov/setupvars.sh
@@ -768,7 +774,7 @@ jobs:
768774
source ./ov/setupvars.sh
769775
export PYTHONPATH=./build/:$PYTHONPATH
770776
printf 'What is on the image?\nWhat is special on the image?\n' > ./input.txt
771-
timeout 120s python ./samples/python/visual_language_chat/visual_language_chat.py ./MiniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt
777+
timeout 120s python ./samples/python/visual_language_chat/visual_language_chat.py ./MiniCPM-V-2_6/ cat_img/cat.jpg < input.txt > ./pred.txt
772778
773779
cpp-continuous-batching-ubuntu:
774780
runs-on: ubuntu-20.04-8-cores

samples/cpp/visual_language_chat/load_image.cpp

+22
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,28 @@
66
#include "stb_image.h"
77
#include "load_image.hpp"
88

9+
namespace fs = std::filesystem;
10+
11+
std::vector<ov::Tensor> utils::load_images(const std::filesystem::path& input_path) {
12+
std::vector<ov::Tensor> images;
13+
if (!input_path.empty() && fs::exists(input_path)) {
14+
if (fs::is_directory(input_path)) {
15+
for (const auto& dir_entry : fs::directory_iterator(input_path)) {
16+
ov::Tensor image = utils::load_image(dir_entry.path());
17+
images.push_back(std::move(image));
18+
}
19+
} else if (fs::is_regular_file(input_path)) {
20+
ov::Tensor image = utils::load_image(input_path);
21+
images.push_back(std::move(image));
22+
}
23+
}
24+
25+
if (images.empty())
26+
throw std::runtime_error(std::string{"No images were found in path "} + input_path.string());
27+
28+
return images;
29+
}
30+
931
ov::Tensor utils::load_image(const std::filesystem::path& image_path) {
1032
int x = 0, y = 0, channels_in_file = 0;
1133
constexpr int desired_channels = 3;

samples/cpp/visual_language_chat/load_image.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@
99

1010
namespace utils {
1111
ov::Tensor load_image(const std::filesystem::path& image_path);
12+
std::vector<ov::Tensor> load_images(const std::filesystem::path& image_path);
1213
}

samples/cpp/visual_language_chat/visual_language_chat.cpp

+16-8
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#include "load_image.hpp"
55
#include <openvino/genai/visual_language/pipeline.hpp>
6+
#include <filesystem>
67
#include <openvino/runtime/intel_gpu/properties.hpp>
78

89
bool print_subword(std::string&& subword) {
@@ -11,9 +12,14 @@ bool print_subword(std::string&& subword) {
1112

1213
int main(int argc, char* argv[]) try {
1314
if (3 != argc) {
14-
throw std::runtime_error(std::string{"Usage "} + argv[0] + " <MODEL_DIR> <IMAGE_FILE>");
15+
throw std::runtime_error(std::string{"Usage "} + argv[0] + " <MODEL_DIR> <IMAGE_FILE OR DIR_WITH_IMAGES>");
1516
}
16-
ov::Tensor image = utils::load_image(argv[2]);
17+
18+
std::vector<ov::Tensor> images = utils::load_images(argv[2]);
19+
20+
ov::genai::GenerationConfig generation_config;
21+
generation_config.max_new_tokens = 200;
22+
1723
std::string device = "CPU"; // GPU can be used as well
1824
ov::AnyMap enable_compile_cache;
1925
if ("GPU" == device) {
@@ -26,16 +32,18 @@ int main(int argc, char* argv[]) try {
2632

2733
pipe.start_chat();
2834
std::cout << "question:\n";
35+
2936
std::getline(std::cin, prompt);
30-
pipe.generate(
31-
prompt,
32-
ov::genai::image(image),
33-
ov::genai::streamer(print_subword)
34-
);
37+
pipe.generate(prompt,
38+
ov::genai::images(images),
39+
ov::genai::generation_config(generation_config),
40+
ov::genai::streamer(print_subword));
3541
std::cout << "\n----------\n"
3642
"question:\n";
3743
while (std::getline(std::cin, prompt)) {
38-
pipe.generate(prompt, ov::genai::streamer(print_subword));
44+
pipe.generate(prompt,
45+
ov::genai::generation_config(generation_config),
46+
ov::genai::streamer(print_subword));
3947
std::cout << "\n----------\n"
4048
"question:\n";
4149
}

src/cpp/src/sampler.cpp

+25
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,22 @@ Sampler::GroupBeamSearcher::GroupBeamSearcher(SequenceGroup::Ptr sequence_group,
230230
}
231231
}
232232

233+
234+
std::vector<int32_t> Sampler::GroupBeamSearcher::get_beam_idxs() {
235+
std::vector<int32_t> next_beams;
236+
237+
for (Group& group : m_groups) {
238+
if (!group.done) {
239+
for (Beam& beam : group.ongoing) {
240+
next_beams.push_back(beam.m_global_beam_idx);
241+
}
242+
}
243+
}
244+
245+
return next_beams;
246+
}
247+
248+
233249
void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output) {
234250
assert(m_parameters.num_beams % m_parameters.num_beam_groups == 0 &&
235251
"number of beams should be divisible by number of groups");
@@ -581,6 +597,15 @@ void register_new_token(const Token& sampled_token_id,
581597
}
582598
};
583599

600+
std::vector<int32_t> Sampler::get_beam_idxs(SequenceGroup::CPtr sequence_group) {
601+
size_t request_id = sequence_group->get_request_id();
602+
auto beam_searcher = m_beam_search_info.find(request_id);
603+
if (m_beam_search_info.find(request_id) == m_beam_search_info.end()) {
604+
return std::vector<int32_t>(sequence_group->num_running_seqs(), 0);
605+
}
606+
return beam_searcher->second.get_beam_idxs();
607+
}
608+
584609
std::list<uint64_t>
585610
create_n_forked_sequences(SequenceGroup::Ptr sequence_group,
586611
LogitProcessor& logit_processor,

src/cpp/src/sampler.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ class Sampler {
6565
SamplerOutput sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits, bool is_validation_mode_enabled = false);
6666
void set_seed(size_t seed) { rng_engine.seed(seed); }
6767
void clear_beam_search_info(uint64_t request_id);
68+
std::vector<int32_t> get_beam_idxs(SequenceGroup::CPtr sequence_group);
6869
};
6970

7071
class Sampler::GroupBeamSearcher {
@@ -109,5 +110,6 @@ class Sampler::GroupBeamSearcher {
109110

110111
void select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output);
111112
void finalize(SamplerOutput& sampler_output);
113+
std::vector<int32_t> get_beam_idxs();
112114
};
113115
}

0 commit comments

Comments
 (0)