Skip to content

Commit 6d2763a

Browse files
Wovchenawenyi5608yangsu2022yatarkanakladiev
authored
Multiple images miniCPM-V-2_6 (openvinotoolkit#919)
TODO: - [ ] Remove `ov::Core` from constructors. - [ ] Hide files and API. --------- Co-authored-by: wenyi5608 <93560477+wenyi5608@users.noreply.github.com> Co-authored-by: Yang,Su <su.yang@intel.com> Co-authored-by: Yaroslav Tarkan <yaroslav.tarkan@intel.com> Co-authored-by: Alina Kladieva <alina.kladieva@intel.com> Co-authored-by: Pavel Esir <pavel.esir@intel.com> Co-authored-by: Pavel Esir <pavel.esir@gmail.com> Co-authored-by: Artur Paniukov <chgk1101@gmail.com> Co-authored-by: Ekaterina Aidova <ekaterina.aidova@intel.com> Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com> Co-authored-by: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
1 parent a1feff9 commit 6d2763a

14 files changed

+426
-146
lines changed

.github/workflows/causal_lm_cpp.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -683,7 +683,7 @@ jobs:
683683
diff pred2.txt ref.txt
684684
echo "Chat sample python" passed
685685
686-
py-vlm_chat_sample-ubuntu:
686+
visual_language_chat_sample-ubuntu:
687687
runs-on: ubuntu-22.04-16-cores
688688
steps:
689689
- uses: actions/checkout@v4
@@ -859,6 +859,7 @@ jobs:
859859
cpp-beam_search_causal_lm-Qwen-7B-Chat, cpp-beam_search_causal_lm-Qwen1_5-7B-Chat, cpp-beam_search_causal_lm-Phi-2,
860860
cpp-beam_search_causal_lm-notus-7b-v1, cpp-speculative_decoding_lm-ubuntu, cpp-prompt_lookup_decoding_lm-ubuntu,
861861
cpp-Phi-1_5, cpp-greedy_causal_lm-redpajama-3b-chat, cpp-chat_sample-ubuntu, cpp-continuous-batching-ubuntu,
862+
visual_language_chat_sample-ubuntu,
862863
cpp-continuous-batching-windows, cpp-continuous-batching-macos]
863864
if: ${{ always() }}
864865
runs-on: ubuntu-latest

samples/CMakeLists.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ install(DIRECTORY
2525
cpp/greedy_causal_lm
2626
cpp/multinomial_causal_lm
2727
# Don't install prompt_lookup_decoding_lm and speculative_decoding_lm because they don't use openvino_genai library and arent verifyed yet.
28-
# Don't install continuous_batching_accuracy and continuous_batching_benchmark because they depend on json.
28+
# Don't install continuous_batching_accuracy and continuous_batching_benchmark because CB isn't ready.
29+
cpp/visual_language_chat
2930
cpp/whisper_speech_recognition
3031
cpp/text2image
3132
cpp/lora_greedy_causal_lm

samples/cpp/visual_language_chat/CMakeLists.txt

+7-5
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
# Copyright (C) 2023-2024 Intel Corporation
22
# SPDX-License-Identifier: Apache-2.0
33

4-
find_package(OpenVINOGenAI REQUIRED PATHS
5-
"${CMAKE_BINARY_DIR}" # Reuse the package from the build.
6-
${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO.
4+
find_package(OpenVINOGenAI REQUIRED
5+
PATHS
6+
"${CMAKE_BINARY_DIR}" # Reuse the package from the build.
7+
${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO.
8+
NO_CMAKE_FIND_ROOT_PATH
79
)
810

911
file(DOWNLOAD
@@ -14,11 +16,11 @@ file(DOWNLOAD
1416
add_executable(visual_language_chat visual_language_chat.cpp load_image.cpp)
1517
target_include_directories(visual_language_chat PRIVATE "${CMAKE_CURRENT_SOUCE_DIR}" "${CMAKE_BINARY_DIR}")
1618
target_link_libraries(visual_language_chat PRIVATE openvino::genai)
19+
1720
set_target_properties(visual_language_chat PROPERTIES
18-
COMPILE_PDB_NAME chat_sample
21+
COMPILE_PDB_NAME visual_language_chat
1922
# Ensure out of box LC_RPATH on macOS with SIP
2023
INSTALL_RPATH_USE_LINK_PATH ON)
21-
target_compile_features(visual_language_chat PRIVATE cxx_std_11)
2224

2325
install(TARGETS visual_language_chat
2426
RUNTIME DESTINATION samples_bin/

samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py

+247-13
Large diffs are not rendered by default.

samples/cpp/visual_language_chat/load_image.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ ov::Tensor utils::load_image(const std::filesystem::path& image_path) {
1313
image_path.string().c_str(),
1414
&x, &y, &channels_in_file, desired_channels);
1515
if (!image) {
16-
throw std::runtime_error{"Failed to load the image"};
16+
throw std::runtime_error{"Failed to load the image."};
1717
}
1818
struct SharedImageAllocator {
1919
unsigned char* image;
@@ -22,11 +22,11 @@ ov::Tensor utils::load_image(const std::filesystem::path& image_path) {
2222
if (channels * height * width == bytes) {
2323
return image;
2424
}
25-
throw std::runtime_error{"Unexpected number of bytes was requested to allocate"};
25+
throw std::runtime_error{"Unexpected number of bytes was requested to allocate."};
2626
}
2727
void deallocate(void*, size_t bytes, size_t) {
2828
if (channels * height * width != bytes) {
29-
throw std::runtime_error{"Unexpected number of bytes was requested to deallocate"};
29+
throw std::runtime_error{"Unexpected number of bytes was requested to deallocate."};
3030
}
3131
std::free(image);
3232
image = nullptr;

samples/requirements.txt

-1
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,4 @@ optimum[openvino]==1.22.0
33
einops==0.8.0 # For Qwen
44
transformers_stream_generator==0.0.5 # For Qwen
55
diffusers==0.30.3
6-
pillow
76
torchvision # needed for mini-CPM export script. Need to remove when we switch to exporting with optimum-intel.

src/cpp/include/openvino/genai/processor_config.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ namespace ov::genai {
1414
/// preprocessor_config.json.
1515
class OPENVINO_GENAI_EXPORTS ProcessorConfig {
1616
public:
17+
size_t image_size = 980;
1718
/// @brief Dimensions of the smaller, non-overlapping patches that the
1819
/// input image is divided into before being fed into the
1920
/// transformer model. Used to divide image height and width.

src/cpp/src/clip.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ struct clip_ctx {
2525
std::vector<uint8_t> buf_compute_meta;
2626

2727
projector_type proj_type = PROJECTOR_TYPE_RESAMPLER;
28+
size_t patch_size = 0;
29+
size_t image_size = 0;
2830
};
2931

3032
// RGB uint8 image

src/cpp/src/llm_pipeline.cpp

+1-19
Original file line numberDiff line numberDiff line change
@@ -18,24 +18,6 @@
1818
#include "openvino/genai/lora_adapter.hpp"
1919
#include "lora_helper.hpp"
2020

21-
namespace {
22-
23-
ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& fisrt, const ov::genai::TokenizedInputs& second){
24-
auto first_size = fisrt.input_ids.get_size();
25-
auto second_size = second.input_ids.get_size();
26-
ov::Shape new_shape{1, first_size - second_size};
27-
28-
ov::Tensor new_input_ids(ov::element::i64, new_shape);
29-
auto data_ptr = fisrt.input_ids.data<int64_t>();
30-
std::copy(data_ptr + second_size, data_ptr + first_size, new_input_ids.data<int64_t>());
31-
32-
ov::Tensor new_attention_mask(ov::element::i64, new_shape);
33-
std::fill_n(new_attention_mask.data<int64_t>(), new_shape[1], 1);
34-
35-
return {new_input_ids, new_attention_mask};
36-
}
37-
}
38-
3921
namespace ov {
4022
namespace genai {
4123

@@ -153,7 +135,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
153135
encoded_input = new_chat_tokens;
154136
} else {
155137
auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens_));
156-
encoded_input = subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens);
138+
encoded_input = utils::subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens);
157139
}
158140
m_templated_chat_history = new_templated_chat_history;
159141
// TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied

src/cpp/src/utils.hpp

+14
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,20 @@ ProcessorConfig from_any_map(
8686

8787
std::pair<ov::AnyMap, ov::AnyMap> split_core_complile_config(const ov::AnyMap& plugin_config);
8888

89+
inline ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& fisrt, const ov::genai::TokenizedInputs& second){
90+
auto first_size = fisrt.input_ids.get_size();
91+
auto second_size = second.input_ids.get_size();
92+
ov::Shape new_shape{1, first_size - second_size};
93+
94+
ov::Tensor new_input_ids(ov::element::i64, new_shape);
95+
auto data_ptr = fisrt.input_ids.data<int64_t>();
96+
std::copy(data_ptr + second_size, data_ptr + first_size, new_input_ids.data<int64_t>());
97+
98+
ov::Tensor new_attention_mask(ov::element::i64, new_shape);
99+
std::fill_n(new_attention_mask.data<int64_t>(), new_shape[1], 1);
100+
101+
return {new_input_ids, new_attention_mask};
102+
}
89103
} // namespace utils
90104
} // namespace genai
91105
} // namespace ov

src/cpp/src/vision_encoder.cpp

+71-16
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,65 @@ ov::Tensor preprocess_for_encoder(const ov::Tensor& images, size_t kernel) {
216216
return permuted_tensor;
217217
}
218218

219+
// torch.bucketize(fractional_coords, boundaries, right=True)
220+
std::vector<int64_t> bucket_size_right(const std::vector<float>& fractional_coords, const std::vector<float>& boundaries) {
221+
std::vector<int64_t> bucket_coords(fractional_coords.size());
222+
std::transform(fractional_coords.begin(), fractional_coords.end(), bucket_coords.begin(), [&boundaries](float fractional_coord) {
223+
return std::distance(boundaries.begin(), std::upper_bound(boundaries.begin(), boundaries.end(), fractional_coord));
224+
});
225+
return bucket_coords;
226+
}
227+
228+
ov::Tensor prepare_vis_position_ids(
229+
const ov::Tensor& pixel_values,
230+
const ov::Tensor& patch_attention_mask,
231+
const std::vector<HeightWidth> tgt_sizes,
232+
size_t patch_size,
233+
size_t num_patches_per_side
234+
) {
235+
size_t batch_size = pixel_values.get_shape().at(0);
236+
size_t max_im_h = pixel_values.get_shape().at(2), max_im_w = pixel_values.get_shape().at(3);
237+
size_t max_nb_patches_h = max_im_h / patch_size, max_nb_patches_w = max_im_w / patch_size;
238+
std::vector<float> boundaries(1.0f * num_patches_per_side - 1);
239+
std::generate(boundaries.begin(), boundaries.end(), [num_patches_per_side, val = 0.0f]() mutable {
240+
val += 1.0f / num_patches_per_side;
241+
return val;
242+
});
243+
size_t position_ids_batch_elem = max_nb_patches_h * max_nb_patches_w;
244+
ov::Tensor position_ids{ov::element::i64, {batch_size, position_ids_batch_elem}};
245+
// throw std::runtime_error("");
246+
int64_t* res_data = position_ids.data<int64_t>();
247+
std::fill_n(res_data, position_ids.get_size(), 0);
248+
249+
for (size_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
250+
size_t nb_patches_h = tgt_sizes.at(batch_idx).height;
251+
size_t nb_patches_w = tgt_sizes.at(batch_idx).width;
252+
253+
std::vector<float> fractional_coords_h(nb_patches_h);
254+
std::generate(fractional_coords_h.begin(), fractional_coords_h.end(), [nb_patches_h, val = -1.0f / nb_patches_h]() mutable {
255+
val += 1.0f / nb_patches_h;
256+
return val;
257+
});
258+
std::vector<float> fractional_coords_w(nb_patches_w);
259+
std::generate(fractional_coords_w.begin(), fractional_coords_w.end(), [nb_patches_w, val = -1.0f / nb_patches_w]() mutable {
260+
val += 1.0f / nb_patches_w;
261+
return val;
262+
});
263+
264+
std::vector<int64_t> bucket_coords_h = bucket_size_right(fractional_coords_h, boundaries);
265+
std::vector<int64_t> bucket_coords_w = bucket_size_right(fractional_coords_w, boundaries);
266+
267+
std::vector<int64_t> pos_ids(bucket_coords_h.size() * bucket_coords_w.size());
268+
for (size_t col = 0; col < bucket_coords_h.size(); ++col) {
269+
for (size_t row = 0; row < bucket_coords_w.size(); ++row) {;
270+
pos_ids.at(col * bucket_coords_w.size() + row) = bucket_coords_h.at(col) * num_patches_per_side + bucket_coords_w.at(row);
271+
}
272+
}
273+
std::copy(pos_ids.begin(), pos_ids.end(), res_data + batch_idx * position_ids_batch_elem);
274+
}
275+
return position_ids;
276+
}
277+
219278
EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const ov::Tensor& img, ov::InferRequest& encoder, int max_slice_nums, int scale_resolution, size_t patch_size, bool never_split) {
220279
clip_image_u8 source{
221280
int(img.get_shape().at(3)),
@@ -244,14 +303,11 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
244303
ov::Tensor patch_attention_mask{ov::element::boolean, {pixel_values.get_shape().at(0), 1, resized_source_size.height * resized_source_size.width}};
245304
std::fill_n(patch_attention_mask.data<bool>(), patch_attention_mask.get_size(), true);
246305
encoder.set_tensor("patch_attention_mask", patch_attention_mask);
247-
ov::Tensor tgt_sizes{ov::element::i64, {1, 2}};
248-
int64_t* tgt_sizes_data = tgt_sizes.data<int64_t>();
249-
tgt_sizes_data[0] = resized_source_size.height;
250-
tgt_sizes_data[1] = resized_source_size.width;
251-
encoder.set_tensor("tgt_sizes", tgt_sizes);
306+
ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {resized_source_size}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size);
307+
encoder.set_tensor("position_ids", position_ids);
252308
encoder.infer();
253309
const ov::Tensor& output_tensor = encoder.get_output_tensor();
254-
ov::Tensor resized_source{output_tensor.get_element_type(), output_tensor.get_shape()};
310+
ov::Tensor resized_source{ov::element::f32, output_tensor.get_shape()};
255311
output_tensor.copy_to(resized_source);
256312

257313
if (1 == preprocessed.size()) {
@@ -266,27 +322,24 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
266322
size_t n_patches = size.height / patch_size * size.width / patch_size,
267323
old_hidden_size = resized_source.get_shape().at(2);
268324
ov::Tensor encoded_slices{ov::element::f32, {preprocessed.size() - 1, preprocessed.at(1).size(), n_patches, old_hidden_size}};
269-
// там внутри есть какая-то операция которая констант фолдит батч и из-за этого нельзя использовать отличный от того что был при экспорте
270-
// констант фолдит она его в торч скрипте
271-
// Even though batch can't be used, it's still possible to use async.
272325
for (size_t row = 1; row < preprocessed.size(); ++row) {
273326
for (size_t col = 0; col < preprocessed.at(row).size(); ++col) {
274327
clip_image_f32& elem = preprocessed.at(row).at(col);
275328
sliced_sizes.push_back({elem.ny / patch_size, elem.nx / patch_size});
276-
encoder.set_tensor("pixel_values", preprocess_for_encoder(
329+
ov::Tensor pixel_values = preprocess_for_encoder(
277330
{ov::element::f32, {1, 3, size_t(elem.ny), size_t(elem.nx)}, elem.buf.data()},
278331
patch_size
279-
));
332+
);
333+
encoder.set_tensor("pixel_values", pixel_values);
280334
ov::Tensor patch_attention_mask{ov::element::boolean, {1, 1, sliced_sizes.back().height * sliced_sizes.back().width}};
281335
std::fill_n(patch_attention_mask.data<bool>(), patch_attention_mask.get_size(), true);
282336
encoder.set_tensor("patch_attention_mask", patch_attention_mask);
283-
ov::Tensor tgt_sizes{ov::element::i64, {1, 2}};
284-
int64_t* tgt_sizes_data = tgt_sizes.data<int64_t>();
285-
tgt_sizes_data[0] = sliced_sizes.back().height;
286-
tgt_sizes_data[1] = sliced_sizes.back().width;
287-
encoder.set_tensor("tgt_sizes", tgt_sizes);
337+
ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {sliced_sizes.back()}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size);
338+
encoder.set_tensor("position_ids", position_ids);
339+
const ov::Tensor& old = encoder.get_output_tensor();
288340
encoder.set_output_tensor({ov::element::f32, {1, n_patches, old_hidden_size}, encoded_slices.data<float>() + ((row - 1) * preprocessed.at(row).size() + col) * n_patches * old_hidden_size});
289341
encoder.infer();
342+
encoder.set_output_tensor(old);
290343
}
291344
}
292345
return {resized_source, resized_source_size, encoded_slices, sliced_sizes};
@@ -305,6 +358,8 @@ VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const std::
305358

306359
EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ProcessorConfig& config) {
307360
clip_ctx ctx_clip;
361+
ctx_clip.patch_size = m_processor_config.patch_size;
362+
ctx_clip.image_size = m_processor_config.image_size;
308363
std::copy(config.norm_mean.begin(), config.norm_mean.end(), ctx_clip.image_mean);
309364
std::copy(config.norm_std.begin(), config.norm_std.end(), ctx_clip.image_std);
310365
return llava_image_embed_make_with_bytes_slice(ctx_clip, image, m_encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums);

0 commit comments

Comments
 (0)