Skip to content

Commit be13a23

Browse files
popovaanWovchenailya-lavrenov
authored
Store EncodedImage's in VLM CB chat history. (openvinotoolkit#1901)
Storage of EncodedImage's in VLM CB chat history instead of original image allows to reduce generate() time by ~10% on 2nd and subsequent chat iterations. Time measure for 3 CB chat iterations for MiniCPM-V-2_6 and [this image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11): Master: Generate 1 Time: 19625.116645 ms Generate 2 Time: 58074.136806 ms Generate 3 Time: 57504.088475 ms This branch: Generate 1 Time: 19716.716223 ms Generate 2 Time: 51544.187465 ms Generate 3 Time: 51619.265177 ms --------- Co-authored-by: Vladimir Zlobin <vladimir.zlobin@intel.com> Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
1 parent fcefe9f commit be13a23

17 files changed

+90
-59
lines changed

src/cpp/src/icontinuous_batching.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -165,8 +165,8 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
165165
prompt_with_tags = add_image_tags_to_prompt(prompt_with_tags, rgbs, m_history_images.size());
166166
}
167167
m_history.push_back({{"role", "user"}, {"content", prompt_with_tags}});
168-
// TODO: save embeddings, instead of image tensors and compare performance
169-
m_history_images.insert(m_history_images.end(), rgbs.begin(), rgbs.end());
168+
const auto encoded_images = m_inputs_embedder->encode_images(rgbs);
169+
m_history_images.insert(m_history_images.end(), encoded_images.begin(), encoded_images.end());
170170
std::string templated_history = m_tokenizer.apply_chat_template(m_history, true);
171171

172172
m_inputs_embedder->set_apply_chat_template_status(false);

src/cpp/src/icontinuous_batching.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline {
4949

5050
bool m_is_chat_conversation = false;
5151
ChatHistory m_history;
52-
std::vector<ov::Tensor> m_history_images;
52+
std::vector<ov::genai::EncodedImage> m_history_images;
5353

5454
float m_load_time_ms = 0.0f;
5555
// to access m_load_time_ms

src/cpp/src/visual_language/inputs_embedder.cpp

+21
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,19 @@ std::vector<ov::Tensor> InputsEmbedder::IInputsEmbedder::to_single_image_tensors
187187
return single_image_tensors;
188188
}
189189

190+
std::vector<ov::genai::EncodedImage> InputsEmbedder::IInputsEmbedder::encode_images(const std::vector<ov::Tensor>& images) {
191+
std::vector<EncodedImage> embeds;
192+
std::vector<ov::Tensor> single_images = to_single_image_tensors(images);
193+
for (const ov::Tensor& image : single_images) {
194+
embeds.emplace_back(m_vision_encoder->encode(image));
195+
}
196+
return embeds;
197+
}
198+
199+
ov::Tensor InputsEmbedder::IInputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) {
200+
return get_inputs_embeds(prompt, encode_images(images), metrics);
201+
}
202+
190203
/// Public InputsEmbedder class
191204

192205
InputsEmbedder::InputsEmbedder(const std::filesystem::path& model_dir,
@@ -239,6 +252,14 @@ ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const st
239252
return m_impl->get_inputs_embeds(prompt, images, metrics);
240253
}
241254

255+
ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) {
256+
return m_impl->get_inputs_embeds(prompt, images, metrics);
257+
}
258+
259+
std::vector<ov::genai::EncodedImage> InputsEmbedder::encode_images(const std::vector<ov::Tensor>& images) {
260+
return m_impl->encode_images(images);
261+
}
262+
242263
std::pair<ov::Tensor, std::optional<int64_t>> InputsEmbedder::get_position_ids(const size_t inputs_embeds_size, const size_t history_size) {
243264
return m_impl->get_position_ids(inputs_embeds_size, history_size);
244265
}

src/cpp/src/visual_language/inputs_embedder.hpp

+10-1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ class InputsEmbedder {
3535
// compute input embedding for prompt and multiple images
3636
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics);
3737

38+
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics);
39+
40+
std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images);
41+
3842
// compute position ids for language model input
3943
std::pair<ov::Tensor, std::optional<int64_t>> get_position_ids(const size_t inputs_embeds_size, const size_t history_size);
4044

@@ -90,8 +94,13 @@ class InputsEmbedder {
9094
// Verifies no previous image is referenced.
9195
// InputsEmbedderMiniCPM Uses to insert <image_id>i</image_id> per image (not a slice).
9296
size_t m_image_id = 0;
97+
9398
public:
94-
virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) = 0;
99+
virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) = 0;
100+
101+
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics);
102+
103+
virtual std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images);
95104

96105
virtual std::pair<ov::Tensor, std::optional<int64_t>> get_position_ids(const size_t inputs_embeds_size, const size_t history_size);
97106

src/cpp/src/visual_language/internvl_chat/classes.cpp

+3-6
Original file line numberDiff line numberDiff line change
@@ -226,19 +226,16 @@ InputsEmbedderInternVLChat::InputsEmbedderInternVLChat(
226226
const ov::AnyMap device_config) :
227227
IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { }
228228

229-
ov::Tensor InputsEmbedderInternVLChat::get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) {
229+
ov::Tensor InputsEmbedderInternVLChat::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) {
230230
std::string image_start_token = m_vlm_config.image_start_token;
231231
std::string image_context_token = m_vlm_config.image_context_token;
232232
std::string image_end_token = m_vlm_config.image_end_token;
233-
234-
std::vector<ov::Tensor> single_images = to_single_image_tensors(images);
235233

236234
std::string formatted_prompt;
237235
std::vector<ov::Tensor> image_embeds;
238-
image_embeds.reserve(single_images.size());
236+
image_embeds.reserve(images.size());
239237

240-
for (const auto& image : single_images) {
241-
EncodedImage encoded_image = m_vision_encoder->encode(image);
238+
for (const auto& encoded_image : images) {
242239
ov::Tensor single_image_embeds = encoded_image.resized_source;
243240

244241
const size_t num_patches = single_image_embeds.get_shape().at(0);

src/cpp/src/visual_language/internvl_chat/classes.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder {
3535
const std::string& device,
3636
const ov::AnyMap device_config);
3737

38-
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override;
38+
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) override;
3939
};
4040

4141
} // namespace ov::genai

src/cpp/src/visual_language/llava/classes.cpp

+14-7
Original file line numberDiff line numberDiff line change
@@ -103,18 +103,25 @@ InputsEmbedderLLaVA::InputsEmbedderLLaVA(
103103
const ov::AnyMap device_config) :
104104
IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { }
105105

106-
ov::Tensor InputsEmbedderLLaVA::get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) {
107-
std::string image_token = m_vlm_config.im_start;
108-
106+
std::vector<ov::genai::EncodedImage> InputsEmbedderLLaVA::encode_images(const std::vector<ov::Tensor>& images) {
107+
std::vector<EncodedImage> embeds;
108+
ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}};
109109
std::vector<ov::Tensor> single_images = to_single_image_tensors(images);
110+
embeds.reserve(single_images.size());
111+
for (const ov::Tensor& image : single_images) {
112+
embeds.emplace_back(m_vision_encoder->encode(image, vision_config));
113+
}
114+
return embeds;
115+
}
116+
117+
ov::Tensor InputsEmbedderLLaVA::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) {
118+
std::string image_token = m_vlm_config.im_start;
110119

111120
std::string formatted_prompt;
112121
std::vector<ov::Tensor> image_embeds;
113-
image_embeds.reserve(single_images.size());
122+
image_embeds.reserve(images.size());
114123

115-
for (const auto& image : single_images) {
116-
ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}};
117-
EncodedImage encoded_image = m_vision_encoder->encode(image, vision_config);
124+
for (const auto& encoded_image : images) {
118125
for (size_t idx = 0; idx < encoded_image.resized_source.get_shape().at(1); ++idx) {
119126
formatted_prompt += image_token;
120127
}

src/cpp/src/visual_language/llava/classes.hpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,9 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
3535
const std::string& device,
3636
const ov::AnyMap device_config);
3737

38-
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override;
38+
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) override;
3939

40+
std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images) override;
4041
protected:
4142
ov::Tensor merge_text_and_image_embeddings_llava(
4243
const ov::Tensor& input_ids,

src/cpp/src/visual_language/llava_next/classes.cpp

+16-14
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ EncodedImage VisionEncoderLLaVANext::encode(const ov::Tensor& image, const ov::A
7575
encoded_image.resized_source = std::move(image_features);
7676
encoded_image.resized_source_size = resized_source_size;
7777
encoded_image.patches_grid = {num_patches_h, num_patches_w};
78+
encoded_image.original_image_size = original_image_size;
7879
return encoded_image;
7980
}
8081

@@ -262,7 +263,6 @@ ov::Tensor add_image_newline(const ov::Tensor& image_feature, const ov::Tensor&
262263
*/
263264
ov::Tensor pack_image_features_llava_next(
264265
const EncodedImage& encoded_image,
265-
const ImageSize& original_image_size,
266266
const ov::Tensor& image_newline) {
267267
auto image_feature = encoded_image.resized_source;
268268
auto image_feature_shape = image_feature.get_shape();
@@ -295,7 +295,7 @@ ov::Tensor pack_image_features_llava_next(
295295

296296
ov::Tensor reshaped_image_feature = reshape_and_rearrange_image_feature(patches_image_feature, num_patch_height, num_patch_width, height, width);
297297

298-
ov::Tensor unpadded_image_feature = unpad_image(reshaped_image_feature, original_image_size);
298+
ov::Tensor unpadded_image_feature = unpad_image(reshaped_image_feature, encoded_image.original_image_size);
299299

300300
ov::Tensor image_feature_with_newline = add_image_newline(unpadded_image_feature, image_newline);
301301

@@ -333,31 +333,33 @@ ov::Tensor pack_image_features_llava_next(
333333

334334
} // namespace
335335

336-
ov::Tensor InputsEmbedderLLaVANext::get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) {
337-
std::string image_token = m_vlm_config.im_start;
338-
336+
std::vector<ov::genai::EncodedImage> InputsEmbedderLLaVANext::encode_images(const std::vector<ov::Tensor>& images) {
337+
std::vector<EncodedImage> embeds;
338+
ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}};
339339
std::vector<ov::Tensor> single_images = to_single_image_tensors(images);
340+
for (const ov::Tensor& image : single_images) {
341+
embeds.emplace_back(m_vision_encoder->encode(image, vision_config));
342+
}
343+
return embeds;
344+
}
345+
346+
ov::Tensor InputsEmbedderLLaVANext::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) {
347+
std::string image_token = m_vlm_config.im_start;
340348

341349
std::string formatted_prompt;
342350
std::vector<ov::Tensor> image_embeds;
343-
image_embeds.reserve(single_images.size());
344-
351+
image_embeds.reserve(images.size());
345352
ov::Tensor image_newline;
346353

347-
for (const auto& image : single_images) {
348-
ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}};
349-
EncodedImage encoded_image = m_vision_encoder->encode(image, vision_config);
350-
354+
for (const auto& encoded_image : images) {
351355
if (!image_newline) {
352356
size_t embed_dim = encoded_image.resized_source.get_shape().at(2);
353357
image_newline = ov::Tensor(encoded_image.resized_source.get_element_type(), {embed_dim});
354358
float* image_newline_data = image_newline.data<float>();
355359
std::copy(m_vlm_config.image_newline.begin(), m_vlm_config.image_newline.end(), image_newline_data);
356360
}
357361

358-
ImageSize original_image_size{image.get_shape().at(1), image.get_shape().at(2)}; // [height, width]
359-
360-
ov::Tensor packed_features = pack_image_features_llava_next(encoded_image, original_image_size, image_newline);
362+
ov::Tensor packed_features = pack_image_features_llava_next(encoded_image, image_newline);
361363
for (size_t idx = 0; idx < packed_features.get_shape().at(1); ++idx) {
362364
formatted_prompt += image_token;
363365
}

src/cpp/src/visual_language/llava_next/classes.hpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
2222
public:
2323
using InputsEmbedderLLaVA::InputsEmbedderLLaVA;
2424

25-
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override;
25+
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) override;
26+
27+
std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images) override;
2628
};
2729

2830
} // namespace ov::genai

src/cpp/src/visual_language/minicpm/classes.cpp

+4-12
Original file line numberDiff line numberDiff line change
@@ -580,30 +580,22 @@ InputsEmbedderMiniCPM::InputsEmbedderMiniCPM(
580580
m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70});
581581
}
582582

583-
ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) {
584-
std::string images_prompt;
585-
std::vector<EncodedImage> embeds;
586-
587-
std::vector<ov::Tensor> single_images = to_single_image_tensors(images);
583+
ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) {
588584
auto [unified_prompt, images_sequence] = unify_prompt(
589585
prompt,
590586
NATIVE_TAG,
591587
'(' + NATIVE_TAG + ")\n",
592-
single_images.size(),
588+
images.size(),
593589
m_image_id
594590
);
595591

596-
for (const ov::Tensor& image : single_images) {
597-
embeds.push_back(m_vision_encoder->encode(image));
598-
}
599-
600592
std::string unk64;
601593
for (size_t idx = 0; idx < m_vlm_config.query_num; ++idx) {
602594
unk64 += m_vlm_config.unk;
603595
}
604596

605597
for (size_t new_image_id : images_sequence) {
606-
const EncodedImage& encoded_image = embeds.at(new_image_id - m_prev_image_id);
598+
const EncodedImage& encoded_image = images.at(new_image_id - m_prev_image_id);
607599
std::string expanded_tag;
608600
if (m_vlm_config.use_image_id) {
609601
expanded_tag += m_vlm_config.im_id_start + std::to_string(new_image_id) + m_vlm_config.im_id_end;
@@ -655,7 +647,7 @@ ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& prompt, c
655647
int64_t* end = ids + encoded_input_size;
656648
float* inputs_embeds_data = inputs_embeds.data<float>();
657649
for (size_t image_id : images_sequence) {
658-
const EncodedImage& encoded_image = embeds.at(image_id - m_prev_image_id);
650+
const EncodedImage& encoded_image = images.at(image_id - m_prev_image_id);
659651
const ov::Tensor& resampled_source = resample(encoded_image.resized_source, {encoded_image.resized_source_size});
660652
float* emb = resampled_source.data<float>();
661653
ids = std::find(ids, end, im_start_id);

src/cpp/src/visual_language/minicpm/classes.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
4545
const std::string& device,
4646
const ov::AnyMap device_config);
4747

48-
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override;
48+
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) override;
4949

5050
void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) override;
5151

src/cpp/src/visual_language/phi3_vision/classes.cpp

+2-3
Original file line numberDiff line numberDiff line change
@@ -529,15 +529,14 @@ InputsEmbedderPhi3V::InputsEmbedderPhi3V(
529529
});
530530
}
531531

532-
ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) {
532+
ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) {
533533
std::vector<ov::Tensor> images_features_proj;
534534
std::stringstream images_prompt;
535535
CircularBufferQueueElementGuard<ov::InferRequest> hd_feature_transformer_ireq_guard(this->m_ireq_queue_hd_feature_transformer.get());
536536
CircularBufferQueueElementGuard<ov::InferRequest> vision_projection_ireq_guard(this->m_ireq_queue_vision_projection.get());
537537
ov::InferRequest& hd_feature_transformer = hd_feature_transformer_ireq_guard.get();
538538
ov::InferRequest& vision_projection = vision_projection_ireq_guard.get();
539-
for (const ov::Tensor& image : to_single_image_tensors(images)) {
540-
EncodedImage encoded_image = m_vision_encoder->encode(image);
539+
for (const ov::genai::EncodedImage& encoded_image : images) {
541540
images_features_proj.push_back(hd_feature_transform(encoded_image, hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, vision_projection));
542541
m_tokens_per_images.push_back(images_features_proj.back().get_shape().at(1));
543542
images_prompt << "<|image_" << m_tokens_per_images.size() << "|>\n";

src/cpp/src/visual_language/phi3_vision/classes.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
2828
const ov::AnyMap device_config
2929
);
3030

31-
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override;
31+
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) override;
3232

3333
void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) override;
3434

src/cpp/src/visual_language/qwen2vl/classes.cpp

+5-7
Original file line numberDiff line numberDiff line change
@@ -281,16 +281,14 @@ InputsEmbedderQwen2VL::InputsEmbedderQwen2VL(
281281
});
282282
}
283283

284-
ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) {
285-
std::vector<ov::Tensor> single_images = to_single_image_tensors(images);
286-
auto [unified_prompt, images_sequence] = unify_prompt(prompt, NATIVE_TAG, NATIVE_TAG, single_images.size(), m_image_id);
284+
ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) {
285+
auto [unified_prompt, images_sequence] = unify_prompt(prompt, NATIVE_TAG, NATIVE_TAG, images.size(), m_image_id);
287286
std::vector<ov::Tensor> image_embeds;
288287
std::vector<std::array<size_t, 3>> images_grid_thw;
289-
image_embeds.reserve(single_images.size());
290-
images_grid_thw.reserve(single_images.size());
288+
image_embeds.reserve(images.size());
289+
images_grid_thw.reserve(images.size());
291290

292-
for (const auto& image : single_images) {
293-
EncodedImage encoded_image = m_vision_encoder->encode(image);
291+
for (const auto& encoded_image : images) {
294292
ov::Tensor single_image_embeds = encoded_image.resized_source;
295293
image_embeds.push_back(std::move(single_image_embeds));
296294

src/cpp/src/visual_language/qwen2vl/classes.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder {
4646
const std::string& device,
4747
const ov::AnyMap device_config);
4848

49-
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override;
49+
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) override;
5050

5151
std::pair<ov::Tensor, std::optional<int64_t>> get_position_ids(const size_t inputs_embeds_size, const size_t history_size) override;
5252

src/cpp/src/visual_language/vision_encoder.hpp

+3
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ struct EncodedImage {
4242
/// @brief Patches grid after llava_next preprocessing.
4343
/// Format: [num_patches_height, num_patches_width]
4444
std::pair<int, int> patches_grid;
45+
46+
/// @brief Original size of the image
47+
ImageSize original_image_size;
4548
};
4649

4750
/// @brief A class used to infer embeddings of an image using

0 commit comments

Comments
 (0)