@@ -75,6 +75,7 @@ EncodedImage VisionEncoderLLaVANext::encode(const ov::Tensor& image, const ov::A
75
75
encoded_image.resized_source = std::move (image_features);
76
76
encoded_image.resized_source_size = resized_source_size;
77
77
encoded_image.patches_grid = {num_patches_h, num_patches_w};
78
+ encoded_image.original_image_size = original_image_size;
78
79
return encoded_image;
79
80
}
80
81
@@ -262,7 +263,6 @@ ov::Tensor add_image_newline(const ov::Tensor& image_feature, const ov::Tensor&
262
263
*/
263
264
ov::Tensor pack_image_features_llava_next (
264
265
const EncodedImage& encoded_image,
265
- const ImageSize& original_image_size,
266
266
const ov::Tensor& image_newline) {
267
267
auto image_feature = encoded_image.resized_source ;
268
268
auto image_feature_shape = image_feature.get_shape ();
@@ -295,7 +295,7 @@ ov::Tensor pack_image_features_llava_next(
295
295
296
296
ov::Tensor reshaped_image_feature = reshape_and_rearrange_image_feature (patches_image_feature, num_patch_height, num_patch_width, height, width);
297
297
298
- ov::Tensor unpadded_image_feature = unpad_image (reshaped_image_feature, original_image_size);
298
+ ov::Tensor unpadded_image_feature = unpad_image (reshaped_image_feature, encoded_image. original_image_size );
299
299
300
300
ov::Tensor image_feature_with_newline = add_image_newline (unpadded_image_feature, image_newline);
301
301
@@ -333,31 +333,33 @@ ov::Tensor pack_image_features_llava_next(
333
333
334
334
} // namespace
335
335
336
- ov::Tensor InputsEmbedderLLaVANext::get_inputs_embeds (const std::string& prompt, const std:: vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics ) {
337
- std::string image_token = m_vlm_config. im_start ;
338
-
336
+ std::vector< ov::genai::EncodedImage> InputsEmbedderLLaVANext::encode_images (const std::vector<ov::Tensor>& images) {
337
+ std::vector<EncodedImage> embeds ;
338
+ ov::AnyMap vision_config = {{ " patch_size " , m_vlm_config. vision_config_patch_size }};
339
339
std::vector<ov::Tensor> single_images = to_single_image_tensors (images);
340
+ for (const ov::Tensor& image : single_images) {
341
+ embeds.emplace_back (m_vision_encoder->encode (image, vision_config));
342
+ }
343
+ return embeds;
344
+ }
345
+
346
+ ov::Tensor InputsEmbedderLLaVANext::get_inputs_embeds (const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) {
347
+ std::string image_token = m_vlm_config.im_start ;
340
348
341
349
std::string formatted_prompt;
342
350
std::vector<ov::Tensor> image_embeds;
343
- image_embeds.reserve (single_images.size ());
344
-
351
+ image_embeds.reserve (images.size ());
345
352
ov::Tensor image_newline;
346
353
347
- for (const auto & image : single_images) {
348
- ov::AnyMap vision_config = {{" patch_size" , m_vlm_config.vision_config_patch_size }};
349
- EncodedImage encoded_image = m_vision_encoder->encode (image, vision_config);
350
-
354
+ for (const auto & encoded_image : images) {
351
355
if (!image_newline) {
352
356
size_t embed_dim = encoded_image.resized_source .get_shape ().at (2 );
353
357
image_newline = ov::Tensor (encoded_image.resized_source .get_element_type (), {embed_dim});
354
358
float * image_newline_data = image_newline.data <float >();
355
359
std::copy (m_vlm_config.image_newline .begin (), m_vlm_config.image_newline .end (), image_newline_data);
356
360
}
357
361
358
- ImageSize original_image_size{image.get_shape ().at (1 ), image.get_shape ().at (2 )}; // [height, width]
359
-
360
- ov::Tensor packed_features = pack_image_features_llava_next (encoded_image, original_image_size, image_newline);
362
+ ov::Tensor packed_features = pack_image_features_llava_next (encoded_image, image_newline);
361
363
for (size_t idx = 0 ; idx < packed_features.get_shape ().at (1 ); ++idx) {
362
364
formatted_prompt += image_token;
363
365
}
0 commit comments