@@ -242,7 +242,6 @@ ov::Tensor prepare_vis_position_ids(
242
242
});
243
243
size_t position_ids_batch_elem = max_nb_patches_h * max_nb_patches_w;
244
244
ov::Tensor position_ids{ov::element::i64, {batch_size, position_ids_batch_elem}};
245
- // throw std::runtime_error("");
246
245
int64_t * res_data = position_ids.data <int64_t >();
247
246
std::fill_n (res_data, position_ids.get_size (), 0 );
248
247
@@ -285,66 +284,84 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
285
284
std::vector<std::vector<ov::Tensor>> results;
286
285
std::vector<std::vector<ImageSize>> sizes;
287
286
288
- // std::vector<clip_image_f32*> img_res_v; // format N x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
289
287
std::vector<std::vector<clip_image_f32>> preprocessed{imgs.size ()};
290
- std::transform (imgs.begin (), imgs.end (), preprocessed.begin (), [&ctx_clip](const std::vector<clip_image_u8>& row) {
288
+ size_t max_h = 0 , max_w = 0 , n_images = 0 ;
289
+ std::transform (imgs.begin (), imgs.end (), preprocessed.begin (), [&ctx_clip, &max_h, &max_w, &n_images](const std::vector<clip_image_u8>& row) {
291
290
std::vector<clip_image_f32> processed_row{row.size ()};
292
- std::transform (row.begin (), row.end (), processed_row.begin (), [&ctx_clip](const clip_image_u8& raw) {
293
- return clip_image_preprocess (ctx_clip, raw);
291
+ std::transform (row.begin (), row.end (), processed_row.begin (), [&ctx_clip, &max_h, &max_w, &n_images](const clip_image_u8& raw) {
292
+ clip_image_f32 im = clip_image_preprocess (ctx_clip, raw);
293
+ max_h = std::max (size_t (im.ny ), max_h);
294
+ max_w = std::max (size_t (im.nx ), max_w);
295
+ ++n_images;
296
+ return im;
294
297
});
295
298
return processed_row;
296
299
});
297
300
301
+ ov::Tensor batched_images{ov::element::f32, {n_images, 3 , max_h, max_w}};
302
+ float * batched_data = batched_images.data <float >();
298
303
const clip_image_f32& resized_preprocessed = preprocessed.at (0 ).at (0 );
299
- ImageSize resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size};
300
- ov::Tensor input_tensor{ov::element::f32, {1 , 3 , size_t (resized_preprocessed.ny ), size_t (resized_preprocessed.nx )}, (void *)(resized_preprocessed.buf .data ())};
301
- ov::Tensor pixel_values = preprocess_for_encoder (input_tensor, patch_size);
304
+ std::copy (resized_preprocessed.buf .begin (), resized_preprocessed.buf .end (), batched_data);
305
+ if (1 < preprocessed.size ()) {
306
+ for (size_t row = 1 ; row < preprocessed.size (); ++row) {
307
+ size_t n_slices = preprocessed.at (row).size ();
308
+ for (size_t col = 0 ; col < n_slices; ++col) {
309
+ const clip_image_f32& elem = preprocessed.at (row).at (col);
310
+ std::copy (elem.buf .begin (), elem.buf .end (), batched_data + ((row - 1 ) * n_slices + col + 1 ) * 3 * max_h * max_w);
311
+ }
312
+ }
313
+ }
314
+ ov::Tensor pixel_values = preprocess_for_encoder (batched_images, patch_size);
302
315
encoder.set_tensor (" pixel_values" , pixel_values);
303
- ov::Tensor patch_attention_mask{ov::element::f32, {pixel_values.get_shape ().at (0 ), 1 , resized_source_size.height * resized_source_size.width }};
304
- std::fill_n (patch_attention_mask.data <float >(), patch_attention_mask.get_size (), 1 .0f );
316
+
317
+ ov::Tensor patch_attention_mask{ov::element::f32, {pixel_values.get_shape ().at (0 ), 1 , max_h / patch_size * max_w / patch_size}};
318
+ float * attention_data = patch_attention_mask.data <float >();
319
+ std::fill_n (attention_data, patch_attention_mask.get_size (), 0 .0f );
320
+ std::fill_n (attention_data, resized_preprocessed.ny / patch_size * resized_preprocessed.nx / patch_size, 1 .0f );
321
+ if (1 < preprocessed.size ()) {
322
+ for (size_t row = 1 ; row < preprocessed.size (); ++row) {
323
+ size_t n_slices = preprocessed.at (row).size ();
324
+ for (size_t col = 0 ; col < n_slices; ++col) {
325
+ const clip_image_f32& elem = preprocessed.at (row).at (col);
326
+ std::fill_n (attention_data + ((row - 1 ) * n_slices + col + 1 ) * max_h / patch_size * max_w / patch_size, elem.ny / patch_size * elem.nx / patch_size, 1 .0f );
327
+ }
328
+ }
329
+ }
305
330
encoder.set_tensor (" patch_attention_mask" , patch_attention_mask);
306
- ov::Tensor position_ids = prepare_vis_position_ids (pixel_values, patch_attention_mask, {resized_source_size}, ctx_clip.patch_size , ctx_clip.image_size / ctx_clip.patch_size );
331
+
332
+ ImageSize resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size};
333
+ std::vector<ImageSize> tgt_sizes{resized_source_size};
334
+ if (1 < preprocessed.size ()) {
335
+ for (const std::vector<clip_image_f32>& row : preprocessed) {
336
+ for (const clip_image_f32& elem : row) {
337
+ tgt_sizes.push_back ({elem.ny / patch_size, elem.nx / patch_size});
338
+ }
339
+ }
340
+ }
341
+ ov::Tensor position_ids = prepare_vis_position_ids (pixel_values, patch_attention_mask, tgt_sizes, patch_size, ctx_clip.image_size / patch_size);
307
342
encoder.set_tensor (" position_ids" , position_ids);
308
343
encoder.infer ();
309
344
const ov::Tensor& output_tensor = encoder.get_output_tensor ();
310
- ov::Tensor resized_source{ov::element::f32, output_tensor.get_shape ()};
311
- output_tensor.copy_to (resized_source);
312
345
313
346
if (1 == preprocessed.size ()) {
347
+ ov::Tensor resized_source{ov::element::f32, output_tensor.get_shape ()};
348
+ output_tensor.copy_to (resized_source);
314
349
return {std::move (resized_source), resized_source_size};
315
350
}
316
351
317
- ImageSize raw_size{
318
- size_t (preprocessed.at (1 ).at (0 ).ny ),
319
- size_t (preprocessed.at (1 ).at (0 ).nx )
320
- };
321
- ImageSize slices_size{
322
- raw_size.height / patch_size,
323
- raw_size.width / patch_size
324
- };
325
- size_t n_patches = slices_size.height * slices_size.width ,
326
- old_hidden_size = resized_source.get_shape ().at (2 );
352
+ size_t old_hidden_size = output_tensor.get_shape ().at (2 );
353
+ const float * out = output_tensor.data <float >();
354
+ ov::Tensor resized_source{ov::element::f32, {1 , resized_source_size.height * resized_source_size.width , old_hidden_size}};
355
+ std::copy_n (out, resized_source.get_size (), resized_source.data <float >());
356
+
357
+ size_t n_patches = tgt_sizes.at (1 ).height * tgt_sizes.at (1 ).width ;
327
358
ov::Tensor encoded_slices{ov::element::f32, {preprocessed.size () - 1 , preprocessed.at (1 ).size (), n_patches, old_hidden_size}};
328
- for (size_t row = 1 ; row < preprocessed.size (); ++row) {
329
- for (size_t col = 0 ; col < preprocessed.at (row).size (); ++col) {
330
- clip_image_f32& elem = preprocessed.at (row).at (col);
331
- ov::Tensor pixel_values = preprocess_for_encoder (
332
- {ov::element::f32, {1 , 3 , size_t (elem.ny ), size_t (elem.nx )}, elem.buf .data ()},
333
- patch_size
334
- );
335
- encoder.set_tensor (" pixel_values" , pixel_values);
336
- ov::Tensor patch_attention_mask{ov::element::f32, {1 , 1 , slices_size.height * slices_size.width }};
337
- std::fill_n (patch_attention_mask.data <float >(), patch_attention_mask.get_size (), 1 .0f );
338
- encoder.set_tensor (" patch_attention_mask" , patch_attention_mask);
339
- ov::Tensor position_ids = prepare_vis_position_ids (pixel_values, patch_attention_mask, {slices_size}, ctx_clip.patch_size , ctx_clip.image_size / ctx_clip.patch_size );
340
- encoder.set_tensor (" position_ids" , position_ids);
341
- const ov::Tensor& old = encoder.get_output_tensor ();
342
- encoder.set_output_tensor ({ov::element::f32, {1 , n_patches, old_hidden_size}, encoded_slices.data <float >() + ((row - 1 ) * preprocessed.at (row).size () + col) * n_patches * old_hidden_size});
343
- encoder.infer ();
344
- encoder.set_output_tensor (old);
359
+ for (size_t col = 0 ; col < preprocessed.size () - 1 ; ++col) {
360
+ for (size_t row = 0 ; row < preprocessed.at (1 ).size (); ++row) {
361
+ std::copy_n (out + (col * preprocessed.at (1 ).size () + row + 1 ) * n_patches * old_hidden_size, n_patches * old_hidden_size, encoded_slices.data <float >() + (col * preprocessed.at (1 ).size () + row) * n_patches * old_hidden_size);
345
362
}
346
363
}
347
- return {resized_source, resized_source_size, encoded_slices, slices_size };
364
+ return {resized_source, resized_source_size, encoded_slices, tgt_sizes. at ( 1 ) };
348
365
}
349
366
350
367
ProcessorConfig from_any_map (
@@ -504,7 +521,6 @@ EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ov::AnyMap& co
504
521
505
522
EncodedImage VisionEncoder::encode_minicpm (const ov::Tensor& image, const ProcessorConfig& config) {
506
523
clip_ctx ctx_clip;
507
- ctx_clip.patch_size = m_processor_config.patch_size ;
508
524
ctx_clip.image_size = m_processor_config.image_size ;
509
525
std::copy (config.norm_mean .begin (), config.norm_mean .end (), ctx_clip.image_mean );
510
526
std::copy (config.norm_std .begin (), config.norm_std .end (), ctx_clip.image_std );
0 commit comments