@@ -216,6 +216,65 @@ ov::Tensor preprocess_for_encoder(const ov::Tensor& images, size_t kernel) {
216
216
return permuted_tensor;
217
217
}
218
218
219
+ // torch.bucketize(fractional_coords, boundaries, right=True)
220
+ std::vector<int64_t > bucket_size_right (const std::vector<float >& fractional_coords, const std::vector<float >& boundaries) {
221
+ std::vector<int64_t > bucket_coords (fractional_coords.size ());
222
+ std::transform (fractional_coords.begin (), fractional_coords.end (), bucket_coords.begin (), [&boundaries](float fractional_coord) {
223
+ return std::distance (boundaries.begin (), std::upper_bound (boundaries.begin (), boundaries.end (), fractional_coord));
224
+ });
225
+ return bucket_coords;
226
+ }
227
+
228
+ ov::Tensor prepare_vis_position_ids (
229
+ const ov::Tensor& pixel_values,
230
+ const ov::Tensor& patch_attention_mask,
231
+ const std::vector<HeightWidth> tgt_sizes,
232
+ size_t patch_size,
233
+ size_t num_patches_per_side
234
+ ) {
235
+ size_t batch_size = pixel_values.get_shape ().at (0 );
236
+ size_t max_im_h = pixel_values.get_shape ().at (2 ), max_im_w = pixel_values.get_shape ().at (3 );
237
+ size_t max_nb_patches_h = max_im_h / patch_size, max_nb_patches_w = max_im_w / patch_size;
238
+ std::vector<float > boundaries (1 .0f * num_patches_per_side - 1 );
239
+ std::generate (boundaries.begin (), boundaries.end (), [num_patches_per_side, val = 0 .0f ]() mutable {
240
+ val += 1 .0f / num_patches_per_side;
241
+ return val;
242
+ });
243
+ size_t position_ids_batch_elem = max_nb_patches_h * max_nb_patches_w;
244
+ ov::Tensor position_ids{ov::element::i64, {batch_size, position_ids_batch_elem}};
245
+ // throw std::runtime_error("");
246
+ int64_t * res_data = position_ids.data <int64_t >();
247
+ std::fill_n (res_data, position_ids.get_size (), 0 );
248
+
249
+ for (size_t batch_idx = 0 ; batch_idx < batch_size; ++batch_idx) {
250
+ size_t nb_patches_h = tgt_sizes.at (batch_idx).height ;
251
+ size_t nb_patches_w = tgt_sizes.at (batch_idx).width ;
252
+
253
+ std::vector<float > fractional_coords_h (nb_patches_h);
254
+ std::generate (fractional_coords_h.begin (), fractional_coords_h.end (), [nb_patches_h, val = -1 .0f / nb_patches_h]() mutable {
255
+ val += 1 .0f / nb_patches_h;
256
+ return val;
257
+ });
258
+ std::vector<float > fractional_coords_w (nb_patches_w);
259
+ std::generate (fractional_coords_w.begin (), fractional_coords_w.end (), [nb_patches_w, val = -1 .0f / nb_patches_w]() mutable {
260
+ val += 1 .0f / nb_patches_w;
261
+ return val;
262
+ });
263
+
264
+ std::vector<int64_t > bucket_coords_h = bucket_size_right (fractional_coords_h, boundaries);
265
+ std::vector<int64_t > bucket_coords_w = bucket_size_right (fractional_coords_w, boundaries);
266
+
267
+ std::vector<int64_t > pos_ids (bucket_coords_h.size () * bucket_coords_w.size ());
268
+ for (size_t col = 0 ; col < bucket_coords_h.size (); ++col) {
269
+ for (size_t row = 0 ; row < bucket_coords_w.size (); ++row) {;
270
+ pos_ids.at (col * bucket_coords_w.size () + row) = bucket_coords_h.at (col) * num_patches_per_side + bucket_coords_w.at (row);
271
+ }
272
+ }
273
+ std::copy (pos_ids.begin (), pos_ids.end (), res_data + batch_idx * position_ids_batch_elem);
274
+ }
275
+ return position_ids;
276
+ }
277
+
219
278
EncodedImage llava_image_embed_make_with_bytes_slice (clip_ctx& ctx_clip, const ov::Tensor& img, ov::InferRequest& encoder, int max_slice_nums, int scale_resolution, size_t patch_size, bool never_split) {
220
279
clip_image_u8 source{
221
280
int (img.get_shape ().at (3 )),
@@ -244,14 +303,11 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
244
303
ov::Tensor patch_attention_mask{ov::element::boolean, {pixel_values.get_shape ().at (0 ), 1 , resized_source_size.height * resized_source_size.width }};
245
304
std::fill_n (patch_attention_mask.data <bool >(), patch_attention_mask.get_size (), true );
246
305
encoder.set_tensor (" patch_attention_mask" , patch_attention_mask);
247
- ov::Tensor tgt_sizes{ov::element::i64, {1 , 2 }};
248
- int64_t * tgt_sizes_data = tgt_sizes.data <int64_t >();
249
- tgt_sizes_data[0 ] = resized_source_size.height ;
250
- tgt_sizes_data[1 ] = resized_source_size.width ;
251
- encoder.set_tensor (" tgt_sizes" , tgt_sizes);
306
+ ov::Tensor position_ids = prepare_vis_position_ids (pixel_values, patch_attention_mask, {resized_source_size}, ctx_clip.patch_size , ctx_clip.image_size / ctx_clip.patch_size );
307
+ encoder.set_tensor (" position_ids" , position_ids);
252
308
encoder.infer ();
253
309
const ov::Tensor& output_tensor = encoder.get_output_tensor ();
254
- ov::Tensor resized_source{output_tensor. get_element_type () , output_tensor.get_shape ()};
310
+ ov::Tensor resized_source{ov::element::f32 , output_tensor.get_shape ()};
255
311
output_tensor.copy_to (resized_source);
256
312
257
313
if (1 == preprocessed.size ()) {
@@ -266,27 +322,24 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
266
322
size_t n_patches = size.height / patch_size * size.width / patch_size,
267
323
old_hidden_size = resized_source.get_shape ().at (2 );
268
324
ov::Tensor encoded_slices{ov::element::f32, {preprocessed.size () - 1 , preprocessed.at (1 ).size (), n_patches, old_hidden_size}};
269
- // там внутри есть какая-то операция которая констант фолдит батч и из-за этого нельзя использовать отличный от того что был при экспорте
270
- // констант фолдит она его в торч скрипте
271
- // Even though batch can't be used, it's still possible to use async.
272
325
for (size_t row = 1 ; row < preprocessed.size (); ++row) {
273
326
for (size_t col = 0 ; col < preprocessed.at (row).size (); ++col) {
274
327
clip_image_f32& elem = preprocessed.at (row).at (col);
275
328
sliced_sizes.push_back ({elem.ny / patch_size, elem.nx / patch_size});
276
- encoder. set_tensor ( " pixel_values" , preprocess_for_encoder (
329
+ ov::Tensor pixel_values = preprocess_for_encoder (
277
330
{ov::element::f32, {1 , 3 , size_t (elem.ny ), size_t (elem.nx )}, elem.buf .data ()},
278
331
patch_size
279
- ));
332
+ );
333
+ encoder.set_tensor (" pixel_values" , pixel_values);
280
334
ov::Tensor patch_attention_mask{ov::element::boolean, {1 , 1 , sliced_sizes.back ().height * sliced_sizes.back ().width }};
281
335
std::fill_n (patch_attention_mask.data <bool >(), patch_attention_mask.get_size (), true );
282
336
encoder.set_tensor (" patch_attention_mask" , patch_attention_mask);
283
- ov::Tensor tgt_sizes{ov::element::i64, {1 , 2 }};
284
- int64_t * tgt_sizes_data = tgt_sizes.data <int64_t >();
285
- tgt_sizes_data[0 ] = sliced_sizes.back ().height ;
286
- tgt_sizes_data[1 ] = sliced_sizes.back ().width ;
287
- encoder.set_tensor (" tgt_sizes" , tgt_sizes);
337
+ ov::Tensor position_ids = prepare_vis_position_ids (pixel_values, patch_attention_mask, {sliced_sizes.back ()}, ctx_clip.patch_size , ctx_clip.image_size / ctx_clip.patch_size );
338
+ encoder.set_tensor (" position_ids" , position_ids);
339
+ const ov::Tensor& old = encoder.get_output_tensor ();
288
340
encoder.set_output_tensor ({ov::element::f32, {1 , n_patches, old_hidden_size}, encoded_slices.data <float >() + ((row - 1 ) * preprocessed.at (row).size () + col) * n_patches * old_hidden_size});
289
341
encoder.infer ();
342
+ encoder.set_output_tensor (old);
290
343
}
291
344
}
292
345
return {resized_source, resized_source_size, encoded_slices, sliced_sizes};
@@ -305,6 +358,8 @@ VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const std::
305
358
306
359
EncodedImage VisionEncoder::encode (const ov::Tensor& image, const ProcessorConfig& config) {
307
360
clip_ctx ctx_clip;
361
+ ctx_clip.patch_size = m_processor_config.patch_size ;
362
+ ctx_clip.image_size = m_processor_config.image_size ;
308
363
std::copy (config.norm_mean .begin (), config.norm_mean .end (), ctx_clip.image_mean );
309
364
std::copy (config.norm_std .begin (), config.norm_std .end (), ctx_clip.image_std );
310
365
return llava_image_embed_make_with_bytes_slice (ctx_clip, image, m_encoder, config.max_slice_nums , config.scale_resolution , config.patch_size , 0 == config.max_slice_nums );
0 commit comments