@@ -59,9 +59,9 @@ std::vector<std::vector<clip_image_u8>> slice_image(const clip_image_u8& img, co
59
59
const std::pair<int , int > original_size{img.nx , img.ny };
60
60
const int original_width = img.nx ;
61
61
const int original_height = img.ny ;
62
- const float log_ratio = log (1.0 * original_width / original_height);
63
- const float ratio = 1.0 * original_width * original_height / (scale_resolution * scale_resolution);
64
- const int multiple = fmin (ceil (ratio), max_slice_nums);
62
+ const float log_ratio = log (1 .0f * original_width / original_height);
63
+ const float ratio = 1 .0f * original_width * original_height / (scale_resolution * scale_resolution);
64
+ const int multiple = std::min (ceil (ratio), max_slice_nums);
65
65
66
66
std::vector<std::vector<clip_image_u8>> images;
67
67
images.push_back (std::vector<clip_image_u8>{});
@@ -140,7 +140,176 @@ std::vector<std::vector<clip_image_u8>> slice_image(const clip_image_u8& img, co
140
140
return images;
141
141
}
142
142
143
- EncodedImage llava_image_embed_make_with_bytes_slice (clip_ctx& ctx_clip, const ov::Tensor& img, ov::InferRequest& encoder, int max_slice_nums, int scale_resolution, size_t patch_size, bool never_split) {
143
+ ov::Tensor concatenate (const ov::Tensor& first, const ov::Tensor& second) {
144
+ size_t res_d_0 = first.get_shape ().at (0 );
145
+ size_t res_d_1 = first.get_shape ().at (1 );
146
+ size_t res_d_2 = first.get_shape ().at (2 ) * 2 ;
147
+ ov::Tensor res{first.get_element_type (), {res_d_0, res_d_1, res_d_2}};
148
+ float * first_data = first.data <float >();
149
+ float * second_data = second.data <float >();
150
+ float * res_data = res.data <float >();
151
+ for (size_t i = 0 ; i < res_d_0; ++i) {
152
+ for (size_t j = 0 ; j < res_d_1; ++j) {
153
+ size_t k = 0 ;
154
+ for (; k < first.get_shape ().at (2 ); ++k) {
155
+ res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k]
156
+ = first_data[i * res_d_1 * first.get_shape ().at (2 ) + j * first.get_shape ().at (2 ) + k];
157
+ }
158
+ for (size_t l = 0 ; l < second.get_shape ().at (2 ); ++l, ++k) {
159
+ res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k]
160
+ = second_data[i * res_d_1 * second.get_shape ().at (2 ) + j * second.get_shape ().at (2 ) + l];
161
+ }
162
+ }
163
+ }
164
+ return res;
165
+ }
166
+
167
+ // / embed_dim: output dimension for each position
168
+ // / pos: a list of positions to be encoded: size (H, W)
169
+ // / out: (H, W, D)
170
+ ov::Tensor get_1d_sincos_pos_embed_from_grid_new (size_t embed_dim, const ov::Tensor& pos) {
171
+ OPENVINO_ASSERT (embed_dim % 2 == 0 );
172
+ OPENVINO_ASSERT (pos.get_shape ().size () == 3 );
173
+ OPENVINO_ASSERT (pos.get_shape ().at (0 ) == 1 );
174
+ size_t d0 = pos.get_shape ().at (1 );
175
+ size_t d1 = pos.get_shape ().at (2 );
176
+ size_t d2 = embed_dim / 2 ;
177
+ std::vector<float > omega (d2);
178
+ for (size_t idx = 0 ; idx < omega.size (); ++idx) {
179
+ omega.at (idx) = idx / (embed_dim / 2 .0f );
180
+ omega.at (idx) = 1 .0f / std::pow (10000 .0f , omega.at (idx)); // (D/2,)
181
+ }
182
+ const float * const pos_data = pos.data <float >();
183
+ ov::Tensor out (ov::element::f32, {d0, d1, d2}); // (H, W, D/2), outer product
184
+ float * out_data = out.data <float >();
185
+ for (size_t i = 0 ; i < d0; ++i) {
186
+ for (size_t j = 0 ; j < d1; ++j) {
187
+ for (size_t k = 0 ; k < d2; ++k) {
188
+ out_data[i * d1 * d2 + j * d2 + k]
189
+ = pos_data[i * d1 + j] * omega[k];
190
+ }
191
+ }
192
+ }
193
+
194
+ ov::Tensor emb_sin{out.get_element_type (), out.get_shape ()}; // (H, W, D/2)
195
+ float * emb_sin_data = emb_sin.data <float >();
196
+ std::transform (out_data, out_data + out.get_size (), emb_sin_data, [](float arg) {
197
+ return std::sin (arg);
198
+ });
199
+ ov::Tensor emb_cos{out.get_element_type (), out.get_shape ()}; // (H, W, D/2)
200
+ float * emb_cos_data = emb_cos.data <float >();
201
+ std::transform (out_data, out_data + out.get_size (), emb_cos_data, [](float arg) {
202
+ return std::cos (arg);
203
+ });
204
+ return concatenate (emb_sin, emb_cos); // (H, W, D)
205
+ }
206
+
207
+ ov::Tensor get_2d_sincos_pos_embed_from_grid (size_t embed_dim, const ov::Tensor& grid) {
208
+ OPENVINO_ASSERT (embed_dim % 2 == 0 );
209
+ // use half of dimensions to encode grid_h
210
+ ov::Coordinate begin_h{0 , 0 , 0 };
211
+ ov::Coordinate end_h{grid.get_shape ()};
212
+ end_h.at (0 ) = 1 ;
213
+ ov::Coordinate begin_w{1 , 0 , 0 };
214
+ ov::Coordinate end_w{grid.get_shape ()};
215
+ end_w.at (0 ) = 2 ;
216
+ ov::Tensor emb_h = get_1d_sincos_pos_embed_from_grid_new (embed_dim / 2 , ov::Tensor{grid, begin_h, end_h}); // (H, W, D/2)
217
+ ov::Tensor emb_w = get_1d_sincos_pos_embed_from_grid_new (embed_dim / 2 , ov::Tensor{grid, begin_w, end_w}); // (H, W, D/2)
218
+ return concatenate (emb_h, emb_w);
219
+ }
220
+
221
+ // / image_size: image_size or (image_height, image_width)
222
+ // / return:
223
+ // / pos_embed: [image_height, image_width, embed_dim]
224
+ ov::Tensor get_2d_sincos_pos_embed (size_t embed_dim, const HeightWidth& image_size) {
225
+ size_t grid_h_size = image_size.height , grid_w_size = image_size.width ;
226
+ ov::Tensor grid (ov::element::f32, {2 , grid_h_size, grid_w_size});
227
+ float * data = grid.data <float >();
228
+ for (size_t y = 0 ; y < grid_h_size; ++y) {
229
+ std::iota (data, data + grid_w_size, 0 .0f );
230
+ data += grid_w_size;
231
+ }
232
+ for (float y = 0 .0f ; y < grid_h_size; ++y) {
233
+ std::fill (data, data + grid_w_size, y);
234
+ data += grid_w_size;
235
+ }
236
+ return get_2d_sincos_pos_embed_from_grid (embed_dim, grid);
237
+ }
238
+
239
+ void adjust_pos_cache (
240
+ const std::vector<HeightWidth>& target_sizes,
241
+ size_t hidden_size,
242
+ ov::Tensor& pos_embed_cache
243
+ ) {
244
+ size_t max_h = std::max_element (target_sizes.begin (), target_sizes.end (), [](const HeightWidth& left, const HeightWidth& right) {
245
+ return left.height < right.height ;
246
+ })->height ;
247
+ size_t max_w = std::max_element (target_sizes.begin (), target_sizes.end (), [](const HeightWidth& left, const HeightWidth& right) {
248
+ return left.width < right.width ;
249
+ })->width ;
250
+ size_t allocated_height, allocated_width;
251
+ if (pos_embed_cache) {
252
+ const ov::Shape& allocated_shape = pos_embed_cache.get_shape ();
253
+ allocated_height = allocated_shape.at (0 );
254
+ allocated_width = allocated_shape.at (1 );
255
+ } else {
256
+ allocated_height = allocated_width = 70 ;
257
+ }
258
+ if (max_h > allocated_height || max_w > allocated_width) {
259
+ allocated_height = std::max (max_h, allocated_height);
260
+ allocated_width = std::max (max_w, allocated_width);
261
+ pos_embed_cache = get_2d_sincos_pos_embed (
262
+ hidden_size, {allocated_height, allocated_width}
263
+ );
264
+ }
265
+ }
266
+
267
+ ov::Tensor resample (VisionEncoder& vision, const ov::Tensor& encoded_image, const std::vector<HeightWidth>& target_sizes) {
268
+ size_t bs = encoded_image.get_shape ().at (0 );
269
+ std::vector<size_t > patch_len{target_sizes.size ()};
270
+ std::transform (target_sizes.begin (), target_sizes.end (), patch_len.begin (), [](const HeightWidth& height_width) {
271
+ return height_width.height * height_width.width ;
272
+ });
273
+ adjust_pos_cache (
274
+ target_sizes,
275
+ vision.m_vlm_config .hidden_size ,
276
+ vision.m_pos_embed_cache
277
+ );
278
+ size_t max_patch_len = *std::max_element (patch_len.begin (), patch_len.end ());
279
+ ov::Tensor key_padding_mask (ov::element::boolean, {bs, max_patch_len});
280
+ bool * mask_data = key_padding_mask.data <bool >();
281
+ size_t embed_len = vision.m_pos_embed_cache .get_shape ().at (2 );
282
+ ov::Tensor pos_embed (ov::element::f32, {max_patch_len, bs, embed_len}); // BLD => L * B * D
283
+ float * pos_embed_data = pos_embed.data <float >();
284
+ float * cache_data = vision.m_pos_embed_cache .data <float >();
285
+ size_t _d0 = vision.m_pos_embed_cache .get_shape ().at (0 );
286
+ size_t _d1 = vision.m_pos_embed_cache .get_shape ().at (1 );
287
+ for (size_t i = 0 ; i < bs; ++i) {
288
+ size_t target_h = target_sizes.at (i).height ;
289
+ size_t target_w = target_sizes.at (i).width ;
290
+ for (size_t h_idx = 0 ; h_idx < target_h; ++h_idx) {
291
+ for (size_t w_idx = 0 ; w_idx < target_w; ++w_idx) {
292
+ std::copy_n (
293
+ cache_data + h_idx * _d1 + w_idx,
294
+ embed_len,
295
+ pos_embed_data + (h_idx * target_w + w_idx) * bs * embed_len + i * embed_len
296
+ );
297
+ }
298
+ }
299
+ for (size_t flat = target_h * target_w; flat < max_patch_len; ++flat) {
300
+ std::fill_n (pos_embed_data + flat * bs * embed_len + i * embed_len, embed_len, 0 .0f );
301
+ }
302
+ std::fill_n (mask_data + i * max_patch_len, patch_len[i], false );
303
+ std::fill_n (mask_data + i * max_patch_len + patch_len[i], max_patch_len - patch_len[i], true );
304
+ }
305
+ vision.m_resampler .set_tensor (" x" , encoded_image); // [N, H*W, old_hidden_size]
306
+ vision.m_resampler .set_tensor (" pos_embed" , pos_embed); // [H*W, N, new_hidden_size]
307
+ vision.m_resampler .set_tensor (" key_padding_mask" , key_padding_mask); // [N, H*W]
308
+ vision.m_resampler .infer ();
309
+ return pipe .m_resampler .get_output_tensor (); // [N, query_num, new_hidden_size]
310
+ }
311
+
312
+ EncodedImage llava_image_embed_make_with_bytes_slice (VisionEncoder& vision, clip_ctx& ctx_clip, const ov::Tensor& img, ov::InferRequest& encoder, int max_slice_nums, int scale_resolution, size_t patch_size, bool never_split) {
144
313
clip_image_u8 source{
145
314
int (img.get_shape ().at (3 )),
146
315
int (img.get_shape ().at (2 )),
@@ -168,35 +337,43 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
168
337
ov::Tensor resized_source{output_tensor.get_element_type (), output_tensor.get_shape ()};
169
338
output_tensor.copy_to (resized_source);
170
339
HeightWidth resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size};
171
-
172
- HeightWidth size{
173
- size_t (preprocessed.at (1 ).at (0 ).ny ),
174
- size_t (preprocessed.at (1 ).at (0 ).nx )
175
- };
176
- ov::Tensor batched{ov::element::f32, {(preprocessed.size () - 1 ) * preprocessed.at (1 ).size (), 3 , size.height , size.width }};
177
- float * batched_data = batched.data <float >();
178
- size_t batch_offset = 0 ;
179
- size_t values_in_elem = 3 * size.height * size.width ;
180
- std::vector<HeightWidth> sliced_sizes;
181
- for (size_t row = 1 ; row < preprocessed.size (); ++row) {
182
- for (const clip_image_f32& elem : preprocessed.at (row)) {
183
- std::copy_n (elem.buf .begin (), values_in_elem, batched_data + batch_offset);
184
- sliced_sizes.push_back ({elem.ny / patch_size, elem.nx / patch_size});
185
- batch_offset += values_in_elem;
186
- }
187
- }
188
- encoder.set_input_tensor (batched);
189
- encoder.infer ();
190
- const ov::Tensor& encoded = encoder.get_output_tensor ();
191
- const ov::Shape& plain = encoded.get_shape ();
192
340
struct SharedTensorAllocator {
193
341
const ov::Tensor tensor;
194
342
void * allocate (size_t bytes, size_t ) {return bytes <= tensor.get_byte_size () ? tensor.data () : nullptr ;}
195
343
void deallocate (void *, size_t , size_t ) {}
196
344
bool is_equal (const SharedTensorAllocator& other) const noexcept {return this == &other;}
197
345
};
198
- ov::Tensor reshaped{encoded.get_element_type (), {preprocessed.size () - 1 , preprocessed.at (1 ).size (), plain.at (1 ), plain.at (2 )}, SharedTensorAllocator{encoded}};
199
- return {resized_source, resized_source_size, reshaped, sliced_sizes};
346
+ ov::Tensor resampled_resized = resample (vision, encoder.get_output_tensor (), {resized_preprocessed.ny , resize_preprocessed.nx });
347
+ ov::Tensor owner{resampled_resized.get_element_type (), resized_resampled.get_shape ()};
348
+ resampled_resized.copy_to (owner);
349
+ owner.set_shape (owner.get_shape ().at (1 ), owner.get_shape ().at (2 ));
350
+
351
+ ov::Tensor resampled_slices;
352
+ if (1 < preprocessed.size ()) {
353
+ HeightWidth size{
354
+ size_t (preprocessed.at (1 ).at (0 ).ny ),
355
+ size_t (preprocessed.at (1 ).at (0 ).nx )
356
+ };
357
+ ov::Tensor batched{ov::element::f32, {(preprocessed.size () - 1 ) * preprocessed.at (1 ).size (), 3 , size.height , size.width }};
358
+ float * batched_data = batched.data <float >();
359
+ size_t batch_offset = 0 ;
360
+ size_t values_in_elem = 3 * size.height * size.width ;
361
+ std::vector<HeightWidth> sliced_sizes;
362
+ for (size_t row = 1 ; row < preprocessed.size (); ++row) {
363
+ for (const clip_image_f32& elem : preprocessed.at (row)) {
364
+ std::copy_n (elem.buf .begin (), values_in_elem, batched_data + batch_offset);
365
+ sliced_sizes.push_back ({elem.ny / patch_size, elem.nx / patch_size});
366
+ batch_offset += values_in_elem;
367
+ }
368
+ }
369
+ encoder.set_input_tensor (batched);
370
+ encoder.infer ();
371
+ ov::Tensor resampled_batched = resample (vision, encoder.get_output_tensor (), size);
372
+ const ov::Tensor& encoded = encoder.get_output_tensor ();
373
+ const ov::Shape& plain = encoded.get_shape ();
374
+ resampled_slices{encoded.get_element_type (), {preprocessed.size () - 1 , preprocessed.at (1 ).size (), plain.at (1 ), plain.at (2 )}, SharedTensorAllocator{encoded}};
375
+ }
376
+ return {owner, resampled_slices};
200
377
}
201
378
}
202
379
0 commit comments