@@ -412,39 +412,37 @@ DecodedResults VLMPipeline::generate(
412
412
4 == special_tokens.get_shape ().at (1 ),
413
413
" Every special token must be represented with a single int."
414
414
);
415
- size_t im_start_id = special_tokens.data <int64_t >()[0 ];
416
- size_t im_end_id = special_tokens.data <int64_t >()[1 ];
417
- size_t slice_start_id = special_tokens.data <int64_t >()[2 ];
418
- size_t slice_end_id = special_tokens.data <int64_t >()[3 ];
419
- size_t im_start_pos = 0 , slice_start_pos = 0 ;
415
+ int64_t im_start_id = special_tokens.data <int64_t >()[0 ];
416
+ int64_t im_end_id = special_tokens.data <int64_t >()[1 ];
417
+ int64_t slice_start_id = special_tokens.data <int64_t >()[2 ];
418
+ int64_t slice_end_id = special_tokens.data <int64_t >()[3 ];
419
+ int64_t im_start_pos = 0 , slice_start_pos = 0 ;
420
420
int64_t * begin = encoded_input.data <int64_t >();
421
421
int64_t * ids = begin;
422
422
size_t encoded_input_size = encoded_input.get_size ();
423
- const int64_t * end = ids + encoded_input_size;
424
- float * input_embeds_data = input_embeds .data <float >();
423
+ int64_t * end = ids + encoded_input_size;
424
+ float * inputs_embeds_data = inputs_embeds .data <float >();
425
425
for (const EncodedImage& encoded_image : embeds) {
426
426
const ov::Tensor& resampled_source = resample (*this , encoded_image.resized_source , {encoded_image.resized_source_size });
427
427
float * emb = resampled_source.data <float >();
428
428
ids = std::find (ids, end, im_start_id);
429
- if (end == ids) {
430
- break ;
431
- }
432
- ids = std::copy_n (emb, resampled_source.get_size (), input_embeds_data + std::distance (begin, ids) * m_vlm_config.hidden_size );
433
- if (embeds.slices ) {
429
+ OPENVINO_ASSERT (end != ids);
430
+ std::copy_n (emb, resampled_source.get_size (), inputs_embeds_data + std::distance (begin, ids) * m_vlm_config.hidden_size );
431
+ ids += m_vlm_config.hidden_size ;
432
+ if (encoded_image.slices ) {
434
433
size_t token_idx = 0 ;
435
- const ov::Shape& slices_shape = embeds .slices .get_shape ();
436
- const std::vector<HeightWidth>& sliced_sizes = embeds .slices_sizes ;
434
+ const ov::Shape& slices_shape = encoded_image .slices .get_shape ();
435
+ const std::vector<HeightWidth>& sliced_sizes = encoded_image .slices_sizes ;
437
436
for (size_t i = 0 ; i < slices_shape.at (0 ); ++i) {
438
437
for (size_t ja = 0 ; ja < slices_shape.at (1 ); ++ja) {
439
438
size_t d2 = slices_shape.at (2 );
440
439
size_t d3 = slices_shape.at (3 );
441
- ov::Tensor encoded_view{ov::element::f32, {1 , d2, d3}, embeds .slices .data <float >() + (i * slices_shape.at (1 ) + ja) * d2 * d3};
440
+ ov::Tensor encoded_view{ov::element::f32, {1 , d2, d3}, encoded_image .slices .data <float >() + (i * slices_shape.at (1 ) + ja) * d2 * d3};
442
441
const ov::Tensor& vision_embed_tensor_i_j = resample (*this , encoded_view, {sliced_sizes.at (i * slices_shape.at (1 ) + ja)});
443
442
ids = std::find (ids, end, slice_start_id);
444
- if (end == ids) {
445
- break ;
446
- }
447
- ids = std::copy_n (vision_embed_tensor_i_j.data <float >(), vision_embed_tensor_i_j.get_size (), input_embeds_data + std::distance (begin, ids) * m_vlm_config.hidden_size );
443
+ OPENVINO_ASSERT (end != ids);
444
+ std::copy_n (vision_embed_tensor_i_j.data <float >(), vision_embed_tensor_i_j.get_size (), inputs_embeds_data + std::distance (begin, ids) * m_vlm_config.hidden_size );
445
+ ids += m_vlm_config.hidden_size ;
448
446
}
449
447
}
450
448
}
0 commit comments