@@ -342,7 +342,7 @@ DecodedResults VLMPipeline::generate(
342
342
for (const ov::Tensor& rgb : rgbs) {
343
343
EncodedImage encoded_image = m_vision_encoder.encode (rgb);
344
344
if (m_vlm_config.use_image_id ) {
345
- images_prompt = m_vlm_config.im_id_start + std::to_string (image_id) + m_vlm_config.im_id_end ;
345
+ images_prompt + = m_vlm_config.im_id_start + std::to_string (image_id) + m_vlm_config.im_id_end ;
346
346
++image_id;
347
347
}
348
348
std::string unk64;
@@ -380,7 +380,6 @@ DecodedResults VLMPipeline::generate(
380
380
m_history.push_back ({{" role" , " user" }, {" content" , images_prompt}});
381
381
constexpr bool add_generation_prompt = true ;
382
382
std::string new_templated_chat_history = m_tokenizer.apply_chat_template (m_history, add_generation_prompt);
383
- std::cout << new_templated_chat_history << ' \n ' ;
384
383
ov::Tensor new_chat_tokens = m_tokenizer.encode (new_templated_chat_history).input_ids ;
385
384
if (0 == m_language.get_tensor (" attention_mask" ).get_shape ().at (1 )) {
386
385
encoded_input = new_chat_tokens;
@@ -419,10 +418,6 @@ DecodedResults VLMPipeline::generate(
419
418
int64_t slice_end_id = special_tokens.data <int64_t >()[3 ];
420
419
int64_t im_start_pos = 0 , slice_start_pos = 0 ;
421
420
int64_t * begin = encoded_input.data <int64_t >();
422
- for (size_t cont = 0 ; cont < encoded_input.get_size (); ++cont) {
423
- std::cout << begin[cont] << " , " ;
424
- }
425
- std::cout << ' \n ' ;
426
421
int64_t * ids = begin;
427
422
size_t encoded_input_size = encoded_input.get_size ();
428
423
int64_t * end = ids + encoded_input_size;
@@ -431,11 +426,9 @@ DecodedResults VLMPipeline::generate(
431
426
const ov::Tensor& resampled_source = resample (*this , encoded_image.resized_source , {encoded_image.resized_source_size });
432
427
float * emb = resampled_source.data <float >();
433
428
ids = std::find (ids, end, im_start_id);
434
- std::cout << std::distance (begin, ids) << ' \n ' ;
435
429
OPENVINO_ASSERT (end != ids);
436
430
std::copy_n (emb, resampled_source.get_size (), inputs_embeds_data + std::distance (begin, ids) * m_vlm_config.hidden_size );
437
431
ids += m_vlm_config.query_num ;
438
- std::cout << std::distance (begin, ids) << ' \n ' ;
439
432
if (encoded_image.slices ) {
440
433
size_t token_idx = 0 ;
441
434
const ov::Shape& slices_shape = encoded_image.slices .get_shape ();
@@ -447,11 +440,9 @@ DecodedResults VLMPipeline::generate(
447
440
ov::Tensor encoded_view{ov::element::f32, {1 , d2, d3}, encoded_image.slices .data <float >() + (i * slices_shape.at (1 ) + ja) * d2 * d3};
448
441
const ov::Tensor& vision_embed_tensor_i_j = resample (*this , encoded_view, {sliced_sizes.at (i * slices_shape.at (1 ) + ja)});
449
442
ids = std::find (ids, end, slice_start_id);
450
- std::cout << std::distance (begin, ids) << ' \n ' ;
451
443
OPENVINO_ASSERT (end != ids);
452
444
std::copy_n (vision_embed_tensor_i_j.data <float >(), vision_embed_tensor_i_j.get_size (), inputs_embeds_data + std::distance (begin, ids) * m_vlm_config.hidden_size );
453
445
ids += m_vlm_config.query_num ;
454
- std::cout << std::distance (begin, ids) << ' \n ' ;
455
446
}
456
447
}
457
448
}
0 commit comments