@@ -380,6 +380,7 @@ DecodedResults VLMPipeline::generate(
380
380
m_history.push_back ({{" role" , " user" }, {" content" , images_prompt}});
381
381
constexpr bool add_generation_prompt = true ;
382
382
std::string new_templated_chat_history = m_tokenizer.apply_chat_template (m_history, add_generation_prompt);
383
+ std::cout << new_templated_chat_history << ' \n ' ;
383
384
ov::Tensor new_chat_tokens = m_tokenizer.encode (new_templated_chat_history).input_ids ;
384
385
if (0 == m_language.get_tensor (" attention_mask" ).get_shape ().at (1 )) {
385
386
encoded_input = new_chat_tokens;
@@ -418,6 +419,10 @@ DecodedResults VLMPipeline::generate(
418
419
int64_t slice_end_id = special_tokens.data <int64_t >()[3 ];
419
420
int64_t im_start_pos = 0 , slice_start_pos = 0 ;
420
421
int64_t * begin = encoded_input.data <int64_t >();
422
+ for (size_t cont = 0 ; cont < encoded_input.get_size (); ++cont) {
423
+ std::cout << begin[cont] << " , " ;
424
+ }
425
+ std::cout << ' \n ' ;
421
426
int64_t * ids = begin;
422
427
size_t encoded_input_size = encoded_input.get_size ();
423
428
int64_t * end = ids + encoded_input_size;
@@ -426,9 +431,11 @@ DecodedResults VLMPipeline::generate(
426
431
const ov::Tensor& resampled_source = resample (*this , encoded_image.resized_source , {encoded_image.resized_source_size });
427
432
float * emb = resampled_source.data <float >();
428
433
ids = std::find (ids, end, im_start_id);
434
+ std::cout << std::distance (begin, ids) << ' \n ' ;
429
435
OPENVINO_ASSERT (end != ids);
430
436
std::copy_n (emb, resampled_source.get_size (), inputs_embeds_data + std::distance (begin, ids) * m_vlm_config.hidden_size );
431
- ids += m_vlm_config.hidden_size ;
437
+ ids += m_vlm_config.query_num ;
438
+ std::cout << std::distance (begin, ids) << ' \n ' ;
432
439
if (encoded_image.slices ) {
433
440
size_t token_idx = 0 ;
434
441
const ov::Shape& slices_shape = encoded_image.slices .get_shape ();
@@ -440,9 +447,11 @@ DecodedResults VLMPipeline::generate(
440
447
ov::Tensor encoded_view{ov::element::f32, {1 , d2, d3}, encoded_image.slices .data <float >() + (i * slices_shape.at (1 ) + ja) * d2 * d3};
441
448
const ov::Tensor& vision_embed_tensor_i_j = resample (*this , encoded_view, {sliced_sizes.at (i * slices_shape.at (1 ) + ja)});
442
449
ids = std::find (ids, end, slice_start_id);
450
+ std::cout << std::distance (begin, ids) << ' \n ' ;
443
451
OPENVINO_ASSERT (end != ids);
444
452
std::copy_n (vision_embed_tensor_i_j.data <float >(), vision_embed_tensor_i_j.get_size (), inputs_embeds_data + std::distance (begin, ids) * m_vlm_config.hidden_size );
445
- ids += m_vlm_config.hidden_size ;
453
+ ids += m_vlm_config.query_num ;
454
+ std::cout << std::distance (begin, ids) << ' \n ' ;
446
455
}
447
456
}
448
457
}
0 commit comments