@@ -298,6 +298,11 @@ ov::Tensor merge_text_and_image_embeddings_llava(
298
298
299
299
return merged_embeds;
300
300
}
301
+
302
+ ov::Core singleton_core () {
303
+ static ov::Core core;
304
+ return core;
305
+ }
301
306
}
302
307
303
308
class ov ::genai::VLMPipeline::VLMPipelineImpl {
@@ -345,30 +350,31 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
345
350
)
346
351
},
347
352
m_tokenizer{Tokenizer (model_dir.string (), device_config)},
348
- m_vision_encoder (model_dir, m_vlm_config.model_type, device, device_config, ov::Core{} ),
353
+ m_vision_encoder (model_dir, m_vlm_config.model_type, device, device_config, singleton_core() ),
349
354
m_is_chat_conversation{false },
350
355
m_image_id{0 } {
356
+ ov::Core core = singleton_core ();
351
357
if (m_vlm_config.model_type == VLMModelType::MINICPM) {
352
- m_resampler = ov::Core{} .compile_model (
358
+ m_resampler = core .compile_model (
353
359
model_dir / " openvino_resampler_model.xml" , device, device_config
354
360
).create_infer_request ();
355
361
356
- m_embedding = ov::Core{} .compile_model (
362
+ m_embedding = core .compile_model (
357
363
model_dir / " openvino_text_embeddings_model.xml" , device, device_config
358
364
).create_infer_request ();
359
365
360
- m_language = ov::Core{} .compile_model (
366
+ m_language = core .compile_model (
361
367
model_dir / " openvino_language_model.xml" , device, device_config
362
368
).create_infer_request ();
363
369
364
370
m_pos_embed_cache = get_2d_sincos_pos_embed (m_vlm_config.hidden_size , {70 , 70 });
365
371
} else if (m_vlm_config.model_type == VLMModelType::LLAVA) {
366
- m_language = ov::Core{} .compile_model (
372
+ m_language = core .compile_model (
367
373
model_dir / " openvino_language_model.xml" , device, device_config
368
374
).create_infer_request ();
369
375
370
376
// Reusing the same m_embedding for llava text_embeddings model
371
- m_embedding = ov::Core{} .compile_model (
377
+ m_embedding = core .compile_model (
372
378
model_dir / " openvino_text_embeddings_model.xml" , device, device_config
373
379
).create_infer_request ();
374
380
}
@@ -407,8 +413,6 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
407
413
408
414
int64_t sequence_len = m_language.get_tensor (" logits" ).get_shape ().at (1 ) - 1 ;
409
415
size_t vocab_size = m_language.get_tensor (" logits" ).get_shape ().back ();
410
- float * logits = m_language.get_tensor (" logits" ).data <float >() + sequence_len * vocab_size;
411
- int64_t out_token = std::max_element (logits, logits + vocab_size) - logits;
412
416
413
417
m_language.get_tensor (" inputs_embeds" ).set_shape ({BATCH_SIZE, 1 , m_vlm_config.hidden_size });
414
418
m_language.get_tensor (" position_ids" ).set_shape ({ BATCH_SIZE, 1 });
@@ -431,6 +435,16 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
431
435
}, streamer);
432
436
std::vector<int64_t > generated;
433
437
while (true ) { // (out_token != eos_token_id)
438
+ float *logits = m_language.get_tensor (" logits" ).data <float >();
439
+ int64_t out_token = std::max_element (logits, logits + vocab_size) - logits;
440
+ generated.push_back (out_token);
441
+ // if (streamer_ptr && streamer_ptr->put(out_token)) {
442
+ // break;
443
+ // }
444
+ std::cout << out_token << " , " ;
445
+ if (out_token == eos_token_id) {
446
+ break ;
447
+ }
434
448
m_embedding.get_input_tensor ().data <int64_t >()[0 ] = out_token;
435
449
m_embedding.infer ();
436
450
const ov::Tensor& embed_prompt_tensor = m_embedding.get_output_tensor ();
@@ -445,17 +459,6 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
445
459
m_language.get_tensor (" position_ids" ).data <int64_t >()[0 ] = int64_t (m_language.get_tensor (" attention_mask" ).get_size () - 1 );
446
460
447
461
m_language.infer ();
448
-
449
- generated.push_back (out_token);
450
- if (streamer_ptr && streamer_ptr->put (out_token)) {
451
- break ;
452
- }
453
- logits = m_language.get_tensor (" logits" ).data <float >();
454
-
455
- out_token = std::max_element (logits, logits + vocab_size) - logits;
456
- if (out_token == eos_token_id) {
457
- break ;
458
- }
459
462
}
460
463
461
464
if (streamer_ptr) {
@@ -474,6 +477,9 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
474
477
}
475
478
m_language.get_tensor (" attention_mask" ).set_shape ({1 , 0 });
476
479
}
480
+ std::cout << ' \n ' ;
481
+ std::cout << eos_token_id << ' \n ' ;
482
+ std::cout << decoded_results << ' \n ' ;
477
483
return {{std::move (decoded_results)}};
478
484
}
479
485
0 commit comments