Skip to content

Commit 34d3c91

Browse files
committed
Temp
1 parent ff4f4be commit 34d3c91

File tree

1 file changed

+25
-19
lines changed

1 file changed

+25
-19
lines changed

src/cpp/src/visual_language/pipeline.cpp

+25-19
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,11 @@ ov::Tensor merge_text_and_image_embeddings_llava(
298298

299299
return merged_embeds;
300300
}
301+
302+
ov::Core singleton_core() {
303+
static ov::Core core;
304+
return core;
305+
}
301306
}
302307

303308
class ov::genai::VLMPipeline::VLMPipelineImpl {
@@ -345,30 +350,31 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
345350
)
346351
},
347352
m_tokenizer{Tokenizer(model_dir.string(), device_config)},
348-
m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config, ov::Core{}),
353+
m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config, singleton_core()),
349354
m_is_chat_conversation{false},
350355
m_image_id{0} {
356+
ov::Core core = singleton_core();
351357
if (m_vlm_config.model_type == VLMModelType::MINICPM) {
352-
m_resampler = ov::Core{}.compile_model(
358+
m_resampler = core.compile_model(
353359
model_dir / "openvino_resampler_model.xml", device, device_config
354360
).create_infer_request();
355361

356-
m_embedding = ov::Core{}.compile_model(
362+
m_embedding = core.compile_model(
357363
model_dir / "openvino_text_embeddings_model.xml", device, device_config
358364
).create_infer_request();
359365

360-
m_language = ov::Core{}.compile_model(
366+
m_language = core.compile_model(
361367
model_dir / "openvino_language_model.xml", device, device_config
362368
).create_infer_request();
363369

364370
m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70});
365371
} else if (m_vlm_config.model_type == VLMModelType::LLAVA) {
366-
m_language = ov::Core{}.compile_model(
372+
m_language = core.compile_model(
367373
model_dir / "openvino_language_model.xml", device, device_config
368374
).create_infer_request();
369375

370376
// Reusing the same m_embedding for llava text_embeddings model
371-
m_embedding = ov::Core{}.compile_model(
377+
m_embedding = core.compile_model(
372378
model_dir / "openvino_text_embeddings_model.xml", device, device_config
373379
).create_infer_request();
374380
}
@@ -407,8 +413,6 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
407413

408414
int64_t sequence_len = m_language.get_tensor("logits").get_shape().at(1) - 1;
409415
size_t vocab_size = m_language.get_tensor("logits").get_shape().back();
410-
float* logits = m_language.get_tensor("logits").data<float>() + sequence_len * vocab_size;
411-
int64_t out_token = std::max_element(logits, logits + vocab_size) - logits;
412416

413417
m_language.get_tensor("inputs_embeds").set_shape({BATCH_SIZE, 1, m_vlm_config.hidden_size});
414418
m_language.get_tensor("position_ids").set_shape({ BATCH_SIZE, 1 });
@@ -431,6 +435,16 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
431435
}, streamer);
432436
std::vector<int64_t> generated;
433437
while (true) { //(out_token != eos_token_id)
438+
float *logits = m_language.get_tensor("logits").data<float>();
439+
int64_t out_token = std::max_element(logits, logits + vocab_size) - logits;
440+
generated.push_back(out_token);
441+
// if (streamer_ptr && streamer_ptr->put(out_token)) {
442+
// break;
443+
// }
444+
std::cout << out_token << ", ";
445+
if (out_token == eos_token_id) {
446+
break;
447+
}
434448
m_embedding.get_input_tensor().data<int64_t>()[0] = out_token;
435449
m_embedding.infer();
436450
const ov::Tensor& embed_prompt_tensor = m_embedding.get_output_tensor();
@@ -445,17 +459,6 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
445459
m_language.get_tensor("position_ids").data<int64_t>()[0] = int64_t(m_language.get_tensor("attention_mask").get_size() - 1);
446460

447461
m_language.infer();
448-
449-
generated.push_back(out_token);
450-
if (streamer_ptr && streamer_ptr->put(out_token)) {
451-
break;
452-
}
453-
logits = m_language.get_tensor("logits").data<float>();
454-
455-
out_token = std::max_element(logits, logits + vocab_size) - logits;
456-
if (out_token == eos_token_id) {
457-
break;
458-
}
459462
}
460463

461464
if (streamer_ptr) {
@@ -474,6 +477,9 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
474477
}
475478
m_language.get_tensor("attention_mask").set_shape({1, 0});
476479
}
480+
std::cout << '\n';
481+
std::cout << eos_token_id << '\n';
482+
std::cout << decoded_results << '\n';
477483
return {{std::move(decoded_results)}};
478484
}
479485

0 commit comments

Comments
 (0)