diff --git a/src/cpp/src/visual_language/embedding_model.cpp b/src/cpp/src/visual_language/embedding_model.cpp index 9559391b3e..3402d22691 100644 --- a/src/cpp/src/visual_language/embedding_model.cpp +++ b/src/cpp/src/visual_language/embedding_model.cpp @@ -78,7 +78,8 @@ ov::Tensor EmbeddingsModel::infer(const ov::Tensor& input_idx, bool return_remot } else { req.ireq.set_output_tensor(req.cpu_tensor); } - req.ireq.infer(); + req.ireq.start_async(); + req.ireq.wait(); return req.ireq.get_output_tensor(); } diff --git a/src/cpp/src/visual_language/internvl_chat/classes.cpp b/src/cpp/src/visual_language/internvl_chat/classes.cpp index 3f34c68077..b8b837ad02 100644 --- a/src/cpp/src/visual_language/internvl_chat/classes.cpp +++ b/src/cpp/src/visual_language/internvl_chat/classes.cpp @@ -136,7 +136,8 @@ EncodedImage VisionEncoderInternVLChat::encode(const ov::Tensor& image, const ov ov::Tensor pixel_values = get_pixel_values_internvl(image, config); encoder.set_tensor("pixel_values", pixel_values); - encoder.infer(); + encoder.start_async(); + encoder.wait(); const ov::Tensor& infer_output = encoder.get_output_tensor(); ov::Tensor image_features(infer_output.get_element_type(), infer_output.get_shape()); diff --git a/src/cpp/src/visual_language/llava/classes.cpp b/src/cpp/src/visual_language/llava/classes.cpp index 88d35070ca..9a71a55837 100644 --- a/src/cpp/src/visual_language/llava/classes.cpp +++ b/src/cpp/src/visual_language/llava/classes.cpp @@ -76,7 +76,8 @@ EncodedImage VisionEncoderLLaVA::encode( const ov::Tensor& image, const ov::AnyM ov::Tensor pixel_values = get_pixel_values_llava(image, config); encoder.set_tensor("pixel_values", pixel_values); - encoder.infer(); + encoder.start_async(); + encoder.wait(); const ov::Tensor& infer_output = encoder.get_output_tensor(); ov::Tensor image_features(infer_output.get_element_type(), infer_output.get_shape()); diff --git a/src/cpp/src/visual_language/llava_next/classes.cpp b/src/cpp/src/visual_language/llava_next/classes.cpp index 0ea08cf39f..6fa310ae3d 100644 --- a/src/cpp/src/visual_language/llava_next/classes.cpp +++ b/src/cpp/src/visual_language/llava_next/classes.cpp @@ -57,7 +57,8 @@ EncodedImage VisionEncoderLLaVANext::encode(const ov::Tensor& image, const ov::A ov::Tensor pixel_values = get_pixel_values_llava_next(image, config); encoder.set_tensor("pixel_values", pixel_values); - encoder.infer(); + encoder.start_async(); + encoder.wait(); const ov::Tensor& infer_output = encoder.get_output_tensor(); ov::Tensor image_features(infer_output.get_element_type(), infer_output.get_shape()); diff --git a/src/cpp/src/visual_language/minicpm/classes.cpp b/src/cpp/src/visual_language/minicpm/classes.cpp index 358d4e25a1..bbb68be3c6 100644 --- a/src/cpp/src/visual_language/minicpm/classes.cpp +++ b/src/cpp/src/visual_language/minicpm/classes.cpp @@ -377,7 +377,8 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o } ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, tgt_sizes, patch_size, ctx_clip.image_size / patch_size); encoder.set_tensor("position_ids", position_ids); - encoder.infer(); + encoder.start_async(); + encoder.wait(); const ov::Tensor& output_tensor = encoder.get_output_tensor(); if (1 == preprocessed.size()) { @@ -746,7 +747,8 @@ ov::Tensor InputsEmbedderMiniCPM::resample(const ov::Tensor& encoded_image, cons resampler.set_tensor("image_feature", encoded_image); // [N, H*W, old_hidden_size] resampler.set_tensor("pos_embed", pos_embed); // [H*W, N, new_hidden_size] resampler.set_tensor("key_padding_mask", key_padding_mask); // [N, H*W] - resampler.infer(); + resampler.start_async(); + resampler.wait(); return resampler.get_output_tensor(); // [N, query_num, new_hidden_size] } diff --git a/src/cpp/src/visual_language/phi3_vision/classes.cpp b/src/cpp/src/visual_language/phi3_vision/classes.cpp index 115f44f86e..d8139358ad 100644 --- a/src/cpp/src/visual_language/phi3_vision/classes.cpp +++ b/src/cpp/src/visual_language/phi3_vision/classes.cpp @@ -265,7 +265,8 @@ EncodedImage VisionEncoderPhi3V::encode(const ov::Tensor& image, const ov::AnyMa encoder.set_input_tensor(pixel_values); ov::Tensor res{ov::element::f32, encoder.get_output_tensor().get_shape()}; encoder.set_output_tensor(res); - encoder.infer(); + encoder.start_async(); + encoder.wait(); return {std::move(res), image_size}; } @@ -410,7 +411,8 @@ ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, size_t hd_feature_transformer.set_input_tensor(1, height); ov::Tensor width{ov::element::i32, {}, &w_crop}; hd_feature_transformer.set_input_tensor(2, width); - hd_feature_transformer.infer(); + hd_feature_transformer.start_async(); + hd_feature_transformer.wait(); return hd_feature_transformer.get_output_tensor(); } @@ -477,7 +479,8 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest ov::Tensor sub_image_features_hd_newline = add_image_newline(sub_image_features_hd, sub_GN); // [1,h_crop*12*(w_crop*12+1), 4096] ov::Tensor image_embeddings = concatenate_2d(sub_image_features_hd_newline, glb_GN, global_image_features_hd_newline); // [1,l,4096] vision_projection.set_input_tensor(image_embeddings); - vision_projection.infer(); + vision_projection.start_async(); + vision_projection.wait(); ov::Tensor out = vision_projection.get_output_tensor(); ov::Tensor res{out.get_element_type(), out.get_shape()}; out.copy_to(res); diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 8c8288a5dc..f17be3d8a1 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -233,7 +233,8 @@ EncodedImage VisionEncoderQwen2VL::encode(const ov::Tensor& image, const ov::Any std::memcpy(flattened_patches.data(), transposed_patches.data(), transposed_patches.get_byte_size()); encoder.set_tensor("hidden_states", flattened_patches); - encoder.infer(); + encoder.start_async(); + encoder.wait(); const ov::Tensor& infer_output = encoder.get_output_tensor(); ov::Tensor image_features(infer_output.get_element_type(), infer_output.get_shape()); @@ -435,7 +436,8 @@ ov::Tensor InputsEmbedderQwen2VL::merge_text_and_image_embeddings_qwen2vl( vision_embeddings_merger.set_tensor("hidden_states", concatenated_images); vision_embeddings_merger.set_tensor("attention_mask", attention_mask); vision_embeddings_merger.set_tensor("rotary_pos_emb", rotary_pos_emb); - vision_embeddings_merger.infer(); + vision_embeddings_merger.start_async(); + vision_embeddings_merger.wait(); ov::Tensor processed_vision_embeds = vision_embeddings_merger.get_output_tensor(); ov::Tensor merged_embeds(text_embeds.get_element_type(), text_embeds.get_shape());