Skip to content

Commit 3ce470c

Browse files
authored
VLM: change infer to start_async/wait (#1947)
1 parent c9118e5 commit 3ce470c

File tree

7 files changed

+22
-11
lines changed

7 files changed

+22
-11
lines changed

src/cpp/src/visual_language/embedding_model.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ ov::Tensor EmbeddingsModel::infer(const ov::Tensor& input_idx, bool return_remot
7878
} else {
7979
req.ireq.set_output_tensor(req.cpu_tensor);
8080
}
81-
req.ireq.infer();
81+
req.ireq.start_async();
82+
req.ireq.wait();
8283
return req.ireq.get_output_tensor();
8384
}
8485

src/cpp/src/visual_language/internvl_chat/classes.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,8 @@ EncodedImage VisionEncoderInternVLChat::encode(const ov::Tensor& image, const ov
136136
ov::Tensor pixel_values = get_pixel_values_internvl(image, config);
137137

138138
encoder.set_tensor("pixel_values", pixel_values);
139-
encoder.infer();
139+
encoder.start_async();
140+
encoder.wait();
140141

141142
const ov::Tensor& infer_output = encoder.get_output_tensor();
142143
ov::Tensor image_features(infer_output.get_element_type(), infer_output.get_shape());

src/cpp/src/visual_language/llava/classes.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ EncodedImage VisionEncoderLLaVA::encode( const ov::Tensor& image, const ov::AnyM
7676
ov::Tensor pixel_values = get_pixel_values_llava(image, config);
7777

7878
encoder.set_tensor("pixel_values", pixel_values);
79-
encoder.infer();
79+
encoder.start_async();
80+
encoder.wait();
8081

8182
const ov::Tensor& infer_output = encoder.get_output_tensor();
8283
ov::Tensor image_features(infer_output.get_element_type(), infer_output.get_shape());

src/cpp/src/visual_language/llava_next/classes.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ EncodedImage VisionEncoderLLaVANext::encode(const ov::Tensor& image, const ov::A
5757
ov::Tensor pixel_values = get_pixel_values_llava_next(image, config);
5858

5959
encoder.set_tensor("pixel_values", pixel_values);
60-
encoder.infer();
60+
encoder.start_async();
61+
encoder.wait();
6162

6263
const ov::Tensor& infer_output = encoder.get_output_tensor();
6364
ov::Tensor image_features(infer_output.get_element_type(), infer_output.get_shape());

src/cpp/src/visual_language/minicpm/classes.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,8 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
377377
}
378378
ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, tgt_sizes, patch_size, ctx_clip.image_size / patch_size);
379379
encoder.set_tensor("position_ids", position_ids);
380-
encoder.infer();
380+
encoder.start_async();
381+
encoder.wait();
381382
const ov::Tensor& output_tensor = encoder.get_output_tensor();
382383

383384
if (1 == preprocessed.size()) {
@@ -746,7 +747,8 @@ ov::Tensor InputsEmbedderMiniCPM::resample(const ov::Tensor& encoded_image, cons
746747
resampler.set_tensor("image_feature", encoded_image); // [N, H*W, old_hidden_size]
747748
resampler.set_tensor("pos_embed", pos_embed); // [H*W, N, new_hidden_size]
748749
resampler.set_tensor("key_padding_mask", key_padding_mask); // [N, H*W]
749-
resampler.infer();
750+
resampler.start_async();
751+
resampler.wait();
750752
return resampler.get_output_tensor(); // [N, query_num, new_hidden_size]
751753
}
752754

src/cpp/src/visual_language/phi3_vision/classes.cpp

+6-3
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,8 @@ EncodedImage VisionEncoderPhi3V::encode(const ov::Tensor& image, const ov::AnyMa
220220
encoder.set_input_tensor(pixel_values);
221221
ov::Tensor res{ov::element::f32, encoder.get_output_tensor().get_shape()};
222222
encoder.set_output_tensor(res);
223-
encoder.infer();
223+
encoder.start_async();
224+
encoder.wait();
224225
return {std::move(res), image_size};
225226
}
226227

@@ -365,7 +366,8 @@ ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, size_t
365366
hd_feature_transformer.set_input_tensor(1, height);
366367
ov::Tensor width{ov::element::i32, {}, &w_crop};
367368
hd_feature_transformer.set_input_tensor(2, width);
368-
hd_feature_transformer.infer();
369+
hd_feature_transformer.start_async();
370+
hd_feature_transformer.wait();
369371
return hd_feature_transformer.get_output_tensor();
370372
}
371373

@@ -432,7 +434,8 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest
432434
ov::Tensor sub_image_features_hd_newline = add_image_newline(sub_image_features_hd, sub_GN); // [1,h_crop*12*(w_crop*12+1), 4096]
433435
ov::Tensor image_embeddings = concatenate_2d(sub_image_features_hd_newline, glb_GN, global_image_features_hd_newline); // [1,l,4096]
434436
vision_projection.set_input_tensor(image_embeddings);
435-
vision_projection.infer();
437+
vision_projection.start_async();
438+
vision_projection.wait();
436439
ov::Tensor out = vision_projection.get_output_tensor();
437440
ov::Tensor res{out.get_element_type(), out.get_shape()};
438441
out.copy_to(res);

src/cpp/src/visual_language/qwen2vl/classes.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,8 @@ EncodedImage VisionEncoderQwen2VL::encode(const ov::Tensor& image, const ov::Any
233233
std::memcpy(flattened_patches.data(), transposed_patches.data(), transposed_patches.get_byte_size());
234234

235235
encoder.set_tensor("hidden_states", flattened_patches);
236-
encoder.infer();
236+
encoder.start_async();
237+
encoder.wait();
237238

238239
const ov::Tensor& infer_output = encoder.get_output_tensor();
239240
ov::Tensor image_features(infer_output.get_element_type(), infer_output.get_shape());
@@ -435,7 +436,8 @@ ov::Tensor InputsEmbedderQwen2VL::merge_text_and_image_embeddings_qwen2vl(
435436
vision_embeddings_merger.set_tensor("hidden_states", concatenated_images);
436437
vision_embeddings_merger.set_tensor("attention_mask", attention_mask);
437438
vision_embeddings_merger.set_tensor("rotary_pos_emb", rotary_pos_emb);
438-
vision_embeddings_merger.infer();
439+
vision_embeddings_merger.start_async();
440+
vision_embeddings_merger.wait();
439441
ov::Tensor processed_vision_embeds = vision_embeddings_merger.get_output_tensor();
440442

441443
ov::Tensor merged_embeds(text_embeds.get_element_type(), text_embeds.get_shape());

0 commit comments

Comments
 (0)