Skip to content

Commit 0eb693e

Browse files

File tree

3 files changed

+11
-18
lines changed

3 files changed

+11
-18
lines changed
 

‎.github/workflows/causal_lm_cpp.yml

+5-5
Original file line numberDiff line numberDiff line change
@@ -708,18 +708,18 @@ jobs:
708708
- run: >
709709
LD_LIBRARY_PATH=${{ github.workspace }}/ov/runtime/3rdparty/tbb/lib/:$LD_LIBRARY_PATH
710710
cmake --build ./build/ --config Release --target visual_language_chat -j
711+
- run: >
712+
LD_LIBRARY_PATH=${{ github.workspace }}/ov/runtime/3rdparty/tbb/lib/:$LD_LIBRARY_PATH
713+
./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ d5fbbd1a-d484-415c-88cb-9986625b7b11
714+
<<< $'What is on the image?\nWhat is special on the image?'
715+
timeout-minutes: 2
711716
- run: >
712717
source ./ov/setupvars.sh
713718
&& python -m pip install --upgrade-strategy eager ./thirdparty/openvino_tokenizers/[transformers] -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
714719
- run: >
715720
source ./ov/setupvars.sh
716721
&& python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/
717722
- run: wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11
718-
- run: >
719-
LD_LIBRARY_PATH=${{ github.workspace }}/ov/runtime/3rdparty/tbb/lib/:$LD_LIBRARY_PATH
720-
./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ d5fbbd1a-d484-415c-88cb-9986625b7b11
721-
<<< $'What is on the image?\nWhat is special on the image?'
722-
timeout-minutes: 2
723723

724724
cpp-continuous-batching-ubuntu:
725725
runs-on: ubuntu-20.04-8-cores

‎src/cpp/src/vision_encoder.cpp

+5-3
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,9 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
250250
tgt_sizes_data[1] = resized_source_size.width;
251251
encoder.set_tensor("tgt_sizes", tgt_sizes);
252252
encoder.infer();
253-
const ov::Tensor& resized_source = encoder.get_output_tensor();
253+
const ov::Tensor& output_tensor = encoder.get_output_tensor();
254+
ov::Tensor resized_source{ov::element::f32, output_tensor.get_shape()};
255+
output_tensor.copy_to(resized_source);
254256

255257
if (1 == preprocessed.size()) {
256258
return {std::move(resized_source), resized_source_size};
@@ -280,12 +282,12 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
280282
tgt_sizes_data[0] = sliced_sizes.back().height;
281283
tgt_sizes_data[1] = sliced_sizes.back().width;
282284
encoder.set_tensor("tgt_sizes", tgt_sizes);
285+
const ov::Tensor& old = encoder.get_output_tensor();
283286
encoder.set_output_tensor({ov::element::f32, {1, n_patches, old_hidden_size}, encoded_slices.data<float>() + ((row - 1) * preprocessed.at(row).size() + col) * n_patches * old_hidden_size});
284287
encoder.infer();
288+
encoder.set_output_tensor(old);
285289
}
286290
}
287-
// Override prev output tensor that doesn't own memory.
288-
encoder.set_output_tensor(ov::Tensor{ov::element::f32, {0, 0, old_hidden_size}});
289291
return {resized_source, resized_source_size, encoded_slices, sliced_sizes};
290292
}
291293
}

‎src/cpp/src/vlm_pipeline.cpp

+1-10
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ DecodedResults VLMPipeline::generate(
342342
for (const ov::Tensor& rgb : rgbs) {
343343
EncodedImage encoded_image = m_vision_encoder.encode(rgb);
344344
if (m_vlm_config.use_image_id) {
345-
images_prompt = m_vlm_config.im_id_start + std::to_string(image_id) + m_vlm_config.im_id_end;
345+
images_prompt += m_vlm_config.im_id_start + std::to_string(image_id) + m_vlm_config.im_id_end;
346346
++image_id;
347347
}
348348
std::string unk64;
@@ -380,7 +380,6 @@ DecodedResults VLMPipeline::generate(
380380
m_history.push_back({{"role", "user"}, {"content", images_prompt}});
381381
constexpr bool add_generation_prompt = true;
382382
std::string new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
383-
std::cout << new_templated_chat_history << '\n';
384383
ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids;
385384
if (0 == m_language.get_tensor("attention_mask").get_shape().at(1)) {
386385
encoded_input = new_chat_tokens;
@@ -419,10 +418,6 @@ DecodedResults VLMPipeline::generate(
419418
int64_t slice_end_id = special_tokens.data<int64_t>()[3];
420419
int64_t im_start_pos = 0, slice_start_pos = 0;
421420
int64_t* begin = encoded_input.data<int64_t>();
422-
for (size_t cont = 0; cont < encoded_input.get_size(); ++cont) {
423-
std::cout << begin[cont] << ", ";
424-
}
425-
std::cout << '\n';
426421
int64_t* ids = begin;
427422
size_t encoded_input_size = encoded_input.get_size();
428423
int64_t* end = ids + encoded_input_size;
@@ -431,11 +426,9 @@ DecodedResults VLMPipeline::generate(
431426
const ov::Tensor& resampled_source = resample(*this, encoded_image.resized_source, {encoded_image.resized_source_size});
432427
float* emb = resampled_source.data<float>();
433428
ids = std::find(ids, end, im_start_id);
434-
std::cout << std::distance(begin, ids) << '\n';
435429
OPENVINO_ASSERT(end != ids);
436430
std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
437431
ids += m_vlm_config.query_num;
438-
std::cout << std::distance(begin, ids) << '\n';
439432
if (encoded_image.slices) {
440433
size_t token_idx = 0;
441434
const ov::Shape& slices_shape = encoded_image.slices.get_shape();
@@ -447,11 +440,9 @@ DecodedResults VLMPipeline::generate(
447440
ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data<float>() + (i * slices_shape.at(1) + ja) * d2 * d3};
448441
const ov::Tensor& vision_embed_tensor_i_j = resample(*this, encoded_view, {sliced_sizes.at(i * slices_shape.at(1) + ja)});
449442
ids = std::find(ids, end, slice_start_id);
450-
std::cout << std::distance(begin, ids) << '\n';
451443
OPENVINO_ASSERT(end != ids);
452444
std::copy_n(vision_embed_tensor_i_j.data<float>(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
453445
ids += m_vlm_config.query_num;
454-
std::cout << std::distance(begin, ids) << '\n';
455446
}
456447
}
457448
}

0 commit comments

Comments
 (0)
Please sign in to comment.