Skip to content

Commit 57bb044

Browse files
committed
Add macos
1 parent 3d0e5ba commit 57bb044

File tree

3 files changed

+39
-35
lines changed

3 files changed

+39
-35
lines changed

.github/workflows/causal_lm_cpp.yml

+19-14
Original file line numberDiff line numberDiff line change
@@ -681,35 +681,40 @@ jobs:
681681
diff pred2.txt ref.txt
682682
echo "Chat sample python" passed
683683
684-
py-vlm_chat_sample-ubuntu:
685-
runs-on: ubuntu-22.04-16-cores
684+
visual_language_sample:
685+
strategy:
686+
fail-fast: false
687+
matrix:
688+
runs-on: [ubuntu-20.04-16-core, macos-12]
689+
runs-on: ${{ matrix.runs-on }}
686690
steps:
687691
- uses: actions/checkout@v4
688692
with:
689693
submodules: recursive
690694
- uses: actions/setup-python@v4
691695
with:
692-
python-version: 3.11
693-
- name: Install OpenVINO
694-
run: |
695-
mkdir ./ov/
696+
python-version: 3.12
697+
- run: mkdir ./ov/
698+
- if: ubuntu-20.04-16-core == ${{ matrix.runs-on }}
699+
run: >
696700
curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
697-
sudo ./ov/install_dependencies/install_openvino_dependencies.sh
698-
- name: Build app
699-
run: |
700-
source ./ov/setupvars.sh
701-
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
702-
cmake --build ./build/ --config Release --target visual_language_chat -j
701+
&& sudo ./ov/install_dependencies/install_openvino_dependencies.sh
702+
- if: macos-12 == ${{ matrix.runs-on }}
703+
run: >
704+
curl ${{ env.m_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
705+
&& brew install coreutils scons
706+
- run: cmake -DOpenVINO_DIR=./ov/runtime/cmake/ -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
707+
- run: cmake --build ./build/ --config Release --target visual_language_chat -j
703708
- name: Download and convert a model and an image
704709
run: |
705710
source ./ov/setupvars.sh
706711
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
707712
python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
708713
python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/
709714
wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11
710-
- run: |
715+
- run: >
711716
source ./ov/setupvars.sh
712-
timeout 2m ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ d5fbbd1a-d484-415c-88cb-9986625b7b11
717+
&& timeout 2m ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ d5fbbd1a-d484-415c-88cb-9986625b7b11
713718
<<< $'What is on the image?\nWhat is special on the image?'
714719
715720
cpp-continuous-batching-ubuntu:

samples/cpp/visual_language_chat/visual_language_chat.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,11 @@ bool print_subword(std::string&& subword) {
1010
}
1111

1212
int main(int argc, char* argv[]) {
13-
if (3 != argc) {
13+
if (4 != argc) {
1414
throw std::runtime_error(std::string{"Usage "} + argv[0] + " <MODEL_DIR> <IMAGE_FILE>");
1515
}
1616
ov::Tensor image = utils::load_image(argv[2]);
17+
ov::Tensor image2 = utils::load_image(argv[3]);
1718
std::string device = "CPU"; // GPU can be used as well
1819
ov::AnyMap enable_compile_cache;
1920
if ("GPU" == device) {
@@ -31,7 +32,7 @@ int main(int argc, char* argv[]) {
3132
}
3233
pipe.generate(
3334
prompt,
34-
ov::genai::images(std::vector{image, image}),
35+
ov::genai::images(std::vector{image2, image}),
3536
ov::genai::streamer(print_subword)
3637
);
3738
std::cout << "\n----------\n"

src/cpp/src/vlm_pipeline.cpp

+17-19
Original file line numberDiff line numberDiff line change
@@ -412,39 +412,37 @@ DecodedResults VLMPipeline::generate(
412412
4 == special_tokens.get_shape().at(1),
413413
"Every special token must be represented with a single int."
414414
);
415-
size_t im_start_id = special_tokens.data<int64_t>()[0];
416-
size_t im_end_id = special_tokens.data<int64_t>()[1];
417-
size_t slice_start_id = special_tokens.data<int64_t>()[2];
418-
size_t slice_end_id = special_tokens.data<int64_t>()[3];
419-
size_t im_start_pos = 0, slice_start_pos = 0;
415+
int64_t im_start_id = special_tokens.data<int64_t>()[0];
416+
int64_t im_end_id = special_tokens.data<int64_t>()[1];
417+
int64_t slice_start_id = special_tokens.data<int64_t>()[2];
418+
int64_t slice_end_id = special_tokens.data<int64_t>()[3];
419+
int64_t im_start_pos = 0, slice_start_pos = 0;
420420
int64_t* begin = encoded_input.data<int64_t>();
421421
int64_t* ids = begin;
422422
size_t encoded_input_size = encoded_input.get_size();
423-
const int64_t* end = ids + encoded_input_size;
424-
float* input_embeds_data = input_embeds.data<float>();
423+
int64_t* end = ids + encoded_input_size;
424+
float* inputs_embeds_data = inputs_embeds.data<float>();
425425
for (const EncodedImage& encoded_image : embeds) {
426426
const ov::Tensor& resampled_source = resample(*this, encoded_image.resized_source, {encoded_image.resized_source_size});
427427
float* emb = resampled_source.data<float>();
428428
ids = std::find(ids, end, im_start_id);
429-
if (end == ids) {
430-
break;
431-
}
432-
ids = std::copy_n(emb, resampled_source.get_size(), input_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
433-
if (embeds.slices) {
429+
OPENVINO_ASSERT(end != ids);
430+
std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
431+
ids += m_vlm_config.hidden_size;
432+
if (encoded_image.slices) {
434433
size_t token_idx = 0;
435-
const ov::Shape& slices_shape = embeds.slices.get_shape();
436-
const std::vector<HeightWidth>& sliced_sizes = embeds.slices_sizes;
434+
const ov::Shape& slices_shape = encoded_image.slices.get_shape();
435+
const std::vector<HeightWidth>& sliced_sizes = encoded_image.slices_sizes;
437436
for (size_t i = 0; i < slices_shape.at(0); ++i) {
438437
for (size_t ja = 0; ja < slices_shape.at(1); ++ja) {
439438
size_t d2 = slices_shape.at(2);
440439
size_t d3 = slices_shape.at(3);
441-
ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, embeds.slices.data<float>() + (i * slices_shape.at(1) + ja) * d2 * d3};
440+
ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data<float>() + (i * slices_shape.at(1) + ja) * d2 * d3};
442441
const ov::Tensor& vision_embed_tensor_i_j = resample(*this, encoded_view, {sliced_sizes.at(i * slices_shape.at(1) + ja)});
443442
ids = std::find(ids, end, slice_start_id);
444-
if (end == ids) {
445-
break;
446-
}
447-
ids = std::copy_n(vision_embed_tensor_i_j.data<float>(), vision_embed_tensor_i_j.get_size(), input_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
443+
OPENVINO_ASSERT(end != ids);
444+
std::copy_n(vision_embed_tensor_i_j.data<float>(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
445+
ids += m_vlm_config.hidden_size;
448446
}
449447
}
450448
}

0 commit comments

Comments
 (0)