Skip to content

Commit 330f122

Browse files
committed
Merge branch 'add-VLM-matching-test'
2 parents d6e56a7 + f90725e commit 330f122

File tree

4 files changed

+100
-67
lines changed

4 files changed

+100
-67
lines changed

.github/workflows/causal_lm_cpp.yml

+28-3
Original file line numberDiff line numberDiff line change
@@ -702,19 +702,44 @@ jobs:
702702
run: |
703703
source ./ov/setupvars.sh
704704
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
705-
python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
705+
python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt opencv-python --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
706706
- name: Download and convert MiniCPM-V-2_6 model and an image
707707
run: |
708708
python -m pip install git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv
709709
python -m pip install -U "optimum<1.23" --no-dependencies
710710
source ./ov/setupvars.sh
711711
optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 MiniCPM-V-2_6 --trust-remote-code
712712
wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg
713+
- name: Generate reference
714+
shell: python
715+
run: |
716+
from optimum.intel.openvino import OVModelForVisualCausalLM
717+
from transformers import AutoProcessor
718+
from PIL import Image
719+
import cv2
720+
import numpy as np
721+
res = 448, 448
722+
lines = np.arange(res[0] * res[1] * 3, dtype=np.uint8) % 255
723+
lines = lines.reshape([*res, 3])
724+
cv2.imwrite("lines.png", lines)
725+
lines = Image.open("lines.png").convert('RGB')
726+
model_id = "openbmb/MiniCPM-V-2_6"
727+
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
728+
prompt = processor.tokenizer.apply_chat_template([{"role": "user", "content": "(<image>./</image>)\nWhat is unusual on this image?"}], tokenize=False, add_generation_prompt=True)
729+
inputs = processor([prompt], [lines], return_tensors="pt")
730+
model = OVModelForVisualCausalLM.from_pretrained("MiniCPM-V-2_6", device="CPU", trust_remote_code=True)
731+
result = model.generate(**inputs, max_new_tokens=200)
732+
decoded = processor.tokenizer.batch_decode(result[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]
733+
print(decoded)
734+
with open("ref.txt", "w") as f:
735+
f.write(f"question:\n{decoded}\n----------\nquestion:\n")
736+
713737
- name: Run visual_language_chat C++ sample - MiniCPM-V-2_6
714738
run: >
715739
source ./ov/setupvars.sh
716-
&& timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./MiniCPM-V-2_6/ cat.jpg
717-
<<< $'What is on the image?\nWhat is special on the image?'
740+
&& timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./MiniCPM-V-2_6/ lines.png
741+
<<< $'What is unusual on this image?' | tee cpp.txt
742+
- run: diff cpp.txt ref.txt
718743
- name: Download and convert LLaVa 1.5 model and an image
719744
run: |
720745
source ./ov/setupvars.sh

src/cpp/src/visual_language/clip.hpp

+2-18
Original file line numberDiff line numberDiff line change
@@ -6,25 +6,9 @@
66
#include <vector>
77
#include <numeric>
88

9-
//#define CLIP_DEBUG_FUNCTIONS
10-
enum projector_type {
11-
PROJECTOR_TYPE_RESAMPLER,
12-
PROJECTOR_TYPE_UNKNOWN,
13-
};
14-
159
struct clip_ctx {
16-
bool has_text_encoder = false;
17-
bool has_vision_encoder = false;
18-
bool has_minicpmv_projector = false;
19-
20-
float image_mean[3];
21-
float image_std[3];
22-
int32_t ftype = 1;
23-
24-
std::vector<uint8_t> buf_compute_meta;
25-
26-
projector_type proj_type = PROJECTOR_TYPE_RESAMPLER;
27-
size_t patch_size = 0;
10+
float image_mean[3] = {0.0f, 0.0f, 0.0f};
11+
float image_std[3] = {1.0f, 1.0f, 1.0f};
2812
size_t image_size = 0;
2913
};
3014

src/cpp/src/visual_language/pipeline.cpp

+12-4
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,13 @@ ov::Tensor pack_image_features_llava_next(
557557
return result;
558558
}
559559
}
560+
561+
// It's not possible to pass a GPU tensor from one model to another GPU
562+
// model on a different ov::Core instance.
563+
ov::Core singleton_core() {
564+
static ov::Core core;
565+
return core;
566+
}
560567
}
561568

562569
class ov::genai::VLMPipeline::VLMPipelineImpl {
@@ -604,21 +611,22 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
604611
)
605612
},
606613
m_tokenizer{Tokenizer(model_dir.string(), device_config)},
607-
m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config, ov::Core{}),
614+
m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config, singleton_core()),
608615
m_is_chat_conversation{false},
609616
m_image_id{0} {
617+
ov::Core core = singleton_core();
610618
if (m_vlm_config.model_type == VLMModelType::MINICPM) {
611-
m_resampler = ov::Core{}.compile_model(
619+
m_resampler = core.compile_model(
612620
model_dir / "openvino_resampler_model.xml", device, device_config
613621
).create_infer_request();
614622

615623
m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70});
616624
}
617-
m_embedding = ov::Core{}.compile_model(
625+
m_embedding = core.compile_model(
618626
model_dir / "openvino_text_embeddings_model.xml", device, device_config
619627
).create_infer_request();
620628

621-
m_language = ov::Core{}.compile_model(
629+
m_language = core.compile_model(
622630
model_dir / "openvino_language_model.xml", device, device_config
623631
).create_infer_request();
624632

src/cpp/src/visual_language/vision_encoder.cpp

+58-42
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,6 @@ ov::Tensor prepare_vis_position_ids(
242242
});
243243
size_t position_ids_batch_elem = max_nb_patches_h * max_nb_patches_w;
244244
ov::Tensor position_ids{ov::element::i64, {batch_size, position_ids_batch_elem}};
245-
// throw std::runtime_error("");
246245
int64_t* res_data = position_ids.data<int64_t>();
247246
std::fill_n(res_data, position_ids.get_size(), 0);
248247

@@ -285,66 +284,84 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
285284
std::vector<std::vector<ov::Tensor>> results;
286285
std::vector<std::vector<ImageSize>> sizes;
287286

288-
// std::vector<clip_image_f32*> img_res_v; // format N x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
289287
std::vector<std::vector<clip_image_f32>> preprocessed{imgs.size()};
290-
std::transform(imgs.begin(), imgs.end(), preprocessed.begin(), [&ctx_clip](const std::vector<clip_image_u8>& row) {
288+
size_t max_h = 0, max_w = 0, n_images = 0;
289+
std::transform(imgs.begin(), imgs.end(), preprocessed.begin(), [&ctx_clip, &max_h, &max_w, &n_images](const std::vector<clip_image_u8>& row) {
291290
std::vector<clip_image_f32> processed_row{row.size()};
292-
std::transform(row.begin(), row.end(), processed_row.begin(), [&ctx_clip](const clip_image_u8& raw) {
293-
return clip_image_preprocess(ctx_clip, raw);
291+
std::transform(row.begin(), row.end(), processed_row.begin(), [&ctx_clip, &max_h, &max_w, &n_images](const clip_image_u8& raw) {
292+
clip_image_f32 im = clip_image_preprocess(ctx_clip, raw);
293+
max_h = std::max(size_t(im.ny), max_h);
294+
max_w = std::max(size_t(im.nx), max_w);
295+
++n_images;
296+
return im;
294297
});
295298
return processed_row;
296299
});
297300

301+
ov::Tensor batched_images{ov::element::f32, {n_images, 3, max_h, max_w}};
302+
float* batched_data = batched_images.data<float>();
298303
const clip_image_f32& resized_preprocessed = preprocessed.at(0).at(0);
299-
ImageSize resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size};
300-
ov::Tensor input_tensor{ov::element::f32, {1, 3, size_t(resized_preprocessed.ny), size_t(resized_preprocessed.nx)}, (void*)(resized_preprocessed.buf.data())};
301-
ov::Tensor pixel_values = preprocess_for_encoder(input_tensor, patch_size);
304+
std::copy(resized_preprocessed.buf.begin(), resized_preprocessed.buf.end(), batched_data);
305+
if (1 < preprocessed.size()) {
306+
for (size_t row = 1; row < preprocessed.size(); ++row) {
307+
size_t n_slices = preprocessed.at(row).size();
308+
for (size_t col = 0; col < n_slices; ++col) {
309+
const clip_image_f32& elem = preprocessed.at(row).at(col);
310+
std::copy(elem.buf.begin(), elem.buf.end(), batched_data + ((row - 1) * n_slices + col + 1) * 3 * max_h * max_w);
311+
}
312+
}
313+
}
314+
ov::Tensor pixel_values = preprocess_for_encoder(batched_images, patch_size);
302315
encoder.set_tensor("pixel_values", pixel_values);
303-
ov::Tensor patch_attention_mask{ov::element::f32, {pixel_values.get_shape().at(0), 1, resized_source_size.height * resized_source_size.width}};
304-
std::fill_n(patch_attention_mask.data<float>(), patch_attention_mask.get_size(), 1.0f);
316+
317+
ov::Tensor patch_attention_mask{ov::element::f32, {pixel_values.get_shape().at(0), 1, max_h / patch_size * max_w / patch_size}};
318+
float* attention_data = patch_attention_mask.data<float>();
319+
std::fill_n(attention_data, patch_attention_mask.get_size(), 0.0f);
320+
std::fill_n(attention_data, resized_preprocessed.ny / patch_size * resized_preprocessed.nx / patch_size, 1.0f);
321+
if (1 < preprocessed.size()) {
322+
for (size_t row = 1; row < preprocessed.size(); ++row) {
323+
size_t n_slices = preprocessed.at(row).size();
324+
for (size_t col = 0; col < n_slices; ++col) {
325+
const clip_image_f32& elem = preprocessed.at(row).at(col);
326+
std::fill_n(attention_data + ((row - 1) * n_slices + col + 1) * max_h / patch_size * max_w / patch_size, elem.ny / patch_size * elem.nx / patch_size, 1.0f);
327+
}
328+
}
329+
}
305330
encoder.set_tensor("patch_attention_mask", patch_attention_mask);
306-
ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {resized_source_size}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size);
331+
332+
ImageSize resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size};
333+
std::vector<ImageSize> tgt_sizes{resized_source_size};
334+
if (1 < preprocessed.size()) {
335+
for (const std::vector<clip_image_f32>& row : preprocessed) {
336+
for (const clip_image_f32& elem : row) {
337+
tgt_sizes.push_back({elem.ny / patch_size, elem.nx / patch_size});
338+
}
339+
}
340+
}
341+
ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, tgt_sizes, patch_size, ctx_clip.image_size / patch_size);
307342
encoder.set_tensor("position_ids", position_ids);
308343
encoder.infer();
309344
const ov::Tensor& output_tensor = encoder.get_output_tensor();
310-
ov::Tensor resized_source{ov::element::f32, output_tensor.get_shape()};
311-
output_tensor.copy_to(resized_source);
312345

313346
if (1 == preprocessed.size()) {
347+
ov::Tensor resized_source{ov::element::f32, output_tensor.get_shape()};
348+
output_tensor.copy_to(resized_source);
314349
return {std::move(resized_source), resized_source_size};
315350
}
316351

317-
ImageSize raw_size{
318-
size_t(preprocessed.at(1).at(0).ny),
319-
size_t(preprocessed.at(1).at(0).nx)
320-
};
321-
ImageSize slices_size{
322-
raw_size.height / patch_size,
323-
raw_size.width / patch_size
324-
};
325-
size_t n_patches = slices_size.height * slices_size.width,
326-
old_hidden_size = resized_source.get_shape().at(2);
352+
size_t old_hidden_size = output_tensor.get_shape().at(2);
353+
const float* out = output_tensor.data<float>();
354+
ov::Tensor resized_source{ov::element::f32, {1, resized_source_size.height * resized_source_size.width, old_hidden_size}};
355+
std::copy_n(out, resized_source.get_size(), resized_source.data<float>());
356+
357+
size_t n_patches = tgt_sizes.at(1).height * tgt_sizes.at(1).width;
327358
ov::Tensor encoded_slices{ov::element::f32, {preprocessed.size() - 1, preprocessed.at(1).size(), n_patches, old_hidden_size}};
328-
for (size_t row = 1; row < preprocessed.size(); ++row) {
329-
for (size_t col = 0; col < preprocessed.at(row).size(); ++col) {
330-
clip_image_f32& elem = preprocessed.at(row).at(col);
331-
ov::Tensor pixel_values = preprocess_for_encoder(
332-
{ov::element::f32, {1, 3, size_t(elem.ny), size_t(elem.nx)}, elem.buf.data()},
333-
patch_size
334-
);
335-
encoder.set_tensor("pixel_values", pixel_values);
336-
ov::Tensor patch_attention_mask{ov::element::f32, {1, 1, slices_size.height * slices_size.width}};
337-
std::fill_n(patch_attention_mask.data<float>(), patch_attention_mask.get_size(), 1.0f);
338-
encoder.set_tensor("patch_attention_mask", patch_attention_mask);
339-
ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {slices_size}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size);
340-
encoder.set_tensor("position_ids", position_ids);
341-
const ov::Tensor& old = encoder.get_output_tensor();
342-
encoder.set_output_tensor({ov::element::f32, {1, n_patches, old_hidden_size}, encoded_slices.data<float>() + ((row - 1) * preprocessed.at(row).size() + col) * n_patches * old_hidden_size});
343-
encoder.infer();
344-
encoder.set_output_tensor(old);
359+
for (size_t col = 0; col < preprocessed.size() - 1; ++col) {
360+
for (size_t row = 0; row < preprocessed.at(1).size(); ++row) {
361+
std::copy_n(out + (col * preprocessed.at(1).size() + row + 1) * n_patches * old_hidden_size, n_patches * old_hidden_size, encoded_slices.data<float>() + (col * preprocessed.at(1).size() + row) * n_patches * old_hidden_size);
345362
}
346363
}
347-
return {resized_source, resized_source_size, encoded_slices, slices_size};
364+
return {resized_source, resized_source_size, encoded_slices, tgt_sizes.at(1)};
348365
}
349366

350367
ProcessorConfig from_any_map(
@@ -504,7 +521,6 @@ EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ov::AnyMap& co
504521

505522
EncodedImage VisionEncoder::encode_minicpm(const ov::Tensor& image, const ProcessorConfig& config) {
506523
clip_ctx ctx_clip;
507-
ctx_clip.patch_size = m_processor_config.patch_size;
508524
ctx_clip.image_size = m_processor_config.image_size;
509525
std::copy(config.norm_mean.begin(), config.norm_mean.end(), ctx_clip.image_mean);
510526
std::copy(config.norm_std.begin(), config.norm_std.end(), ctx_clip.image_std);

0 commit comments

Comments
 (0)