Skip to content

Commit 7f43828

Browse files
committed
temp
1 parent 2a11316 commit 7f43828

File tree

5 files changed

+225
-224
lines changed

5 files changed

+225
-224
lines changed

samples/cpp/visual_language_chat/visual_language_chat.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ int main(int argc, char* argv[]) try {
1717
std::string device = "CPU"; // GPU can be used as well
1818
ov::AnyMap enable_compile_cache;
1919
if ("GPU" == device) {
20-
// Cache compile models on disks for GPU to save time on the
20+
// Cache compile models on disk for GPU to save time on the
2121
// next run. It's not beneficial for CPU.
2222
enable_compile_cache.insert({ov::cache_dir("vlm_cache")});
2323
}

src/cpp/include/openvino/genai/processor_config.hpp

+3
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ class OPENVINO_GENAI_EXPORTS ProcessorConfig {
3636
/// @brief the number of embedding vectors representing a single
3737
/// image slice.
3838
size_t image_feature_size = 64;
39+
/// @brief The size of a single embedding returned by a resampler.
40+
/// Used to initialize positional embeddings for resampler input.
41+
size_t hidden_size = 2304;
3942
/// @brief Default constructor
4043
ProcessorConfig() = default;
4144
/// @brief Construct ProcessorConfig from values in json_path.

src/cpp/include/openvino/genai/vision_encoder.hpp

+17-26
Original file line numberDiff line numberDiff line change
@@ -7,34 +7,21 @@
77
#include <openvino/openvino.hpp>
88

99
namespace ov::genai {
10-
/// @brief A pair describing image size.
11-
struct HeightWidth {
12-
/// @brief Height of a corresponding image.
13-
size_t height;
14-
/// @brief Width of a corresponding image.
15-
size_t width;
16-
};
17-
1810
/// @brief Embeddings of a given image. The number of slices is no
1911
/// greater than ProcessorConfig's max_slice_nums.
20-
struct std::vector<EncodedImage> {
12+
struct EncodedImage {
2113
/// @brief Embeddings of a resized image based on ProcessorConfig's
2214
/// scale_resolution. The tensor's shape is
23-
/// [N, H*W, hidden_size]. [N, 1014, 1152] is a possible example for
15+
/// [image_feature_size, hidden_size]. [1014, 1152] is a possible example for
2416
/// openbmb/MiniCPM-V-2.
2517
ov::Tensor resized_source;
26-
/// @brief A size of an image used to compute embeddings for
27-
/// divided by ProcessorConfig's patch_size.
28-
HeightWidth resized_source_size;
2918
/// @brief Embeddings of images obtained from a source image by
3019
/// slicing at no more than max_slice_nums pieces and resizing.
3120
/// The tensor's shape is
32-
/// [slice_y, slice_x, number_of_embeddings, embedding_size].
33-
/// slices_sizes.size() == slice_y * slice_x.
21+
/// [slice_y, slice_x, image_feature_size, embedding_size].
22+
/// slices are empty if the image was small enough allowing to skip
23+
/// slicing.
3424
ov::Tensor slices;
35-
/// @brief Flattened sizes of images used to compute embeddings
36-
/// stored in slices member divided by ProcessorConfig's patch_size.
37-
std::vector<HeightWidth> slices_sizes;
3825
};
3926

4027
/// @brief A class used to infer embeddings of an image using
@@ -49,14 +36,10 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
4936
ov::InferRequest m_resampler;
5037
/// @brief A config to follow.
5138
ProcessorConfig m_processor_config;
52-
53-
/// @brief Construct from an already compiled model and a config.
54-
/// @param encoder Compiled model.
55-
/// @param processor_config Initial config.
56-
explicit VisionEncoder(
57-
const ov::InferRequest& encoder,
58-
const ProcessorConfig& processor_config=ProcessorConfig{}
59-
) : m_vision_embedding{encoder}, m_processor_config{processor_config} {}
39+
// Precomputed positional embeddings for the resampler.
40+
// [70, 70, hidden_size]. 70 is the initial guess of the image
41+
// height and width after dividing by patch_size.
42+
ov::Tensor m_pos_embed_cache;
6043

6144
/// @brief Construct the encoder from model_dir.
6245
/// @param model_dir A folder containing openvino_embedding.xml and
@@ -72,6 +55,14 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
7255
ov::Core core=ov::Core{}
7356
);
7457

58+
/// @brief Construct from an already compiled model and a config.
59+
/// @param encoder Compiled model.
60+
/// @param processor_config Initial config.
61+
explicit VisionEncoder(
62+
const ov::InferRequest& encoder,
63+
const ProcessorConfig& processor_config=ProcessorConfig{}
64+
) : m_vision_embedding{encoder}, m_processor_config{processor_config} {}
65+
7566
/// @brief Compute embeddings of an image.
7667
/// @param image Images to infer embeddings for. Image shape must be
7768
/// [HWC].

src/cpp/src/vision_encoder.cpp

+204-27
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,9 @@ std::vector<std::vector<clip_image_u8>> slice_image(const clip_image_u8& img, co
5959
const std::pair<int, int> original_size{img.nx, img.ny};
6060
const int original_width = img.nx;
6161
const int original_height = img.ny;
62-
const float log_ratio = log(1.0 * original_width / original_height);
63-
const float ratio = 1.0 * original_width * original_height / (scale_resolution * scale_resolution);
64-
const int multiple = fmin(ceil(ratio), max_slice_nums);
62+
const float log_ratio = log(1.0f * original_width / original_height);
63+
const float ratio = 1.0f * original_width * original_height / (scale_resolution * scale_resolution);
64+
const int multiple = std::min(ceil(ratio), max_slice_nums);
6565

6666
std::vector<std::vector<clip_image_u8>> images;
6767
images.push_back(std::vector<clip_image_u8>{});
@@ -140,7 +140,176 @@ std::vector<std::vector<clip_image_u8>> slice_image(const clip_image_u8& img, co
140140
return images;
141141
}
142142

143-
EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const ov::Tensor& img, ov::InferRequest& encoder, int max_slice_nums, int scale_resolution, size_t patch_size, bool never_split) {
143+
ov::Tensor concatenate(const ov::Tensor& first, const ov::Tensor& second) {
144+
size_t res_d_0 = first.get_shape().at(0);
145+
size_t res_d_1 = first.get_shape().at(1);
146+
size_t res_d_2 = first.get_shape().at(2) * 2;
147+
ov::Tensor res{first.get_element_type(), {res_d_0, res_d_1, res_d_2}};
148+
float* first_data = first.data<float>();
149+
float* second_data = second.data<float>();
150+
float* res_data = res.data<float>();
151+
for (size_t i = 0; i < res_d_0; ++i) {
152+
for (size_t j = 0; j < res_d_1; ++j) {
153+
size_t k = 0;
154+
for (; k < first.get_shape().at(2); ++k) {
155+
res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k]
156+
= first_data[i * res_d_1 * first.get_shape().at(2) + j * first.get_shape().at(2) + k];
157+
}
158+
for (size_t l = 0; l < second.get_shape().at(2); ++l, ++k) {
159+
res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k]
160+
= second_data[i * res_d_1 * second.get_shape().at(2) + j * second.get_shape().at(2) + l];
161+
}
162+
}
163+
}
164+
return res;
165+
}
166+
167+
/// embed_dim: output dimension for each position
168+
/// pos: a list of positions to be encoded: size (H, W)
169+
/// out: (H, W, D)
170+
ov::Tensor get_1d_sincos_pos_embed_from_grid_new(size_t embed_dim, const ov::Tensor& pos) {
171+
OPENVINO_ASSERT(embed_dim % 2 == 0);
172+
OPENVINO_ASSERT(pos.get_shape().size() == 3);
173+
OPENVINO_ASSERT(pos.get_shape().at(0) == 1);
174+
size_t d0 = pos.get_shape().at(1);
175+
size_t d1 = pos.get_shape().at(2);
176+
size_t d2 = embed_dim / 2;
177+
std::vector<float> omega(d2);
178+
for (size_t idx = 0; idx < omega.size(); ++idx) {
179+
omega.at(idx) = idx / (embed_dim / 2.0f);
180+
omega.at(idx) = 1.0f / std::pow(10000.0f, omega.at(idx)); // (D/2,)
181+
}
182+
const float* const pos_data = pos.data<float>();
183+
ov::Tensor out(ov::element::f32, {d0, d1, d2}); // (H, W, D/2), outer product
184+
float* out_data = out.data<float>();
185+
for (size_t i = 0; i < d0; ++i) {
186+
for (size_t j = 0; j < d1; ++j) {
187+
for (size_t k = 0; k < d2; ++k) {
188+
out_data[i * d1 * d2 + j * d2 + k]
189+
= pos_data[i * d1 + j] * omega[k];
190+
}
191+
}
192+
}
193+
194+
ov::Tensor emb_sin{out.get_element_type(), out.get_shape()}; // (H, W, D/2)
195+
float* emb_sin_data = emb_sin.data<float>();
196+
std::transform(out_data, out_data + out.get_size(), emb_sin_data, [](float arg) {
197+
return std::sin(arg);
198+
});
199+
ov::Tensor emb_cos{out.get_element_type(), out.get_shape()}; // (H, W, D/2)
200+
float* emb_cos_data = emb_cos.data<float>();
201+
std::transform(out_data, out_data + out.get_size(), emb_cos_data, [](float arg) {
202+
return std::cos(arg);
203+
});
204+
return concatenate(emb_sin, emb_cos); // (H, W, D)
205+
}
206+
207+
ov::Tensor get_2d_sincos_pos_embed_from_grid(size_t embed_dim, const ov::Tensor& grid) {
208+
OPENVINO_ASSERT(embed_dim % 2 == 0);
209+
// use half of dimensions to encode grid_h
210+
ov::Coordinate begin_h{0, 0, 0};
211+
ov::Coordinate end_h{grid.get_shape()};
212+
end_h.at(0) = 1;
213+
ov::Coordinate begin_w{1, 0, 0};
214+
ov::Coordinate end_w{grid.get_shape()};
215+
end_w.at(0) = 2;
216+
ov::Tensor emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, ov::Tensor{grid, begin_h, end_h}); // (H, W, D/2)
217+
ov::Tensor emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, ov::Tensor{grid, begin_w, end_w}); // (H, W, D/2)
218+
return concatenate(emb_h, emb_w);
219+
}
220+
221+
/// image_size: image_size or (image_height, image_width)
222+
/// return:
223+
/// pos_embed: [image_height, image_width, embed_dim]
224+
ov::Tensor get_2d_sincos_pos_embed(size_t embed_dim, const HeightWidth& image_size) {
225+
size_t grid_h_size = image_size.height, grid_w_size = image_size.width;
226+
ov::Tensor grid(ov::element::f32, {2, grid_h_size, grid_w_size});
227+
float* data = grid.data<float>();
228+
for (size_t y = 0; y < grid_h_size; ++y) {
229+
std::iota(data, data + grid_w_size, 0.0f);
230+
data += grid_w_size;
231+
}
232+
for (float y = 0.0f; y < grid_h_size; ++y) {
233+
std::fill(data, data + grid_w_size, y);
234+
data += grid_w_size;
235+
}
236+
return get_2d_sincos_pos_embed_from_grid(embed_dim, grid);
237+
}
238+
239+
void adjust_pos_cache(
240+
const std::vector<HeightWidth>& target_sizes,
241+
size_t hidden_size,
242+
ov::Tensor& pos_embed_cache
243+
) {
244+
size_t max_h = std::max_element(target_sizes.begin(), target_sizes.end(), [](const HeightWidth& left, const HeightWidth& right) {
245+
return left.height < right.height;
246+
})->height;
247+
size_t max_w = std::max_element(target_sizes.begin(), target_sizes.end(), [](const HeightWidth& left, const HeightWidth& right) {
248+
return left.width < right.width;
249+
})->width;
250+
size_t allocated_height, allocated_width;
251+
if (pos_embed_cache) {
252+
const ov::Shape& allocated_shape = pos_embed_cache.get_shape();
253+
allocated_height = allocated_shape.at(0);
254+
allocated_width = allocated_shape.at(1);
255+
} else {
256+
allocated_height = allocated_width = 70;
257+
}
258+
if (max_h > allocated_height || max_w > allocated_width) {
259+
allocated_height = std::max(max_h, allocated_height);
260+
allocated_width = std::max(max_w, allocated_width);
261+
pos_embed_cache = get_2d_sincos_pos_embed(
262+
hidden_size, {allocated_height, allocated_width}
263+
);
264+
}
265+
}
266+
267+
ov::Tensor resample(VisionEncoder& vision, const ov::Tensor& encoded_image, const std::vector<HeightWidth>& target_sizes) {
268+
size_t bs = encoded_image.get_shape().at(0);
269+
std::vector<size_t> patch_len{target_sizes.size()};
270+
std::transform(target_sizes.begin(), target_sizes.end(), patch_len.begin(), [](const HeightWidth& height_width) {
271+
return height_width.height * height_width.width;
272+
});
273+
adjust_pos_cache(
274+
target_sizes,
275+
vision.m_vlm_config.hidden_size,
276+
vision.m_pos_embed_cache
277+
);
278+
size_t max_patch_len = *std::max_element(patch_len.begin(), patch_len.end());
279+
ov::Tensor key_padding_mask(ov::element::boolean, {bs, max_patch_len});
280+
bool* mask_data = key_padding_mask.data<bool>();
281+
size_t embed_len = vision.m_pos_embed_cache.get_shape().at(2);
282+
ov::Tensor pos_embed(ov::element::f32, {max_patch_len, bs, embed_len}); // BLD => L * B * D
283+
float* pos_embed_data = pos_embed.data<float>();
284+
float* cache_data = vision.m_pos_embed_cache.data<float>();
285+
size_t _d0 = vision.m_pos_embed_cache.get_shape().at(0);
286+
size_t _d1 = vision.m_pos_embed_cache.get_shape().at(1);
287+
for (size_t i = 0; i < bs; ++i) {
288+
size_t target_h = target_sizes.at(i).height;
289+
size_t target_w = target_sizes.at(i).width;
290+
for (size_t h_idx = 0; h_idx < target_h; ++h_idx) {
291+
for (size_t w_idx = 0; w_idx < target_w; ++w_idx) {
292+
std::copy_n(
293+
cache_data + h_idx * _d1 + w_idx,
294+
embed_len,
295+
pos_embed_data + (h_idx * target_w + w_idx) * bs * embed_len + i * embed_len
296+
);
297+
}
298+
}
299+
for (size_t flat = target_h * target_w; flat < max_patch_len; ++flat) {
300+
std::fill_n(pos_embed_data + flat * bs * embed_len + i * embed_len, embed_len, 0.0f);
301+
}
302+
std::fill_n(mask_data + i * max_patch_len, patch_len[i], false);
303+
std::fill_n(mask_data + i * max_patch_len + patch_len[i], max_patch_len - patch_len[i], true);
304+
}
305+
vision.m_resampler.set_tensor("x", encoded_image); // [N, H*W, old_hidden_size]
306+
vision.m_resampler.set_tensor("pos_embed", pos_embed); // [H*W, N, new_hidden_size]
307+
vision.m_resampler.set_tensor("key_padding_mask", key_padding_mask); // [N, H*W]
308+
vision.m_resampler.infer();
309+
return pipe.m_resampler.get_output_tensor(); // [N, query_num, new_hidden_size]
310+
}
311+
312+
EncodedImage llava_image_embed_make_with_bytes_slice(VisionEncoder& vision, clip_ctx& ctx_clip, const ov::Tensor& img, ov::InferRequest& encoder, int max_slice_nums, int scale_resolution, size_t patch_size, bool never_split) {
144313
clip_image_u8 source{
145314
int(img.get_shape().at(3)),
146315
int(img.get_shape().at(2)),
@@ -168,35 +337,43 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
168337
ov::Tensor resized_source{output_tensor.get_element_type(), output_tensor.get_shape()};
169338
output_tensor.copy_to(resized_source);
170339
HeightWidth resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size};
171-
172-
HeightWidth size{
173-
size_t(preprocessed.at(1).at(0).ny),
174-
size_t(preprocessed.at(1).at(0).nx)
175-
};
176-
ov::Tensor batched{ov::element::f32, {(preprocessed.size() - 1) * preprocessed.at(1).size(), 3, size.height, size.width}};
177-
float* batched_data = batched.data<float>();
178-
size_t batch_offset = 0;
179-
size_t values_in_elem = 3 * size.height * size.width;
180-
std::vector<HeightWidth> sliced_sizes;
181-
for (size_t row = 1; row < preprocessed.size(); ++row) {
182-
for (const clip_image_f32& elem : preprocessed.at(row)) {
183-
std::copy_n(elem.buf.begin(), values_in_elem, batched_data + batch_offset);
184-
sliced_sizes.push_back({elem.ny / patch_size, elem.nx / patch_size});
185-
batch_offset += values_in_elem;
186-
}
187-
}
188-
encoder.set_input_tensor(batched);
189-
encoder.infer();
190-
const ov::Tensor& encoded = encoder.get_output_tensor();
191-
const ov::Shape& plain = encoded.get_shape();
192340
struct SharedTensorAllocator {
193341
const ov::Tensor tensor;
194342
void* allocate(size_t bytes, size_t) {return bytes <= tensor.get_byte_size() ? tensor.data() : nullptr;}
195343
void deallocate(void*, size_t, size_t) {}
196344
bool is_equal(const SharedTensorAllocator& other) const noexcept {return this == &other;}
197345
};
198-
ov::Tensor reshaped{encoded.get_element_type(), {preprocessed.size() - 1, preprocessed.at(1).size(), plain.at(1), plain.at(2)}, SharedTensorAllocator{encoded}};
199-
return {resized_source, resized_source_size, reshaped, sliced_sizes};
346+
ov::Tensor resampled_resized = resample(vision, encoder.get_output_tensor(), {resized_preprocessed.ny, resize_preprocessed.nx});
347+
ov::Tensor owner{resampled_resized.get_element_type(), resized_resampled.get_shape()};
348+
resampled_resized.copy_to(owner);
349+
owner.set_shape(owner.get_shape().at(1), owner.get_shape().at(2));
350+
351+
ov::Tensor resampled_slices;
352+
if (1 < preprocessed.size()) {
353+
HeightWidth size{
354+
size_t(preprocessed.at(1).at(0).ny),
355+
size_t(preprocessed.at(1).at(0).nx)
356+
};
357+
ov::Tensor batched{ov::element::f32, {(preprocessed.size() - 1) * preprocessed.at(1).size(), 3, size.height, size.width}};
358+
float* batched_data = batched.data<float>();
359+
size_t batch_offset = 0;
360+
size_t values_in_elem = 3 * size.height * size.width;
361+
std::vector<HeightWidth> sliced_sizes;
362+
for (size_t row = 1; row < preprocessed.size(); ++row) {
363+
for (const clip_image_f32& elem : preprocessed.at(row)) {
364+
std::copy_n(elem.buf.begin(), values_in_elem, batched_data + batch_offset);
365+
sliced_sizes.push_back({elem.ny / patch_size, elem.nx / patch_size});
366+
batch_offset += values_in_elem;
367+
}
368+
}
369+
encoder.set_input_tensor(batched);
370+
encoder.infer();
371+
ov::Tensor resampled_batched = resample(vision, encoder.get_output_tensor(), size);
372+
const ov::Tensor& encoded = encoder.get_output_tensor();
373+
const ov::Shape& plain = encoded.get_shape();
374+
resampled_slices{encoded.get_element_type(), {preprocessed.size() - 1, preprocessed.at(1).size(), plain.at(1), plain.at(2)}, SharedTensorAllocator{encoded}};
375+
}
376+
return {owner, resampled_slices};
200377
}
201378
}
202379

0 commit comments

Comments
 (0)