temp

Wovchena · Wovchena · commit 7f438288e8cf · 2024-09-05T14:48:08.000+04:00
diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp
@@ -17,7 +17,7 @@ int main(int argc, char* argv[]) try {
     std::string device = "CPU";  // GPU can be used as well
     ov::AnyMap enable_compile_cache;
     if ("GPU" == device) {
-        // Cache compile models on disks for GPU to save time on the
+        // Cache compile models on disk for GPU to save time on the
         // next run. It's not beneficial for CPU.
         enable_compile_cache.insert({ov::cache_dir("vlm_cache")});
     }
diff --git a/src/cpp/include/openvino/genai/processor_config.hpp b/src/cpp/include/openvino/genai/processor_config.hpp
@@ -36,6 +36,9 @@ class OPENVINO_GENAI_EXPORTS ProcessorConfig {
     /// @brief the number of embedding vectors representing a single
     /// image slice.
     size_t image_feature_size = 64;
+    /// @brief The size of a single embedding returned by a resampler.
+    /// Used to initialize positional embeddings for resampler input.
+    size_t hidden_size = 2304;
     /// @brief Default constructor
     ProcessorConfig() = default;
     /// @brief Construct ProcessorConfig from values in json_path.
diff --git a/src/cpp/include/openvino/genai/vision_encoder.hpp b/src/cpp/include/openvino/genai/vision_encoder.hpp
@@ -7,34 +7,21 @@
 #include <openvino/openvino.hpp>
 
 namespace ov::genai {
-/// @brief A pair describing image size.
-struct HeightWidth {
-    /// @brief Height of a corresponding image.
-    size_t height;
-    /// @brief Width of a corresponding image.
-    size_t width;
-};
-
 /// @brief Embeddings of a given image. The number of slices is no
 /// greater than ProcessorConfig's max_slice_nums.
-struct std::vector<EncodedImage> {
+struct EncodedImage {
     /// @brief Embeddings of a resized image based on ProcessorConfig's
     /// scale_resolution. The tensor's shape is
-    /// [N, H*W, hidden_size]. [N, 1014, 1152] is a possible example for
+    /// [image_feature_size, hidden_size]. [1014, 1152] is a possible example for
     /// openbmb/MiniCPM-V-2.
     ov::Tensor resized_source;
-    /// @brief A size of an image used to compute embeddings for
-    /// divided by ProcessorConfig's patch_size.
-    HeightWidth resized_source_size;
     /// @brief Embeddings of images obtained from a source image by
     /// slicing at no more than max_slice_nums pieces and resizing.
     /// The tensor's shape is
-    /// [slice_y, slice_x, number_of_embeddings, embedding_size].
-    /// slices_sizes.size() == slice_y * slice_x.
+    /// [slice_y, slice_x, image_feature_size, embedding_size].
+    /// slices are empty if the image was small enough allowing to skip
+    /// slicing.
     ov::Tensor slices;
-    /// @brief Flattened sizes of images used to compute embeddings
-    /// stored in slices member divided by ProcessorConfig's patch_size.
-    std::vector<HeightWidth> slices_sizes;
 };
 
 /// @brief A class used to infer embeddings of an image using
@@ -49,14 +36,10 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
     ov::InferRequest m_resampler;
     /// @brief A config to follow.
     ProcessorConfig m_processor_config;
-
-    /// @brief Construct from an already compiled model and a config.
-    /// @param encoder Compiled model.
-    /// @param processor_config Initial config.
-    explicit VisionEncoder(
-        const ov::InferRequest& encoder,
-        const ProcessorConfig& processor_config=ProcessorConfig{}
-    ) : m_vision_embedding{encoder}, m_processor_config{processor_config} {}
+    // Precomputed positional embeddings for the resampler.
+    // [70, 70, hidden_size]. 70 is the initial guess of the image
+    // height and width after dividing by patch_size.
+    ov::Tensor m_pos_embed_cache;
 
     /// @brief Construct the encoder from model_dir.
     /// @param model_dir A folder containing openvino_embedding.xml and
@@ -72,6 +55,14 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
         ov::Core core=ov::Core{}
     );
 
+    /// @brief Construct from an already compiled model and a config.
+    /// @param encoder Compiled model.
+    /// @param processor_config Initial config.
+    explicit VisionEncoder(
+        const ov::InferRequest& encoder,
+        const ProcessorConfig& processor_config=ProcessorConfig{}
+    ) : m_vision_embedding{encoder}, m_processor_config{processor_config} {}
+
     /// @brief Compute embeddings of an image.
     /// @param image Images to infer embeddings for. Image shape must be
     /// [HWC].
diff --git a/src/cpp/src/vision_encoder.cpp b/src/cpp/src/vision_encoder.cpp
@@ -59,9 +59,9 @@ std::vector<std::vector<clip_image_u8>> slice_image(const clip_image_u8& img, co
     const std::pair<int, int> original_size{img.nx, img.ny};
     const int original_width = img.nx;
     const int original_height = img.ny;
-    const float log_ratio = log(1.0 * original_width / original_height);
-    const float ratio = 1.0 * original_width * original_height / (scale_resolution * scale_resolution);
-    const int multiple = fmin(ceil(ratio), max_slice_nums);
+    const float log_ratio = log(1.0f * original_width / original_height);
+    const float ratio = 1.0f * original_width * original_height / (scale_resolution * scale_resolution);
+    const int multiple = std::min(ceil(ratio), max_slice_nums);
 
     std::vector<std::vector<clip_image_u8>> images;
     images.push_back(std::vector<clip_image_u8>{});
@@ -140,7 +140,176 @@ std::vector<std::vector<clip_image_u8>> slice_image(const clip_image_u8& img, co
     return images;
 }
 
-EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const ov::Tensor& img, ov::InferRequest& encoder, int max_slice_nums, int scale_resolution, size_t patch_size, bool never_split) {
+ov::Tensor concatenate(const ov::Tensor& first, const ov::Tensor& second) {
+    size_t res_d_0 = first.get_shape().at(0);
+    size_t res_d_1 = first.get_shape().at(1);
+    size_t res_d_2 = first.get_shape().at(2) * 2;
+    ov::Tensor res{first.get_element_type(), {res_d_0, res_d_1, res_d_2}};
+    float* first_data = first.data<float>();
+    float* second_data = second.data<float>();
+    float* res_data = res.data<float>();
+    for (size_t i = 0; i < res_d_0; ++i) {
+        for (size_t j = 0; j < res_d_1; ++j) {
+            size_t k = 0;
+            for (; k < first.get_shape().at(2); ++k) {
+                res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k]
+                    = first_data[i * res_d_1 * first.get_shape().at(2) + j * first.get_shape().at(2) + k];
+            }
+            for (size_t l = 0; l < second.get_shape().at(2); ++l, ++k) {
+                res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k]
+                    = second_data[i * res_d_1 * second.get_shape().at(2) + j * second.get_shape().at(2) + l];
+            }
+        }
+    }
+    return res;
+}
+
+/// embed_dim: output dimension for each position
+/// pos: a list of positions to be encoded: size (H, W)
+/// out: (H, W, D)
+ov::Tensor get_1d_sincos_pos_embed_from_grid_new(size_t embed_dim, const ov::Tensor& pos) {
+    OPENVINO_ASSERT(embed_dim % 2 == 0);
+    OPENVINO_ASSERT(pos.get_shape().size() == 3);
+    OPENVINO_ASSERT(pos.get_shape().at(0) == 1);
+    size_t d0 = pos.get_shape().at(1);
+    size_t d1 = pos.get_shape().at(2);
+    size_t d2 = embed_dim / 2;
+    std::vector<float> omega(d2);
+    for (size_t idx = 0; idx < omega.size(); ++idx) {
+        omega.at(idx) = idx / (embed_dim / 2.0f);
+        omega.at(idx) = 1.0f / std::pow(10000.0f, omega.at(idx));  // (D/2,)
+    }
+    const float* const pos_data = pos.data<float>();
+    ov::Tensor out(ov::element::f32, {d0, d1, d2});  // (H, W, D/2), outer product
+    float* out_data = out.data<float>();
+    for (size_t i = 0; i < d0; ++i) {
+        for (size_t j = 0; j < d1; ++j) {
+            for (size_t k = 0; k < d2; ++k) {
+                out_data[i * d1 * d2 + j * d2 + k]
+                    = pos_data[i * d1 + j] * omega[k];
+            }
+        }
+    }
+
+    ov::Tensor emb_sin{out.get_element_type(), out.get_shape()};  // (H, W, D/2)
+    float* emb_sin_data = emb_sin.data<float>();
+    std::transform(out_data, out_data + out.get_size(), emb_sin_data, [](float arg) {
+        return std::sin(arg);
+    });
+    ov::Tensor emb_cos{out.get_element_type(), out.get_shape()};  // (H, W, D/2)
+    float* emb_cos_data = emb_cos.data<float>();
+    std::transform(out_data, out_data + out.get_size(), emb_cos_data, [](float arg) {
+        return std::cos(arg);
+    });
+    return concatenate(emb_sin, emb_cos); // (H, W, D)
+}
+
+ov::Tensor get_2d_sincos_pos_embed_from_grid(size_t embed_dim, const ov::Tensor& grid) {
+    OPENVINO_ASSERT(embed_dim % 2 == 0);
+    // use half of dimensions to encode grid_h
+    ov::Coordinate begin_h{0, 0, 0};
+    ov::Coordinate end_h{grid.get_shape()};
+    end_h.at(0) = 1;
+    ov::Coordinate begin_w{1, 0, 0};
+    ov::Coordinate end_w{grid.get_shape()};
+    end_w.at(0) = 2;
+    ov::Tensor emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, ov::Tensor{grid, begin_h, end_h});  // (H, W, D/2)
+    ov::Tensor emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, ov::Tensor{grid, begin_w, end_w});  // (H, W, D/2)
+    return concatenate(emb_h, emb_w);
+}
+
+/// image_size: image_size or (image_height, image_width)
+/// return:
+/// pos_embed: [image_height, image_width, embed_dim]
+ov::Tensor get_2d_sincos_pos_embed(size_t embed_dim, const HeightWidth& image_size) {
+    size_t grid_h_size = image_size.height, grid_w_size = image_size.width;
+    ov::Tensor grid(ov::element::f32, {2, grid_h_size, grid_w_size});
+    float* data = grid.data<float>();
+    for (size_t y = 0; y < grid_h_size; ++y) {
+        std::iota(data, data + grid_w_size, 0.0f);
+        data += grid_w_size;
+    }
+    for (float y = 0.0f; y < grid_h_size; ++y) {
+        std::fill(data, data + grid_w_size, y);
+        data += grid_w_size;
+    }
+    return get_2d_sincos_pos_embed_from_grid(embed_dim, grid);
+}
+
+void adjust_pos_cache(
+    const std::vector<HeightWidth>& target_sizes,
+    size_t hidden_size,
+    ov::Tensor& pos_embed_cache
+) {
+    size_t max_h = std::max_element(target_sizes.begin(), target_sizes.end(), [](const HeightWidth& left, const HeightWidth& right) {
+        return left.height < right.height;
+    })->height;
+    size_t max_w = std::max_element(target_sizes.begin(), target_sizes.end(), [](const HeightWidth& left, const HeightWidth& right) {
+        return left.width < right.width;
+    })->width;
+    size_t allocated_height, allocated_width;
+    if (pos_embed_cache) {
+        const ov::Shape& allocated_shape = pos_embed_cache.get_shape();
+        allocated_height = allocated_shape.at(0);
+        allocated_width = allocated_shape.at(1);
+    } else {
+        allocated_height = allocated_width = 70;
+    }
+    if (max_h > allocated_height || max_w > allocated_width) {
+        allocated_height = std::max(max_h, allocated_height);
+        allocated_width = std::max(max_w, allocated_width);
+        pos_embed_cache = get_2d_sincos_pos_embed(
+            hidden_size, {allocated_height, allocated_width}
+        );
+    }
+}
+
+ov::Tensor resample(VisionEncoder& vision, const ov::Tensor& encoded_image, const std::vector<HeightWidth>& target_sizes) {
+    size_t bs = encoded_image.get_shape().at(0);
+    std::vector<size_t> patch_len{target_sizes.size()};
+    std::transform(target_sizes.begin(), target_sizes.end(), patch_len.begin(), [](const HeightWidth& height_width) {
+        return height_width.height * height_width.width;
+    });
+    adjust_pos_cache(
+        target_sizes,
+        vision.m_vlm_config.hidden_size,
+        vision.m_pos_embed_cache
+    );
+    size_t max_patch_len = *std::max_element(patch_len.begin(), patch_len.end());
+    ov::Tensor key_padding_mask(ov::element::boolean, {bs, max_patch_len});
+    bool* mask_data = key_padding_mask.data<bool>();
+    size_t embed_len = vision.m_pos_embed_cache.get_shape().at(2);
+    ov::Tensor pos_embed(ov::element::f32, {max_patch_len, bs, embed_len});  // BLD => L * B * D
+    float* pos_embed_data = pos_embed.data<float>();
+    float* cache_data = vision.m_pos_embed_cache.data<float>();
+    size_t _d0 = vision.m_pos_embed_cache.get_shape().at(0);
+    size_t _d1 = vision.m_pos_embed_cache.get_shape().at(1);
+    for (size_t i = 0; i < bs; ++i) {
+        size_t target_h = target_sizes.at(i).height;
+        size_t target_w = target_sizes.at(i).width;
+        for (size_t h_idx = 0; h_idx < target_h; ++h_idx) {
+            for (size_t w_idx = 0; w_idx < target_w; ++w_idx) {
+                std::copy_n(
+                    cache_data + h_idx * _d1 + w_idx,
+                    embed_len,
+                    pos_embed_data + (h_idx * target_w + w_idx) * bs * embed_len + i * embed_len
+                );
+            }
+        }
+        for (size_t flat = target_h * target_w; flat < max_patch_len; ++flat) {
+            std::fill_n(pos_embed_data + flat * bs * embed_len + i * embed_len, embed_len, 0.0f);
+        }
+        std::fill_n(mask_data + i * max_patch_len, patch_len[i], false);
+        std::fill_n(mask_data + i * max_patch_len + patch_len[i], max_patch_len - patch_len[i], true);
+    }
+    vision.m_resampler.set_tensor("x", encoded_image);  // [N, H*W, old_hidden_size]
+    vision.m_resampler.set_tensor("pos_embed", pos_embed);  // [H*W, N, new_hidden_size]
+    vision.m_resampler.set_tensor("key_padding_mask", key_padding_mask);  // [N, H*W]
+    vision.m_resampler.infer();
+    return pipe.m_resampler.get_output_tensor();  // [N, query_num, new_hidden_size]
+}
+
+EncodedImage llava_image_embed_make_with_bytes_slice(VisionEncoder& vision, clip_ctx& ctx_clip, const ov::Tensor& img, ov::InferRequest& encoder, int max_slice_nums, int scale_resolution, size_t patch_size, bool never_split) {
     clip_image_u8 source{
         int(img.get_shape().at(3)),
         int(img.get_shape().at(2)),
@@ -168,35 +337,43 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
     ov::Tensor resized_source{output_tensor.get_element_type(), output_tensor.get_shape()};
     output_tensor.copy_to(resized_source);
     HeightWidth resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size};
-
-    HeightWidth size{
-        size_t(preprocessed.at(1).at(0).ny),
-        size_t(preprocessed.at(1).at(0).nx)
-    };
-    ov::Tensor batched{ov::element::f32, {(preprocessed.size() - 1) * preprocessed.at(1).size(), 3, size.height, size.width}};
-    float* batched_data = batched.data<float>();
-    size_t batch_offset = 0;
-    size_t values_in_elem = 3 * size.height * size.width;
-    std::vector<HeightWidth> sliced_sizes;
-    for (size_t row = 1; row < preprocessed.size(); ++row) {
-        for (const clip_image_f32& elem : preprocessed.at(row)) {
-            std::copy_n(elem.buf.begin(), values_in_elem, batched_data + batch_offset);
-            sliced_sizes.push_back({elem.ny / patch_size, elem.nx / patch_size});
-            batch_offset += values_in_elem;
-        }
-    }
-    encoder.set_input_tensor(batched);
-    encoder.infer();
-    const ov::Tensor& encoded = encoder.get_output_tensor();
-    const ov::Shape& plain = encoded.get_shape();
     struct SharedTensorAllocator {
         const ov::Tensor tensor;
         void* allocate(size_t bytes, size_t) {return bytes <= tensor.get_byte_size() ? tensor.data() : nullptr;}
         void deallocate(void*, size_t, size_t) {}
         bool is_equal(const SharedTensorAllocator& other) const noexcept {return this == &other;}
     };
-    ov::Tensor reshaped{encoded.get_element_type(), {preprocessed.size() - 1, preprocessed.at(1).size(), plain.at(1), plain.at(2)}, SharedTensorAllocator{encoded}};
-    return {resized_source, resized_source_size, reshaped, sliced_sizes};
+    ov::Tensor resampled_resized = resample(vision, encoder.get_output_tensor(), {resized_preprocessed.ny, resize_preprocessed.nx});
+    ov::Tensor owner{resampled_resized.get_element_type(), resized_resampled.get_shape()};
+    resampled_resized.copy_to(owner);
+    owner.set_shape(owner.get_shape().at(1), owner.get_shape().at(2));
+
+    ov::Tensor resampled_slices;
+    if (1 < preprocessed.size()) {
+        HeightWidth size{
+            size_t(preprocessed.at(1).at(0).ny),
+            size_t(preprocessed.at(1).at(0).nx)
+        };
+        ov::Tensor batched{ov::element::f32, {(preprocessed.size() - 1) * preprocessed.at(1).size(), 3, size.height, size.width}};
+        float* batched_data = batched.data<float>();
+        size_t batch_offset = 0;
+        size_t values_in_elem = 3 * size.height * size.width;
+        std::vector<HeightWidth> sliced_sizes;
+        for (size_t row = 1; row < preprocessed.size(); ++row) {
+            for (const clip_image_f32& elem : preprocessed.at(row)) {
+                std::copy_n(elem.buf.begin(), values_in_elem, batched_data + batch_offset);
+                sliced_sizes.push_back({elem.ny / patch_size, elem.nx / patch_size});
+                batch_offset += values_in_elem;
+            }
+        }
+        encoder.set_input_tensor(batched);
+        encoder.infer();
+        ov::Tensor resampled_batched = resample(vision, encoder.get_output_tensor(), size);
+        const ov::Tensor& encoded = encoder.get_output_tensor();
+        const ov::Shape& plain = encoded.get_shape();
+        resampled_slices{encoded.get_element_type(), {preprocessed.size() - 1, preprocessed.at(1).size(), plain.at(1), plain.at(2)}, SharedTensorAllocator{encoded}};
+    }
+    return {owner, resampled_slices};
 }
 }
 
diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/vlm_pipeline.cpp

Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@ int main(int argc, char* argv[]) try {`
`17`	`17`	`std::string device = "CPU"; // GPU can be used as well`
`18`	`18`	`ov::AnyMap enable_compile_cache;`
`19`	`19`	`if ("GPU" == device) {`
`20`		`- // Cache compile models on disks for GPU to save time on the`
	`20`	`+ // Cache compile models on disk for GPU to save time on the`
`21`	`21`	`// next run. It's not beneficial for CPU.`
`22`	`22`	`enable_compile_cache.insert({ov::cache_dir("vlm_cache")});`
`23`	`23`	`}`