Clean up VLMPipeline

Wovchena · Wovchena · commit 35b6d899001c · 2024-10-07T20:07:39.000+04:00
Address comments in openvinotoolkit#912
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
@@ -711,10 +711,10 @@ jobs:
       - run: >
           source ./ov/setupvars.sh
           && python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/
-      - run: wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11
+      - run: wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg
       - run: >
           source ./ov/setupvars.sh
-          && ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ d5fbbd1a-d484-415c-88cb-9986625b7b11
+          && ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg
           <<< $'What is on the image?\nWhat is special on the image?'
         timeout-minutes: 110
 
diff --git a/samples/cpp/visual_language_chat/README.md b/samples/cpp/visual_language_chat/README.md
@@ -15,7 +15,7 @@ export_MiniCPM-V-2_6.py miniCPM-V-2_6
 
 ## Run
 
-https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 can be used as a sample image.
+[This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image.
 
 `visual_language_chat miniCPM-V-2_6 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg`
 
diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp
@@ -31,7 +31,7 @@ int main(int argc, char* argv[]) try {
     }
     pipe.generate(
         prompt,
-        ov::genai::image(std::move(image)),
+        ov::genai::image(image),
         ov::genai::streamer(print_subword)
     );
     std::cout << "\n----------\n"
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
@@ -51,7 +51,9 @@ file(GLOB_RECURSE SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp" "${CMAKE_
 
 set(TARGET_NAME openvino_genai)
 add_library(${TARGET_NAME} SHARED ${SOURCE_FILES})
-add_dependencies(${TARGET_NAME} openvino_tokenizers)
+if(TARGET openvino_tokenizers)
+    add_dependencies(${TARGET_NAME} openvino_tokenizers)
+endif()
 add_library(openvino::genai ALIAS ${TARGET_NAME})
 
 target_include_directories(${TARGET_NAME}
diff --git a/src/cpp/include/openvino/genai/vision_encoder.hpp b/src/cpp/include/openvino/genai/vision_encoder.hpp
@@ -8,7 +8,7 @@
 
 namespace ov::genai {
 /// @brief A pair describing image size.
-struct HeightWidth {
+struct ImageSize {
     /// @brief Height of a corresponding image.
     size_t height;
     /// @brief Width of a corresponding image.
@@ -25,16 +25,16 @@ struct EncodedImage {
     ov::Tensor resized_source;
     /// @brief A size of an image used to compute embeddings for
     /// divided by ProcessorConfig's patch_size.
-    HeightWidth resized_source_size;
+    ImageSize resized_source_size;
     /// @brief Embeddings of images obtained from a source image by
     /// slicing at no more than max_slice_nums pieces and resizing.
     /// The tensor's shape is
     /// [slice_y, slice_x, number_of_embeddings, embedding_size].
     /// slices_sizes.size() == slice_y * slice_x.
     ov::Tensor slices;
-    /// @brief Flattened sizes of images used to compute embeddings
+    /// @brief A size of images used to compute embeddings
     /// stored in slices member divided by ProcessorConfig's patch_size.
-    std::vector<HeightWidth> slices_sizes;
+    ImageSize slices_size;
 };
 
 /// @brief A class used to infer embeddings of an image using
diff --git a/src/cpp/include/openvino/genai/vlm_pipeline.hpp b/src/cpp/include/openvino/genai/vlm_pipeline.hpp
@@ -65,37 +65,14 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     explicit VLMPipeline(
         const std::filesystem::path& model_dir,
         const std::string& device="CPU",
-        const ov::AnyMap device_config={},
-        ov::Core core=ov::Core{}
-    ) : VLMPipeline{
-        model_dir,
-        Tokenizer(model_dir.string(), device_config),
-        device,
-        device_config,
-        core
-    } {}
-
-    /// @brief Construct a pipeline form a folder containing model IRs
-    /// and from a Tokenizer instance.
-    /// @param model_dir A folder to read model IRs.
-    /// @param tokenizer An instance of Tokenizer to use.
-    /// @param device Inference device.
-    /// @param device_config A config to pass to ov::Core.set_property()
-    /// and ov::Core::compile_model().
-    /// @param core ov::Core instance to use.
-    VLMPipeline(
-        const std::filesystem::path& model_dir,
-        const ov::genai::Tokenizer& tokenizer,
-        const std::string& device="CPU",
-        const ov::AnyMap device_config={},
-        ov::Core core=ov::Core{}
+        const ov::AnyMap device_config={}
     );
 
     /// @brief Default destructor.
     ~VLMPipeline();
 
     /// @brief Generate a response given a prompt and any number of
-    /// uint8 RGB images.
+    /// uint8 RGB images with [HWC] layout.
     /// @param prompt A prompt to respond to.
     /// @param images Images to be prepended to a prompt.
     /// @param generation_config A config to follow for text generation.
@@ -120,7 +97,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     /// @brief Generate a response given a prompt and arbitrary number
     /// of ov::Property instances.
     /// Example:
-    /// generate("text", image(std::move(rgb)), do_sample(true));
+    /// generate("text", image(rgb), do_sample(true));
     /// @param prompt A prompt to respond to.
     /// @param ...properties ov::Property instances to be combined into
     /// ov::AnyMap.
@@ -166,7 +143,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
 
 /*
  * utils that allow to use generate() in the following way:
- * pipe.generate(prompt, ov::genai::image(std::move(image_tensor))).
+ * pipe.generate(prompt, ov::genai::image(image_tensor)).
 */
 static constexpr ov::Property<ov::Tensor> image{"image"};
 static constexpr ov::Property<std::vector<ov::Tensor>> images{"images"};
diff --git a/src/cpp/src/clip.cpp b/src/cpp/src/clip.cpp
@@ -6,9 +6,6 @@
 // I'll gradually clean and extend it
 // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 
-#define STB_IMAGE_IMPLEMENTATION
-#include "stb_image.hpp"
-
 #include <cassert>
 #include <cmath>
 #include <cstdlib>
diff --git a/src/cpp/src/clip.hpp b/src/cpp/src/clip.hpp
@@ -1,8 +1,7 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#ifndef CLIP_H
-#define CLIP_H
+#pragma once
 
 #include <vector>
 #include <numeric>
@@ -53,4 +52,3 @@ bool bicubic_resize(const clip_image_u8& img, clip_image_u8& dst, int target_wid
 
 /** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
 clip_image_f32 clip_image_preprocess(struct clip_ctx& ctx, const clip_image_u8& img);
-#endif // CLIP_H
diff --git a/src/cpp/src/stb_image.hpp b/src/cpp/src/stb_image.hpp
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
diff --git a/src/cpp/src/vision_encoder.cpp b/src/cpp/src/vision_encoder.cpp
diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/vlm_pipeline.cpp

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ int main(int argc, char* argv[]) try {`
`31`	`31`	`}`
`32`	`32`	`pipe.generate(`
`33`	`33`	`prompt,`
`34`		`- ov::genai::image(std::move(image)),`
	`34`	`+ ov::genai::image(image),`
`35`	`35`	`ov::genai::streamer(print_subword)`
`36`	`36`	`);`
`37`	`37`	`std::cout << "\n----------\n"`