apaniukov
diff --git a/‎.github/workflows/causal_lm_cpp.yml
+14-1 b/‎.github/workflows/causal_lm_cpp.yml
+14-1
diff --git a/‎samples/cpp/visual_language_chat/CMakeLists.txt
+17 b/‎samples/cpp/visual_language_chat/CMakeLists.txt
+17
diff --git a/‎samples/cpp/visual_language_chat/README.md
+41 b/‎samples/cpp/visual_language_chat/README.md
+41
diff --git a/‎samples/cpp/visual_language_chat/benchmark_vlm.cpp
+82 b/‎samples/cpp/visual_language_chat/benchmark_vlm.cpp
+82
diff --git a/‎samples/python/visual_language_chat/README.md
+39 b/‎samples/python/visual_language_chat/README.md
+39
diff --git a/‎samples/python/visual_language_chat/benchmark_vlm.py
+76 b/‎samples/python/visual_language_chat/benchmark_vlm.py
+76
diff --git a/‎src/cpp/include/openvino/genai/visual_language/perf_metrics.hpp
+34 b/‎src/cpp/include/openvino/genai/visual_language/perf_metrics.hpp
+34
@@ -727,7 +727,7 @@ jobs:
           ov_link: ${{ env.l_u22_ov_link }}
       - uses: ./.github/actions/build_app
         with:
-          build_target: 'visual_language_chat py_openvino_genai'
+          build_target: 'visual_language_chat benchmark_vlm py_openvino_genai'
       - uses: ./.github/actions/install_python_deps
       - name: Download and convert tiny-random-minicpmv-2_6 model and an image
         run: |
@@ -754,6 +754,12 @@ jobs:
           && ./build/samples/cpp/visual_language_chat/visual_language_chat ./tiny-random-minicpmv-2_6/ ./images/
           <<< $'Describe the images?' | tee cpp.txt
         timeout-minutes: 2
+      - name: Run benchmark_vlm C++ sample - tiny-random-minicpmv-2_6
+        run: >
+          set -o pipefail
+          && source ./ov/setupvars.sh
+          && ./build/samples/cpp/visual_language_chat/benchmark_vlm  -m ./tiny-random-minicpmv-2_6/ -i ./images/cat.png -n 3
+        timeout-minutes: 2
       - name: Run visual_language_chat Python sample - tiny-random-minicpmv-2_6
         run: >
           set -o pipefail
@@ -762,6 +768,13 @@ jobs:
           <<< $'Describe the images?' | tee py.txt
         env:
           PYTHONPATH: "./build/"
+      - name: Run benchmark_vlm Python sample - tiny-random-minicpmv-2_6
+        run: >
+          set -o pipefail
+          && source ./ov/setupvars.sh
+          && ./samples/python/visual_language_chat/benchmark_vlm.py -m ./tiny-random-minicpmv-2_6/ -i ./images/cat.png -n 3
+        env:
+          PYTHONPATH: "./build/"
       - name: Encode cpp.txt with Python encoding instead of terminal one
         shell: python
         run: |
 
@@ -13,6 +13,8 @@ file(DOWNLOAD
     ${CMAKE_BINARY_DIR}/stb_image.h
     EXPECTED_HASH MD5=27932e6fb3a2f26aee2fc33f2cb4e696)
 
+# create main sample executable
+
 add_executable(visual_language_chat visual_language_chat.cpp load_image.cpp)
 target_include_directories(visual_language_chat PRIVATE "${CMAKE_CURRENT_SOUCE_DIR}" "${CMAKE_BINARY_DIR}")
 target_link_libraries(visual_language_chat PRIVATE openvino::genai)
@@ -26,3 +28,18 @@ install(TARGETS visual_language_chat
         RUNTIME DESTINATION samples_bin/
         COMPONENT samples_bin
         EXCLUDE_FROM_ALL)
+
+# create benchmark executable
+
+add_executable(benchmark_vlm benchmark_vlm.cpp load_image.cpp)
+target_include_directories(benchmark_vlm PRIVATE "${CMAKE_CURRENT_SOUCE_DIR}" "${CMAKE_BINARY_DIR}")
+target_link_libraries(benchmark_vlm PRIVATE openvino::genai cxxopts::cxxopts)
+set_target_properties(benchmark_vlm PROPERTIES
+    COMPILE_PDB_NAME benchmark_vlm
+    # Ensure out of box LC_RPATH on macOS with SIP
+    INSTALL_RPATH_USE_LINK_PATH ON)
+
+install(TARGETS benchmark_vlm
+        RUNTIME DESTINATION samples_bin/
+        COMPONENT samples_bin
+        EXCLUDE_FROM_ALL)
@@ -2,6 +2,12 @@
 
 This example showcases inference of Visual language models (VLMs): [`openbmb/MiniCPM-V-2_6`](https://huggingface.co/openbmb/MiniCPM-V-2_6). The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `ov::genai::VLMPipeline` and runs the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/minicpm-v-multimodal-chatbot) which provides an example of Visual-language assistant.
 
+
+There are two sample files:
+ - [`visual_language_chat.cpp`](./visual_language_chat.cpp) demonstrates basic usage of the VLM pipeline.
+ - [`benchmark_vlm.cpp`](./benchmark_vlm.cpp) shows how to benchmark a VLM in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text and calculating various performance metrics.
+
+
 ## Download and convert the model and tokenizers
 
 The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
@@ -25,6 +31,41 @@ Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is
 
 See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#visual-language-models) for the list of supported models.
 
+## Run benchmark:
+
+```sh
+benchmark_vlm [OPTIONS]
+```
+
+### Options
+
+- `-m, --model`(default: `.`): Path to the model and tokenizers base directory.
+- `-p, --prompt` (default: `What is on the image?`): The prompt to generate text.
+- `-i, --image` (default: `image.jpg`): Path to the image.
+- `-nw, --num_warmup` (default: `1`): Number of warmup iterations.
+- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations.
+- `-n, --num_iter` (default: `3`): Number of iterations.
+- `-d, --device` (default: `"CPU"`): Device to run the model on.
+
+### Output:
+
+```
+benchmark_vlm -m miniCPM-V-2_6 -i 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg -n 3
+```
+
+```
+Load time: 1982.00 ms
+Generate time: 13820.99 ± 64.62 ms
+Tokenization time: 1.26 ± 0.09 ms
+Detokenization time: 0.33 ± 0.05 ms
+Embeddings preparation time: 5733.85 ± 26.34 ms
+TTFT: 11246.98 ± 80.55 ms
+TPOT: 135.45 ± 4.73 ms/token 
+Throughput: 7.38 ± 0.26 tokens/s
+```
+
+For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics).
+
 ### Troubleshooting
 
 #### Unicode characters encoding error on Windows
 
@@ -0,0 +1,82 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cxxopts.hpp>
+#include <filesystem>
+
+#include "load_image.hpp"
+#include <openvino/genai/visual_language/pipeline.hpp>
+
+
+int main(int argc, char* argv[]) try {
+    cxxopts::Options options("benchmark_vlm", "Help command");
+
+    options.add_options()
+    ("m,model", "Path to model and tokenizers base directory", cxxopts::value<std::string>()->default_value("."))
+    ("p,prompt", "Prompt", cxxopts::value<std::string>()->default_value("What is on the image?"))
+    ("i,image", "Image", cxxopts::value<std::string>()->default_value("image.jpg"))
+    ("nw,num_warmup", "Number of warmup iterations", cxxopts::value<size_t>()->default_value(std::to_string(1)))
+    ("n,num_iter", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(3)))
+    ("mt,max_new_tokens", "Maximal number of new tokens", cxxopts::value<size_t>()->default_value(std::to_string(20)))
+    ("d,device", "device", cxxopts::value<std::string>()->default_value("CPU"))
+    ("h,help", "Print usage");
+
+    cxxopts::ParseResult result;
+    try {
+        result = options.parse(argc, argv);
+    } catch (const cxxopts::exceptions::exception& e) {
+        std::cout << e.what() << "\n\n";
+        std::cout << options.help() << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    if (result.count("help")) {
+        std::cout << options.help() << std::endl;
+        return EXIT_SUCCESS;
+    }
+
+    std::string prompt = result["prompt"].as<std::string>();
+    const std::string models_path = result["model"].as<std::string>();
+    const std::string image_path = result["image"].as<std::string>();
+    std::string device = result["device"].as<std::string>();
+    size_t num_warmup = result["num_warmup"].as<size_t>();
+    size_t num_iter = result["num_iter"].as<size_t>();
+    ov::Tensor image = utils::load_image(image_path);
+  
+    ov::genai::GenerationConfig config;
+    config.max_new_tokens = result["max_new_tokens"].as<size_t>();
+
+    ov::genai::VLMPipeline pipe(models_path, device);
+    
+    for (size_t i = 0; i < num_warmup; i++)
+        pipe.generate(prompt, ov::genai::image(image), ov::genai::generation_config(config));
+    
+    auto res = pipe.generate(prompt, ov::genai::image(image), ov::genai::generation_config(config));
+    auto metrics = res.perf_metrics;
+    for (size_t i = 0; i < num_iter - 1; i++) {
+        res = pipe.generate(prompt, ov::genai::image(image), ov::genai::generation_config(config));
+        metrics = metrics + res.perf_metrics;
+    }
+
+    std::cout << std::fixed << std::setprecision(2);
+    std::cout << "Load time: " << metrics.get_load_time() << " ms" << std::endl;
+    std::cout << "Generate time: " << metrics.get_generate_duration().mean << " ± " << metrics.get_generate_duration().std << " ms" << std::endl;
+    std::cout << "Tokenization time: " << metrics.get_tokenization_duration().mean << " ± " << metrics.get_tokenization_duration().std << " ms" << std::endl;
+    std::cout << "Detokenization time: " << metrics.get_detokenization_duration().mean << " ± " << metrics.get_detokenization_duration().std << " ms" << std::endl;
+    std::cout << "Embeddings preparation time: " << metrics.get_prepare_embeddings_duration().mean << " ± " << metrics.get_prepare_embeddings_duration().std << " ms" << std::endl;
+    std::cout << "TTFT: " << metrics.get_ttft().mean  << " ± " << metrics.get_ttft().std << " ms" << std::endl;
+    std::cout << "TPOT: " << metrics.get_tpot().mean  << " ± " << metrics.get_tpot().std << " ms/token " << std::endl;
+    std::cout << "Throughput: " << metrics.get_throughput().mean  << " ± " << metrics.get_throughput().std << " tokens/s" << std::endl;
+
+    return 0;
+} catch (const std::exception& error) {
+    try {
+        std::cerr << error.what() << '\n';
+    } catch (const std::ios_base::failure&) {}
+    return EXIT_FAILURE;
+} catch (...) {
+    try {
+        std::cerr << "Non-exception object thrown\n";
+    } catch (const std::ios_base::failure&) {}
+    return EXIT_FAILURE;
+}
@@ -2,6 +2,10 @@
 
 This example showcases inference of text-generation Vision Language Models (VLMs): `miniCPM-V-2_6` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `openvino_genai.VLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/minicpm-v-multimodal-chatbot) which provides an example of Visual-language assistant.
 
+There are two sample files:
+ - [`visual_language_chat.py`](./visual_language_chat.py) demonstrates basic usage of the VLM pipeline.
+ - [`benchmark_vlm.py`](./benchmark_vlm.py) shows how to benchmark a VLM in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text and calculating various performance metrics.
+
 ## Download and convert the model and tokenizers
 
 The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
@@ -27,6 +31,41 @@ Modify the source code to change the device for inference to the GPU.
 
 See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
 
+## Run benchmark:
+
+```sh
+python benchmark_vlm.py [OPTIONS]
+```
+
+### Options
+
+- `-m, --model`(default: `.`): Path to the model and tokenizers base directory.
+- `-p, --prompt` (default: `What is on the image?`): The prompt to generate text.
+- `-i, --image` (default: `image.jpg`): Path to the image.
+- `-nw, --num_warmup` (default: `1`): Number of warmup iterations.
+- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations.
+- `-n, --num_iter` (default: `3`): Number of iterations.
+- `-d, --device` (default: `"CPU"`): Device to run the model on.
+
+### Output:
+
+```
+python benchmark_vlm.py -m miniCPM-V-2_6 -i 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg -n 3
+```
+
+```
+Load time: 1982.00 ms
+Generate time: 13820.99 ± 64.62 ms
+Tokenization time: 1.26 ± 0.09 ms
+Detokenization time: 0.33 ± 0.05 ms
+Embeddings preparation time: 5733.85 ± 26.34 ms
+TTFT: 11246.98 ± 80.55 ms
+TPOT: 135.45 ± 4.73 ms/token 
+Throughput: 7.38 ± 0.26 tokens/s
+```
+
+For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics).
+
 ### Troubleshooting
 
 #### Unicode characters encoding error on Windows
 
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+# Copyright (C) 2023-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import openvino_genai as ov_genai
+from PIL import Image
+from openvino import Tensor
+import numpy as np
+
+
+def read_image(path: str) -> Tensor:
+    '''
+
+    Args:
+        path: The path to the image.
+
+    Returns: the ov.Tensor containing the image.
+
+    '''
+    pic = Image.open(path).convert("RGB")
+    image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8)
+    return Tensor(image_data)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Help command")
+    parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory")
+    parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt")
+    parser.add_argument("-i", "--image", type=str, default="image.jpg", help="Image")
+    parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations")
+    parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations")
+    parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens")
+    parser.add_argument("-d", "--device", type=str, default="CPU", help="Device")
+
+    args = parser.parse_args()
+
+    # Perf metrics is stored in VLMDecodedResults.
+    # In order to get VLMDecodedResults instead of a string input should be a list.
+    prompt = args.prompt
+    models_path = args.model
+    image = read_image(args.image)
+    device = args.device
+    num_warmup = args.num_warmup
+    num_iter = args.num_iter
+
+    config = ov_genai.GenerationConfig()
+    config.max_new_tokens = args.max_new_tokens
+
+    pipe = ov_genai.VLMPipeline(models_path, device)
+
+    for _ in range(num_warmup):
+        pipe.generate(prompt, images=image, generation_config=config)
+
+    res = pipe.generate(prompt, images=image, generation_config=config)
+    perf_metrics = res.perf_metrics
+    for _ in range(num_iter - 1):
+        res = pipe.generate(prompt, images=image, generation_config=config)
+        perf_metrics += res.perf_metrics
+
+    print(f"Load time: {perf_metrics.get_load_time():.2f} ms")
+    print(
+        f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms")
+    print(
+        f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms")
+    print(
+        f"Detokenization time: {perf_metrics.get_detokenization_duration().mean:.2f} ± {perf_metrics.get_detokenization_duration().std:.2f} ms")
+    print(
+        f"Embeddings preparation time: {perf_metrics.get_prepare_embeddings_duration().mean:.2f} ± {perf_metrics.get_prepare_embeddings_duration().std:.2f} ms")
+    print(f"TTFT: {perf_metrics.get_ttft().mean:.2f} ± {perf_metrics.get_ttft().std:.2f} ms")
+    print(f"TPOT: {perf_metrics.get_tpot().mean:.2f} ± {perf_metrics.get_tpot().std:.2f} ms")
+    print(f"Throughput : {perf_metrics.get_throughput().mean:.2f} ± {perf_metrics.get_throughput().std:.2f} tokens/s")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,34 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "openvino/genai/perf_metrics.hpp"
+#include "openvino/genai/visibility.hpp"
+
+
+namespace ov::genai {
+
+struct OPENVINO_GENAI_EXPORTS VLMRawPerfMetrics {
+    /** @brief Duration of preparation of embeddings */
+    std::vector<MicroSeconds> prepare_embeddings_durations;
+};
+
+struct OPENVINO_GENAI_EXPORTS VLMPerfMetrics : public PerfMetrics {
+    /** @brief Mean and standard deviation of preparation of embeddings in milliseconds */
+    MeanStdPair prepare_embeddings_duration;
+
+    MeanStdPair get_prepare_embeddings_duration();
+
+    VLMPerfMetrics() = default;
+
+    VLMPerfMetrics(PerfMetrics& perf_metrics) : PerfMetrics(perf_metrics){};
+
+    void evaluate_statistics(std::optional<TimePoint> start_time = std::nullopt) override;
+
+    VLMPerfMetrics operator+(const VLMPerfMetrics& metrics) const;
+
+    VLMRawPerfMetrics vlm_raw_metrics;
+};
+
+}