benchmark_image_gen: Add --reshape option, and ability to specify multiple devices (#1878)

RyanMetcalfeInt8 · web-flow · commit dc5c9a9dfa40 · 2025-03-20T13:30:42.000+04:00
diff --git a/samples/cpp/image_generation/README.md b/samples/cpp/image_generation/README.md
@@ -164,7 +164,7 @@ Options:
 - `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text.
 - `--nw, --num_warmup` (default: `1`): Number of warmup iterations.
 - `-n, --num_iter` (default: `3`): Number of iterations.
-- `-d, --device` (default: `"CPU"`): Device to run the model on.
+- `-d, --device` (default: `"CPU"`): Device(s) to run the pipeline with.
 - `-w, --width` (default: `512`): The width of the output image.
 - `--ht, --height` (default: `512`): The height of the output image.
 - `--is, --num_inference_steps` (default: `20`): The number of inference steps.
@@ -173,6 +173,7 @@ Options:
 - `-i, --image`: Path to input image.
 - `-s, --strength`: Indicates extent to transform the reference `image`. Must be between 0 and 1.
 - `--mi, --mask_image`: Path to mask image.
+- `-r, --reshape': Reshape pipeline before compilation. This can improve image generation performance.
 
 For example:
 
diff --git a/samples/cpp/image_generation/benchmark_image_gen.cpp b/samples/cpp/image_generation/benchmark_image_gen.cpp
@@ -106,15 +106,50 @@ inline void print_statistic(std::vector<ov::genai::ImageGenerationPerfMetrics>&
               << " ms, vae decoder infer avg time:" << vae_decoder_mean << " ms" << std::endl;
 }
 
+inline std::vector<std::string> device_string_to_triplet(const std::string& device_input) {
+    std::vector<std::string> devices;
+    std::istringstream stream(device_input);
+    std::string device;
+
+    // Split the device input string by commas
+    while (std::getline(stream, device, ',')) {
+        devices.push_back(device);
+    }
+
+    // Trim whitespace from each device name
+    for (auto& dev : devices) {
+        dev.erase(0, dev.find_first_not_of(" \t"));
+        dev.erase(dev.find_last_not_of(" \t") + 1);
+    }
+
+    // Ensure exactly three devices
+    if (devices.size() == 1) {
+        return {devices[0], devices[0], devices[0]};
+    } else if (devices.size() == 3) {
+        return devices;
+    } else {
+        throw std::invalid_argument("The device specified by -d/--device must be a single device (e.g. -d \"GPU\"), "
+                                    "or exactly 3 comma separated device names (e.g. -d \"CPU,NPU,GPU\")");
+    }
+}
+
 void text2image(cxxopts::ParseResult& result) {
     std::string prompt = result["prompt"].as<std::string>();
     const std::string models_path = result["model"].as<std::string>();
-    std::string device = result["device"].as<std::string>();
+    auto devices = device_string_to_triplet(result["device"].as<std::string>());
     size_t num_warmup = result["num_warmup"].as<size_t>();
     size_t num_iter = result["num_iter"].as<size_t>();
     const std::string output_dir = result["output_dir"].as<std::string>();
 
-    ov::genai::Text2ImagePipeline pipe(models_path, device);
+    ov::genai::Text2ImagePipeline pipe(models_path);
+    if (result["reshape"].as<bool>()) {
+        pipe.reshape(result["num_images_per_prompt"].as<size_t>(),
+                     result["height"].as<size_t>(),
+                     result["width"].as<size_t>(),
+                     pipe.get_generation_config().guidance_scale);
+    }
+    pipe.compile(devices[0], devices[1], devices[2]);
+
     ov::genai::ImageGenerationConfig config = pipe.get_generation_config();
     config.width = result["width"].as<size_t>();
     config.height = result["height"].as<size_t>();
@@ -148,15 +183,21 @@ void image2image(cxxopts::ParseResult& result) {
     std::string prompt = result["prompt"].as<std::string>();
     const std::string models_path = result["model"].as<std::string>();
     std::string image_path = result["image"].as<std::string>();
-    std::string device = result["device"].as<std::string>();
+    auto devices = device_string_to_triplet(result["device"].as<std::string>());
     size_t num_warmup = result["num_warmup"].as<size_t>();
     size_t num_iter = result["num_iter"].as<size_t>();
     const std::string output_dir = result["output_dir"].as<std::string>();
     float strength = result["strength"].as<float>();
 
     ov::Tensor image_input = utils::load_image(image_path);
 
-    ov::genai::Image2ImagePipeline pipe(models_path, device);
+    ov::genai::Image2ImagePipeline pipe(models_path);
+    if (result["reshape"].as<bool>()) {
+        auto height = image_input.get_shape()[1];
+        auto width = image_input.get_shape()[2];
+        pipe.reshape(1, height, width, pipe.get_generation_config().guidance_scale);
+    }
+    pipe.compile(devices[0], devices[1], devices[2]);
 
     std::vector<ov::genai::ImageGenerationPerfMetrics> warmup_metrics;
     std::cout << std::fixed << std::setprecision(2);
@@ -185,15 +226,21 @@ void inpainting(cxxopts::ParseResult& result) {
     const std::string models_path = result["model"].as<std::string>();
     std::string image_path = result["image"].as<std::string>();
     std::string mask_image_path = result["mask_image"].as<std::string>();
-    std::string device = result["device"].as<std::string>();
+    auto devices = device_string_to_triplet(result["device"].as<std::string>());
     size_t num_warmup = result["num_warmup"].as<size_t>();
     size_t num_iter = result["num_iter"].as<size_t>();
     const std::string output_dir = result["output_dir"].as<std::string>();
 
     ov::Tensor image_input = utils::load_image(image_path);
     ov::Tensor mask_image = utils::load_image(mask_image_path);
 
-    ov::genai::InpaintingPipeline pipe(models_path, device);
+    ov::genai::InpaintingPipeline pipe(models_path);
+    if (result["reshape"].as<bool>()) {
+        auto height = image_input.get_shape()[1];
+        auto width = image_input.get_shape()[2];
+        pipe.reshape(1, height, width, pipe.get_generation_config().guidance_scale);
+    }
+    pipe.compile(devices[0], devices[1], devices[2]);
 
     std::cout << std::fixed << std::setprecision(2);
     std::vector<ov::genai::ImageGenerationPerfMetrics> warmup_metrics;
@@ -239,6 +286,7 @@ int main(int argc, char* argv[]) try {
     ("s,strength", "Indicates extent to transform the reference `image`. Must be between 0 and 1", cxxopts::value<float>()->default_value(std::to_string(0.8)))
     //special parameters of inpainting pipeline
     ("mi,mask_image", "Mask image path", cxxopts::value<std::string>())
+    ("r,reshape", "Reshape pipeline before compilation", cxxopts::value<bool>()->default_value("false"))
     ("h,help", "Print usage");
 
     cxxopts::ParseResult result;
diff --git a/samples/python/image_generation/README.md b/samples/python/image_generation/README.md
@@ -163,7 +163,7 @@ Options:
 - `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text.
 - `-nw, --num_warmup` (default: `1`): Number of warmup iterations.
 - `-n, --num_iter` (default: `3`): Number of iterations.
-- `-d, --device` (default: `"CPU"`): Device to run the model on.
+- `-d, --device` (default: `"CPU"`): Device(s) to run the pipeline with.
 - `-w, --width` (default: `512`): The width of the output image.
 - `-ht, --height` (default: `512`): The height of the output image.
 - `-is, --num_inference_steps` (default: `20`): The number of inference steps.
@@ -172,6 +172,7 @@ Options:
 - `-i, --image`: Path to input image.
 - `-mi, --mask_image`: Path to the mask image.
 - `-s, --strength`: Indicates extent to transform the reference `image`. Must be between 0 and 1.
+- `-r, --reshape': Reshape pipeline before compilation. This can improve image generation performance.
 
 For example:
 
diff --git a/samples/python/image_generation/benchmark_image_gen.py b/samples/python/image_generation/benchmark_image_gen.py
@@ -81,15 +81,29 @@ def print_statistic(warmup_metrics, iter_metrics):
           f"infer avg time: {inference_mean:.2f} ms, all text encoder infer avg time: {text_encoder_mean:.2f} ms, "
           f"vae encoder infer avg time: {vae_encoder_mean:.2f} ms, vae decoder infer avg time: {vae_decoder_mean:.2f} ms")
 
+def device_string_to_triplet(device_input):
+    devices = [device.strip() for device in device_input.split(",")]
+    if len(devices) == 1:
+        return [devices[0]] * 3
+    elif len(devices) == 3:
+        return devices
+    else:
+        raise ValueError("The device specified by -d/--device must be a single device (e.g. -d \"GPU\"), " +
+                         "or exactly 3 comma separated device names (e.g. -d \"CPU,NPU,GPU\")")
+
 def text2image(args):
     prompt = args.prompt
     models_path = args.model
-    device = args.device
+    devices = device_string_to_triplet(args.device)
     num_warmup = args.num_warmup
     num_iter = args.num_iter
     output_dir = args.output_dir
     
-    pipe = ov_genai.Text2ImagePipeline(models_path, device)
+    pipe = ov_genai.Text2ImagePipeline(models_path)
+    if args.reshape:
+        pipe.reshape(args.num_images_per_prompt, args.height, args.width, pipe.get_generation_config().guidance_scale)
+    pipe.compile(devices[0], devices[1], devices[2])
+
     config = pipe.get_generation_config()
     config.width = args.width
     config.height = args.height
@@ -124,17 +138,22 @@ def read_image(path: str) -> openvino.Tensor:
 def image2image(args):
     prompt = args.prompt
     models_path = args.model
-    device = args.device
+    devices = device_string_to_triplet(args.device)
     num_warmup = args.num_warmup
     num_iter = args.num_iter
     output_dir = args.output_dir
     image_path = args.image
     strength = args.strength
     
-    pipe = ov_genai.Image2ImagePipeline(models_path, device)
-
     image_input = read_image(image_path)
 
+    pipe = ov_genai.Image2ImagePipeline(models_path)
+    if args.reshape:
+        height = image_input.get_shape()[1]
+        width = image_input.get_shape()[2]
+        pipe.reshape(1, height, width, pipe.get_generation_config().guidance_scale)
+    pipe.compile(devices[0], devices[1], devices[2])
+
     warmup_metrics = []
     for i in range(num_warmup):
         pipe.generate(prompt, image_input, strength=strength)
@@ -157,19 +176,24 @@ def image2image(args):
 def inpainting(args):
     prompt = args.prompt
     models_path = args.model
-    device = args.device
+    devices = device_string_to_triplet(args.device)
     num_warmup = args.num_warmup
     num_iter = args.num_iter
     output_dir = args.output_dir
     image_path = args.image
     strength = args.strength
     mask_image_path = args.mask_image
     
-    pipe = ov_genai.InpaintingPipeline(models_path, device)
-
     image_input = read_image(image_path)
     mask_image = read_image(mask_image_path)
 
+    pipe = ov_genai.InpaintingPipeline(models_path)
+    if args.reshape:
+        height = image_input.get_shape()[1]
+        width = image_input.get_shape()[2]
+        pipe.reshape(1, height, width, pipe.get_generation_config().guidance_scale)
+    pipe.compile(devices[0], devices[1], devices[2])
+
     warmup_metrics = []
     for i in range(num_warmup):
         pipe.generate(prompt, image_input, mask_image)
@@ -202,6 +226,7 @@ def main():
     parser.add_argument("-is", "--num_inference_steps", type=int, default=20, help="The number of inference steps used to denoise initial noised latent to final image")
     parser.add_argument("-ni", "--num_images_per_prompt", type=int, default=1, help="The number of images to generate per generate() call")
     parser.add_argument("-i", "--image", type=str, help="Image path")
+    parser.add_argument("-r", "--reshape", action="store_true", help="Reshape pipeline before compilation")
     # special parameters of text2image pipeline
     parser.add_argument("-w", "--width", type=int, default=512, help="The width of the resulting image")
     parser.add_argument("-ht", "--height", type=int, default=512, help="The height of the resulting image")
diff --git a/src/cpp/src/image_generation/diffusion_pipeline.hpp b/src/cpp/src/image_generation/diffusion_pipeline.hpp
@@ -124,7 +124,7 @@ class DiffusionPipeline {
 
     void save_load_time(std::chrono::steady_clock::time_point start_time) {
         auto stop_time = std::chrono::steady_clock::now();
-        m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
+        m_load_time_ms += std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
     }
 
     virtual ~DiffusionPipeline() = default;
diff --git a/src/cpp/src/image_generation/image2image_pipeline.cpp b/src/cpp/src/image_generation/image2image_pipeline.cpp
@@ -170,18 +170,24 @@ void Image2ImagePipeline::set_scheduler(std::shared_ptr<Scheduler> scheduler) {
 }
 
 void Image2ImagePipeline::reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) {
+    auto start_time = std::chrono::steady_clock::now();
     m_impl->reshape(num_images_per_prompt, height, width, guidance_scale);
+    m_impl->save_load_time(start_time);
 }
 
 void Image2ImagePipeline::compile(const std::string& device, const ov::AnyMap& properties) {
+    auto start_time = std::chrono::steady_clock::now();
     m_impl->compile(device, properties);
+    m_impl->save_load_time(start_time);
 }
 
 void Image2ImagePipeline::compile(const std::string& text_encode_device,
                                   const std::string& denoise_device,
                                   const std::string& vae_device,
                                   const ov::AnyMap& properties) {
+    auto start_time = std::chrono::steady_clock::now();
     m_impl->compile(text_encode_device, denoise_device, vae_device, properties);
+    m_impl->save_load_time(start_time);
 }
 
 ov::Tensor Image2ImagePipeline::generate(const std::string& positive_prompt, ov::Tensor initial_image, const ov::AnyMap& properties) {
diff --git a/src/cpp/src/image_generation/inpainting_pipeline.cpp b/src/cpp/src/image_generation/inpainting_pipeline.cpp
@@ -192,18 +192,24 @@ void InpaintingPipeline::set_scheduler(std::shared_ptr<Scheduler> scheduler) {
 }
 
 void InpaintingPipeline::reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) {
+    auto start_time = std::chrono::steady_clock::now();
     m_impl->reshape(num_images_per_prompt, height, width, guidance_scale);
+    m_impl->save_load_time(start_time);
 }
 
 void InpaintingPipeline::compile(const std::string& device, const ov::AnyMap& properties) {
+    auto start_time = std::chrono::steady_clock::now();
     m_impl->compile(device, properties);
+    m_impl->save_load_time(start_time);
 }
 
 void InpaintingPipeline::compile(const std::string& text_encode_device,
                                  const std::string& denoise_device,
                                  const std::string& vae_device,
                                  const ov::AnyMap& properties) {
+    auto start_time = std::chrono::steady_clock::now();
     m_impl->compile(text_encode_device, denoise_device, vae_device, properties);
+    m_impl->save_load_time(start_time);
 }
 
 ov::Tensor InpaintingPipeline::generate(const std::string& positive_prompt, ov::Tensor initial_image, ov::Tensor mask, const ov::AnyMap& properties) {
diff --git a/src/cpp/src/image_generation/text2image_pipeline.cpp b/src/cpp/src/image_generation/text2image_pipeline.cpp
@@ -186,7 +186,9 @@ void Text2ImagePipeline::set_scheduler(std::shared_ptr<Scheduler> scheduler) {
 }
 
 void Text2ImagePipeline::reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) {
+    auto start_time = std::chrono::steady_clock::now();
     m_impl->reshape(num_images_per_prompt, height, width, guidance_scale);
+    m_impl->save_load_time(start_time);
 
     // update config with the specified parameters, so that the user doesn't need to explicitly pass these as properties
     // to generate()
@@ -199,14 +201,18 @@ void Text2ImagePipeline::reshape(const int num_images_per_prompt, const int heig
 }
 
 void Text2ImagePipeline::compile(const std::string& device, const ov::AnyMap& properties) {
+    auto start_time = std::chrono::steady_clock::now();
     m_impl->compile(device, properties);
+    m_impl->save_load_time(start_time);
 }
 
 void Text2ImagePipeline::compile(const std::string& text_encode_device,
     const std::string& denoise_device,
     const std::string& vae_device,
     const ov::AnyMap& properties) {
+    auto start_time = std::chrono::steady_clock::now();
     m_impl->compile(text_encode_device, denoise_device, vae_device, properties);
+    m_impl->save_load_time(start_time);
 }
 
 ov::Tensor Text2ImagePipeline::generate(const std::string& positive_prompt, const ov::AnyMap& properties) {

Original file line number	Diff line number	Diff line change
`@@ -124,7 +124,7 @@ class DiffusionPipeline {`
`124`	`124`
`125`	`125`	`void save_load_time(std::chrono::steady_clock::time_point start_time) {`
`126`	`126`	`auto stop_time = std::chrono::steady_clock::now();`
`127`		`- m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();`
	`127`	`+ m_load_time_ms += std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();`
`128`	`128`	`}`
`129`	`129`
`130`	`130`	`virtual ~DiffusionPipeline() = default;`
Original file line number	Diff line number	Diff line change
`@@ -170,18 +170,24 @@ void Image2ImagePipeline::set_scheduler(std::shared_ptr<Scheduler> scheduler) {`
`170`	`170`	`}`
`171`	`171`
`172`	`172`	`void Image2ImagePipeline::reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) {`
	`173`	`+ auto start_time = std::chrono::steady_clock::now();`
`173`	`174`	`m_impl->reshape(num_images_per_prompt, height, width, guidance_scale);`
	`175`	`+ m_impl->save_load_time(start_time);`
`174`	`176`	`}`
`175`	`177`
`176`	`178`	`void Image2ImagePipeline::compile(const std::string& device, const ov::AnyMap& properties) {`
	`179`	`+ auto start_time = std::chrono::steady_clock::now();`
`177`	`180`	`m_impl->compile(device, properties);`
	`181`	`+ m_impl->save_load_time(start_time);`
`178`	`182`	`}`
`179`	`183`
`180`	`184`	`void Image2ImagePipeline::compile(const std::string& text_encode_device,`
`181`	`185`	`const std::string& denoise_device,`
`182`	`186`	`const std::string& vae_device,`
`183`	`187`	`const ov::AnyMap& properties) {`
	`188`	`+ auto start_time = std::chrono::steady_clock::now();`
`184`	`189`	`m_impl->compile(text_encode_device, denoise_device, vae_device, properties);`
	`190`	`+ m_impl->save_load_time(start_time);`
`185`	`191`	`}`
`186`	`192`
`187`	`193`	`ov::Tensor Image2ImagePipeline::generate(const std::string& positive_prompt, ov::Tensor initial_image, const ov::AnyMap& properties) {`
Original file line number	Diff line number	Diff line change
`@@ -192,18 +192,24 @@ void InpaintingPipeline::set_scheduler(std::shared_ptr<Scheduler> scheduler) {`
`192`	`192`	`}`
`193`	`193`
`194`	`194`	`void InpaintingPipeline::reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) {`
	`195`	`+ auto start_time = std::chrono::steady_clock::now();`
`195`	`196`	`m_impl->reshape(num_images_per_prompt, height, width, guidance_scale);`
	`197`	`+ m_impl->save_load_time(start_time);`
`196`	`198`	`}`
`197`	`199`
`198`	`200`	`void InpaintingPipeline::compile(const std::string& device, const ov::AnyMap& properties) {`
	`201`	`+ auto start_time = std::chrono::steady_clock::now();`
`199`	`202`	`m_impl->compile(device, properties);`
	`203`	`+ m_impl->save_load_time(start_time);`
`200`	`204`	`}`
`201`	`205`
`202`	`206`	`void InpaintingPipeline::compile(const std::string& text_encode_device,`
`203`	`207`	`const std::string& denoise_device,`
`204`	`208`	`const std::string& vae_device,`
`205`	`209`	`const ov::AnyMap& properties) {`
	`210`	`+ auto start_time = std::chrono::steady_clock::now();`
`206`	`211`	`m_impl->compile(text_encode_device, denoise_device, vae_device, properties);`
	`212`	`+ m_impl->save_load_time(start_time);`
`207`	`213`	`}`
`208`	`214`
`209`	`215`	`ov::Tensor InpaintingPipeline::generate(const std::string& positive_prompt, ov::Tensor initial_image, ov::Tensor mask, const ov::AnyMap& properties) {`
Original file line number	Diff line number	Diff line change
`@@ -186,7 +186,9 @@ void Text2ImagePipeline::set_scheduler(std::shared_ptr<Scheduler> scheduler) {`
`186`	`186`	`}`
`187`	`187`
`188`	`188`	`void Text2ImagePipeline::reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) {`
	`189`	`+ auto start_time = std::chrono::steady_clock::now();`
`189`	`190`	`m_impl->reshape(num_images_per_prompt, height, width, guidance_scale);`
	`191`	`+ m_impl->save_load_time(start_time);`
`190`	`192`
`191`	`193`	`// update config with the specified parameters, so that the user doesn't need to explicitly pass these as properties`
`192`	`194`	`// to generate()`
`@@ -199,14 +201,18 @@ void Text2ImagePipeline::reshape(const int num_images_per_prompt, const int heig`
`199`	`201`	`}`
`200`	`202`
`201`	`203`	`void Text2ImagePipeline::compile(const std::string& device, const ov::AnyMap& properties) {`
	`204`	`+ auto start_time = std::chrono::steady_clock::now();`
`202`	`205`	`m_impl->compile(device, properties);`
	`206`	`+ m_impl->save_load_time(start_time);`
`203`	`207`	`}`
`204`	`208`
`205`	`209`	`void Text2ImagePipeline::compile(const std::string& text_encode_device,`
`206`	`210`	`const std::string& denoise_device,`
`207`	`211`	`const std::string& vae_device,`
`208`	`212`	`const ov::AnyMap& properties) {`
	`213`	`+ auto start_time = std::chrono::steady_clock::now();`
`209`	`214`	`m_impl->compile(text_encode_device, denoise_device, vae_device, properties);`
	`215`	`+ m_impl->save_load_time(start_time);`
`210`	`216`	`}`
`211`	`217`
`212`	`218`	`ov::Tensor Text2ImagePipeline::generate(const std::string& positive_prompt, const ov::AnyMap& properties) {`