Skip to content

Commit dc5c9a9

Browse files
benchmark_image_gen: Add --reshape option, and ability to specify multiple devices (#1878)
1 parent 1949366 commit dc5c9a9

File tree

8 files changed

+110
-17
lines changed

8 files changed

+110
-17
lines changed

samples/cpp/image_generation/README.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ Options:
164164
- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text.
165165
- `--nw, --num_warmup` (default: `1`): Number of warmup iterations.
166166
- `-n, --num_iter` (default: `3`): Number of iterations.
167-
- `-d, --device` (default: `"CPU"`): Device to run the model on.
167+
- `-d, --device` (default: `"CPU"`): Device(s) to run the pipeline with.
168168
- `-w, --width` (default: `512`): The width of the output image.
169169
- `--ht, --height` (default: `512`): The height of the output image.
170170
- `--is, --num_inference_steps` (default: `20`): The number of inference steps.
@@ -173,6 +173,7 @@ Options:
173173
- `-i, --image`: Path to input image.
174174
- `-s, --strength`: Indicates extent to transform the reference `image`. Must be between 0 and 1.
175175
- `--mi, --mask_image`: Path to mask image.
176+
- `-r, --reshape': Reshape pipeline before compilation. This can improve image generation performance.
176177

177178
For example:
178179

samples/cpp/image_generation/benchmark_image_gen.cpp

+54-6
Original file line numberDiff line numberDiff line change
@@ -106,15 +106,50 @@ inline void print_statistic(std::vector<ov::genai::ImageGenerationPerfMetrics>&
106106
<< " ms, vae decoder infer avg time:" << vae_decoder_mean << " ms" << std::endl;
107107
}
108108

109+
inline std::vector<std::string> device_string_to_triplet(const std::string& device_input) {
110+
std::vector<std::string> devices;
111+
std::istringstream stream(device_input);
112+
std::string device;
113+
114+
// Split the device input string by commas
115+
while (std::getline(stream, device, ',')) {
116+
devices.push_back(device);
117+
}
118+
119+
// Trim whitespace from each device name
120+
for (auto& dev : devices) {
121+
dev.erase(0, dev.find_first_not_of(" \t"));
122+
dev.erase(dev.find_last_not_of(" \t") + 1);
123+
}
124+
125+
// Ensure exactly three devices
126+
if (devices.size() == 1) {
127+
return {devices[0], devices[0], devices[0]};
128+
} else if (devices.size() == 3) {
129+
return devices;
130+
} else {
131+
throw std::invalid_argument("The device specified by -d/--device must be a single device (e.g. -d \"GPU\"), "
132+
"or exactly 3 comma separated device names (e.g. -d \"CPU,NPU,GPU\")");
133+
}
134+
}
135+
109136
void text2image(cxxopts::ParseResult& result) {
110137
std::string prompt = result["prompt"].as<std::string>();
111138
const std::string models_path = result["model"].as<std::string>();
112-
std::string device = result["device"].as<std::string>();
139+
auto devices = device_string_to_triplet(result["device"].as<std::string>());
113140
size_t num_warmup = result["num_warmup"].as<size_t>();
114141
size_t num_iter = result["num_iter"].as<size_t>();
115142
const std::string output_dir = result["output_dir"].as<std::string>();
116143

117-
ov::genai::Text2ImagePipeline pipe(models_path, device);
144+
ov::genai::Text2ImagePipeline pipe(models_path);
145+
if (result["reshape"].as<bool>()) {
146+
pipe.reshape(result["num_images_per_prompt"].as<size_t>(),
147+
result["height"].as<size_t>(),
148+
result["width"].as<size_t>(),
149+
pipe.get_generation_config().guidance_scale);
150+
}
151+
pipe.compile(devices[0], devices[1], devices[2]);
152+
118153
ov::genai::ImageGenerationConfig config = pipe.get_generation_config();
119154
config.width = result["width"].as<size_t>();
120155
config.height = result["height"].as<size_t>();
@@ -148,15 +183,21 @@ void image2image(cxxopts::ParseResult& result) {
148183
std::string prompt = result["prompt"].as<std::string>();
149184
const std::string models_path = result["model"].as<std::string>();
150185
std::string image_path = result["image"].as<std::string>();
151-
std::string device = result["device"].as<std::string>();
186+
auto devices = device_string_to_triplet(result["device"].as<std::string>());
152187
size_t num_warmup = result["num_warmup"].as<size_t>();
153188
size_t num_iter = result["num_iter"].as<size_t>();
154189
const std::string output_dir = result["output_dir"].as<std::string>();
155190
float strength = result["strength"].as<float>();
156191

157192
ov::Tensor image_input = utils::load_image(image_path);
158193

159-
ov::genai::Image2ImagePipeline pipe(models_path, device);
194+
ov::genai::Image2ImagePipeline pipe(models_path);
195+
if (result["reshape"].as<bool>()) {
196+
auto height = image_input.get_shape()[1];
197+
auto width = image_input.get_shape()[2];
198+
pipe.reshape(1, height, width, pipe.get_generation_config().guidance_scale);
199+
}
200+
pipe.compile(devices[0], devices[1], devices[2]);
160201

161202
std::vector<ov::genai::ImageGenerationPerfMetrics> warmup_metrics;
162203
std::cout << std::fixed << std::setprecision(2);
@@ -185,15 +226,21 @@ void inpainting(cxxopts::ParseResult& result) {
185226
const std::string models_path = result["model"].as<std::string>();
186227
std::string image_path = result["image"].as<std::string>();
187228
std::string mask_image_path = result["mask_image"].as<std::string>();
188-
std::string device = result["device"].as<std::string>();
229+
auto devices = device_string_to_triplet(result["device"].as<std::string>());
189230
size_t num_warmup = result["num_warmup"].as<size_t>();
190231
size_t num_iter = result["num_iter"].as<size_t>();
191232
const std::string output_dir = result["output_dir"].as<std::string>();
192233

193234
ov::Tensor image_input = utils::load_image(image_path);
194235
ov::Tensor mask_image = utils::load_image(mask_image_path);
195236

196-
ov::genai::InpaintingPipeline pipe(models_path, device);
237+
ov::genai::InpaintingPipeline pipe(models_path);
238+
if (result["reshape"].as<bool>()) {
239+
auto height = image_input.get_shape()[1];
240+
auto width = image_input.get_shape()[2];
241+
pipe.reshape(1, height, width, pipe.get_generation_config().guidance_scale);
242+
}
243+
pipe.compile(devices[0], devices[1], devices[2]);
197244

198245
std::cout << std::fixed << std::setprecision(2);
199246
std::vector<ov::genai::ImageGenerationPerfMetrics> warmup_metrics;
@@ -239,6 +286,7 @@ int main(int argc, char* argv[]) try {
239286
("s,strength", "Indicates extent to transform the reference `image`. Must be between 0 and 1", cxxopts::value<float>()->default_value(std::to_string(0.8)))
240287
//special parameters of inpainting pipeline
241288
("mi,mask_image", "Mask image path", cxxopts::value<std::string>())
289+
("r,reshape", "Reshape pipeline before compilation", cxxopts::value<bool>()->default_value("false"))
242290
("h,help", "Print usage");
243291

244292
cxxopts::ParseResult result;

samples/python/image_generation/README.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ Options:
163163
- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text.
164164
- `-nw, --num_warmup` (default: `1`): Number of warmup iterations.
165165
- `-n, --num_iter` (default: `3`): Number of iterations.
166-
- `-d, --device` (default: `"CPU"`): Device to run the model on.
166+
- `-d, --device` (default: `"CPU"`): Device(s) to run the pipeline with.
167167
- `-w, --width` (default: `512`): The width of the output image.
168168
- `-ht, --height` (default: `512`): The height of the output image.
169169
- `-is, --num_inference_steps` (default: `20`): The number of inference steps.
@@ -172,6 +172,7 @@ Options:
172172
- `-i, --image`: Path to input image.
173173
- `-mi, --mask_image`: Path to the mask image.
174174
- `-s, --strength`: Indicates extent to transform the reference `image`. Must be between 0 and 1.
175+
- `-r, --reshape': Reshape pipeline before compilation. This can improve image generation performance.
175176

176177
For example:
177178

samples/python/image_generation/benchmark_image_gen.py

+33-8
Original file line numberDiff line numberDiff line change
@@ -81,15 +81,29 @@ def print_statistic(warmup_metrics, iter_metrics):
8181
f"infer avg time: {inference_mean:.2f} ms, all text encoder infer avg time: {text_encoder_mean:.2f} ms, "
8282
f"vae encoder infer avg time: {vae_encoder_mean:.2f} ms, vae decoder infer avg time: {vae_decoder_mean:.2f} ms")
8383

84+
def device_string_to_triplet(device_input):
85+
devices = [device.strip() for device in device_input.split(",")]
86+
if len(devices) == 1:
87+
return [devices[0]] * 3
88+
elif len(devices) == 3:
89+
return devices
90+
else:
91+
raise ValueError("The device specified by -d/--device must be a single device (e.g. -d \"GPU\"), " +
92+
"or exactly 3 comma separated device names (e.g. -d \"CPU,NPU,GPU\")")
93+
8494
def text2image(args):
8595
prompt = args.prompt
8696
models_path = args.model
87-
device = args.device
97+
devices = device_string_to_triplet(args.device)
8898
num_warmup = args.num_warmup
8999
num_iter = args.num_iter
90100
output_dir = args.output_dir
91101

92-
pipe = ov_genai.Text2ImagePipeline(models_path, device)
102+
pipe = ov_genai.Text2ImagePipeline(models_path)
103+
if args.reshape:
104+
pipe.reshape(args.num_images_per_prompt, args.height, args.width, pipe.get_generation_config().guidance_scale)
105+
pipe.compile(devices[0], devices[1], devices[2])
106+
93107
config = pipe.get_generation_config()
94108
config.width = args.width
95109
config.height = args.height
@@ -124,17 +138,22 @@ def read_image(path: str) -> openvino.Tensor:
124138
def image2image(args):
125139
prompt = args.prompt
126140
models_path = args.model
127-
device = args.device
141+
devices = device_string_to_triplet(args.device)
128142
num_warmup = args.num_warmup
129143
num_iter = args.num_iter
130144
output_dir = args.output_dir
131145
image_path = args.image
132146
strength = args.strength
133147

134-
pipe = ov_genai.Image2ImagePipeline(models_path, device)
135-
136148
image_input = read_image(image_path)
137149

150+
pipe = ov_genai.Image2ImagePipeline(models_path)
151+
if args.reshape:
152+
height = image_input.get_shape()[1]
153+
width = image_input.get_shape()[2]
154+
pipe.reshape(1, height, width, pipe.get_generation_config().guidance_scale)
155+
pipe.compile(devices[0], devices[1], devices[2])
156+
138157
warmup_metrics = []
139158
for i in range(num_warmup):
140159
pipe.generate(prompt, image_input, strength=strength)
@@ -157,19 +176,24 @@ def image2image(args):
157176
def inpainting(args):
158177
prompt = args.prompt
159178
models_path = args.model
160-
device = args.device
179+
devices = device_string_to_triplet(args.device)
161180
num_warmup = args.num_warmup
162181
num_iter = args.num_iter
163182
output_dir = args.output_dir
164183
image_path = args.image
165184
strength = args.strength
166185
mask_image_path = args.mask_image
167186

168-
pipe = ov_genai.InpaintingPipeline(models_path, device)
169-
170187
image_input = read_image(image_path)
171188
mask_image = read_image(mask_image_path)
172189

190+
pipe = ov_genai.InpaintingPipeline(models_path)
191+
if args.reshape:
192+
height = image_input.get_shape()[1]
193+
width = image_input.get_shape()[2]
194+
pipe.reshape(1, height, width, pipe.get_generation_config().guidance_scale)
195+
pipe.compile(devices[0], devices[1], devices[2])
196+
173197
warmup_metrics = []
174198
for i in range(num_warmup):
175199
pipe.generate(prompt, image_input, mask_image)
@@ -202,6 +226,7 @@ def main():
202226
parser.add_argument("-is", "--num_inference_steps", type=int, default=20, help="The number of inference steps used to denoise initial noised latent to final image")
203227
parser.add_argument("-ni", "--num_images_per_prompt", type=int, default=1, help="The number of images to generate per generate() call")
204228
parser.add_argument("-i", "--image", type=str, help="Image path")
229+
parser.add_argument("-r", "--reshape", action="store_true", help="Reshape pipeline before compilation")
205230
# special parameters of text2image pipeline
206231
parser.add_argument("-w", "--width", type=int, default=512, help="The width of the resulting image")
207232
parser.add_argument("-ht", "--height", type=int, default=512, help="The height of the resulting image")

src/cpp/src/image_generation/diffusion_pipeline.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ class DiffusionPipeline {
124124

125125
void save_load_time(std::chrono::steady_clock::time_point start_time) {
126126
auto stop_time = std::chrono::steady_clock::now();
127-
m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
127+
m_load_time_ms += std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
128128
}
129129

130130
virtual ~DiffusionPipeline() = default;

src/cpp/src/image_generation/image2image_pipeline.cpp

+6
Original file line numberDiff line numberDiff line change
@@ -170,18 +170,24 @@ void Image2ImagePipeline::set_scheduler(std::shared_ptr<Scheduler> scheduler) {
170170
}
171171

172172
void Image2ImagePipeline::reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) {
173+
auto start_time = std::chrono::steady_clock::now();
173174
m_impl->reshape(num_images_per_prompt, height, width, guidance_scale);
175+
m_impl->save_load_time(start_time);
174176
}
175177

176178
void Image2ImagePipeline::compile(const std::string& device, const ov::AnyMap& properties) {
179+
auto start_time = std::chrono::steady_clock::now();
177180
m_impl->compile(device, properties);
181+
m_impl->save_load_time(start_time);
178182
}
179183

180184
void Image2ImagePipeline::compile(const std::string& text_encode_device,
181185
const std::string& denoise_device,
182186
const std::string& vae_device,
183187
const ov::AnyMap& properties) {
188+
auto start_time = std::chrono::steady_clock::now();
184189
m_impl->compile(text_encode_device, denoise_device, vae_device, properties);
190+
m_impl->save_load_time(start_time);
185191
}
186192

187193
ov::Tensor Image2ImagePipeline::generate(const std::string& positive_prompt, ov::Tensor initial_image, const ov::AnyMap& properties) {

src/cpp/src/image_generation/inpainting_pipeline.cpp

+6
Original file line numberDiff line numberDiff line change
@@ -192,18 +192,24 @@ void InpaintingPipeline::set_scheduler(std::shared_ptr<Scheduler> scheduler) {
192192
}
193193

194194
void InpaintingPipeline::reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) {
195+
auto start_time = std::chrono::steady_clock::now();
195196
m_impl->reshape(num_images_per_prompt, height, width, guidance_scale);
197+
m_impl->save_load_time(start_time);
196198
}
197199

198200
void InpaintingPipeline::compile(const std::string& device, const ov::AnyMap& properties) {
201+
auto start_time = std::chrono::steady_clock::now();
199202
m_impl->compile(device, properties);
203+
m_impl->save_load_time(start_time);
200204
}
201205

202206
void InpaintingPipeline::compile(const std::string& text_encode_device,
203207
const std::string& denoise_device,
204208
const std::string& vae_device,
205209
const ov::AnyMap& properties) {
210+
auto start_time = std::chrono::steady_clock::now();
206211
m_impl->compile(text_encode_device, denoise_device, vae_device, properties);
212+
m_impl->save_load_time(start_time);
207213
}
208214

209215
ov::Tensor InpaintingPipeline::generate(const std::string& positive_prompt, ov::Tensor initial_image, ov::Tensor mask, const ov::AnyMap& properties) {

src/cpp/src/image_generation/text2image_pipeline.cpp

+6
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,9 @@ void Text2ImagePipeline::set_scheduler(std::shared_ptr<Scheduler> scheduler) {
186186
}
187187

188188
void Text2ImagePipeline::reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) {
189+
auto start_time = std::chrono::steady_clock::now();
189190
m_impl->reshape(num_images_per_prompt, height, width, guidance_scale);
191+
m_impl->save_load_time(start_time);
190192

191193
// update config with the specified parameters, so that the user doesn't need to explicitly pass these as properties
192194
// to generate()
@@ -199,14 +201,18 @@ void Text2ImagePipeline::reshape(const int num_images_per_prompt, const int heig
199201
}
200202

201203
void Text2ImagePipeline::compile(const std::string& device, const ov::AnyMap& properties) {
204+
auto start_time = std::chrono::steady_clock::now();
202205
m_impl->compile(device, properties);
206+
m_impl->save_load_time(start_time);
203207
}
204208

205209
void Text2ImagePipeline::compile(const std::string& text_encode_device,
206210
const std::string& denoise_device,
207211
const std::string& vae_device,
208212
const ov::AnyMap& properties) {
213+
auto start_time = std::chrono::steady_clock::now();
209214
m_impl->compile(text_encode_device, denoise_device, vae_device, properties);
215+
m_impl->save_load_time(start_time);
210216
}
211217

212218
ov::Tensor Text2ImagePipeline::generate(const std::string& positive_prompt, const ov::AnyMap& properties) {

0 commit comments

Comments
 (0)