Skip to content

Commit a8145bc

Browse files
[Image Generation] Supported force_zeros_for_empty_prompt in SDXL (openvinotoolkit#1115)
- Supported force_zeros_for_empty_prompt - Supported prompt_2, negative_prompt_2 - Fixed indexes of text encoders hidden state which are used to create UNet's encoder_hidden_state Now implementation is fully aligned with vanilla HF implementation CVS-156383
1 parent 15fe46e commit a8145bc

10 files changed

+112
-52
lines changed

src/cpp/include/openvino/genai/image_generation/clip_text_model.hpp

+1-2
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,7 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModel {
2222
public:
2323
struct OPENVINO_GENAI_EXPORTS Config {
2424
size_t max_position_embeddings = 77;
25-
size_t hidden_size = 512;
26-
size_t num_hidden_layers = 13;
25+
size_t num_hidden_layers = 12;
2726

2827
explicit Config(const std::filesystem::path& config_path);
2928
};

src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp

+1-2
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,7 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModelWithProjection {
2222
public:
2323
struct OPENVINO_GENAI_EXPORTS Config {
2424
size_t max_position_embeddings = 77;
25-
size_t hidden_size = 512;
26-
size_t num_hidden_layers = 33;
25+
size_t num_hidden_layers = 32;
2726

2827
explicit Config(const std::filesystem::path& config_path);
2928
};

src/cpp/include/openvino/genai/image_generation/generation_config.hpp

+1-2
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,8 @@ struct OPENVINO_GENAI_EXPORTS ImageGenerationConfig {
4343
// SD XL: prompt2 and negative_prompt2
4444
// FLUX: prompt2 (prompt if prompt2 is not defined explicitly)
4545
// SD 3: prompt2, prompt3 (with fallback to prompt) and negative_prompt2, negative_prompt3
46-
std::string negative_prompt;
4746
std::optional<std::string> prompt_2 = std::nullopt, prompt_3 = std::nullopt;
48-
std::optional<std::string> negative_prompt_2 = std::nullopt, negative_prompt_3 = std::nullopt;
47+
std::optional<std::string> negative_prompt = std::nullopt, negative_prompt_2 = std::nullopt, negative_prompt_3 = std::nullopt;
4948

5049
size_t num_images_per_prompt = 1;
5150

src/cpp/src/image_generation/generation_config.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ void ImageGenerationConfig::update_generation_config(const ov::AnyMap& propertie
6868
}
6969

7070
void ImageGenerationConfig::validate() const {
71-
OPENVINO_ASSERT(guidance_scale > 1.0f || negative_prompt.empty(), "Guidance scale <= 1.0 ignores negative prompt");
71+
OPENVINO_ASSERT(guidance_scale > 1.0f || negative_prompt == std::nullopt, "Guidance scale <= 1.0 ignores negative prompt");
7272
OPENVINO_ASSERT(guidance_scale > 1.0f || negative_prompt_2 == std::nullopt, "Guidance scale <= 1.0 ignores negative prompt 2");
7373
OPENVINO_ASSERT(guidance_scale > 1.0f || negative_prompt_3 == std::nullopt, "Guidance scale <= 1.0 ignores negative prompt 3");
7474
}

src/cpp/src/image_generation/models/clip_text_model.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ CLIPTextModel::Config::Config(const std::filesystem::path& config_path) {
2020
using utils::read_json_param;
2121

2222
read_json_param(data, "max_position_embeddings", max_position_embeddings);
23-
read_json_param(data, "hidden_size", hidden_size);
2423
read_json_param(data, "num_hidden_layers", num_hidden_layers);
2524
}
2625

src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ CLIPTextModelWithProjection::Config::Config(const std::filesystem::path& config_
2020
using utils::read_json_param;
2121

2222
read_json_param(data, "max_position_embeddings", max_position_embeddings);
23-
read_json_param(data, "hidden_size", hidden_size);
2423
read_json_param(data, "num_hidden_layers", num_hidden_layers);
2524
}
2625

src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp

+6-3
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,9 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
289289
std::string prompt_3_str =
290290
generation_config.prompt_3 != std::nullopt ? *generation_config.prompt_3 : positive_prompt;
291291

292-
std::string negative_prompt_1_str = generation_config.negative_prompt;
292+
std::string negative_prompt_1_str = generation_config.negative_prompt != std::nullopt
293+
? *generation_config.negative_prompt
294+
: std::string{};
293295
std::string negative_prompt_2_str = generation_config.negative_prompt_2 != std::nullopt
294296
? *generation_config.negative_prompt_2
295297
: negative_prompt_1_str;
@@ -582,9 +584,10 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
582584

583585
ov::Shape noise_pred_shape = noise_pred_tensor.get_shape();
584586
noise_pred_shape[0] /= batch_size_multiplier;
585-
noisy_residual_tensor.set_shape(noise_pred_shape);
586587

587588
if (batch_size_multiplier > 1) {
589+
noisy_residual_tensor.set_shape(noise_pred_shape);
590+
588591
// perform guidance
589592
float* noisy_residual = noisy_residual_tensor.data<float>();
590593
const float* noise_pred_uncond = noise_pred_tensor.data<const float>();
@@ -657,7 +660,7 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
657660
generation_config.prompt_3 == std::nullopt || generation_config.negative_prompt_3 == std::nullopt,
658661
"T5Encoder is not currently supported, 'prompt_3' and 'negative_prompt_3' can't be used. Please, add "
659662
"support.");
660-
OPENVINO_ASSERT(is_classifier_free_guidance || generation_config.negative_prompt.empty(),
663+
OPENVINO_ASSERT(is_classifier_free_guidance || generation_config.negative_prompt == std::nullopt,
661664
"Negative prompt is not used when guidance scale < 1.0");
662665
OPENVINO_ASSERT(is_classifier_free_guidance || generation_config.negative_prompt_2 == std::nullopt,
663666
"Negative prompt 2 is not used when guidance scale < 1.0");

src/cpp/src/image_generation/stable_diffusion_pipeline.hpp

+7-8
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,8 @@ class StableDiffusionPipeline : public DiffusionPipeline {
220220
generation_config.generator = std::make_shared<CppStdGenerator>(seed);
221221
}
222222

223-
ov::Tensor encoder_hidden_states = m_clip_text_encoder->infer(positive_prompt, generation_config.negative_prompt,
223+
std::string negative_prompt = generation_config.negative_prompt != std::nullopt ? *generation_config.negative_prompt : std::string{};
224+
ov::Tensor encoder_hidden_states = m_clip_text_encoder->infer(positive_prompt, negative_prompt,
224225
batch_size_multiplier > 1);
225226

226227
// replicate encoder hidden state to UNet model
@@ -261,13 +262,10 @@ class StableDiffusionPipeline : public DiffusionPipeline {
261262

262263
ov::Tensor denoised, noisy_residual_tensor(ov::element::f32, {});
263264
for (size_t inference_step = 0; inference_step < generation_config.num_inference_steps; inference_step++) {
265+
batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt);
264266
// concat the same latent twice along a batch dimension in case of CFG
265267
if (batch_size_multiplier > 1) {
266-
batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt);
267268
batch_copy(latent, latent_cfg, 0, generation_config.num_images_per_prompt, generation_config.num_images_per_prompt);
268-
} else {
269-
// just assign to save memory copy
270-
latent_cfg = latent;
271269
}
272270

273271
m_scheduler->scale_model_input(latent_cfg, inference_step);
@@ -277,9 +275,10 @@ class StableDiffusionPipeline : public DiffusionPipeline {
277275

278276
ov::Shape noise_pred_shape = noise_pred_tensor.get_shape();
279277
noise_pred_shape[0] /= batch_size_multiplier;
280-
noisy_residual_tensor.set_shape(noise_pred_shape);
281278

282279
if (batch_size_multiplier > 1) {
280+
noisy_residual_tensor.set_shape(noise_pred_shape);
281+
283282
// perform guidance
284283
float* noisy_residual = noisy_residual_tensor.data<float>();
285284
const float* noise_pred_uncond = noise_pred_tensor.data<const float>();
@@ -349,9 +348,9 @@ class StableDiffusionPipeline : public DiffusionPipeline {
349348
OPENVINO_ASSERT(generation_config.prompt_2 == std::nullopt, "Prompt 2 is not used by ", pipeline_name);
350349
OPENVINO_ASSERT(generation_config.prompt_3 == std::nullopt, "Prompt 3 is not used by ", pipeline_name);
351350
if (is_lcm) {
352-
OPENVINO_ASSERT(generation_config.negative_prompt.empty(), "Negative prompt is not used by ", pipeline_name);
351+
OPENVINO_ASSERT(generation_config.negative_prompt == std::nullopt, "Negative prompt is not used by ", pipeline_name);
353352
} else if (!is_classifier_free_guidance) {
354-
OPENVINO_ASSERT(generation_config.negative_prompt.empty(), "Negative prompt is not used when guidance scale <= 1.0");
353+
OPENVINO_ASSERT(generation_config.negative_prompt == std::nullopt, "Negative prompt is not used when guidance scale <= 1.0");
355354
}
356355
OPENVINO_ASSERT(generation_config.negative_prompt_2 == std::nullopt, "Negative prompt 2 is not used by ", pipeline_name);
357356
OPENVINO_ASSERT(generation_config.negative_prompt_3 == std::nullopt, "Negative prompt 3 is not used by ", pipeline_name);

0 commit comments

Comments
 (0)