Skip to content

Commit 0b4848a

Browse files
[Image generation] Fixed SD3 accuracy issues (openvinotoolkit#1131)
- Fixed VAE part for SD3 - `scaling_factor` was applied 2x time: in pipeline itself and as part of VAE decoder preprocessing. - Fixed float / double arithmetic mismatch in `FlowMatchEulerDiscreteScheduler` CVS-156384
1 parent 6165c47 commit 0b4848a

10 files changed

+30
-41
lines changed

src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL {
2323
size_t in_channels = 3;
2424
size_t latent_channels = 4;
2525
size_t out_channels = 3;
26-
float scaling_factor = 0.18215f;
27-
float shift_factor = 0.0609f;
26+
float scaling_factor = 1.0f;
27+
float shift_factor = 0.0f;
2828
std::vector<size_t> block_out_channels = { 64 };
2929

3030
explicit Config(const std::filesystem::path& config_path);

src/cpp/src/image_generation/models/autoencoder_kl.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -186,8 +186,10 @@ void AutoencoderKL::merge_vae_image_pre_processing() const {
186186
void AutoencoderKL::merge_vae_image_post_processing() const {
187187
ov::preprocess::PrePostProcessor ppp(m_decoder_model);
188188

189-
// scale input before VAE decoder
190-
ppp.input().preprocess().scale(m_config.scaling_factor);
189+
// scale and shift input before VAE decoder
190+
ppp.input().preprocess()
191+
.scale(m_config.scaling_factor)
192+
.mean(-m_config.shift_factor);
191193

192194
// apply VaeImageProcessor normalization steps
193195
// https://github.com/huggingface/diffusers/blob/v0.30.1/src/diffusers/image_processor.py#L159

src/cpp/src/image_generation/models/clip_text_model.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ CLIPTextModel& CLIPTextModel::compile(const std::string& device, const ov::AnyMa
8686
}
8787

8888
void CLIPTextModel::set_adapters(const std::optional<AdapterConfig>& adapters) {
89-
if(adapters) {
89+
if (adapters) {
9090
m_adapter_controller.apply(m_request, *adapters);
9191
}
9292
}

src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ CLIPTextModelWithProjection& CLIPTextModelWithProjection::compile(const std::str
6262
ov::Core core = utils::singleton_core();
6363
ov::CompiledModel compiled_model;
6464
std::optional<AdapterConfig> adapters;
65-
if(auto filtered_properties = extract_adapters_from_properties(properties, &adapters)) {
65+
if (auto filtered_properties = extract_adapters_from_properties(properties, &adapters)) {
6666
adapters->set_tensor_name_prefix(adapters->get_tensor_name_prefix().value_or("lora_te"));
6767
m_adapter_controller = AdapterController(m_model, *adapters, device);
6868
compiled_model = core.compile_model(m_model, device, *filtered_properties);
@@ -77,7 +77,7 @@ CLIPTextModelWithProjection& CLIPTextModelWithProjection::compile(const std::str
7777
}
7878

7979
void CLIPTextModelWithProjection::set_adapters(const std::optional<AdapterConfig>& adapters) {
80-
if(adapters) {
80+
if (adapters) {
8181
m_adapter_controller.apply(m_request, *adapters);
8282
}
8383
}

src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ SD3Transformer2DModel& SD3Transformer2DModel::reshape(int batch_size,
6565
std::string input_name = input.get_any_name();
6666
name_to_shape[input_name] = input.get_partial_shape();
6767
if (input_name == "timestep") {
68-
name_to_shape[input_name][0] = batch_size;
68+
name_to_shape[input_name][0] = 1;
6969
} else if (input_name == "hidden_states") {
7070
name_to_shape[input_name] = {batch_size, name_to_shape[input_name][1], height, width};
7171
} else if (input_name == "encoder_hidden_states") {

src/cpp/src/image_generation/models/unet2d_condition_model.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ void UNet2DConditionModel::set_hidden_states(const std::string& tensor_name, ov:
9999
}
100100

101101
void UNet2DConditionModel::set_adapters(const std::optional<AdapterConfig>& adapters) {
102-
if(adapters) {
102+
if (adapters) {
103103
m_adapter_controller.apply(m_request, *adapters);
104104
}
105105
}

src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp

+15-13
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,8 @@ FlowMatchEulerDiscreteScheduler::FlowMatchEulerDiscreteScheduler(const Config& s
4040
int32_t num_train_timesteps = m_config.num_train_timesteps;
4141
float shift = m_config.shift;
4242

43-
auto linspaced = linspace<float>(1.0f, static_cast<float>(num_train_timesteps), num_train_timesteps, true);
44-
for (auto it = linspaced.rbegin(); it != linspaced.rend(); ++it) {
45-
m_timesteps.push_back(*it);
46-
}
43+
m_timesteps = linspace<float>(1.0f, static_cast<float>(num_train_timesteps), num_train_timesteps, true);
44+
std::reverse(m_timesteps.begin(), m_timesteps.end());
4745

4846
std::transform(m_timesteps.begin(),
4947
m_timesteps.end(),
@@ -66,7 +64,7 @@ FlowMatchEulerDiscreteScheduler::FlowMatchEulerDiscreteScheduler(const Config& s
6664
m_sigma_max = m_sigmas[0], m_sigma_min = m_sigmas.back();
6765
}
6866

69-
float FlowMatchEulerDiscreteScheduler::sigma_to_t(float sigma) {
67+
double FlowMatchEulerDiscreteScheduler::sigma_to_t(double sigma) {
7068
return sigma * m_config.num_train_timesteps;
7169
}
7270

@@ -79,20 +77,24 @@ void FlowMatchEulerDiscreteScheduler::set_timesteps(size_t num_inference_steps,
7977
float shift = m_config.shift;
8078

8179
using numpy_utils::linspace;
82-
m_timesteps = linspace<float>(sigma_to_t(m_sigma_max), sigma_to_t(m_sigma_min), m_num_inference_steps, true);
80+
std::vector<double> timesteps = linspace<double>(sigma_to_t(m_sigma_max), sigma_to_t(m_sigma_min), m_num_inference_steps, true);
8381

84-
for (const float& i : m_timesteps) {
85-
m_sigmas.push_back(i / num_train_timesteps);
82+
std::vector<double> sigmas(timesteps.size());
83+
for (size_t i = 0; i < sigmas.size(); ++i) {
84+
sigmas[i] = timesteps[i] / num_train_timesteps;
8685
}
8786

8887
OPENVINO_ASSERT(!m_config.use_dynamic_shifting,
8988
"Parameter 'use_dynamic_shifting' is not supported. Please, add support.");
9089

91-
for (size_t i = 0; i < m_sigmas.size(); ++i) {
92-
m_sigmas[i] = shift * m_sigmas[i] / (1 + (shift - 1) * m_sigmas[i]);
90+
m_sigmas.resize(sigmas.size());
91+
m_timesteps.resize(sigmas.size());
92+
93+
for (size_t i = 0; i < sigmas.size(); ++i) {
94+
m_sigmas[i] = shift * sigmas[i] / (1.0 + (shift - 1.0) * sigmas[i]);
9395
m_timesteps[i] = m_sigmas[i] * num_train_timesteps;
9496
}
95-
m_sigmas.push_back(0);
97+
m_sigmas.push_back(0.0f);
9698

9799
m_step_index = -1, m_begin_index = -1;
98100
}
@@ -102,8 +104,8 @@ std::map<std::string, ov::Tensor> FlowMatchEulerDiscreteScheduler::step(ov::Tens
102104
// latents - sample
103105
// inference_step
104106

105-
float* model_output_data = noise_pred.data<float>();
106-
float* sample_data = latents.data<float>();
107+
const float* model_output_data = noise_pred.data<const float>();
108+
const float* sample_data = latents.data<const float>();
107109

108110
if (m_step_index == -1)
109111
init_step_index();

src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class FlowMatchEulerDiscreteScheduler : public IScheduler {
5353
size_t m_num_inference_steps;
5454

5555
void init_step_index();
56-
float sigma_to_t(float simga);
56+
double sigma_to_t(double simga);
5757
};
5858

5959
} // namespace genai

src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp

+2-17
Original file line numberDiff line numberDiff line change
@@ -557,27 +557,18 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
557557

558558
// 6. Denoising loop
559559
ov::Tensor noisy_residual_tensor(ov::element::f32, {});
560-
ov::Tensor timestep;
561560

562561
for (size_t inference_step = 0; inference_step < generation_config.num_inference_steps; ++inference_step) {
563562
// concat the same latent twice along a batch dimension in case of CFG
564563
if (batch_size_multiplier > 1) {
565564
batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt);
566-
batch_copy(latent,
567-
latent_cfg,
568-
0,
569-
generation_config.num_images_per_prompt,
570-
generation_config.num_images_per_prompt);
571-
572-
size_t timestep_size = generation_config.num_images_per_prompt * batch_size_multiplier;
573-
timestep = ov::Tensor(ov::element::f32, {timestep_size});
574-
std::fill_n(timestep.data<float>(), timestep.get_size(), timesteps[inference_step]);
565+
batch_copy(latent, latent_cfg, 0, generation_config.num_images_per_prompt, generation_config.num_images_per_prompt);
575566
} else {
576567
// just assign to save memory copy
577568
latent_cfg = latent;
578-
timestep = ov::Tensor(ov::element::f32, {1}, &timesteps[inference_step]);
579569
}
580570

571+
ov::Tensor timestep(ov::element::f32, {1}, &timesteps[inference_step]);
581572
ov::Tensor noise_pred_tensor = m_transformer->infer(latent_cfg, timestep);
582573

583574
ov::Shape noise_pred_shape = noise_pred_tensor.get_shape();
@@ -603,12 +594,6 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
603594
latent = scheduler_step_result["latent"];
604595
}
605596

606-
float* latent_data = latent.data<float>();
607-
for (size_t i = 0; i < latent.get_size(); ++i) {
608-
latent_data[i] = (latent_data[i] / m_vae->get_config().scaling_factor) +
609-
m_vae->get_config().shift_factor;
610-
}
611-
612597
return m_vae->decode(latent);
613598
}
614599

src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -492,7 +492,7 @@ class StableDiffusionXLPipeline : public DiffusionPipeline {
492492

493493
ov::AnyMap properties_for_text_encoder(ov::AnyMap properties, const std::string& tensor_name_prefix) {
494494
std::optional<AdapterConfig> adapters;
495-
if(update_adapters_from_properties(properties, adapters) && !adapters->get_tensor_name_prefix()) {
495+
if (update_adapters_from_properties(properties, adapters) && !adapters->get_tensor_name_prefix()) {
496496
adapters->set_tensor_name_prefix(tensor_name_prefix);
497497
properties[ov::genai::adapters.name()] = *adapters;
498498
}

0 commit comments

Comments
 (0)