Skip to content

Commit cf88a7e

Browse files
authored
DDIM scheduler (openvinotoolkit#897)
1. model: `stabilityai/stable-diffusion-2-1 FP16` prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting` pipeline params: ```cpp ov::Tensor image = pipe.generate(prompt, ov::genai::width(768), ov::genai::height(768), ov::genai::num_inference_steps(50)); ``` output: ![image](https://github.com/user-attachments/assets/8567efda-d7be-47c3-af25-be441d0d9ec6) 2. model `stabilityai/stable-diffusion-2 FP16` prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting` pipeline params: ```cpp ov::Tensor image = pipe.generate(prompt, ov::genai::width(768), ov::genai::height(768), ov::genai::num_inference_steps(50)); ``` output: ![image](https://github.com/user-attachments/assets/eabd67d5-1ef0-41d8-aee4-861cadde10f2) 3. model `dreamlike-art/dreamlike-anime-1.0 FP16` prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting` pipeline params: ```cpp ov::Tensor image = pipe.generate(prompt, ov::genai::width(512), ov::genai::height(512), ov::genai::num_inference_steps(20)); ``` output: ![image](https://github.com/user-attachments/assets/72f75774-6025-4820-9d37-b2885c7c72c7) 4. model `bghira/pseudo-journey-v2 FP16` - requires **DDPMScheduler** by default prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting` pipeline params: ```cpp ov::Tensor image = pipe.generate(prompt, ov::genai::width(512), ov::genai::height(512), ov::genai::num_inference_steps(20)); ``` output: ![image](https://github.com/user-attachments/assets/2b89c302-a5d6-4ade-bca0-84b2aa509b8a) Ticket: __CVS-152319__
1 parent 50bc8d4 commit cf88a7e

File tree

10 files changed

+313
-31
lines changed

10 files changed

+313
-31
lines changed

README.md

+1-2
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,7 @@ It includes the following pipelines:
3434
6. [multinomial_causal_lm](./samples/cpp/multinomial_causal_lm/README.md)
3535
7. [prompt_lookup_decoding_lm](./samples/cpp/prompt_lookup_decoding_lm/README.md)
3636
8. [speculative_decoding_lm](./samples/cpp/speculative_decoding_lm/README.md)
37-
3. [Stable Diffuison (with LoRA) C++ image generation pipeline](./image_generation/stable_diffusion_1_5/cpp/README.md)
38-
4. [Latent Consistency Model (with LoRA) C++ image generation pipeline](./image_generation/lcm_dreamshaper_v7/cpp/README.md)
37+
3. [Stable Diffuison and Latent Consistency Model (with LoRA) C++ image generation pipeline](./samples/cpp/stable_diffusion/README.md)
3938

4039
### Requirements
4140

0 Bytes
Binary file not shown.

samples/cpp/stable_diffusion/README.md

+2
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ Prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk gol
3636
Models can be downloaded from [OpenAI HiggingFace](https://huggingface.co/openai). This sample can run the following list of models, but not limitied to:
3737

3838
- [botp/stable-diffusion-v1-5](https://huggingface.co/botp/stable-diffusion-v1-5)
39+
- [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2)
40+
- [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1)
3941
- [dreamlike-art/dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0)
4042
- [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7)
4143

src/cpp/include/openvino/genai/text2image/pipeline.hpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline {
5252
enum Type {
5353
AUTO,
5454
LCM,
55-
LMS_DISCRETE
55+
LMS_DISCRETE,
56+
DDIM
5657
};
5758

5859
static std::shared_ptr<Scheduler> from_config(const std::string& scheduler_config_path,
+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Copyright (C) 2023-2024 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#pragma once
5+
6+
#include <vector>
7+
8+
namespace ov {
9+
namespace genai {
10+
namespace numpy_utils {
11+
12+
// https://gist.github.com/lorenzoriano/5414671
13+
template <typename T, typename U>
14+
std::vector<T> linspace(U start, U end, size_t num, bool endpoint = false) {
15+
std::vector<T> indices;
16+
if (num != 0) {
17+
if (num == 1)
18+
indices.push_back(static_cast<T>(start));
19+
else {
20+
if (endpoint)
21+
--num;
22+
23+
U delta = (end - start) / static_cast<U>(num);
24+
for (size_t i = 0; i < num; i++)
25+
indices.push_back(static_cast<T>(start + delta * i));
26+
27+
if (endpoint)
28+
indices.push_back(static_cast<T>(end));
29+
}
30+
}
31+
return indices;
32+
}
33+
34+
}// namespace ov
35+
}// namespace genai
36+
}// namespace txt2img_utils
+201
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
// Copyright (C) 2023-2024 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#include <cassert>
5+
#include <random>
6+
#include <fstream>
7+
#include <iterator>
8+
9+
#include "text2image/schedulers/ddim.hpp"
10+
#include "utils.hpp"
11+
#include "text2image/numpy_utils.hpp"
12+
13+
namespace ov {
14+
namespace genai {
15+
16+
DDIMScheduler::Config::Config(const std::string& scheduler_config_path) {
17+
std::ifstream file(scheduler_config_path);
18+
OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path);
19+
20+
nlohmann::json data = nlohmann::json::parse(file);
21+
using utils::read_json_param;
22+
23+
read_json_param(data, "num_train_timesteps", num_train_timesteps);
24+
read_json_param(data, "beta_start", beta_start);
25+
read_json_param(data, "beta_end", beta_end);
26+
read_json_param(data, "beta_schedule", beta_schedule);
27+
read_json_param(data, "trained_betas", trained_betas);
28+
read_json_param(data, "clip_sample", clip_sample);
29+
read_json_param(data, "set_alpha_to_one", set_alpha_to_one);
30+
read_json_param(data, "steps_offset", steps_offset);
31+
read_json_param(data, "prediction_type", prediction_type);
32+
read_json_param(data, "thresholding", thresholding);
33+
read_json_param(data, "dynamic_thresholding_ratio", dynamic_thresholding_ratio);
34+
read_json_param(data, "clip_sample_range", clip_sample_range);
35+
read_json_param(data, "sample_max_value", sample_max_value);
36+
read_json_param(data, "timestep_spacing", timestep_spacing);
37+
read_json_param(data, "rescale_betas_zero_snr", rescale_betas_zero_snr);
38+
}
39+
40+
DDIMScheduler::DDIMScheduler(const std::string scheduler_config_path)
41+
: DDIMScheduler(Config(scheduler_config_path)) {
42+
}
43+
44+
DDIMScheduler::DDIMScheduler(const Config& scheduler_config)
45+
: m_config(scheduler_config) {
46+
47+
std::vector<float> alphas, betas;
48+
49+
using numpy_utils::linspace;
50+
51+
if (!m_config.trained_betas.empty()) {
52+
betas = m_config.trained_betas;
53+
} else if (m_config.beta_schedule == BetaSchedule::LINEAR) {
54+
betas = linspace<float>(m_config.beta_start, m_config.beta_end, m_config.num_train_timesteps);
55+
} else if (m_config.beta_schedule == BetaSchedule::SCALED_LINEAR) {
56+
float start = std::sqrt(m_config.beta_start);
57+
float end = std::sqrt(m_config.beta_end);
58+
betas = linspace<float>(start, end, m_config.num_train_timesteps);
59+
std::for_each(betas.begin(), betas.end(), [] (float & x) { x *= x; });
60+
} else {
61+
OPENVINO_THROW("'beta_schedule' must be one of 'LINEAR' or 'SCALED_LINEAR'. Please, add support of other types");
62+
}
63+
64+
// TODO: Rescale for zero SNR
65+
// if (m_config.rescale_betas_zero_snr) {betas = rescale_zero_terminal_snr(betas)}
66+
67+
std::transform(betas.begin(), betas.end(), std::back_inserter(alphas), [] (float b) { return 1.0f - b; });
68+
69+
for (size_t i = 1; i <= alphas.size(); i++) {
70+
float alpha_cumprod =
71+
std::accumulate(std::begin(alphas), std::begin(alphas) + i, 1.0, std::multiplies<float>{});
72+
m_alphas_cumprod.push_back(alpha_cumprod);
73+
}
74+
75+
m_final_alpha_cumprod = m_config.set_alpha_to_one ? 1 : m_alphas_cumprod[0];
76+
}
77+
78+
void DDIMScheduler::set_timesteps(size_t num_inference_steps) {
79+
m_timesteps.clear();
80+
81+
OPENVINO_ASSERT(num_inference_steps <= m_config.num_train_timesteps,
82+
"`num_inference_steps` cannot be larger than `m_config.num_train_timesteps`");
83+
84+
m_num_inference_steps = num_inference_steps;
85+
86+
switch (m_config.timestep_spacing) {
87+
case TimestepSpacing::LINSPACE:
88+
{
89+
using numpy_utils::linspace;
90+
float end = static_cast<float>(m_config.num_train_timesteps - 1);
91+
auto linspaced = linspace<float>(0.0f, end, num_inference_steps, true);
92+
for (auto it = linspaced.rbegin(); it != linspaced.rend(); ++it) {
93+
m_timesteps.push_back(static_cast<int64_t>(std::round(*it)));
94+
}
95+
break;
96+
}
97+
case TimestepSpacing::LEADING:
98+
{
99+
size_t step_ratio = m_config.num_train_timesteps / m_num_inference_steps;
100+
for (size_t i = num_inference_steps - 1; i != -1; --i) {
101+
m_timesteps.push_back(i * step_ratio + m_config.steps_offset);
102+
}
103+
break;
104+
}
105+
case TimestepSpacing::TRAILING:
106+
{
107+
float step_ratio = static_cast<float>(m_config.num_train_timesteps) / static_cast<float>(m_num_inference_steps);
108+
for (float i = m_config.num_train_timesteps; i > 0; i-=step_ratio){
109+
m_timesteps.push_back(static_cast<int64_t>(std::round(i)) - 1);
110+
}
111+
break;
112+
}
113+
default:
114+
OPENVINO_THROW("Unsupported value for 'timestep_spacing'");
115+
}
116+
}
117+
118+
std::map<std::string, ov::Tensor> DDIMScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) {
119+
// noise_pred - model_output
120+
// latents - sample
121+
// inference_step
122+
123+
size_t timestep = get_timesteps()[inference_step];
124+
125+
// get previous step value (=t-1)
126+
int prev_timestep = timestep - m_config.num_train_timesteps / m_num_inference_steps;
127+
128+
// compute alphas, betas
129+
float alpha_prod_t = m_alphas_cumprod[timestep];
130+
float alpha_prod_t_prev = (prev_timestep >= 0) ? m_alphas_cumprod[prev_timestep] : m_final_alpha_cumprod;
131+
float beta_prod_t = 1 - alpha_prod_t;
132+
133+
// compute predicted original sample from predicted noise also called
134+
// "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
135+
std::vector<float> pred_original_sample, pred_epsilon;
136+
float pos_val, pe_val;
137+
for (size_t j = 0; j < noise_pred.get_size(); j++) {
138+
switch (m_config.prediction_type) {
139+
case PredictionType::EPSILON:
140+
pos_val = (latents.data<float>()[j] - std::sqrt(beta_prod_t) * noise_pred.data<float>()[j]) / std::sqrt(alpha_prod_t);
141+
pe_val = noise_pred.data<float>()[j];
142+
pred_original_sample.push_back(pos_val);
143+
pred_epsilon.push_back(pe_val);
144+
break;
145+
case PredictionType::SAMPLE:
146+
pos_val = noise_pred.data<float>()[j];
147+
pe_val = (latents.data<float>()[j] - std::sqrt(alpha_prod_t) * pos_val) / std::sqrt(beta_prod_t);
148+
pred_original_sample.push_back(pos_val);
149+
pred_epsilon.push_back(pe_val);
150+
break;
151+
case PredictionType::V_PREDICTION:
152+
pos_val = std::sqrt(alpha_prod_t) * latents.data<float>()[j] - std::sqrt(beta_prod_t) * noise_pred.data<float>()[j];
153+
pe_val = std::sqrt(alpha_prod_t) * noise_pred.data<float>()[j] + std::sqrt(beta_prod_t) * latents.data<float>()[j];
154+
pred_original_sample.push_back(pos_val);
155+
pred_epsilon.push_back(pe_val);
156+
break;
157+
default:
158+
OPENVINO_THROW("Unsupported value for 'PredictionType'");
159+
}
160+
}
161+
162+
// TODO: Clip or threshold "predicted x_0"
163+
// if m_config.thresholding:
164+
// pred_original_sample = _threshold_sample(pred_original_sample)
165+
// elif m_config.clip_sample:
166+
// pred_original_sample = pred_original_sample.clamp(
167+
// -self.config.clip_sample_range, self.config.clip_sample_range
168+
// )
169+
170+
// compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
171+
std::vector<float> pred_sample_direction(pred_epsilon.size());
172+
std::transform(pred_epsilon.begin(), pred_epsilon.end(), pred_sample_direction.begin(), [alpha_prod_t_prev](auto x) {
173+
return std::sqrt(1 - alpha_prod_t_prev) * x;
174+
});
175+
176+
// compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
177+
ov::Tensor prev_sample(latents.get_element_type(), latents.get_shape());
178+
float* prev_sample_data = prev_sample.data<float>();
179+
for (size_t i = 0; i < prev_sample.get_size(); ++i) {
180+
prev_sample_data[i] = std::sqrt(alpha_prod_t_prev) * pred_original_sample[i] + pred_sample_direction[i];
181+
}
182+
183+
std::map<std::string, ov::Tensor> result{{"latent", prev_sample}};
184+
185+
return result;
186+
}
187+
188+
std::vector<int64_t> DDIMScheduler::get_timesteps() const {
189+
return m_timesteps;
190+
}
191+
192+
float DDIMScheduler::get_init_noise_sigma() const {
193+
return 1.0f;
194+
}
195+
196+
void DDIMScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) {
197+
return;
198+
}
199+
200+
} // namespace genai
201+
} // namespace ov
+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
// Copyright (C) 2023-2024 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#pragma once
5+
6+
#include <list>
7+
#include <string>
8+
9+
#include "text2image/schedulers/types.hpp"
10+
#include "text2image/schedulers/ischeduler.hpp"
11+
12+
namespace ov {
13+
namespace genai {
14+
15+
class DDIMScheduler : public IScheduler {
16+
public:
17+
struct Config {
18+
int32_t num_train_timesteps = 1000;
19+
float beta_start = 0.0001f, beta_end = 0.02f;
20+
BetaSchedule beta_schedule = BetaSchedule::SCALED_LINEAR;
21+
std::vector<float> trained_betas = {};
22+
bool clip_sample = true, set_alpha_to_one = true;
23+
size_t steps_offset = 0;
24+
PredictionType prediction_type = PredictionType::EPSILON;
25+
bool thresholding = false;
26+
float dynamic_thresholding_ratio = 0.995f, clip_sample_range = 1.0f, sample_max_value = 1.0f;
27+
TimestepSpacing timestep_spacing = TimestepSpacing::LEADING;
28+
bool rescale_betas_zero_snr = false;
29+
30+
Config() = default;
31+
explicit Config(const std::string& scheduler_config_path);
32+
};
33+
34+
explicit DDIMScheduler(const std::string scheduler_config_path);
35+
explicit DDIMScheduler(const Config& scheduler_config);
36+
37+
void set_timesteps(size_t num_inference_steps) override;
38+
39+
std::vector<std::int64_t> get_timesteps() const override;
40+
41+
float get_init_noise_sigma() const override;
42+
43+
void scale_model_input(ov::Tensor sample, size_t inference_step) override;
44+
45+
std::map<std::string, ov::Tensor> step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) override;
46+
47+
private:
48+
Config m_config;
49+
50+
std::vector<float> m_alphas_cumprod;
51+
float m_final_alpha_cumprod;
52+
53+
size_t m_num_inference_steps;
54+
std::vector<int64_t> m_timesteps;
55+
56+
};
57+
58+
} // namespace genai
59+
} // namespace ov

src/cpp/src/text2image/schedulers/lcm.cpp

+6-26
Original file line numberDiff line numberDiff line change
@@ -7,32 +7,9 @@
77
#include <iterator>
88

99
#include "text2image/schedulers/lcm.hpp"
10+
#include "utils.hpp"
11+
#include "text2image/numpy_utils.hpp"
1012

11-
namespace {
12-
13-
// https://gist.github.com/lorenzoriano/5414671
14-
template <typename T, typename U>
15-
std::vector<T> linspace(U start, U end, size_t num, bool endpoint = false) {
16-
std::vector<T> indices;
17-
if (num != 0) {
18-
if (num == 1)
19-
indices.push_back(static_cast<T>(start));
20-
else {
21-
if (endpoint)
22-
--num;
23-
24-
U delta = (end - start) / static_cast<U>(num);
25-
for (size_t i = 0; i < num; i++)
26-
indices.push_back(static_cast<T>(start + delta * i));
27-
28-
if (endpoint)
29-
indices.push_back(static_cast<T>(end));
30-
}
31-
}
32-
return indices;
33-
}
34-
35-
} // namespace
3613

3714
namespace ov {
3815
namespace genai {
@@ -86,12 +63,14 @@ LCMScheduler::LCMScheduler(const Config& scheduler_config)
8663
} else if (m_config.beta_schedule == BetaSchedule::SCALED_LINEAR) {
8764
float start = std::sqrt(m_config.beta_start);
8865
float end = std::sqrt(m_config.beta_end);
66+
67+
using numpy_utils::linspace;
8968
std::vector<float> temp = linspace<float, float>(start, end, m_config.num_train_timesteps, true);
9069
for (float b : temp) {
9170
betas.push_back(b * b);
9271
}
9372
} else {
94-
OPENVINO_THROW("'beta_schedule' must be one of 'EPSILON' or 'SCALED_LINEAR'");
73+
OPENVINO_THROW("'beta_schedule' must be one of 'LINEAR' or 'SCALED_LINEAR'");
9574
}
9675

9776
for (float b : betas) {
@@ -127,6 +106,7 @@ void LCMScheduler::set_timesteps(size_t num_inference_steps) {
127106
// LCM Inference Steps Schedule
128107
std::reverse(lcm_origin_timesteps.begin(),lcm_origin_timesteps.end());
129108

109+
using numpy_utils::linspace;
130110
// v1. based on https://github.com/huggingface/diffusers/blame/2a7f43a73bda387385a47a15d7b6fe9be9c65eb2/src/diffusers/schedulers/scheduling_lcm.py#L387
131111
std::vector<size_t> inference_indices = linspace<size_t, float>(0, origin_timesteps_size, m_num_inference_steps);
132112
for (size_t i : inference_indices){

0 commit comments

Comments
 (0)