DDIM scheduler (openvinotoolkit#897)

likholat · web-flow · commit cf88a7ee3f84 · 2024-09-26T00:41:17.000Z
1. model: `stabilityai/stable-diffusion-2-1 FP16` prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting` pipeline params: ```cpp ov::Tensor image = pipe.generate(prompt, ov::genai::width(768), ov::genai::height(768), ov::genai::num_inference_steps(50)); ``` output: ![image](https://github.com/user-attachments/assets/8567efda-d7be-47c3-af25-be441d0d9ec6) 2. model `stabilityai/stable-diffusion-2 FP16` prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting` pipeline params: ```cpp ov::Tensor image = pipe.generate(prompt, ov::genai::width(768), ov::genai::height(768), ov::genai::num_inference_steps(50)); ``` output: ![image](https://github.com/user-attachments/assets/eabd67d5-1ef0-41d8-aee4-861cadde10f2) 3. model `dreamlike-art/dreamlike-anime-1.0 FP16` prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting` pipeline params: ```cpp ov::Tensor image = pipe.generate(prompt, ov::genai::width(512), ov::genai::height(512), ov::genai::num_inference_steps(20)); ``` output: ![image](https://github.com/user-attachments/assets/72f75774-6025-4820-9d37-b2885c7c72c7) 4. model `bghira/pseudo-journey-v2 FP16` - requires **DDPMScheduler** by default prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting` pipeline params: ```cpp ov::Tensor image = pipe.generate(prompt, ov::genai::width(512), ov::genai::height(512), ov::genai::num_inference_steps(20)); ``` output: ![image](https://github.com/user-attachments/assets/2b89c302-a5d6-4ade-bca0-84b2aa509b8a) Ticket: __CVS-152319__
diff --git a/README.md b/README.md
@@ -34,8 +34,7 @@ It includes the following pipelines:
      6. [multinomial_causal_lm](./samples/cpp/multinomial_causal_lm/README.md)
      7. [prompt_lookup_decoding_lm](./samples/cpp/prompt_lookup_decoding_lm/README.md)
      8. [speculative_decoding_lm](./samples/cpp/speculative_decoding_lm/README.md)
-3. [Stable Diffuison (with LoRA) C++ image generation pipeline](./image_generation/stable_diffusion_1_5/cpp/README.md)
-4. [Latent Consistency Model (with LoRA) C++ image generation pipeline](./image_generation/lcm_dreamshaper_v7/cpp/README.md)
+3. [Stable Diffuison and Latent Consistency Model (with LoRA) C++ image generation pipeline](./samples/cpp/stable_diffusion/README.md)
 
 ### Requirements
 
diff --git a/samples/cpp/stable_diffusion/512x512.bmp b/samples/cpp/stable_diffusion/512x512.bmp
diff --git a/samples/cpp/stable_diffusion/README.md b/samples/cpp/stable_diffusion/README.md
@@ -36,6 +36,8 @@ Prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk gol
 Models can be downloaded from [OpenAI HiggingFace](https://huggingface.co/openai). This sample can run the following list of models, but not limitied to:
 
 - [botp/stable-diffusion-v1-5](https://huggingface.co/botp/stable-diffusion-v1-5)
+- [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2)
+- [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1)
 - [dreamlike-art/dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0)
 - [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7)
 
diff --git a/src/cpp/include/openvino/genai/text2image/pipeline.hpp b/src/cpp/include/openvino/genai/text2image/pipeline.hpp
@@ -52,7 +52,8 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline {
         enum Type {
             AUTO,
             LCM,
-            LMS_DISCRETE
+            LMS_DISCRETE,
+            DDIM
         };
 
         static std::shared_ptr<Scheduler> from_config(const std::string& scheduler_config_path,
diff --git a/src/cpp/src/text2image/numpy_utils.hpp b/src/cpp/src/text2image/numpy_utils.hpp
@@ -0,0 +1,36 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <vector>
+
+namespace ov {
+namespace genai {
+namespace numpy_utils {
+
+// https://gist.github.com/lorenzoriano/5414671
+template <typename T, typename U>
+std::vector<T> linspace(U start, U end, size_t num, bool endpoint = false) {
+    std::vector<T> indices;
+    if (num != 0) {
+        if (num == 1)
+            indices.push_back(static_cast<T>(start));
+        else {
+            if (endpoint)
+                --num;
+
+            U delta = (end - start) / static_cast<U>(num);
+            for (size_t i = 0; i < num; i++)
+                indices.push_back(static_cast<T>(start + delta * i));
+
+            if (endpoint)
+                indices.push_back(static_cast<T>(end));
+        }
+    }
+    return indices;
+}
+
+}// namespace ov
+}// namespace genai
+}// namespace txt2img_utils
diff --git a/src/cpp/src/text2image/schedulers/ddim.cpp b/src/cpp/src/text2image/schedulers/ddim.cpp
@@ -0,0 +1,201 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cassert>
+#include <random>
+#include <fstream>
+#include <iterator>
+
+#include "text2image/schedulers/ddim.hpp"
+#include "utils.hpp"
+#include "text2image/numpy_utils.hpp"
+
+namespace ov {
+namespace genai {
+
+DDIMScheduler::Config::Config(const std::string& scheduler_config_path) {
+    std::ifstream file(scheduler_config_path);
+    OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path);
+
+    nlohmann::json data = nlohmann::json::parse(file);
+    using utils::read_json_param;
+
+    read_json_param(data, "num_train_timesteps", num_train_timesteps);
+    read_json_param(data, "beta_start", beta_start);
+    read_json_param(data, "beta_end", beta_end);
+    read_json_param(data, "beta_schedule", beta_schedule);
+    read_json_param(data, "trained_betas", trained_betas);
+    read_json_param(data, "clip_sample", clip_sample);
+    read_json_param(data, "set_alpha_to_one", set_alpha_to_one);
+    read_json_param(data, "steps_offset", steps_offset);
+    read_json_param(data, "prediction_type", prediction_type);
+    read_json_param(data, "thresholding", thresholding);
+    read_json_param(data, "dynamic_thresholding_ratio", dynamic_thresholding_ratio);
+    read_json_param(data, "clip_sample_range", clip_sample_range);
+    read_json_param(data, "sample_max_value", sample_max_value);
+    read_json_param(data, "timestep_spacing", timestep_spacing);
+    read_json_param(data, "rescale_betas_zero_snr", rescale_betas_zero_snr);
+}
+
+DDIMScheduler::DDIMScheduler(const std::string scheduler_config_path) 
+    : DDIMScheduler(Config(scheduler_config_path)) {
+}
+
+DDIMScheduler::DDIMScheduler(const Config& scheduler_config)
+    : m_config(scheduler_config) {
+
+    std::vector<float> alphas, betas;
+
+    using numpy_utils::linspace;
+
+    if (!m_config.trained_betas.empty()) {
+        betas = m_config.trained_betas;
+    } else if (m_config.beta_schedule == BetaSchedule::LINEAR) {
+        betas = linspace<float>(m_config.beta_start, m_config.beta_end, m_config.num_train_timesteps);
+    } else if (m_config.beta_schedule == BetaSchedule::SCALED_LINEAR) {
+        float start = std::sqrt(m_config.beta_start);
+        float end = std::sqrt(m_config.beta_end);
+        betas = linspace<float>(start, end, m_config.num_train_timesteps);
+        std::for_each(betas.begin(), betas.end(), [] (float & x) { x *= x; });
+    } else {
+        OPENVINO_THROW("'beta_schedule' must be one of 'LINEAR' or 'SCALED_LINEAR'. Please, add support of other types");
+    }
+
+    // TODO: Rescale for zero SNR
+    // if (m_config.rescale_betas_zero_snr) {betas = rescale_zero_terminal_snr(betas)}
+
+    std::transform(betas.begin(), betas.end(), std::back_inserter(alphas), [] (float b) { return 1.0f - b; });
+
+    for (size_t i = 1; i <= alphas.size(); i++) {
+        float alpha_cumprod =
+            std::accumulate(std::begin(alphas), std::begin(alphas) + i, 1.0, std::multiplies<float>{});
+        m_alphas_cumprod.push_back(alpha_cumprod);
+    }
+
+    m_final_alpha_cumprod = m_config.set_alpha_to_one ? 1 : m_alphas_cumprod[0];
+}
+
+void DDIMScheduler::set_timesteps(size_t num_inference_steps) {
+    m_timesteps.clear();
+
+    OPENVINO_ASSERT(num_inference_steps <= m_config.num_train_timesteps,
+                    "`num_inference_steps` cannot be larger than `m_config.num_train_timesteps`");
+
+    m_num_inference_steps = num_inference_steps;
+
+    switch (m_config.timestep_spacing) {
+        case TimestepSpacing::LINSPACE:
+        {
+            using numpy_utils::linspace;
+            float end = static_cast<float>(m_config.num_train_timesteps - 1);
+            auto linspaced = linspace<float>(0.0f, end, num_inference_steps, true);
+            for (auto it = linspaced.rbegin(); it != linspaced.rend(); ++it) {
+                m_timesteps.push_back(static_cast<int64_t>(std::round(*it)));
+            }
+            break;
+        }
+        case TimestepSpacing::LEADING:
+        {
+            size_t step_ratio = m_config.num_train_timesteps / m_num_inference_steps;
+            for (size_t i = num_inference_steps - 1; i != -1; --i) {
+                m_timesteps.push_back(i * step_ratio + m_config.steps_offset);
+            }
+            break;
+        }
+        case TimestepSpacing::TRAILING:
+        {
+            float step_ratio = static_cast<float>(m_config.num_train_timesteps) / static_cast<float>(m_num_inference_steps);
+            for (float i = m_config.num_train_timesteps; i > 0; i-=step_ratio){
+                m_timesteps.push_back(static_cast<int64_t>(std::round(i)) - 1);
+            }
+            break;
+        }
+        default:
+            OPENVINO_THROW("Unsupported value for 'timestep_spacing'");
+    }
+}
+
+std::map<std::string, ov::Tensor> DDIMScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) {
+    // noise_pred - model_output
+    // latents - sample
+    // inference_step
+
+    size_t timestep = get_timesteps()[inference_step];
+
+    // get previous step value (=t-1)
+    int prev_timestep = timestep - m_config.num_train_timesteps / m_num_inference_steps;
+
+    // compute alphas, betas
+    float alpha_prod_t = m_alphas_cumprod[timestep];
+    float alpha_prod_t_prev = (prev_timestep >= 0) ? m_alphas_cumprod[prev_timestep] : m_final_alpha_cumprod;
+    float beta_prod_t = 1 - alpha_prod_t;
+
+    // compute predicted original sample from predicted noise also called
+    // "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    std::vector<float> pred_original_sample, pred_epsilon;
+    float pos_val, pe_val;
+    for (size_t j = 0; j < noise_pred.get_size(); j++) {
+        switch (m_config.prediction_type) {
+            case PredictionType::EPSILON:
+                pos_val = (latents.data<float>()[j] - std::sqrt(beta_prod_t) * noise_pred.data<float>()[j]) / std::sqrt(alpha_prod_t);
+                pe_val = noise_pred.data<float>()[j];
+                pred_original_sample.push_back(pos_val);
+                pred_epsilon.push_back(pe_val);
+                break;
+            case PredictionType::SAMPLE:
+                pos_val = noise_pred.data<float>()[j];
+                pe_val = (latents.data<float>()[j] - std::sqrt(alpha_prod_t) * pos_val) / std::sqrt(beta_prod_t);
+                pred_original_sample.push_back(pos_val);
+                pred_epsilon.push_back(pe_val);
+                break;
+            case PredictionType::V_PREDICTION:
+                pos_val = std::sqrt(alpha_prod_t) * latents.data<float>()[j] - std::sqrt(beta_prod_t) * noise_pred.data<float>()[j];
+                pe_val = std::sqrt(alpha_prod_t) * noise_pred.data<float>()[j] + std::sqrt(beta_prod_t) * latents.data<float>()[j];
+                pred_original_sample.push_back(pos_val);
+                pred_epsilon.push_back(pe_val);
+                break;
+            default:
+                OPENVINO_THROW("Unsupported value for 'PredictionType'");
+            }
+    }
+
+    // TODO: Clip or threshold "predicted x_0"
+    // if m_config.thresholding:
+    //         pred_original_sample = _threshold_sample(pred_original_sample)
+    // elif m_config.clip_sample:
+    //         pred_original_sample = pred_original_sample.clamp(
+    //             -self.config.clip_sample_range, self.config.clip_sample_range
+    //         )
+
+    // compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    std::vector<float> pred_sample_direction(pred_epsilon.size());
+    std::transform(pred_epsilon.begin(), pred_epsilon.end(), pred_sample_direction.begin(), [alpha_prod_t_prev](auto x) {
+        return std::sqrt(1 - alpha_prod_t_prev) * x;
+    });
+
+    // compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    ov::Tensor prev_sample(latents.get_element_type(), latents.get_shape());
+    float* prev_sample_data = prev_sample.data<float>();
+    for (size_t i = 0; i < prev_sample.get_size(); ++i) {
+        prev_sample_data[i] = std::sqrt(alpha_prod_t_prev) * pred_original_sample[i] + pred_sample_direction[i];
+    }
+
+    std::map<std::string, ov::Tensor> result{{"latent", prev_sample}};
+
+    return result;
+}
+
+std::vector<int64_t> DDIMScheduler::get_timesteps() const {
+    return m_timesteps;
+}
+
+float DDIMScheduler::get_init_noise_sigma() const {
+    return 1.0f;
+}
+
+void DDIMScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) {
+    return;
+}
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/text2image/schedulers/ddim.hpp b/src/cpp/src/text2image/schedulers/ddim.hpp
@@ -0,0 +1,59 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <list>
+#include <string>
+
+#include "text2image/schedulers/types.hpp"
+#include "text2image/schedulers/ischeduler.hpp"
+
+namespace ov {
+namespace genai {
+
+class DDIMScheduler : public IScheduler {
+public:
+    struct Config {
+        int32_t num_train_timesteps = 1000;
+        float beta_start = 0.0001f, beta_end = 0.02f;
+        BetaSchedule beta_schedule = BetaSchedule::SCALED_LINEAR;
+        std::vector<float> trained_betas = {};
+        bool clip_sample = true, set_alpha_to_one = true;
+        size_t steps_offset = 0;
+        PredictionType prediction_type = PredictionType::EPSILON;
+        bool thresholding = false;
+        float dynamic_thresholding_ratio = 0.995f, clip_sample_range = 1.0f, sample_max_value = 1.0f;
+        TimestepSpacing timestep_spacing = TimestepSpacing::LEADING;
+        bool rescale_betas_zero_snr = false;
+
+        Config() = default;
+        explicit Config(const std::string& scheduler_config_path);
+    };
+
+    explicit DDIMScheduler(const std::string scheduler_config_path);
+    explicit DDIMScheduler(const Config& scheduler_config);
+
+    void set_timesteps(size_t num_inference_steps) override;
+
+    std::vector<std::int64_t> get_timesteps() const override;
+
+    float get_init_noise_sigma() const override;
+
+    void scale_model_input(ov::Tensor sample, size_t inference_step) override;
+
+    std::map<std::string, ov::Tensor> step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) override;
+
+private:
+    Config m_config;
+
+    std::vector<float> m_alphas_cumprod;
+    float m_final_alpha_cumprod;
+
+    size_t m_num_inference_steps;
+    std::vector<int64_t> m_timesteps;
+
+};
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/text2image/schedulers/lcm.cpp b/src/cpp/src/text2image/schedulers/lcm.cpp
@@ -7,32 +7,9 @@
 #include <iterator>
 
 #include "text2image/schedulers/lcm.hpp"
+#include "utils.hpp"
+#include "text2image/numpy_utils.hpp"
 
-namespace {
-
-// https://gist.github.com/lorenzoriano/5414671
-template <typename T, typename U>
-std::vector<T> linspace(U start, U end, size_t num, bool endpoint = false) {
-    std::vector<T> indices;
-    if (num != 0) {
-        if (num == 1)
-            indices.push_back(static_cast<T>(start));
-        else {
-            if (endpoint)
-                --num;
-
-            U delta = (end - start) / static_cast<U>(num);
-            for (size_t i = 0; i < num; i++)
-                indices.push_back(static_cast<T>(start + delta * i));
-
-            if (endpoint)
-                indices.push_back(static_cast<T>(end));
-        }
-    }
-    return indices;
-}
-
-} // namespace 
 
 namespace ov {
 namespace genai {
@@ -86,12 +63,14 @@ LCMScheduler::LCMScheduler(const Config& scheduler_config)
     } else if (m_config.beta_schedule == BetaSchedule::SCALED_LINEAR) {
         float start = std::sqrt(m_config.beta_start);
         float end = std::sqrt(m_config.beta_end);
+
+        using numpy_utils::linspace;
         std::vector<float> temp = linspace<float, float>(start, end, m_config.num_train_timesteps, true);
         for (float b : temp) {
             betas.push_back(b * b);
         }
     } else {
-        OPENVINO_THROW("'beta_schedule' must be one of 'EPSILON' or 'SCALED_LINEAR'");
+        OPENVINO_THROW("'beta_schedule' must be one of 'LINEAR' or 'SCALED_LINEAR'");
     }
 
     for (float b : betas) {
@@ -127,6 +106,7 @@ void LCMScheduler::set_timesteps(size_t num_inference_steps) {
     // LCM Inference Steps Schedule
     std::reverse(lcm_origin_timesteps.begin(),lcm_origin_timesteps.end());
 
+    using numpy_utils::linspace;
     // v1. based on https://github.com/huggingface/diffusers/blame/2a7f43a73bda387385a47a15d7b6fe9be9c65eb2/src/diffusers/schedulers/scheduling_lcm.py#L387
     std::vector<size_t> inference_indices = linspace<size_t, float>(0, origin_timesteps_size, m_num_inference_steps);
     for (size_t i : inference_indices){
diff --git a/src/cpp/src/text2image/schedulers/scheduler.cpp b/src/cpp/src/text2image/schedulers/scheduler.cpp
diff --git a/src/cpp/src/text2image/schedulers/types.cpp b/src/cpp/src/text2image/schedulers/types.cpp