apaniukov
diff --git a/‎.github/labeler.yml
+4-4 b/‎.github/labeler.yml
+4-4
diff --git a/‎.github/workflows/stable_diffusion_1_5_cpp.yml
+4-4 b/‎.github/workflows/stable_diffusion_1_5_cpp.yml
+4-4
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎samples/cpp/text2image/lora.cpp
+1-1 b/‎samples/cpp/text2image/lora.cpp
+1-1
diff --git a/‎samples/cpp/text2image/main.cpp
+1-1 b/‎samples/cpp/text2image/main.cpp
+1-1
diff --git a/‎samples/cpp/visual_language_chat/load_image.cpp
+1-1 b/‎samples/cpp/visual_language_chat/load_image.cpp
+1-1
diff --git a/‎src/cpp/include/openvino/genai/text2image/autoencoder_kl.hpp ‎src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp
+31-10 b/‎src/cpp/include/openvino/genai/text2image/autoencoder_kl.hpp ‎src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp
+31-10
diff --git a/‎src/cpp/include/openvino/genai/text2image/clip_text_model.hpp ‎src/cpp/include/openvino/genai/image_generation/clip_text_model.hpp b/‎src/cpp/include/openvino/genai/text2image/clip_text_model.hpp ‎src/cpp/include/openvino/genai/image_generation/clip_text_model.hpp
diff --git a/‎src/cpp/include/openvino/genai/text2image/clip_text_model_with_projection.hpp ‎src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp b/‎src/cpp/include/openvino/genai/text2image/clip_text_model_with_projection.hpp ‎src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp
diff --git a/‎src/cpp/include/openvino/genai/image_generation/generation_config.hpp
+103 b/‎src/cpp/include/openvino/genai/image_generation/generation_config.hpp
+103
diff --git a/‎src/cpp/include/openvino/genai/image_generation/scheduler.hpp
+31 b/‎src/cpp/include/openvino/genai/image_generation/scheduler.hpp
+31
diff --git a/‎src/cpp/include/openvino/genai/text2image/sd3_transformer_2d_model.hpp ‎src/cpp/include/openvino/genai/image_generation/sd3_transformer_2d_model.hpp
+1-10 b/‎src/cpp/include/openvino/genai/text2image/sd3_transformer_2d_model.hpp ‎src/cpp/include/openvino/genai/image_generation/sd3_transformer_2d_model.hpp
+1-10
@@ -117,10 +117,10 @@
 - 'pyproject.toml'
 
 'category: text to image':
-- 'src/include/openvino/genai/text2image/**/*'
-- 'src/cpp/src/text2image/**/*'
-- 'src/python/py_text2image_models.cpp'
-- 'src/python/py_text2image_pipeline.cpp'
+- 'src/include/openvino/genai/image_generation/**/*'
+- 'src/cpp/src/image_generation/**/*'
+- 'src/python/py_image_generation_models.cpp'
+- 'src/python/py_image_generation_pipelines.cpp'
 
 'category: GenAI C++ API':
 - 'src/cpp/include/openvino/genai/**/*'
 
@@ -77,15 +77,15 @@ jobs:
         run: |
           source ${{ env.OV_INSTALL_DIR }}/setupvars.sh
           ${{ env.build_dir }}/samples/cpp/text2image/lora_stable_diffusion ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "curly-haired unicorn in the forest, anime, line" ./models/soulcard.safetensors 0.7
-      
+
       - name: Run Python main app
         run: |
           source openvino_sd_cpp/bin/activate
           source ./ov/setupvars.sh
           python ./samples/python/text2image/main.py ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting"
         env:
           PYTHONPATH: ${{ env.build_dir }}
-      
+
       - name: Run Python LoRA app
         run: |
           source openvino_sd_cpp/bin/activate
@@ -120,7 +120,7 @@ jobs:
         with:
           python-version: ${{ env.PYTHON_VERSION }}
           cache: 'pip'
-  
+
       - name: Build app
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
@@ -148,7 +148,7 @@ jobs:
           "${{ env.build_dir }}/samples/cpp/text2image/Release/stable_diffusion.exe ./models/dreamlike-art-dreamlike-anime-1.0/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'"
         env:
           PATH: ${{ env.build_dir }}\openvino_genai
-  
+
       - name: Run LoRA app
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
 
@@ -192,7 +192,7 @@ def main():
 Code below requires installation of C++ compatible package (see [here](https://docs.openvino.ai/2024/get-started/install-openvino/install-openvino-genai.html#archive-installation) for additional setup details, or this blog for full instruction [How to Build OpenVINO™ GenAI APP in C++](https://medium.com/openvino-toolkit/how-to-build-openvino-genai-app-in-c-32dcbe42fa67)
 
 ```cpp
-#include "openvino/genai/text2image/pipeline.hpp"
+#include "openvino/genai/image_generation/text2image_pipeline.hpp"
 #include "imwrite.hpp"
 int main(int argc, char* argv[]) {
 
 
@@ -1,7 +1,7 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include "openvino/genai/text2image/pipeline.hpp"
+#include "openvino/genai/image_generation/text2image_pipeline.hpp"
 
 #include "imwrite.hpp"
 
 
@@ -1,7 +1,7 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include "openvino/genai/text2image/pipeline.hpp"
+#include "openvino/genai/image_generation/text2image_pipeline.hpp"
 
 #include "imwrite.hpp"
 
 
@@ -45,7 +45,7 @@ ov::Tensor utils::load_image(const std::filesystem::path& image_path) {
             if (channels * height * width != bytes) {
                 throw std::runtime_error{"Unexpected number of bytes was requested to deallocate."};
             }
-            std::free(image);
+            stbi_image_free(image);
             image = nullptr;
         }
         bool is_equal(const SharedImageAllocator& other) const noexcept {return this == &other;}
 
@@ -30,42 +30,63 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL {
         explicit Config(const std::filesystem::path& config_path);
     };
 
-    explicit AutoencoderKL(const std::filesystem::path& root_dir);
+    explicit AutoencoderKL(const std::filesystem::path& vae_decoder_path);
 
-    AutoencoderKL(const std::filesystem::path& root_dir,
+    AutoencoderKL(const std::filesystem::path& vae_encoder_path,
+                  const std::filesystem::path& vae_decoder_path);
+
+    AutoencoderKL(const std::filesystem::path& vae_decoder_path,
+                  const std::string& device,
+                  const ov::AnyMap& properties = {});
+
+    AutoencoderKL(const std::filesystem::path& vae_encoder_path,
+                  const std::filesystem::path& vae_decoder_path,
                   const std::string& device,
                   const ov::AnyMap& properties = {});
 
     template <typename... Properties,
               typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
-    AutoencoderKL(const std::filesystem::path& root_dir,
+    AutoencoderKL(const std::filesystem::path& vae_decoder_path,
+                  const std::string& device,
+                  Properties&&... properties)
+        : AutoencoderKL(vae_decoder_path, device, ov::AnyMap{std::forward<Properties>(properties)...}) { }
+        
+    template <typename... Properties,
+              typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
+    AutoencoderKL(const std::filesystem::path& vae_encoder_path,
+                  const std::filesystem::path& vae_decoder_path,
                   const std::string& device,
                   Properties&&... properties)
-        : AutoencoderKL(root_dir, device, ov::AnyMap{std::forward<Properties>(properties)...}) { }
+        : AutoencoderKL(vae_encoder_path, vae_decoder_path, device, ov::AnyMap{std::forward<Properties>(properties)...}) { }
 
     AutoencoderKL(const AutoencoderKL&);
 
     AutoencoderKL& reshape(int batch_size, int height, int width);
 
     AutoencoderKL& compile(const std::string& device, const ov::AnyMap& properties = {});
 
-    const Config& get_config() const;
-
     template <typename... Properties>
     ov::util::EnableIfAllStringAny<AutoencoderKL&, Properties...> compile(
             const std::string& device,
             Properties&&... properties) {
         return compile(device, ov::AnyMap{std::forward<Properties>(properties)...});
     }
 
-    ov::Tensor infer(ov::Tensor latent);
+    ov::Tensor decode(ov::Tensor latent);
+
+    ov::Tensor encode(ov::Tensor image);
+
+    const Config& get_config() const;
+
+    size_t get_vae_scale_factor() const;
 
 private:
-    void merge_vae_image_processor() const;
+    void merge_vae_image_pre_processing() const;
+    void merge_vae_image_post_processing() const;
 
     Config m_config;
-    ov::InferRequest m_request;
-    std::shared_ptr<ov::Model> m_model;
+    ov::InferRequest m_encoder_request, m_decoder_request;
+    std::shared_ptr<ov::Model> m_encoder_model = nullptr, m_decoder_model = nullptr;
 };
 
 } // namespace genai
 
@@ -0,0 +1,103 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <string>
+#include <random>
+#include <optional>
+
+#include "openvino/runtime/tensor.hpp"
+#include "openvino/runtime/properties.hpp"
+
+#include "openvino/genai/lora_adapter.hpp"
+#include "openvino/genai/visibility.hpp"
+
+namespace ov {
+namespace genai {
+
+//
+// Random generators
+//
+
+class OPENVINO_GENAI_EXPORTS Generator {
+public:
+    virtual float next() = 0;
+    virtual ~Generator();
+};
+
+class OPENVINO_GENAI_EXPORTS CppStdGenerator : public Generator {
+public:
+    // creates 'std::mt19937' with initial 'seed' to generate numbers within a range [0.0f, 1.0f]
+    explicit CppStdGenerator(uint32_t seed);
+
+    virtual float next() override;
+private:
+    std::mt19937 gen;
+    std::normal_distribution<float> normal;
+};
+
+struct OPENVINO_GENAI_EXPORTS ImageGenerationConfig {
+    // LCM: prompt only w/o negative prompt
+    // SD XL: prompt2 and negative_prompt2
+    // FLUX: prompt2 (prompt if prompt2 is not defined explicitly)
+    // SD 3: prompt2, prompt3 (with fallback to prompt) and negative_prompt2, negative_prompt3
+    std::string negative_prompt;
+    std::optional<std::string> prompt_2 = std::nullopt, prompt_3 = std::nullopt;
+    std::optional<std::string> negative_prompt_2 = std::nullopt, negative_prompt_3 = std::nullopt;
+
+    size_t num_images_per_prompt = 1;
+
+    // random generator to have deterministic results
+    std::shared_ptr<Generator> random_generator = std::make_shared<CppStdGenerator>(42);
+
+    // the following values depend on HF diffusers class used to perform generation
+    float guidance_scale = 7.5f;
+    int64_t height = -1;
+    int64_t width = -1;
+    size_t num_inference_steps = 50;
+
+    // used by some image to image pipelines to balance between noise and initial image
+    // higher 'stregth' value means more noise is added to initial latent image
+    // for text to image pipeline it must be set to 1.0f
+    float strength = 1.0f;
+
+    std::optional<AdapterConfig> adapters;
+
+    void update_generation_config(const ov::AnyMap& config_map);
+
+    // checks whether is config is valid
+    void validate() const;
+
+    template <typename... Properties>
+    ov::util::EnableIfAllStringAny<void, Properties...> update_generation_config(Properties&&... properties) {
+        return update_generation_config(ov::AnyMap{std::forward<Properties>(properties)...});
+    }
+};
+
+//
+// Generation config properties
+//
+
+static constexpr ov::Property<std::string> prompt_2{"prompt_2"};
+static constexpr ov::Property<std::string> prompt_3{"prompt_3"};
+
+static constexpr ov::Property<std::string> negative_prompt{"negative_prompt"};
+static constexpr ov::Property<std::string> negative_prompt_2{"negative_prompt_2"};
+static constexpr ov::Property<std::string> negative_prompt_3{"negative_prompt_3"};
+
+static constexpr ov::Property<size_t> num_images_per_prompt{"num_images_per_prompt"};
+static constexpr ov::Property<float> guidance_scale{"guidance_scale"};
+static constexpr ov::Property<int64_t> height{"height"};
+static constexpr ov::Property<int64_t> width{"width"};
+static constexpr ov::Property<size_t> num_inference_steps{"num_inference_steps"};
+
+static constexpr ov::Property<float> strength{"strength"};
+
+static constexpr ov::Property<std::shared_ptr<Generator>> random_generator{"random_generator"};
+
+OPENVINO_GENAI_EXPORTS
+std::pair<std::string, ov::Any> generation_config(const ImageGenerationConfig& generation_config);
+
+} // namespace genai
+} // namespace ov
@@ -0,0 +1,31 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <filesystem>
+
+#include "openvino/genai/visibility.hpp"
+
+namespace ov {
+namespace genai {
+
+class OPENVINO_GENAI_EXPORTS Scheduler {
+public:
+    enum Type {
+        AUTO,
+        LCM,
+        LMS_DISCRETE,
+        DDIM,
+        EULER_DISCRETE,
+        FLOW_MATCH_EULER_DISCRETE
+    };
+
+    static std::shared_ptr<Scheduler> from_config(const std::filesystem::path& scheduler_config_path,
+                                                  Type scheduler_type = AUTO);
+
+    virtual ~Scheduler();
+};
+
+} // namespace genai
+} // namespace ov
@@ -5,6 +5,7 @@
 
 #include <filesystem>
 #include <string>
+#include <vector>
 
 #include "openvino/core/any.hpp"
 #include "openvino/runtime/infer_request.hpp"
@@ -22,15 +23,7 @@ class OPENVINO_GENAI_EXPORTS SD3Transformer2DModel {
         size_t sample_size = 128;
         size_t patch_size = 2;
         size_t in_channels = 16;
-        size_t num_layers = 18;
-        size_t attention_head_dim = 64;
-        size_t num_attention_heads = 18;
         size_t joint_attention_dim = 4096;
-        size_t caption_projection_dim = 1152;
-        size_t pooled_projection_dim = 2048;
-        size_t out_channels = 16;
-        size_t pos_embed_max_size = 96;
-        std::vector<size_t> block_out_channels = { 128, 256, 512, 512 };
 
         explicit Config(const std::filesystem::path& config_path);
     };
@@ -64,8 +57,6 @@ class OPENVINO_GENAI_EXPORTS SD3Transformer2DModel {
 
     ov::Tensor infer(const ov::Tensor latent, const ov::Tensor timestep);
 
-    size_t get_vae_scale_factor() const;
-
 private:
     Config m_config;
     ov::InferRequest m_request;
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ ov::Tensor utils::load_image(const std::filesystem::path& image_path) {`
`45`	`45`	`if (channels * height * width != bytes) {`
`46`	`46`	`throw std::runtime_error{"Unexpected number of bytes was requested to deallocate."};`
`47`	`47`	`}`
`48`		`- std::free(image);`
	`48`	`+ stbi_image_free(image);`
`49`	`49`	`image = nullptr;`
`50`	`50`	`}`
`51`	`51`	`bool is_equal(const SharedImageAllocator& other) const noexcept {return this == &other;}`