[Image Generation] Image2Image for FLUX (openvinotoolkit#1621)

likholat · ilya-lavrenov · web-flow · commit 6bdc704323a3 · 2025-01-24T06:18:06.000Z
![img2img_flux_lite](https://github.com/user-attachments/assets/00d860c9-5e1a-46c3-8403-12e47e20d6b3) ![img2img_flux_dev](https://github.com/user-attachments/assets/13b966f4-6753-45b9-9a3f-2d5f6928f895) ![img2img_flux_schnell](https://github.com/user-attachments/assets/b6b675f5-1e37-4390-a678-b69c77bedc61) --------- Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
diff --git a/SUPPORTED_MODELS.md b/SUPPORTED_MODELS.md
@@ -242,7 +242,7 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
       <tr>
       <td><code>Flux</code></td>
       <td>Supported</td>
-      <td>Not supported</td>
+      <td>Supported</td>
       <td>Not supported</td>
       <td>
         <ul>
diff --git a/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp
@@ -49,6 +49,14 @@ class OPENVINO_GENAI_EXPORTS Image2ImagePipeline {
         const UNet2DConditionModel& unet,
         const AutoencoderKL& vae);
 
+    // creates Flux pipeline from building blocks
+    static Image2ImagePipeline flux(
+        const std::shared_ptr<Scheduler>& scheduler,
+        const CLIPTextModel& clip_text_model,
+        const T5EncoderModel t5_encoder_model,
+        const FluxTransformer2DModel& transformer,
+        const AutoencoderKL& vae);
+
     ImageGenerationConfig get_generation_config() const;
     void set_generation_config(const ImageGenerationConfig& generation_config);
 
diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp
diff --git a/src/cpp/src/image_generation/image2image_pipeline.cpp b/src/cpp/src/image_generation/image2image_pipeline.cpp
@@ -9,6 +9,7 @@
 
 #include "image_generation/stable_diffusion_pipeline.hpp"
 #include "image_generation/stable_diffusion_xl_pipeline.hpp"
+#include "image_generation/flux_pipeline.hpp"
 
 #include "utils.hpp"
 
@@ -22,6 +23,8 @@ Image2ImagePipeline::Image2ImagePipeline(const std::filesystem::path& root_dir)
         m_impl = std::make_shared<StableDiffusionPipeline>(PipelineType::IMAGE_2_IMAGE, root_dir);
     } else if (class_name == "StableDiffusionXLPipeline") {
         m_impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::IMAGE_2_IMAGE, root_dir);
+    } else if (class_name == "FluxPipeline") {
+        m_impl = std::make_shared<FluxPipeline>(PipelineType::IMAGE_2_IMAGE, root_dir);
     } else {
         OPENVINO_THROW("Unsupported image to image generation pipeline '", class_name, "'");
     }
@@ -34,6 +37,8 @@ Image2ImagePipeline::Image2ImagePipeline(const std::filesystem::path& root_dir,
         m_impl = std::make_shared<StableDiffusionPipeline>(PipelineType::IMAGE_2_IMAGE, root_dir, device, properties);
     } else if (class_name == "StableDiffusionXLPipeline") {
         m_impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::IMAGE_2_IMAGE, root_dir, device, properties);
+    } else if (class_name == "FluxPipeline") {
+        m_impl = std::make_shared<FluxPipeline>(PipelineType::IMAGE_2_IMAGE, root_dir, device, properties);
     } else {
         OPENVINO_THROW("Unsupported image to image generation pipeline '", class_name, "'");
     }
@@ -44,6 +49,8 @@ Image2ImagePipeline::Image2ImagePipeline(const InpaintingPipeline& pipe) {
         m_impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::IMAGE_2_IMAGE, *stable_diffusion_xl);
     } else if (auto stable_diffusion = std::dynamic_pointer_cast<StableDiffusionPipeline>(pipe.m_impl); stable_diffusion != nullptr) {
         m_impl = std::make_shared<StableDiffusionPipeline>(PipelineType::IMAGE_2_IMAGE, *stable_diffusion);
+    } else if (auto flux = std::dynamic_pointer_cast<FluxPipeline>(pipe.m_impl); flux != nullptr) {
+        m_impl = std::make_shared<FluxPipeline>(PipelineType::IMAGE_2_IMAGE, *flux);
     } else {
         OPENVINO_ASSERT("Cannot convert specified InpaintingPipeline to Image2ImagePipeline");
     }
@@ -94,6 +101,20 @@ Image2ImagePipeline Image2ImagePipeline::stable_diffusion_xl(
     return Image2ImagePipeline(impl);
 }
 
+Image2ImagePipeline Image2ImagePipeline::flux(
+    const std::shared_ptr<Scheduler>& scheduler,
+    const CLIPTextModel& clip_text_model,
+    const T5EncoderModel t5_encoder_model,
+    const FluxTransformer2DModel& transformer,
+    const AutoencoderKL& vae){
+    auto impl = std::make_shared<FluxPipeline>(PipelineType::IMAGE_2_IMAGE, clip_text_model, t5_encoder_model, transformer, vae);
+
+    assert(scheduler != nullptr);
+    impl->set_scheduler(scheduler);
+
+    return Image2ImagePipeline(impl);
+}
+
 ImageGenerationConfig Image2ImagePipeline::get_generation_config() const {
     return m_impl->get_generation_config();
 }
diff --git a/src/cpp/src/image_generation/image_processor.cpp b/src/cpp/src/image_generation/image_processor.cpp
@@ -32,6 +32,7 @@ IImageProcessor::IImageProcessor(const std::string& device) :
 }
 
 ov::Tensor IImageProcessor::execute(ov::Tensor image) {
+    OPENVINO_ASSERT(m_request, "ImageProcessor model must be compiled first. Cannot infer non-compiled model");
     m_request.set_input_tensor(image);
     m_request.infer();
     return m_request.get_output_tensor();
@@ -124,6 +125,7 @@ ImageResizer::ImageResizer(const std::string& device, ov::element::Type type, ov
 }
 
 ov::Tensor ImageResizer::execute(ov::Tensor image, int64_t dst_height, int64_t dst_width) {
+    OPENVINO_ASSERT(m_request, "ImageResizer model must be compiled first. Cannot infer non-compiled model");
     ov::Tensor target_spatial_tensor(ov::element::i64, ov::Shape{2});
     target_spatial_tensor.data<int64_t>()[0] = dst_height;
     target_spatial_tensor.data<int64_t>()[1] = dst_width;
diff --git a/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp
@@ -208,7 +208,7 @@ std::map<std::string, ov::Tensor> EulerAncestralDiscreteScheduler::step(ov::Tens
     return {{"latent", prev_sample}, {"denoised", pred_original_sample}};
 }
 
-size_t EulerAncestralDiscreteScheduler::_index_for_timestep(int64_t timestep) const{
+size_t EulerAncestralDiscreteScheduler::_index_for_timestep(int64_t timestep) const {
     for (size_t i = 0; i < m_schedule_timesteps.size(); ++i) {
         if (timestep == m_schedule_timesteps[i]) {
             return i;
diff --git a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp
@@ -146,6 +146,43 @@ void FlowMatchEulerDiscreteScheduler::add_noise(ov::Tensor init_latent, ov::Tens
     OPENVINO_THROW("Not implemented");
 }
 
+size_t FlowMatchEulerDiscreteScheduler::_index_for_timestep(float timestep) {
+    if (m_schedule_timesteps.empty()) {
+        m_schedule_timesteps = m_timesteps;
+    }
+
+    for (size_t i = 0; i < m_schedule_timesteps.size(); ++i) {
+        if (timestep == m_schedule_timesteps[i]) {
+            return i;
+        }
+    }
+
+    OPENVINO_THROW("Failed to find index for timestep ", timestep);
+}
+
+void FlowMatchEulerDiscreteScheduler::scale_noise(ov::Tensor sample, float timestep, ov::Tensor noise) {
+    OPENVINO_ASSERT(timestep == -1, "Timestep is not computed yet");
+    
+    size_t index_for_timestep;
+    if (m_begin_index == -1) {
+        index_for_timestep = _index_for_timestep(timestep);
+    } else if (m_step_index != -1) {
+        index_for_timestep = m_step_index;
+    } else {
+        index_for_timestep = m_begin_index;
+    }
+
+    const float sigma = m_sigmas[index_for_timestep];
+
+    float * sample_data = sample.data<float>();
+    const float * noise_data = noise.data<float>();
+
+    for (size_t i = 0; i < sample.get_size(); ++i) {
+        sample_data[i] = sigma * noise_data[i] + (1.0f - sigma) * sample_data[i];
+    }
+
+}
+
 void FlowMatchEulerDiscreteScheduler::set_timesteps_with_sigma(std::vector<float> sigma, float mu) {
     m_timesteps.clear();
     m_sigmas.clear();
@@ -184,5 +221,13 @@ float FlowMatchEulerDiscreteScheduler::calculate_shift(size_t image_seq_len) {
     return mu;
 }
 
+void FlowMatchEulerDiscreteScheduler::set_begin_index(size_t begin_index) {
+    m_begin_index = begin_index;
+}
+
+size_t FlowMatchEulerDiscreteScheduler::get_begin_index() {
+    return m_begin_index;
+}
+
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp
@@ -42,20 +42,27 @@ class FlowMatchEulerDiscreteScheduler : public IScheduler {
 
     void add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const override;
 
+    void scale_noise(ov::Tensor sample, float timestep, ov::Tensor noise) override;
+
     float calculate_shift(size_t image_seq_len) override;
 
+    void set_begin_index(size_t begin_index) override;
+
+    size_t get_begin_index() override;
+
 private:
     Config m_config;
 
     std::vector<float> m_sigmas;
-    std::vector<float> m_timesteps;
+    std::vector<float> m_timesteps, m_schedule_timesteps;
 
     float m_sigma_min, m_sigma_max;
     size_t m_step_index, m_begin_index;
     size_t m_num_inference_steps;
 
     void init_step_index();
     double sigma_to_t(double simga);
+    size_t _index_for_timestep(float timestep);
 };
 
 } // namespace genai
diff --git a/src/cpp/src/image_generation/schedulers/ischeduler.hpp b/src/cpp/src/image_generation/schedulers/ischeduler.hpp
@@ -43,6 +43,16 @@ class IScheduler : public Scheduler {
     virtual std::vector<float> get_float_timesteps() const {
         OPENVINO_THROW("Scheduler doesn't support float timesteps");
     }
+
+    virtual void scale_noise(ov::Tensor sample, float timestep, ov::Tensor noise) {
+        OPENVINO_THROW("Scheduler doesn't support `scale_noise` method");
+    }
+
+    virtual void set_begin_index(size_t begin_index) {};
+
+    virtual size_t get_begin_index() {
+        OPENVINO_THROW("Scheduler doesn't support `get_begin_index` method");
+    }
 };
 
 } // namespace genai
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -774,6 +774,9 @@ class Image2ImagePipeline:
     This class is used for generation with image-to-image models.
     """
     @staticmethod
+    def flux(scheduler: Scheduler, clip_text_model: CLIPTextModel, t5_encoder_model: T5EncoderModel, transformer: FluxTransformer2DModel, vae: AutoencoderKL) -> Image2ImagePipeline:
+        ...
+    @staticmethod
     def latent_consistency_model(scheduler: Scheduler, clip_text_model: CLIPTextModel, unet: UNet2DConditionModel, vae: AutoencoderKL) -> Image2ImagePipeline:
         ...
     @staticmethod
diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp
@@ -330,6 +330,7 @@ void init_image_generation_pipelines(py::module_& m) {
         .def_static("stable_diffusion", &ov::genai::Image2ImagePipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae"))
         .def_static("latent_consistency_model", &ov::genai::Image2ImagePipeline::latent_consistency_model, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae"))
         .def_static("stable_diffusion_xl", &ov::genai::Image2ImagePipeline::stable_diffusion_xl, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("clip_text_model_with_projection"), py::arg("unet"), py::arg("vae"))
+        .def_static("flux", &ov::genai::Image2ImagePipeline::flux, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("t5_encoder_model"), py::arg("transformer"), py::arg("vae"))
         .def(
             "compile",
             [](ov::genai::Image2ImagePipeline& pipe,
diff --git a/tools/who_what_benchmark/tests/test_cli_image.py b/tools/who_what_benchmark/tests/test_cli_image.py
@@ -103,8 +103,11 @@ def test_image_model_types(model_id, model_type, backend):
                             ])),
 )
 def test_image_model_genai(model_id, model_type):
-    if ("flux" in model_id or "stable-diffusion-3" in model_id) and model_type != "text-to-image":
-        pytest.skip(reason="FLUX or SD3 are supported as text to image only")
+    if ("stable-diffusion-3" in model_id) and model_type != "text-to-image":
+        pytest.skip(reason="SD3 is supported as text to image only")
+
+    if ("flux" in model_id) and model_type == "image-inpainting":
+        pytest.skip(reason="FLUX is not yet supported as image inpainting")
 
     with tempfile.TemporaryDirectory() as temp_dir:
         GT_FILE = os.path.join(temp_dir, "gt.csv")

Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,7 @@ IImageProcessor::IImageProcessor(const std::string& device) :`
`32`	`32`	`}`
`33`	`33`
`34`	`34`	`ov::Tensor IImageProcessor::execute(ov::Tensor image) {`
	`35`	`+ OPENVINO_ASSERT(m_request, "ImageProcessor model must be compiled first. Cannot infer non-compiled model");`
`35`	`36`	`m_request.set_input_tensor(image);`
`36`	`37`	`m_request.infer();`
`37`	`38`	`return m_request.get_output_tensor();`
`@@ -124,6 +125,7 @@ ImageResizer::ImageResizer(const std::string& device, ov::element::Type type, ov`
`124`	`125`	`}`
`125`	`126`
`126`	`127`	`ov::Tensor ImageResizer::execute(ov::Tensor image, int64_t dst_height, int64_t dst_width) {`
	`128`	`+ OPENVINO_ASSERT(m_request, "ImageResizer model must be compiled first. Cannot infer non-compiled model");`
`127`	`129`	`ov::Tensor target_spatial_tensor(ov::element::i64, ov::Shape{2});`
`128`	`130`	`target_spatial_tensor.data<int64_t>()[0] = dst_height;`
`129`	`131`	`target_spatial_tensor.data<int64_t>()[1] = dst_width;`
Original file line number	Diff line number	Diff line change
`@@ -208,7 +208,7 @@ std::map<std::string, ov::Tensor> EulerAncestralDiscreteScheduler::step(ov::Tens`
`208`	`208`	`return {{"latent", prev_sample}, {"denoised", pred_original_sample}};`
`209`	`209`	`}`
`210`	`210`
`211`		`-size_t EulerAncestralDiscreteScheduler::_index_for_timestep(int64_t timestep) const{`
	`211`	`+size_t EulerAncestralDiscreteScheduler::_index_for_timestep(int64_t timestep) const {`
`212`	`212`	`for (size_t i = 0; i < m_schedule_timesteps.size(); ++i) {`
`213`	`213`	`if (timestep == m_schedule_timesteps[i]) {`
`214`	`214`	`return i;`