TolyaTalamanov
diff --git a/‎samples/cpp/image_generation/heterogeneous_stable_diffusion.cpp
+17-55 b/‎samples/cpp/image_generation/heterogeneous_stable_diffusion.cpp
+17-55
diff --git a/‎samples/python/image_generation/heterogeneous_stable_diffusion.py
+21-53 b/‎samples/python/image_generation/heterogeneous_stable_diffusion.py
+21-53
diff --git a/‎src/cpp/include/openvino/genai/image_generation/clip_text_model.hpp
+2 b/‎src/cpp/include/openvino/genai/image_generation/clip_text_model.hpp
+2
diff --git a/‎src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp
+2 b/‎src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp
+2
diff --git a/‎src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp
+23 b/‎src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp
+23
diff --git a/‎src/cpp/src/image_generation/diffusion_pipeline.hpp
+9-1 b/‎src/cpp/src/image_generation/diffusion_pipeline.hpp
+9-1
diff --git a/‎src/cpp/src/image_generation/flux_pipeline.hpp
+7 b/‎src/cpp/src/image_generation/flux_pipeline.hpp
+7
diff --git a/‎src/cpp/src/image_generation/models/clip_text_model.cpp
+32-4 b/‎src/cpp/src/image_generation/models/clip_text_model.cpp
+32-4
@@ -18,7 +18,6 @@ int32_t main(int32_t argc, char* argv[]) try {
 
     const int width = 512;
     const int height = 512;
-    const float guidance_scale = 7.5f;
     const int number_of_images_to_generate = 1;
     const int number_of_inference_steps_per_image = 20;
 
@@ -37,73 +36,36 @@ int32_t main(int32_t argc, char* argv[]) try {
     std::string ov_cache_dir = "./cache";
 
     //
-    // Step 1: Prepare each Text2Image subcomponent (scheduler, text encoder, unet, vae) separately.
+    // Step 1: Create the initial Text2ImagePipeline, given the model path
     //
+    ov::genai::Text2ImagePipeline pipe(models_path);
 
-    // Create the scheduler from the details listed in the json.
-    auto scheduler = ov::genai::Scheduler::from_config(root_dir / "scheduler/scheduler_config.json");
-
-    // Note that we could have created the scheduler by specifying specific type (for example EULER_DISCRETE), like
-    // this: auto scheduler = ov::genai::Scheduler::from_config(root_dir / "scheduler/scheduler_config.json",
-    //                                                    ov::genai::Scheduler::Type::EULER_DISCRETE);
-    // This can be useful when a particular type of Scheduler is not yet supported natively by OpenVINO GenAI.
-    // (even though we are actively working to support most commonly used ones)
-
-    // Create unet object
-    auto unet = ov::genai::UNet2DConditionModel(root_dir / "unet");
-
-    // Set batch size based on classifier free guidance condition.
-    int unet_batch_size = unet.do_classifier_free_guidance(guidance_scale)  ? 2 : 1;
-
-    // Create the text encoder.
-    auto text_encoder = ov::genai::CLIPTextModel(root_dir / "text_encoder");
-
-    // In case of NPU, we need to reshape the model to have static shapes
-    if (text_encoder_device == "NPU") {
-        text_encoder.reshape(unet_batch_size);
-    }
-
-    // Compile text encoder for the specified device
-    text_encoder.compile(text_encoder_device, ov::cache_dir(ov_cache_dir));
-
-    // In case of NPU, we need to reshape the model to have static shapes
-    if (unet_device == "NPU") {
-        // The max_postiion_embeddings config from text encoder will be used as a parameter to unet reshape.
-        int max_position_embeddings = text_encoder.get_config().max_position_embeddings;
-
-        unet.reshape(unet_batch_size, height, width, max_position_embeddings);
-    }
-
-    // Compile unet for specified device
-    unet.compile(unet_device, ov::cache_dir(ov_cache_dir));
+    //
+    // Step 2: Reshape the pipeline given number of images, width, height, and guidance scale.
+    //
+    pipe.reshape(1, width, height, pipe.get_generation_config().guidance_scale);
 
-    // Create the vae decoder.
-    auto vae = ov::genai::AutoencoderKL(root_dir / "vae_decoder");
+    //
+    // Step 3: Compile the pipeline with the specified devices, and properties (like cache dir)
+    //
+    ov::AnyMap properties = {ov::cache_dir(ov_cache_dir)};
 
-    // In case of NPU, we need to reshape the model to have static shapes
-    if (vae_decoder_device == "NPU") {
-        // We set batch-size to '1' here, as we're configuring our pipeline to return 1 image per 'generate' call.
-        vae.reshape(1, height, width);
-    }
+    // Note that if there are device-specific properties that are needed, they can
+    // be added using ov::device::properties groups, like this:
+    //ov::AnyMap properties = {ov::device::properties("CPU", ov::cache_dir("cpu_cache")),
+    //                         ov::device::properties("GPU", ov::cache_dir("gpu_cache")),
+    //                         ov::device::properties("NPU", ov::cache_dir("npu_cache"))};
 
-    // Compile vae decoder for the specified device
-    vae.compile(vae_decoder_device, ov::cache_dir(ov_cache_dir));
+    pipe.compile(text_encoder_device, unet_device, vae_decoder_device, properties);
 
-    //
-    // Step 2: Create a Text2ImagePipeline from the individual subcomponents
-    //
-    auto pipe = ov::genai::Text2ImagePipeline::stable_diffusion(scheduler, text_encoder, unet, vae);
 
     //
-    // Step 3: Use the Text2ImagePipeline to generate 'number_of_images_to_generate' images.
+    // Step 4: Use the Text2ImagePipeline to generate 'number_of_images_to_generate' images.
     //
     for (int imagei = 0; imagei < number_of_images_to_generate; imagei++) {
         std::cout << "Generating image " << imagei << std::endl;
 
         ov::Tensor image = pipe.generate(prompt,
-                                         ov::genai::width(width),
-                                         ov::genai::height(height),
-                                         ov::genai::guidance_scale(guidance_scale),
                                          ov::genai::num_inference_steps(number_of_inference_steps_per_image),
                                          ov::genai::callback(progress_bar));
 
 
@@ -23,7 +23,6 @@ def main():
 
     width = 512
     height = 512
-    guidance_scale = 7.5
     number_of_images_to_generate = 1
     number_of_inference_steps_per_image = 20
 
@@ -36,72 +35,41 @@ def main():
     ov_cache_dir = "./cache"
 
     #
-    # Step 1: Prepare each Text2Image subcomponent (scheduler, text encoder, unet, vae) separately.
+    # Step 1: Create the initial Text2ImagePipeline, given the model path
     #
+    pipe = openvino_genai.Text2ImagePipeline(args.model_dir)
 
-    # Create the scheduler from the details listed in the json.
-    scheduler = openvino_genai.Scheduler.from_config(args.model_dir + "/scheduler/scheduler_config.json")
-
-    # Note that we can also create the scheduler by specifying specific type (for example EULER_DISCRETE), like this:
-    # scheduler = openvino_genai.Scheduler.from_config(args.model_dir + "/scheduler/scheduler_config.json",
-    #                                                  openvino_genai.Scheduler.Type.EULER_DISCRETE)
-    # This can be useful when a particular type of Scheduler is not yet supported natively by OpenVINO GenAI.
-    # (even though we are actively working to support most commonly used ones)
-
-    # Create unet object
-    unet = openvino_genai.UNet2DConditionModel(args.model_dir + "/unet")
-
-    # Set batch size based on classifier free guidance condition.
-    unet_batch_size = 2 if unet.do_classifier_free_guidance(guidance_scale) else 1
-
-    # Create the text encoder
-    text_encoder = openvino_genai.CLIPTextModel(args.model_dir + "/text_encoder")
-
-    # In case of NPU, we need to reshape the model to have static shapes
-    if args.text_encoder_device == "NPU":
-        text_encoder.reshape(unet_batch_size)
-
-    # Compile text encoder for the specified device
-    text_encoder.compile(args.text_encoder_device, CACHE_DIR=ov_cache_dir)
-
-    # In case of NPU, we need to reshape the unet model to have static shapes
-    if args.unet_device == "NPU":
-        # The max_postion_embeddings config from text encoder will be used as a parameter to unet reshape.
-        max_position_embeddings = text_encoder.get_config().max_position_embeddings
-
-        unet.reshape(unet_batch_size, height, width, max_position_embeddings)
-
-    # Compile unet for specified device
-    unet.compile(args.unet_device, CACHE_DIR=ov_cache_dir)
-
-    # Create the decoder
-    vae = openvino_genai.AutoencoderKL(args.model_dir + "/vae_decoder")
-
-    # In case of NPU, we need to reshape the vae model to have static shapes
-    if args.vae_decoder_device == "NPU":
-        vae.reshape(1, height, width)
-
-    # Compile vae decoder for the specified device
-    vae.compile(args.vae_decoder_device, CACHE_DIR=ov_cache_dir)
+    #
+    # Step 2: Reshape the pipeline given number of images, width, height, and guidance scale.
+    #
+    pipe.reshape(1, width, height, pipe.get_generation_config().guidance_scale)
 
     #
-    # Step 2: Create a Text2ImagePipeline from the individual subcomponents
+    # Step 3: Compile the pipeline given the specified devices, and properties (like cache dir)
     #
+    properties = {"CACHE_DIR": "cache"}
+
+    # Note that if there are device-specific properties that are needed, they can
+    # be added using a "DEVICE_PROPERTIES" entry, like this:
+    #properties = {
+    #    "DEVICE_PROPERTIES":
+    #    {
+    #        "CPU": {"CACHE_DIR": "cpu_cache"},
+    #        "GPU": {"CACHE_DIR": "gpu_cache"},
+    #        "NPU": {"CACHE_DIR": "npu_cache"}
+    #    }
+    #}
 
-    pipe = openvino_genai.Text2ImagePipeline.stable_diffusion(scheduler, text_encoder, unet, vae)
+    pipe.compile(args.text_encoder_device, args.unet_device, args.vae_decoder_device, config=properties)
 
     #
-    # Step 3: Use the Text2ImagePipeline to generate 'number_of_images_to_generate' images.
+    # Step 4: Use the Text2ImagePipeline to generate 'number_of_images_to_generate' images.
     #
 
     for imagei in range(0, number_of_images_to_generate):
         image_tensor = pipe.generate(
             args.prompt,
-            width=width,
-            height=height,
-            guidance_scale=guidance_scale,
             num_inference_steps=number_of_inference_steps_per_image,
-            num_images_per_prompt=1
         )
 
         image = Image.fromarray(image_tensor.data[0])
 
@@ -95,6 +95,8 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModel {
     std::shared_ptr<ov::Model> m_model;
 
     Tokenizer m_clip_tokenizer;
+
+    bool m_slice_batch1_output = false;
 };
 
 } // namespace genai
 
@@ -95,6 +95,8 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModelWithProjection {
     std::shared_ptr<ov::Model> m_model;
 
     Tokenizer m_clip_tokenizer;
+
+    bool m_slice_batch1_output = false;
 };
 
 } // namespace genai
 
@@ -192,13 +192,36 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline {
      */
     void compile(const std::string& device, const ov::AnyMap& properties = {});
 
+    /**
+     * Compiles image generation pipeline for given devices for text encoding, denoising, and vae decoding.
+     * @param text_encode_device A device to compile text encoder(s) with
+     * @param denoise_device A device to compile denoiser (e.g. UNet, SD3 Transformer, etc.) with
+     * @param vae_device A device to compile VAE decoder(s) with
+     * @param properties A map of properties which affect models compilation
+     * @note If pipeline was compiled before, an exception is thrown.
+     */
+    void compile(const std::string& text_encode_device,
+                 const std::string& denoise_device,
+                 const std::string& vae_device,
+                 const ov::AnyMap& properties = {});
+
     template <typename... Properties>
     ov::util::EnableIfAllStringAny<void, Properties...> compile(
             const std::string& device,
             Properties&&... properties) {
         return compile(device, ov::AnyMap{std::forward<Properties>(properties)...});
     }
 
+    template <typename... Properties>
+    ov::util::EnableIfAllStringAny<void, Properties...> compile(const std::string& text_encode_device,
+                                                                const std::string& denoise_device,
+                                                                const std::string& vae_device,
+                                                                Properties&&... properties) {
+        return compile(text_encode_device,
+                       denoise_device,
+                       vae_device, ov::AnyMap{std::forward<Properties>(properties)...});
+    }
+
     /**
      * Generates image(s) based on prompt and other image generation parameters
      * @param positive_prompt Prompt to generate image(s) from
 
@@ -100,7 +100,15 @@ class DiffusionPipeline {
 
     virtual void reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) = 0;
 
-    virtual void compile(const std::string& device, const ov::AnyMap& properties) = 0;
+    virtual void compile(const std::string& device, const ov::AnyMap& properties)
+    {
+        compile(device, device, device, properties);
+    }
+
+    virtual void compile(const std::string& text_encode_device,
+                         const std::string& denoise_device,
+                         const std::string& vae_device,
+                         const ov::AnyMap& properties) = 0;
 
     virtual std::tuple<ov::Tensor, ov::Tensor, ov::Tensor, ov::Tensor> prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) = 0;
 
 
@@ -256,6 +256,13 @@ class FluxPipeline : public DiffusionPipeline {
         m_transformer->compile(device, *updated_properties);
     }
 
+    void compile(const std::string& text_encode_device,
+                 const std::string& denoise_device,
+                 const std::string& vae_device,
+                 const ov::AnyMap& properties) override {
+        OPENVINO_THROW("not supported yet.");
+    }
+
     void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config) override {
         // encode_prompt
         std::string prompt_2_str = generation_config.prompt_2 != std::nullopt ? *generation_config.prompt_2 : positive_prompt;
 
@@ -124,12 +124,26 @@ ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string
         }
     };
 
+    ov::PartialShape compiled_input_partial_shape = m_request.get_compiled_model().inputs()[0].get_partial_shape();
+
     ov::Tensor input_ids = m_request.get_input_tensor();
-    input_ids.set_shape({text_embedding_batch_size, m_config.max_position_embeddings});
+
+    if (compiled_input_partial_shape.is_dynamic()) {
+        input_ids.set_shape({text_embedding_batch_size, m_config.max_position_embeddings});
+    } else {
+        auto compiled_input_shape = input_ids.get_shape();
+        OPENVINO_ASSERT(compiled_input_shape.size() == 2, "CLIP text encoder model input must have rank of 2");
+        OPENVINO_ASSERT(text_embedding_batch_size <= compiled_input_shape[0],
+                        "text_embedding_batch_size (", text_embedding_batch_size,
+                        ") > CLIP text encoder model batch size (",compiled_input_shape[0], ").");
+        OPENVINO_ASSERT(m_config.max_position_embeddings == compiled_input_shape[1],
+                        "max_position_embeddings (", m_config.max_position_embeddings,
+                        ") != what CLIP text encoder model was compiled for (", compiled_input_shape[1], ").");
+    }
 
     size_t current_batch_idx = 0;
 
-    if (do_classifier_free_guidance) {
+    if (input_ids.get_shape()[0] == 2) {
         perform_tokenization(neg_prompt,
                              ov::Tensor(input_ids, {current_batch_idx    , 0},
                                                    {current_batch_idx + 1, m_config.max_position_embeddings}));
@@ -145,11 +159,25 @@ ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string
     // text embeddings
     m_request.infer();
 
-    return m_request.get_output_tensor(0);
+    // This is true when text_embedding_batch_size is 1, but model was reshaped / compiled as batch size 2.
+    m_slice_batch1_output = (text_embedding_batch_size != input_ids.get_shape()[0]);
+
+    return get_output_tensor(0);
 }
 
 ov::Tensor CLIPTextModel::get_output_tensor(const size_t idx) {
-    return m_request.get_output_tensor(idx);
+    auto infer_out_tensor = m_request.get_output_tensor(idx);
+    if (m_slice_batch1_output) {
+        //Slice and return batch index 1 output.
+        auto out_shape = infer_out_tensor.get_shape();
+        auto begin_coord = ov::Coordinate(out_shape.size(), 0);
+        begin_coord[0] = 1;
+        auto end_coord = ov::Coordinate(out_shape);
+        auto sliced_out_tensor = ov::Tensor(infer_out_tensor, begin_coord, end_coord);
+        return sliced_out_tensor;
+    } else {
+        return infer_out_tensor;
+    }
 }
 
 } // namespace genai