text2image: Add pimp approach for unet_2d_condition & batch-size 1 implementation of unet for initial NPU support (openvinotoolkit#1101)

ilya-lavrenov · web-flow · commit ed2baf4461c3 · 2024-11-07T10:17:47.000Z
For now, NPU can be used like this. The unet model must be reshaped to a
static shape before compile is invoked.

```cpp
std::filesystem::path root_dir = models_path;
auto pipe = ov::genai::Text2ImagePipeline::stable_diffusion(
    ov::genai::Text2ImagePipeline::Scheduler::from_config(root_dir / "scheduler/scheduler_config.json"),
    ov::genai::CLIPTextModel(root_dir / "text_encoder", "CPU"),
    ov::genai::UNet2DConditionModel(root_dir / "unet")
        .reshape(2, 512, 512, 77)
        .compile("NPU", ov::cache_dir("./cache")),
    ov::genai::AutoencoderKL(root_dir / "vae_decoder", "GPU", ov::cache_dir("./cache")));
ov::Tensor image = pipe.generate(prompt);
```
diff --git a/samples/cpp/text2image/CMakeLists.txt b/samples/cpp/text2image/CMakeLists.txt
@@ -45,3 +45,22 @@ install(TARGETS lora_stable_diffusion
         RUNTIME DESTINATION samples_bin/
         COMPONENT samples_bin
         EXCLUDE_FROM_ALL)
+
+# create txt2image_from_subcomponent sample executable
+
+add_executable(txt2image_from_subcomponent
+    ${CMAKE_CURRENT_SOURCE_DIR}/txt2image_from_subcomponent.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/imwrite.cpp)
+
+target_include_directories(txt2image_from_subcomponent PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(txt2image_from_subcomponent PRIVATE openvino::genai)
+
+set_target_properties(txt2image_from_subcomponent PROPERTIES
+    COMPILE_PDB_NAME txt2image_from_subcomponent
+    # Ensure out of box LC_RPATH on macOS with SIP
+    INSTALL_RPATH_USE_LINK_PATH ON)
+
+install(TARGETS txt2image_from_subcomponent
+        RUNTIME DESTINATION samples_bin/
+        COMPONENT samples_bin
+        EXCLUDE_FROM_ALL)
diff --git a/samples/cpp/text2image/README.md b/samples/cpp/text2image/README.md
@@ -2,9 +2,10 @@
 
 Examples in this folder showcase inference of text to image models like Stable Diffusion 1.5, 2.1, LCM. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `ov::genai::Text2ImagePipeline` and uses a text prompt as input source.
 
-There are two sample files:
+There are three sample files:
  - [`main.cpp`](./main.cpp) demonstrates basic usage of the text to image pipeline
  - [`lora.cpp`](./lora.cpp) shows how to apply LoRA adapters to the pipeline
+ - [`txt2image_from_subcomponent.cpp`](./txt2image_from_subcomponent.cpp) shows how to assemble a txt2image pipeline from individual subcomponents (scheduler, text encoder, unet, vae decoder)
 
 Users can change the sample code and play with the following generation parameters:
 
@@ -67,3 +68,17 @@ With adapter | Without adapter
 - Image generated with HuggingFace / Optimum Intel is not the same generated by this C++ sample:
 
 C++ random generation with MT19937 results differ from `numpy.random.randn()` and `diffusers.utils.randn_tensor`. So, it's expected that image generated by Python and C++ versions provide different images, because latent images are initialize differently. Users can implement their own random generator derived from `ov::genai::Generator` and pass it to `Text2ImagePipeline::generate` method.
+
+## Run with multiple devices
+
+The `txt2image_from_subcomponent` sample demonstrates how a Text2ImagePipeline object can be created from individual subcomponents - scheduler, text encoder, unet, & vae decoder. This approach gives fine-grained control over the devices used to execute each stage of the stable diffusion pipeline.
+
+The usage of this sample is:
+
+`txt2image_from_subcomponent <MODEL_DIR> '<PROMPT>' [ <TXT_ENCODE_DEVICE> <UNET_DEVICE> <VAE_DEVICE> ]`
+
+For example:
+
+`txt2image_from_subcomponent ./dreamlike_anime_1_0_ov/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting' CPU NPU GPU`
+
+The sample will create a stable diffusion pipeline such that the text encoder is executed on the CPU, UNet on the NPU, and VAE decoder on the GPU.
diff --git a/samples/cpp/text2image/txt2image_from_subcomponent.cpp b/samples/cpp/text2image/txt2image_from_subcomponent.cpp
@@ -0,0 +1,108 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "imwrite.hpp"
+#include "openvino/genai/image_generation/text2image_pipeline.hpp"
+
+int32_t main(int32_t argc, char* argv[]) try {
+    OPENVINO_ASSERT(argc >= 3 && argc <= 6,
+                    "Usage: ",
+                    argv[0],
+                    " <MODEL_DIR> '<PROMPT>' [ <TXT_ENCODE_DEVICE> <UNET_DEVICE> <VAE_DEVICE> ]");
+
+    const std::string models_path = argv[1], prompt = argv[2];
+
+    std::filesystem::path root_dir = models_path;
+
+    const int width = 512;
+    const int height = 512;
+    const float guidance_scale = 7.5f;
+    const int number_of_images_to_generate = 1;
+    const int number_of_inference_steps_per_image = 20;
+
+    // Set devices to command-line args if specified, otherwise default to CPU.
+    // Note that these can be set to CPU, GPU, or NPU.
+    const std::string text_encoder_device = (argc > 3) ? argv[3] : "CPU";
+    const std::string unet_device = (argc > 4) ? argv[4] : "CPU";
+    const std::string vae_decoder_device = (argc > 5) ? argv[5] : "CPU";
+
+    std::cout << "text_encoder_device: " << text_encoder_device << std::endl;
+    std::cout << "unet_device: " << unet_device << std::endl;
+    std::cout << "vae_decoder_device: " << vae_decoder_device << std::endl;
+
+    // this is the path to where compiled models will get cached
+    // (so that the 'compile' method run much faster 2nd+ time)
+    std::string ov_cache_dir = "./cache";
+
+    //
+    // Step 1: Prepare each Text2Image subcomponent (scheduler, text encoder, unet, vae) separately.
+    //
+
+    // Create the scheduler from the details listed in the json.
+    auto scheduler = ov::genai::Scheduler::from_config(root_dir / "scheduler/scheduler_config.json");
+
+    // Note that we could have created the scheduler by specifying specific type (for example EULER_DISCRETE), like
+    // this: auto scheduler = ov::genai::Scheduler::from_config(root_dir / "scheduler/scheduler_config.json",
+    //                                                    ov::genai::Scheduler::Type::EULER_DISCRETE);
+
+    // Create unet object
+    auto unet = ov::genai::UNet2DConditionModel(root_dir / "unet");
+
+    // Given the guidance scale, etc., calculate the batch size.
+    int unet_batch_size = 1;
+    if (guidance_scale > 1.0f && unet.get_config().time_cond_proj_dim < 0) {
+        unet_batch_size = 2;
+    }
+
+    // Create, reshape, and compile the text encoder.
+    auto text_encoder = ov::genai::CLIPTextModel(root_dir / "text_encoder");
+    text_encoder.reshape(unet_batch_size);
+    text_encoder.compile(text_encoder_device, ov::cache_dir(ov_cache_dir));
+
+    // The max_postiion_embeddings config from text encoder will be used as a parameter to unet reshape.
+    int max_position_embeddings = text_encoder.get_config().max_position_embeddings;
+
+    // Reshape unet to a static shape, and compile it.
+    unet.reshape(unet_batch_size, height, width, max_position_embeddings);
+    unet.compile(unet_device, ov::cache_dir(ov_cache_dir));
+
+    // Create, reshape, and compile the vae decoder.
+    auto vae = ov::genai::AutoencoderKL(root_dir / "vae_decoder");
+    vae.reshape(1, height, width);  // We set batch-size to '1' here, as we're configuring our pipeline to return 1
+                                    // image per 'generate' call.
+    vae.compile(vae_decoder_device, ov::cache_dir(ov_cache_dir));
+
+    //
+    // Step 2: Create a Text2ImagePipeline from the individual subcomponents
+    //
+    auto pipe = ov::genai::Text2ImagePipeline::stable_diffusion(scheduler, text_encoder, unet, vae);
+
+    //
+    // Step 3: Use the Text2ImagePipeline to generate 'number_of_images_to_generate' images.
+    //
+    for (int imagei = 0; imagei < number_of_images_to_generate; imagei++) {
+        std::cout << "Generating image " << imagei << std::endl;
+
+        ov::Tensor image = pipe.generate(prompt,
+                                         ov::genai::width(width),
+                                         ov::genai::height(height),
+                                         ov::genai::guidance_scale(guidance_scale),
+                                         ov::genai::num_inference_steps(number_of_inference_steps_per_image));
+
+        imwrite("image_" + std::to_string(imagei) + ".bmp", image, true);
+    }
+
+    return EXIT_SUCCESS;
+} catch (const std::exception& error) {
+    try {
+        std::cerr << error.what() << '\n';
+    } catch (const std::ios_base::failure&) {
+    }
+    return EXIT_FAILURE;
+} catch (...) {
+    try {
+        std::cerr << "Non-exception object thrown\n";
+    } catch (const std::ios_base::failure&) {
+    }
+    return EXIT_FAILURE;
+}
diff --git a/src/cpp/include/openvino/genai/image_generation/unet2d_condition_model.hpp b/src/cpp/include/openvino/genai/image_generation/unet2d_condition_model.hpp
@@ -65,11 +65,16 @@ class OPENVINO_GENAI_EXPORTS UNet2DConditionModel {
     ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep);
 
 private:
+    class UNetInference;
+    std::shared_ptr<UNetInference> m_impl;
+
     Config m_config;
     AdapterController m_adapter_controller;
     std::shared_ptr<ov::Model> m_model;
-    ov::InferRequest m_request;
     size_t m_vae_scale_factor;
+
+    class UNetInferenceDynamic;
+    class UNetInferenceStaticBS1;
 };
 
 } // namespace genai
diff --git a/src/cpp/src/image_generation/models/unet2d_condition_model.cpp b/src/cpp/src/image_generation/models/unet2d_condition_model.cpp
@@ -2,6 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "openvino/genai/image_generation/unet2d_condition_model.hpp"
+#include "image_generation/models/unet_inference_dynamic.hpp"
+#include "image_generation/models/unet_inference_static_bs1.hpp"
 
 #include <fstream>
 
@@ -52,67 +54,51 @@ UNet2DConditionModel& UNet2DConditionModel::reshape(int batch_size, int height,
     height /= m_vae_scale_factor;
     width /= m_vae_scale_factor;
 
-    std::map<std::string, ov::PartialShape> name_to_shape;
-
-    for (auto && input : m_model->inputs()) {
-        std::string input_name = input.get_any_name();
-        name_to_shape[input_name] = input.get_partial_shape();
-        if (input_name == "timestep") {
-            name_to_shape[input_name][0] = 1;
-        } else if (input_name == "sample") {
-            name_to_shape[input_name] = {batch_size, name_to_shape[input_name][1], height, width};
-        } else if (input_name == "time_ids" || input_name == "text_embeds") {
-            name_to_shape[input_name][0] = batch_size;
-        } else if (input_name == "encoder_hidden_states") {
-            name_to_shape[input_name][0] = batch_size;
-            name_to_shape[input_name][1] = tokenizer_model_max_length;
-        }
-    }
-
-    m_model->reshape(name_to_shape);
+    UNetInference::reshape(m_model, batch_size, height, width, tokenizer_model_max_length);
 
     return *this;
 }
 
 UNet2DConditionModel& UNet2DConditionModel::compile(const std::string& device, const ov::AnyMap& properties) {
     OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model");
-    ov::Core core = utils::singleton_core();
-    ov::CompiledModel compiled_model;
+
+    if (device == "NPU") {
+        m_impl = std::make_shared<UNet2DConditionModel::UNetInferenceStaticBS1>();
+    } else {
+        m_impl = std::make_shared<UNet2DConditionModel::UNetInferenceDynamic>();
+    }
+
     std::optional<AdapterConfig> adapters;
     if (auto filtered_properties = extract_adapters_from_properties(properties, &adapters)) {
         adapters->set_tensor_name_prefix(adapters->get_tensor_name_prefix().value_or("lora_unet"));
         m_adapter_controller = AdapterController(m_model, *adapters, device);
-        compiled_model = core.compile_model(m_model, device, *filtered_properties);
+        m_impl->compile(m_model, device, *filtered_properties);
     } else {
-        compiled_model = core.compile_model(m_model, device, properties);
+        m_impl->compile(m_model, device, properties);
     }
-    m_request = compiled_model.create_infer_request();
+
     // release the original model
     m_model.reset();
 
     return *this;
 }
 
 void UNet2DConditionModel::set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states) {
-    OPENVINO_ASSERT(m_request, "UNet model must be compiled first");
-    m_request.set_tensor(tensor_name, encoder_hidden_states);
+    OPENVINO_ASSERT(m_impl, "UNet model must be compiled first");
+    m_impl->set_hidden_states(tensor_name, encoder_hidden_states);
 }
 
 void UNet2DConditionModel::set_adapters(const std::optional<AdapterConfig>& adapters) {
-    if (adapters) {
-        m_adapter_controller.apply(m_request, *adapters);
+    OPENVINO_ASSERT(m_impl, "UNet model must be compiled first");
+    if(adapters) {
+        OPENVINO_ASSERT(m_impl, "UNet model must be compiled first");
+        m_impl->set_adapters(m_adapter_controller, *adapters);
     }
 }
 
 ov::Tensor UNet2DConditionModel::infer(ov::Tensor sample, ov::Tensor timestep) {
-    OPENVINO_ASSERT(m_request, "UNet model must be compiled first. Cannot infer non-compiled model");
-
-    m_request.set_tensor("sample", sample);
-    m_request.set_tensor("timestep", timestep);
-
-    m_request.infer();
-
-    return m_request.get_output_tensor();
+    OPENVINO_ASSERT(m_impl, "UNet model must be compiled first. Cannot infer non-compiled model");
+    return m_impl->infer(sample, timestep);
 }
 
 } // namespace genai
diff --git a/src/cpp/src/image_generation/models/unet_inference.hpp b/src/cpp/src/image_generation/models/unet_inference.hpp
@@ -0,0 +1,68 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "openvino/genai/image_generation/unet2d_condition_model.hpp"
+
+namespace ov {
+namespace genai {
+
+class UNet2DConditionModel::UNetInference {
+
+public:
+    virtual void compile(std::shared_ptr<ov::Model> model, const std::string& device, const ov::AnyMap& properties) = 0;
+    virtual void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states) = 0;
+    virtual void set_adapters(AdapterController& adapter_controller, const AdapterConfig& adapters) = 0;
+    virtual ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep) = 0;
+
+    // utility function to resize model given optional dimensions.
+    static void reshape(std::shared_ptr<ov::Model> model,
+                        std::optional<int> batch_size = {},
+                        std::optional<int> height = {},
+                        std::optional<int> width = {},
+                        std::optional<int> tokenizer_model_max_length = {})
+    {
+        std::map<std::string, ov::PartialShape> name_to_shape;
+        for (auto&& input : model->inputs()) {
+            std::string input_name = input.get_any_name();
+            name_to_shape[input_name] = input.get_partial_shape();
+            if (input_name == "timestep") {
+                name_to_shape[input_name][0] = 1;
+            } else if (input_name == "sample") {
+                if (batch_size) {
+                    name_to_shape[input_name][0] = *batch_size;
+                }
+
+                if (height) {
+                    name_to_shape[input_name][2] = *height;
+                }
+
+                if (width) {
+                    name_to_shape[input_name][3] = *width;
+                }
+            } else if (input_name == "time_ids" || input_name == "text_embeds") {
+                if (batch_size) {
+                    name_to_shape[input_name][0] = *batch_size;
+                }
+            } else if (input_name == "encoder_hidden_states") {
+                if (batch_size) {
+                    name_to_shape[input_name][0] = *batch_size;
+                }
+
+                if (tokenizer_model_max_length) {
+                    name_to_shape[input_name][1] = *tokenizer_model_max_length;
+                }
+            } else if (input_name == "timestep_cond") {
+                if (batch_size) {
+                    name_to_shape[input_name][0] = *batch_size;
+                }
+            }
+        }
+
+        model->reshape(name_to_shape);
+    }
+};
+
+}  // namespace genai
+}  // namespace ov
diff --git a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
diff --git a/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp b/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp