openvinotoolkit
diff --git a/‎.github/workflows/genai-tools.yml
+1-1 b/‎.github/workflows/genai-tools.yml
+1-1
diff --git a/‎.github/workflows/lcm_dreamshaper_cpp.yml
+1-1 b/‎.github/workflows/lcm_dreamshaper_cpp.yml
+1-1
diff --git a/‎.github/workflows/linux.yml
+1-1 b/‎.github/workflows/linux.yml
+1-1
diff --git a/‎.github/workflows/stable_diffusion_1_5_cpp.yml
+1-1 b/‎.github/workflows/stable_diffusion_1_5_cpp.yml
+1-1
diff --git a/‎.github/workflows/windows.yml
+13-7 b/‎.github/workflows/windows.yml
+13-7
diff --git a/‎CMakeLists.txt
+8-6 b/‎CMakeLists.txt
+8-6
diff --git a/‎src/cpp/include/openvino/genai/image_generation/sd3_transformer_2d_model.hpp
+6 b/‎src/cpp/include/openvino/genai/image_generation/sd3_transformer_2d_model.hpp
+6
diff --git a/‎src/cpp/src/debug_utils.hpp
+81 b/‎src/cpp/src/debug_utils.hpp
+81
diff --git a/‎src/cpp/src/image_generation/flux_pipeline.hpp
+7-11 b/‎src/cpp/src/image_generation/flux_pipeline.hpp
+7-11
diff --git a/‎src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
+17-32 b/‎src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
+17-32
@@ -44,7 +44,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: latest_available_commit
+        revision: a8aba4e104f027c2ba8a21fd6c4c861110c57ed9
 
   llm_bench:
     name: 'LLM bench tests'
 
@@ -108,7 +108,7 @@ jobs:
           ${{ env.build_dir }}/samples/cpp/image_generation/benchmark_image_gen -t inpainting -m ./models/lcm_dreamshaper_v7 -p "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" -i ./image.png --mi ./mask_image.png
 
   lcm_dreamshaper_v7_cpp-windows:
-    runs-on: windows-2022
+    runs-on: aks-win-4-cores-8gb-staging
     defaults:
       run:
         shell: pwsh
 
@@ -52,7 +52,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels,openvino_js_package.tar.gz
-        revision: latest_available_commit
+        revision: a8aba4e104f027c2ba8a21fd6c4c861110c57ed9
 
     - name: Clone docker tag from OpenVINO repo
       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
@@ -153,7 +153,7 @@ jobs:
 
   stable_diffusion_1_5_cpp-windows:
     needs: [ openvino_download_windows ]
-    runs-on: windows-2022
+    runs-on: aks-win-4-cores-8gb-staging
     defaults:
       run:
         shell: pwsh
 
@@ -49,7 +49,7 @@ jobs:
       with:
         platform: 'windows'
         commit_packages_to_provide: wheels,openvino_js_package.zip
-        revision: 'latest_available_commit'
+        revision: 'a8aba4e104f027c2ba8a21fd6c4c861110c57ed9'
 
   genai_build_cpack:
     name: genai cpack (${{ matrix.build_type }})
@@ -61,7 +61,7 @@ jobs:
     defaults:
       run:
         shell: pwsh
-    runs-on: windows-2022
+    runs-on: aks-win-8-cores-16gb-staging
     env:
       OV_INSTALL_DIR: ${{ github.workspace }}\install\ov
       GENAI_INSTALL_DIR: ${{ github.workspace }}\install\genai
@@ -162,7 +162,7 @@ jobs:
     defaults:
       run:
         shell: pwsh
-    runs-on: windows-2022
+    runs-on: aks-win-8-cores-16gb-staging
     env:
       OV_INSTALL_DIR: ${{ github.workspace }}\install\ov
       SRC_DIR: ${{ github.workspace }}\src\genai
@@ -216,7 +216,7 @@ jobs:
       - name: Configure Developer Command Prompt for Microsoft Visual C++
         uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
         with:
-          toolset: 14.40 # v2022  
+          toolset: 14.40 # v2022
 
       - name: Build Tokenizers Wheel
         run: |
@@ -271,7 +271,7 @@ jobs:
     defaults:
       run:
         shell: pwsh
-    runs-on: windows-2022
+    runs-on: aks-win-4-cores-8gb-staging
     env:
       OV_INSTALL_DIR: ${{ github.workspace }}/install/ov
       SRC_DIR: ${{ github.workspace }}/src
@@ -319,7 +319,7 @@ jobs:
     defaults:
       run:
         shell: pwsh
-    runs-on: windows-2022
+    runs-on: aks-win-4-cores-8gb-staging
 
     env:
       OV_INSTALL_DIR: ${{ github.workspace }}/ov
@@ -340,6 +340,12 @@ jobs:
           name: ${{ needs.openvino_download.outputs.ov_artifact_name }}
           path: ${{ env.OV_INSTALL_DIR }}
           merge-multiple: true
+          
+      - name: Setup Python ${{ env.PYTHON_VERSION }}
+        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+          cache: 'pip'
 
       - name: Build GenAI Node.js bindings
         run: |
@@ -483,7 +489,7 @@ jobs:
     defaults:
       run:
         shell: pwsh
-    runs-on: windows-latest
+    runs-on: windows-2022
 
     env:
       SRC_DIR: ${{ github.workspace }}/openvino.genai
 
@@ -41,12 +41,14 @@ endif()
 
 # Looking for OpenVINO in the python distribution. It doesn't work for cross-compiling build
 if(NOT CMAKE_CROSSCOMPILING)
-    find_package(Python3 REQUIRED)
-    execute_process(
-        COMMAND ${Python3_EXECUTABLE} -c "from openvino.utils import get_cmake_path; print(get_cmake_path(), end='')"
-        OUTPUT_VARIABLE OpenVINO_DIR_PY
-        ERROR_QUIET
-    )
+    find_package(Python3 QUIET COMPONENTS Interpreter)
+    if(Python3_Interpreter_FOUND)
+        execute_process(
+            COMMAND ${Python3_EXECUTABLE} -c "from openvino.utils import get_cmake_path; print(get_cmake_path(), end='')"
+            OUTPUT_VARIABLE OpenVINO_DIR_PY
+            ERROR_QUIET
+        )
+    endif()
 endif()
 
 # Find OpenVINODeveloperPackage first to compile with SDL flags
 
@@ -80,10 +80,16 @@ class OPENVINO_GENAI_EXPORTS SD3Transformer2DModel {
     ov::Tensor infer(const ov::Tensor latent, const ov::Tensor timestep);
 
 private:
+    class Inference;
+    std::shared_ptr<Inference> m_impl;
+
     Config m_config;
     ov::InferRequest m_request;
     std::shared_ptr<ov::Model> m_model;
     size_t m_vae_scale_factor;
+
+    class InferenceDynamic;
+    class InferenceStaticBS1;
 };
 
 }  // namespace genai
 
@@ -72,3 +72,84 @@ inline void read_tensor(const std::string& file_name, ov::Tensor tensor, bool as
 
     std::cout << "Closing " << file_name << std::endl;
 }
+
+/// @brief Read an npy file created in Python:
+/// with open('ndarray.npy', 'wb') as file:
+///     np.save(file, ndarray)
+inline ov::Tensor from_npy(const std::filesystem::path& npy) {
+    std::ifstream fstream{npy, std::ios::binary};
+    fstream.seekg(0, std::ios_base::end);
+    OPENVINO_ASSERT(fstream.good());
+    auto full_file_size = static_cast<std::size_t>(fstream.tellg());
+    fstream.seekg(0, std::ios_base::beg);
+
+    std::string magic_string(6, ' ');
+    fstream.read(&magic_string[0], magic_string.size());
+    OPENVINO_ASSERT(magic_string == "\x93NUMPY");
+
+    fstream.ignore(2);
+    unsigned short header_size;
+    fstream.read((char*)&header_size, sizeof(header_size));
+
+    std::string header(header_size, ' ');
+    fstream.read(&header[0], header.size());
+
+    int idx, from, to;
+
+    // Verify fortran order is false
+    const std::string fortran_key = "'fortran_order':";
+    idx = header.find(fortran_key);
+    OPENVINO_ASSERT(idx != -1);
+
+    from = header.find_last_of(' ', idx + fortran_key.size()) + 1;
+    to = header.find(',', from);
+    auto fortran_value = header.substr(from, to - from);
+    OPENVINO_ASSERT(fortran_value == "False");
+
+    // Verify array shape matches the input's
+    const std::string shape_key = "'shape':";
+    idx = header.find(shape_key);
+    OPENVINO_ASSERT(idx != -1);
+
+    from = header.find('(', idx + shape_key.size()) + 1;
+    to = header.find(')', from);
+
+    std::string shape_data = header.substr(from, to - from);
+    ov::Shape _shape;
+
+    if (!shape_data.empty()) {
+        shape_data.erase(std::remove(shape_data.begin(), shape_data.end(), ','), shape_data.end());
+
+        std::istringstream shape_data_stream(shape_data);
+        size_t value;
+        while (shape_data_stream >> value) {
+            _shape.push_back(value);
+        }
+    }
+
+    // Verify array data type matches input's
+    std::string dataTypeKey = "'descr':";
+    idx = header.find(dataTypeKey);
+    OPENVINO_ASSERT(-1 != idx);
+
+    from = header.find('\'', idx + dataTypeKey.size()) + 1;
+    to = header.find('\'', from);
+    std::string type;
+    type = header.substr(from, to - from);
+
+    size_t _size = 0;
+    _size = full_file_size - static_cast<std::size_t>(fstream.tellg());
+    ov::element::Type tensor_type;
+    if ("<f4" == type) {
+        tensor_type = ov::element::f32;
+    } else if ("|u1" == type) {
+        tensor_type = ov::element::u8;
+    } else {
+        OPENVINO_THROW("Not implemented dtype");
+    }
+    OPENVINO_ASSERT(_size == ov::shape_size(_shape) * tensor_type.size());
+    ov::Tensor tensor{tensor_type, _shape};
+    fstream.read((char*)tensor.data(), _size);
+    OPENVINO_ASSERT(fstream.gcount() == _size);
+    return tensor;
+}
@@ -247,20 +247,16 @@ class FluxPipeline : public DiffusionPipeline {
         m_vae->reshape(num_images_per_prompt, height, width);
     }
 
-    void compile(const std::string& device, const ov::AnyMap& properties) override {
-        update_adapters_from_properties(properties, m_generation_config.adapters);
-        auto updated_properties = update_adapters_in_properties(properties, &FluxPipeline::derived_adapters);
-        m_clip_text_encoder->compile(device, *updated_properties);
-        m_t5_text_encoder->compile(device, *updated_properties);
-        m_vae->compile(device, *updated_properties);
-        m_transformer->compile(device, *updated_properties);
-    }
-
     void compile(const std::string& text_encode_device,
                  const std::string& denoise_device,
                  const std::string& vae_device,
                  const ov::AnyMap& properties) override {
-        OPENVINO_THROW("not supported yet.");
+        update_adapters_from_properties(properties, m_generation_config.adapters);
+        auto updated_properties = update_adapters_in_properties(properties, &FluxPipeline::derived_adapters);
+        m_clip_text_encoder->compile(text_encode_device, *updated_properties);
+        m_t5_text_encoder->compile(text_encode_device, *updated_properties);
+        m_vae->compile(vae_device, *updated_properties);
+        m_transformer->compile(denoise_device, *updated_properties);
     }
 
     void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config) override {
@@ -618,7 +614,7 @@ class FluxPipeline : public DiffusionPipeline {
                        size_t inference_step) override {
         OPENVINO_ASSERT(m_pipeline_type == PipelineType::INPAINTING, "'blend_latents' can be called for inpainting pipeline only");
         OPENVINO_ASSERT(image_latent.get_shape() == latents.get_shape(),
-                        "Shapes for current", latents.get_shape(), "and initial image latents ", image_latent.get_shape(), " must match");
+                        "Shapes for current ", latents.get_shape(), " and initial image latents ", image_latent.get_shape(), " must match");
 
         ov::Tensor init_latents_proper(image_latent.get_element_type(), image_latent.get_shape());
         image_latent.copy_to(init_latents_proper);
 
@@ -2,6 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "openvino/genai/image_generation/sd3_transformer_2d_model.hpp"
+#include "image_generation/models/sd3transformer_2d_inference_dynamic.hpp"
+#include "image_generation/models/sd3transformer_2d_inference_static_bs1.hpp"
 
 #include <fstream>
 
@@ -77,54 +79,37 @@ SD3Transformer2DModel& SD3Transformer2DModel::reshape(int batch_size,
     height /= m_vae_scale_factor;
     width /= m_vae_scale_factor;
 
-    std::map<std::string, ov::PartialShape> name_to_shape;
-
-    for (auto&& input : m_model->inputs()) {
-        std::string input_name = input.get_any_name();
-        name_to_shape[input_name] = input.get_partial_shape();
-        if (input_name == "timestep") {
-            name_to_shape[input_name][0] = 1;
-        } else if (input_name == "hidden_states") {
-            name_to_shape[input_name] = {batch_size, name_to_shape[input_name][1], height, width};
-        } else if (input_name == "encoder_hidden_states") {
-            name_to_shape[input_name][0] = batch_size;
-            name_to_shape[input_name][1] =
-                tokenizer_model_max_length *
-                2;  // x2 is necessary because of the concatenation of prompt_embeds and t5_prompt_embeds
-        } else if (input_name == "pooled_projections") {
-            name_to_shape[input_name][0] = batch_size;
-        }
-    }
-
-    m_model->reshape(name_to_shape);
+    SD3Transformer2DModel::Inference::reshape(m_model, batch_size, height, width, tokenizer_model_max_length);
 
     return *this;
 }
 
 SD3Transformer2DModel& SD3Transformer2DModel::compile(const std::string& device, const ov::AnyMap& properties) {
     OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model");
-    ov::CompiledModel compiled_model = utils::singleton_core().compile_model(m_model, device, properties);
-    ov::genai::utils::print_compiled_model_properties(compiled_model, "SD3 Transformer 2D model");
-    m_request = compiled_model.create_infer_request();
+
+    if (device.find("NPU") != std::string::npos) {
+        m_impl = std::make_shared<SD3Transformer2DModel::InferenceStaticBS1>();
+    }
+    else {
+        m_impl = std::make_shared<SD3Transformer2DModel::InferenceDynamic>();
+    }
+
+    m_impl->compile(m_model, device, properties);
+
     // release the original model
     m_model.reset();
 
     return *this;
 }
 
 void SD3Transformer2DModel::set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states) {
-    OPENVINO_ASSERT(m_request, "Transformer model must be compiled first");
-    m_request.set_tensor(tensor_name, encoder_hidden_states);
+    OPENVINO_ASSERT(m_impl, "Transformer model must be compiled first");
+    m_impl->set_hidden_states(tensor_name, encoder_hidden_states);
 }
 
 ov::Tensor SD3Transformer2DModel::infer(const ov::Tensor latent_model_input, const ov::Tensor timestep) {
-    OPENVINO_ASSERT(m_request, "Transformer model must be compiled first. Cannot infer non-compiled model");
-
-    m_request.set_tensor("hidden_states", latent_model_input);
-    m_request.set_tensor("timestep", timestep);
-    m_request.infer();
-
-    return m_request.get_output_tensor();
+    OPENVINO_ASSERT(m_impl, "Transformer model must be compiled first. Cannot infer non-compiled model");
+    return m_impl->infer(latent_model_input, timestep);
 }
 
 }  // namespace genai