Skip to content

Commit 84cb99b

Browse files
Some updates to Text 2 image pipeline (openvinotoolkit#944)
**TODO:** - [ ] Python API and sample - [ ] Update doc strings - [x] Update main README.md (PR openvinotoolkit#930) - [ ] Add sample with custom device mapping - [ ] Experiment with reshape + compile as part of Ctor - [x] Add LoRA (PR openvinotoolkit#911) - [X] Use std::optional for prompt2, prompt3 and maybe negative prompts as well - [X] Update https://github.com/openvinotoolkit/openvino.genai/blob/master/src/docs/SUPPORTED_MODELS.md with text 2 image generation models
1 parent 67bcef1 commit 84cb99b

16 files changed

+182
-123
lines changed

.github/workflows/causal_lm_cpp.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ jobs:
254254
&& python samples\python\greedy_causal_lm\greedy_causal_lm.py .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\py.txt
255255
- run: fc .\cpp.txt .\py.txt
256256

257-
cpp-beam_search_causal_lm-Qwen-7B-Chat:
257+
cpp-greedy_causal_lm-Qwen-7B-Chat:
258258
runs-on: ubuntu-20.04-16-cores
259259
defaults:
260260
run:
@@ -866,7 +866,7 @@ jobs:
866866
Overall_Status:
867867
name: ci/gha_overall_status_causal_lm
868868
needs: [cpp-multinomial-greedy_causal_lm-ubuntu, cpp-beam_search_causal_lm-ubuntu, cpp-greedy_causal_lm-windows,
869-
cpp-beam_search_causal_lm-Qwen-7B-Chat, cpp-beam_search_causal_lm-Qwen1_5-7B-Chat, cpp-beam_search_causal_lm-Phi-2,
869+
cpp-greedy_causal_lm-Qwen-7B-Chat, cpp-beam_search_causal_lm-Qwen1_5-7B-Chat, cpp-beam_search_causal_lm-Phi-2,
870870
cpp-beam_search_causal_lm-notus-7b-v1, cpp-speculative_decoding_lm-ubuntu, cpp-prompt_lookup_decoding_lm-ubuntu,
871871
cpp-Phi-1_5, cpp-greedy_causal_lm-redpajama-3b-chat, cpp-chat_sample-ubuntu, cpp-continuous-batching-ubuntu,
872872
visual_language_chat_sample-ubuntu,

samples/cpp/text2image/README.md

-11
Original file line numberDiff line numberDiff line change
@@ -36,17 +36,6 @@ Prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk gol
3636

3737
![](./512x512.bmp)
3838

39-
## Supported models
40-
41-
Models can be downloaded from [HuggingFace](https://huggingface.co/models). This sample can run the following list of models, but not limited to:
42-
43-
- [botp/stable-diffusion-v1-5](https://huggingface.co/botp/stable-diffusion-v1-5)
44-
- [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2)
45-
- [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1)
46-
- [dreamlike-art/dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0)
47-
- [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7)
48-
- [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
49-
- [stabilityai/stable-diffusion-xl-base-0.9](https://huggingface.co/stabilityai/stable-diffusion-xl-base-0.9)
5039

5140
## Run with optional LoRA adapters
5241

samples/cpp/text2image/imwrite.cpp

+64-49
Original file line numberDiff line numberDiff line change
@@ -30,60 +30,59 @@ unsigned char file[14] = {
3030
};
3131

3232
unsigned char info[40] = {
33-
40,
34-
0,
35-
0,
36-
0, // info hd size
37-
0,
38-
0,
39-
0,
40-
0, // width
41-
0,
42-
0,
43-
0,
44-
0, // height
45-
1,
46-
0, // number color planes
47-
24,
48-
0, // bits per pixel
49-
0,
50-
0,
51-
0,
52-
0, // compression is none
53-
0,
54-
0,
55-
0,
56-
0, // image bits size
57-
0x13,
58-
0x0B,
59-
0,
60-
0, // horz resolution in pixel / m
61-
0x13,
62-
0x0B,
63-
0,
64-
0, // vert resolution (0x03C3 = 96 dpi, 0x0B13 = 72
65-
// dpi)
66-
0,
67-
0,
68-
0,
69-
0, // #colors in palette
70-
0,
71-
0,
72-
0,
73-
0, // #important colors
74-
};
75-
76-
}
77-
78-
void imwrite(const std::string& name, ov::Tensor image, bool convert_bgr2rgb) {
79-
std::ofstream output_file(name, std::ofstream::binary);
80-
OPENVINO_ASSERT(output_file.is_open(), "Failed to open the output BMP image path");
33+
40,
34+
0,
35+
0,
36+
0, // info hd size
37+
0,
38+
0,
39+
0,
40+
0, // width
41+
0,
42+
0,
43+
0,
44+
0, // height
45+
1,
46+
0, // number color planes
47+
24,
48+
0, // bits per pixel
49+
0,
50+
0,
51+
0,
52+
0, // compression is none
53+
0,
54+
0,
55+
0,
56+
0, // image bits size
57+
0x13,
58+
0x0B,
59+
0,
60+
0, // horz resolution in pixel / m
61+
0x13,
62+
0x0B,
63+
0,
64+
0, // vert resolution (0x03C3 = 96 dpi, 0x0B13 = 72
65+
// dpi)
66+
0,
67+
0,
68+
0,
69+
0, // #colors in palette
70+
0,
71+
0,
72+
0,
73+
0, // #important colors
74+
};
8175

76+
void imwrite_single_image(const std::string& name, ov::Tensor image, bool convert_bgr2rgb) {
8277
const ov::Shape shape = image.get_shape();
8378
const size_t width = shape[2], height = shape[1], channels = shape[3];
8479
OPENVINO_ASSERT(image.get_element_type() == ov::element::u8 &&
8580
shape.size() == 4 && shape[0] == 1 && channels == 3,
86-
"Image of u8 type and [1, H, W, 3] shape is expected");
81+
"Image of u8 type and [1, H, W, 3] shape is expected.",
82+
"Given image has shape ", shape, " and element type ", image.get_element_type());
83+
84+
std::ofstream output_file(name, std::ofstream::binary);
85+
OPENVINO_ASSERT(output_file.is_open(), "Failed to open the output BMP image path");
8786

8887
int padSize = static_cast<int>(4 - (width * channels) % 4) % 4;
8988
int sizeData = static_cast<int>(width * height * channels + height * padSize);
@@ -131,3 +130,19 @@ void imwrite(const std::string& name, ov::Tensor image, bool convert_bgr2rgb) {
131130
output_file.write(reinterpret_cast<const char*>(pad), padSize);
132131
}
133132
}
133+
134+
} // namespace
135+
136+
137+
void imwrite(const std::string& name, ov::Tensor images, bool convert_bgr2rgb) {
138+
const ov::Shape shape = images.get_shape(), img_shape = {1, shape[1], shape[2], shape[3]};
139+
uint8_t* img_data = images.data<uint8_t>();
140+
141+
for (int img_num = 0, num_images = shape[0], img_size = ov::shape_size(img_shape); img_num < num_images; ++img_num, img_data += img_size) {
142+
char img_name[25];
143+
sprintf(img_name, name.c_str(), img_num);
144+
145+
ov::Tensor image(images.get_element_type(), img_shape, img_data);
146+
imwrite_single_image(img_name, image, true);
147+
}
148+
}

samples/cpp/text2image/imwrite.hpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
#include "openvino/runtime/tensor.hpp"
99

1010
/**
11-
* @brief Writes image to file
12-
* @param name File name
13-
* @param image Image tensor
11+
* @brief Writes mutiple images (depending on `image` tensor batch size) to BPM file(s)
12+
* @param name File name or pattern to use to write images
13+
* @param image Image(s) tensor
1414
* @param convert_bgr2rgb Convert BGR to RGB
1515
*/
16-
void imwrite(const std::string& name, ov::Tensor image, bool convert_bgr2rgb);
16+
void imwrite(const std::string& name, ov::Tensor images, bool convert_bgr2rgb);

samples/cpp/text2image/main.cpp

+2-30
Original file line numberDiff line numberDiff line change
@@ -5,35 +5,6 @@
55

66
#include "imwrite.hpp"
77

8-
namespace {
9-
10-
void imwrite_output_imgs(const ov::Tensor& output) {
11-
ov::Shape out_shape = output.get_shape();
12-
13-
if (out_shape[0] == 1) {
14-
imwrite("image.bmp", output, true);
15-
return;
16-
}
17-
18-
ov::Shape img_shape = {1, out_shape[1], out_shape[2], out_shape[3]};
19-
size_t img_size = output.get_size() / out_shape[0];
20-
21-
ov::Tensor image(output.get_element_type(), img_shape);
22-
uint8_t* out_data = output.data<uint8_t>();
23-
uint8_t* img_data = image.data<uint8_t>();
24-
25-
for (int img_num = 0; img_num < out_shape[0]; ++img_num) {
26-
std::memcpy(img_data, out_data + img_size * img_num, img_size * sizeof(uint8_t));
27-
28-
char img_name[25];
29-
sprintf(img_name, "image_%d.bmp", img_num);
30-
31-
imwrite(img_name, image, true);
32-
}
33-
}
34-
35-
} //namespace
36-
378
int32_t main(int32_t argc, char* argv[]) try {
389
OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " <MODEL_DIR> '<PROMPT>'");
3910

@@ -47,7 +18,8 @@ int32_t main(int32_t argc, char* argv[]) try {
4718
ov::genai::num_inference_steps(20),
4819
ov::genai::num_images_per_prompt(1));
4920

50-
imwrite_output_imgs(image);
21+
// writes `num_images_per_prompt` images by pattern name
22+
imwrite("image_%d.bmp", image, true);
5123

5224
return EXIT_SUCCESS;
5325
} catch (const std::exception& error) {

src/cpp/CMakeLists.txt

+7
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ function(ov_genai_build_jinja2cpp)
3535
option(RAPIDJSON_BUILD_DOC "Build rapidjson documentation." OFF)
3636

3737
add_subdirectory("${jinja2cpp_SOURCE_DIR}" "${jinja2cpp_BINARY_DIR}" EXCLUDE_FROM_ALL)
38+
39+
if(CMAKE_COMPILER_IS_GNUCXX OR OV_COMPILER_IS_CLANG OR (OV_COMPILER_IS_INTEL_LLVM AND UNIX))
40+
target_compile_options(jinja2cpp PRIVATE -Wno-undef)
41+
endif()
42+
if(SUGGEST_OVERRIDE_SUPPORTED)
43+
target_compile_options(jinja2cpp PRIVATE -Wno-suggest-override)
44+
endif()
3845
endif()
3946
endfunction()
4047

src/cpp/include/openvino/genai/text2image/pipeline.hpp

+6-6
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,8 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline {
7070
// SD XL: prompt2 and negative_prompt2
7171
// FLUX: prompt2 (prompt if prompt2 is not defined explicitly)
7272
// SD 3: prompt2, prompt3 (with fallback to prompt) and negative_prompt2, negative_prompt3
73-
std::string prompt2, prompt3;
74-
std::string negative_prompt, negative_prompt2, negative_prompt3;
73+
std::optional<std::string> prompt_2 = std::nullopt, prompt_3 = std::nullopt;
74+
std::string negative_prompt, negative_prompt_2, negative_prompt_3;
7575

7676
size_t num_images_per_prompt = 1;
7777

@@ -165,12 +165,12 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline {
165165
// Generation config properties
166166
//
167167

168-
static constexpr ov::Property<std::string> prompt2{"prompt2"};
169-
static constexpr ov::Property<std::string> prompt3{"prompt3"};
168+
static constexpr ov::Property<std::string> prompt_2{"prompt_2"};
169+
static constexpr ov::Property<std::string> prompt_3{"prompt_3"};
170170

171171
static constexpr ov::Property<std::string> negative_prompt{"negative_prompt"};
172-
static constexpr ov::Property<std::string> negative_prompt2{"negative_prompt2"};
173-
static constexpr ov::Property<std::string> negative_prompt3{"negative_prompt3"};
172+
static constexpr ov::Property<std::string> negative_prompt_2{"negative_prompt_2"};
173+
static constexpr ov::Property<std::string> negative_prompt_3{"negative_prompt_3"};
174174

175175
static constexpr ov::Property<size_t> num_images_per_prompt{"num_images_per_prompt"};
176176
static constexpr ov::Property<float> guidance_scale{"guidance_scale"};

src/cpp/src/text2image/diffusion_pipeline.hpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,9 @@ class Text2ImagePipeline::DiffusionPipeline {
7171
protected:
7272
virtual void initialize_generation_config(const std::string& class_name) = 0;
7373

74-
virtual void check_inputs(const int height, const int width) const = 0;
74+
virtual void check_image_size(const int height, const int width) const = 0;
75+
76+
virtual void check_inputs(const GenerationConfig& generation_config) const = 0;
7577

7678
std::shared_ptr<IScheduler> m_scheduler;
7779
GenerationConfig m_generation_config;

src/cpp/src/text2image/models/clip_text_model.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -94,16 +94,16 @@ ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string
9494

9595
if (do_classifier_free_guidance) {
9696
perform_tokenization(neg_prompt,
97-
ov::Tensor(input_ids, {current_batch_idx , 0},
98-
{current_batch_idx + 1, m_config.max_position_embeddings}));
97+
ov::Tensor(input_ids, {current_batch_idx , 0},
98+
{current_batch_idx + 1, m_config.max_position_embeddings}));
9999
++current_batch_idx;
100100
} else {
101101
// Negative prompt is ignored when --guidanceScale < 1.0
102102
}
103103

104104
perform_tokenization(pos_prompt,
105-
ov::Tensor(input_ids, {current_batch_idx , 0},
106-
{current_batch_idx + 1, m_config.max_position_embeddings}));
105+
ov::Tensor(input_ids, {current_batch_idx , 0},
106+
{current_batch_idx + 1, m_config.max_position_embeddings}));
107107

108108
// text embeddings
109109
m_request.set_tensor("input_ids", input_ids);

src/cpp/src/text2image/models/clip_text_model_with_projection.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -83,16 +83,16 @@ ov::Tensor CLIPTextModelWithProjection::infer(const std::string& pos_prompt, con
8383

8484
if (do_classifier_free_guidance) {
8585
perform_tokenization(neg_prompt,
86-
ov::Tensor(input_ids, {current_batch_idx , 0},
87-
{current_batch_idx + 1, m_config.max_position_embeddings}));
86+
ov::Tensor(input_ids, {current_batch_idx , 0},
87+
{current_batch_idx + 1, m_config.max_position_embeddings}));
8888
++current_batch_idx;
8989
} else {
9090
// Negative prompt is ignored when --guidanceScale < 1.0
9191
}
9292

9393
perform_tokenization(pos_prompt,
94-
ov::Tensor(input_ids, {current_batch_idx , 0},
95-
{current_batch_idx + 1, m_config.max_position_embeddings}));
94+
ov::Tensor(input_ids, {current_batch_idx , 0},
95+
{current_batch_idx + 1, m_config.max_position_embeddings}));
9696

9797
// text embeddings
9898
m_request.set_tensor("input_ids", input_ids);

src/cpp/src/text2image/schedulers/lcm.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ LCMScheduler::LCMScheduler(const std::string scheduler_config_path) :
4747
LCMScheduler::LCMScheduler(const Config& scheduler_config)
4848
: m_config(scheduler_config),
4949
m_seed(42),
50-
m_gen(100, std::mt19937(m_seed)),
50+
m_gen(m_seed),
5151
m_normal(0.0f, 1.0f) {
5252

5353
m_sigma_data = 0.5f; // Default: 0.5
@@ -191,7 +191,7 @@ std::map<std::string, ov::Tensor> LCMScheduler::step(ov::Tensor noise_pred, ov::
191191

192192
if (inference_step != m_num_inference_steps - 1) {
193193
for (std::size_t i = 0; i < batch_size * latent_size; ++i) {
194-
float gen_noise = m_normal(m_gen[i / latent_size]);
194+
float gen_noise = m_normal(m_gen);
195195
prev_sample_data[i] = alpha_prod_t_prev_sqrt * denoised_data[i] + beta_prod_t_prev_sqrt * gen_noise;
196196
}
197197
} else {

src/cpp/src/text2image/schedulers/lcm.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ class LCMScheduler : public IScheduler {
6262
std::vector<int64_t> m_timesteps;
6363

6464
uint32_t m_seed;
65-
std::vector<std::mt19937> m_gen;
65+
std::mt19937 m_gen;
6666
std::normal_distribution<float> m_normal;
6767

6868
std::vector<float> threshold_sample(const std::vector<float>& flat_sample);

0 commit comments

Comments
 (0)