Skip to content

Commit b9408e6

Browse files
authored
phi3_v: add universal tag (#1921)
Replace unify_prompt() with normalize_prompt(). The new function doesn't insert/replace image tags multiple times. It doesn't search the same place multiple times for the same tag. phi3_v is able to reuse its parts. Ticket 164259
1 parent 9b2e4d5 commit b9408e6

File tree

16 files changed

+268
-174
lines changed

16 files changed

+268
-174
lines changed

src/cpp/include/openvino/genai/visual_language/pipeline.hpp

+3
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
110110
/// A model's native image tag can be used instead of
111111
/// <ov_genai_image_i>. These tags are:
112112
/// MiniCPM-V-2_6: (<image>./</image>)\n
113+
/// Phi-3-vision: <|image_i|>\n - the index starts with one
113114
/// Qwen2-VL: <|vision_start|><|image_pad|><|vision_end|>
114115
/// If the prompt doesn't contain image tags, but images are
115116
/// provided, the tags are prepended to the prompt.
@@ -134,6 +135,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
134135
/// A model's native image tag can be used instead of
135136
/// <ov_genai_image_i>. These tags are:
136137
/// MiniCPM-V-2_6: (<image>./</image>)\n
138+
/// Phi-3-vision: <|image_i|>\n - the index starts with one
137139
/// Qwen2-VL: <|vision_start|><|image_pad|><|vision_end|>
138140
/// If the prompt doesn't contain image tags, but images are
139141
/// provided, the tags are prepended to the prompt.
@@ -159,6 +161,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
159161
/// A model's native image tag can be used instead of
160162
/// <ov_genai_image_i>. These tags are:
161163
/// MiniCPM-V-2_6: (<image>./</image>)\n
164+
/// Phi-3-vision: <|image_i|>\n - the index starts with one
162165
/// Qwen2-VL: <|vision_start|><|image_pad|><|vision_end|>
163166
/// If the prompt doesn't contain image tags, but images are
164167
/// provided, the tags are prepended to the prompt.

src/cpp/src/llm_pipeline_static.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
#include "utils.hpp"
88

99
#include <fstream>
10-
#include <regex>
1110

1211
#include "openvino/runtime/core.hpp"
1312
#include "openvino/core/parallel.hpp"

src/cpp/src/lm_encoding.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
#include <iostream>
77
#include <numeric>
88
#include <random>
9-
#include <regex>
109
#include <vector>
1110

1211
#include "utils.hpp"

src/cpp/src/lora_adapter.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
#include <string>
88
#include <vector>
99
#include <fstream>
10-
#include <regex>
1110
#include <optional>
1211
#include <numeric>
1312
#include <iostream>

src/cpp/src/lora_common.hpp

-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
#include <memory>
88
#include <string>
99
#include <optional>
10-
#include <regex>
1110
#include <vector>
1211

1312
#include "openvino/op/constant.hpp"

src/cpp/src/lora_names_mapping.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include <string>
44
#include <unordered_map>
55
#include <set>
6+
#include <regex>
67

78
#include <openvino/genai/lora_adapter.hpp>
89

src/cpp/src/visual_language/inputs_embedder.cpp

+35-45
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,6 @@
1616
#include "visual_language/internvl_chat/classes.hpp"
1717

1818
#include "utils.hpp"
19-
#include <regex>
20-
21-
namespace {
22-
23-
std::regex UNIVERSAL_PATTERN{R"(<ov_genai_image_(\d+)>)"};
24-
25-
}
2619

2720
namespace ov::genai {
2821

@@ -296,49 +289,46 @@ bool InputsEmbedder::prompt_has_image_tag(const std::string& prompt) const {
296289
return m_impl->prompt_has_image_tag(prompt);
297290
}
298291

299-
std::pair<std::string, std::vector<size_t>> unify_prompt(
292+
void verify_ids(const std::vector<size_t>& image_ids, size_t base_id, size_t n_images) {
293+
for (size_t idx : image_ids) {
294+
OPENVINO_ASSERT(base_id <= idx, "Referring to older images isn't implemented");
295+
OPENVINO_ASSERT(idx < base_id + n_images, "Missing image ", idx);
296+
}
297+
}
298+
299+
std::pair<std::string, std::vector<size_t>> normalize_prompt(
300300
const std::string& prompt,
301301
const std::string& native_tag,
302-
const std::string& unified_tag_to_native_tag,
303-
size_t n_new_images,
304-
size_t first_new_image_id
302+
const std::string& automatic_tag,
303+
size_t base_id,
304+
size_t n_images
305305
) {
306-
bool found_universal_tag = std::regex_search(prompt, UNIVERSAL_PATTERN);
307-
bool found_native_tag = prompt.find(native_tag) != std::string::npos;
308-
OPENVINO_ASSERT(!(found_universal_tag && found_native_tag), "Prompt can contain only one type of image tags.");
309-
std::stringstream images_prompt;
310-
if (!found_universal_tag && ! found_native_tag) {
311-
for (size_t i = first_new_image_id; i < n_new_images + first_new_image_id; ++i) {
312-
images_prompt << "<ov_genai_image_" << i << ">";
313-
}
306+
size_t pos = prompt.find(native_tag);
307+
auto [image_prompt, image_sequence] = universal_to_native(prompt, [&](std::ostream& os, size_t) {
308+
os << automatic_tag;
309+
});
310+
if (!image_sequence.empty()) {
311+
OPENVINO_ASSERT(pos == std::string::npos, "Prompt can contain only one type of image tags.");
312+
verify_ids(image_sequence, base_id, n_images);
313+
return {std::move(image_prompt), std::move(image_sequence)};
314314
}
315-
images_prompt << prompt;
316-
317-
std::vector<size_t> images_sequence;
318-
std::string unified_prompt = images_prompt.str();
319-
std::sregex_iterator end_it;
320-
if (found_native_tag) {
321-
size_t pos = 0;
322-
while ((pos = unified_prompt.find(native_tag, pos)) != std::string::npos) {
323-
images_sequence.push_back(first_new_image_id + images_sequence.size());
324-
pos += native_tag.length();
325-
}
326-
OPENVINO_ASSERT(images_sequence.size() == n_new_images);
327-
} else {
328-
bool found = true;
329-
while (found) {
330-
found = false;
331-
for (std::sregex_iterator it(unified_prompt.begin(), unified_prompt.end(), UNIVERSAL_PATTERN); it != end_it; ++it) {
332-
images_sequence.push_back(std::stoi((*it)[1].str()));
333-
OPENVINO_ASSERT(images_sequence.back() < n_new_images + first_new_image_id, "Missing image ", images_sequence.back());
334-
OPENVINO_ASSERT(first_new_image_id <= images_sequence.back(), "Referring to older images isn't implemented");
335-
unified_prompt.replace(it->position(), it->length(), unified_tag_to_native_tag);
336-
found = true;
337-
break;
338-
}
339-
}
315+
// Restore ids from native tags
316+
while (pos != std::string::npos) {
317+
image_sequence.push_back(base_id + image_sequence.size());
318+
pos = prompt.find(native_tag, pos + native_tag.length());
319+
}
320+
if (!image_sequence.empty()) {
321+
OPENVINO_ASSERT(image_sequence.size() == n_images, "The number of native image tags and provided images must match because it's ambiguous which image should be ignored.");
322+
return {std::move(image_prompt), std::move(image_sequence)};
323+
}
324+
// Prepend automatic tags
325+
std::stringstream stream;
326+
for (size_t relative_id = 0; relative_id < n_images; relative_id++) {
327+
image_sequence.push_back(base_id + relative_id);
328+
stream << automatic_tag;
340329
}
341-
return {std::move(unified_prompt), std::move(images_sequence)};
330+
stream << prompt;
331+
return {stream.str(), std::move(image_sequence)};
342332
}
343333

344334
} // namespace ov::genai

src/cpp/src/visual_language/inputs_embedder.hpp

+36-12
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <string>
77
#include <vector>
88
#include <filesystem>
9+
#include <regex>
910

1011
#include "utils.hpp"
1112
#include "lm_encoding.hpp"
@@ -19,6 +20,7 @@
1920

2021
namespace ov::genai {
2122
struct VLMPerfMetrics;
23+
const static std::regex UNIVERSAL_PATTERN{R"(<ov_genai_image_(\d+)>)"};
2224

2325
class InputsEmbedder {
2426
public:
@@ -168,23 +170,45 @@ class InputsEmbedder {
168170
friend class InputsEmbedderQwen2VL;
169171
};
170172

171-
/// @brief Check if universal tag is given.
172-
/// Check if native tag is given.
173-
/// Assert different tag aren't mixed.
174-
/// If no any tag, prepend universal image tag.
175-
/// If native tag, assume incremental image order.
176-
/// Else replace universal tags with native tags and save image order.
177-
/// @param unified_tag_to_native_tag MiniCPM-V-2_6 inserts
173+
template <typename Func>
174+
std::pair<std::string, std::vector<size_t>> universal_to_native(
175+
const std::string& prompt,
176+
const Func& write_native
177+
) {
178+
std::stringstream stream;
179+
std::vector<size_t> image_sequence;
180+
std::smatch match;
181+
std::regex_search(prompt, match, UNIVERSAL_PATTERN);
182+
auto search_begin = prompt.begin();
183+
while (!match.empty()) {
184+
stream.write(&*search_begin, match.position());
185+
image_sequence.push_back(std::stoul(match.str(1)));
186+
write_native(stream, image_sequence.back());
187+
search_begin = match.suffix().first;
188+
std::regex_search(search_begin, prompt.end(), match, UNIVERSAL_PATTERN);
189+
}
190+
stream.write(&*search_begin, prompt.end() - search_begin);
191+
return {stream.str(), std::move(image_sequence)};
192+
}
193+
194+
void verify_ids(const std::vector<size_t>& image_ids, size_t base_id, size_t n_images);
195+
196+
/// @brief 1. Verify native and universal tags aren't mixed.
197+
/// 2. Replace universal tags with native and save image order.
198+
/// 3. If there were no universal tags, restore image order from native.
199+
/// 4. If no tags were found, prepend native tags and assume incremental
200+
/// ordering.
201+
/// @param automatic_tag MiniCPM-V-2_6 inserts
178202
/// (<image>./</image>)\n per image but it only replaces
179203
/// <image>./</image> leaving ()\n untouched.
180-
/// unified_tag_to_native_tag allows to handle this by being separated
204+
/// automatic_tag allows to handle this by being separated
181205
/// from native_tag param.
182-
std::pair<std::string, std::vector<size_t>> unify_prompt(
206+
std::pair<std::string, std::vector<size_t>> normalize_prompt(
183207
const std::string& prompt,
184208
const std::string& native_tag,
185-
const std::string& unified_tag_to_native_tag,
186-
size_t n_new_images,
187-
size_t first_new_image_id
209+
const std::string& automatic_tag,
210+
size_t base_id,
211+
size_t n_images
188212
);
189213

190214
} // namespace ov::genai

src/cpp/src/visual_language/minicpm/classes.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -581,12 +581,12 @@ InputsEmbedderMiniCPM::InputsEmbedderMiniCPM(
581581
}
582582

583583
ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) {
584-
auto [unified_prompt, images_sequence] = unify_prompt(
584+
auto [unified_prompt, images_sequence] = normalize_prompt(
585585
prompt,
586586
NATIVE_TAG,
587587
'(' + NATIVE_TAG + ")\n",
588-
images.size(),
589-
m_image_id
588+
m_image_id,
589+
images.size()
590590
);
591591

592592
std::string unk64;

0 commit comments

Comments
 (0)