openvinotoolkit
diff --git a/‎src/cpp/include/openvino/genai/visual_language/pipeline.hpp
+3 b/‎src/cpp/include/openvino/genai/visual_language/pipeline.hpp
+3
diff --git a/‎src/cpp/src/llm_pipeline_static.cpp
-1 b/‎src/cpp/src/llm_pipeline_static.cpp
-1
diff --git a/‎src/cpp/src/lm_encoding.cpp
-1 b/‎src/cpp/src/lm_encoding.cpp
-1
diff --git a/‎src/cpp/src/lora_adapter.cpp
-1 b/‎src/cpp/src/lora_adapter.cpp
-1
diff --git a/‎src/cpp/src/lora_common.hpp
-1 b/‎src/cpp/src/lora_common.hpp
-1
diff --git a/‎src/cpp/src/lora_names_mapping.hpp
+1 b/‎src/cpp/src/lora_names_mapping.hpp
+1
diff --git a/‎src/cpp/src/visual_language/inputs_embedder.cpp
+35-45 b/‎src/cpp/src/visual_language/inputs_embedder.cpp
+35-45
diff --git a/‎src/cpp/src/visual_language/inputs_embedder.hpp
+36-12 b/‎src/cpp/src/visual_language/inputs_embedder.hpp
+36-12
diff --git a/‎src/cpp/src/visual_language/minicpm/classes.cpp
+3-3 b/‎src/cpp/src/visual_language/minicpm/classes.cpp
+3-3
@@ -110,6 +110,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     /// A model's native image tag can be used instead of
     /// <ov_genai_image_i>. These tags are:
     /// MiniCPM-V-2_6: (<image>./</image>)\n
+    /// Phi-3-vision: <|image_i|>\n - the index starts with one
     /// Qwen2-VL: <|vision_start|><|image_pad|><|vision_end|>
     /// If the prompt doesn't contain image tags, but images are
     /// provided, the tags are prepended to the prompt.
@@ -134,6 +135,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     /// A model's native image tag can be used instead of
     /// <ov_genai_image_i>. These tags are:
     /// MiniCPM-V-2_6: (<image>./</image>)\n
+    /// Phi-3-vision: <|image_i|>\n - the index starts with one
     /// Qwen2-VL: <|vision_start|><|image_pad|><|vision_end|>
     /// If the prompt doesn't contain image tags, but images are
     /// provided, the tags are prepended to the prompt.
@@ -159,6 +161,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     /// A model's native image tag can be used instead of
     /// <ov_genai_image_i>. These tags are:
     /// MiniCPM-V-2_6: (<image>./</image>)\n
+    /// Phi-3-vision: <|image_i|>\n - the index starts with one
     /// Qwen2-VL: <|vision_start|><|image_pad|><|vision_end|>
     /// If the prompt doesn't contain image tags, but images are
     /// provided, the tags are prepended to the prompt.
 
@@ -7,7 +7,6 @@
 #include "utils.hpp"
 
 #include <fstream>
-#include <regex>
 
 #include "openvino/runtime/core.hpp"
 #include "openvino/core/parallel.hpp"
 
@@ -6,7 +6,6 @@
 #include <iostream>
 #include <numeric>
 #include <random>
-#include <regex>
 #include <vector>
 
 #include "utils.hpp"
 
@@ -7,7 +7,6 @@
 #include <string>
 #include <vector>
 #include <fstream>
-#include <regex>
 #include <optional>
 #include <numeric>
 #include <iostream>
 
@@ -7,7 +7,6 @@
 #include <memory>
 #include <string>
 #include <optional>
-#include <regex>
 #include <vector>
 
 #include "openvino/op/constant.hpp"
 
@@ -3,6 +3,7 @@
 #include <string>
 #include <unordered_map>
 #include <set>
+#include <regex>
 
 #include <openvino/genai/lora_adapter.hpp>
 
 
@@ -16,13 +16,6 @@
 #include "visual_language/internvl_chat/classes.hpp"
 
 #include "utils.hpp"
-#include <regex>
-
-namespace {
-
-std::regex UNIVERSAL_PATTERN{R"(<ov_genai_image_(\d+)>)"};
-
-}
 
 namespace ov::genai {
 
@@ -296,49 +289,46 @@ bool InputsEmbedder::prompt_has_image_tag(const std::string& prompt) const {
     return m_impl->prompt_has_image_tag(prompt);
 }
 
-std::pair<std::string, std::vector<size_t>> unify_prompt(
+void verify_ids(const std::vector<size_t>& image_ids, size_t base_id, size_t n_images) {
+    for (size_t idx : image_ids) {
+        OPENVINO_ASSERT(base_id <= idx, "Referring to older images isn't implemented");
+        OPENVINO_ASSERT(idx < base_id + n_images, "Missing image ", idx);
+    }
+}
+
+std::pair<std::string, std::vector<size_t>> normalize_prompt(
     const std::string& prompt,
     const std::string& native_tag,
-    const std::string& unified_tag_to_native_tag,
-    size_t n_new_images,
-    size_t first_new_image_id
+    const std::string& automatic_tag,
+    size_t base_id,
+    size_t n_images
 ) {
-    bool found_universal_tag = std::regex_search(prompt, UNIVERSAL_PATTERN);
-    bool found_native_tag = prompt.find(native_tag) != std::string::npos;
-    OPENVINO_ASSERT(!(found_universal_tag && found_native_tag), "Prompt can contain only one type of image tags.");
-    std::stringstream images_prompt;
-    if (!found_universal_tag && ! found_native_tag) {
-        for (size_t i = first_new_image_id; i < n_new_images + first_new_image_id; ++i) {
-            images_prompt << "<ov_genai_image_" << i << ">";
-        }
+    size_t pos = prompt.find(native_tag);
+    auto [image_prompt, image_sequence] = universal_to_native(prompt, [&](std::ostream& os, size_t) {
+        os << automatic_tag;
+    });
+    if (!image_sequence.empty()) {
+        OPENVINO_ASSERT(pos == std::string::npos, "Prompt can contain only one type of image tags.");
+        verify_ids(image_sequence, base_id, n_images);
+        return {std::move(image_prompt), std::move(image_sequence)};
     }
-    images_prompt << prompt;
-
-    std::vector<size_t> images_sequence;
-    std::string unified_prompt = images_prompt.str();
-    std::sregex_iterator end_it;
-    if (found_native_tag) {
-        size_t pos = 0;
-        while ((pos = unified_prompt.find(native_tag, pos)) != std::string::npos) {
-            images_sequence.push_back(first_new_image_id + images_sequence.size());
-            pos += native_tag.length();
-        }
-        OPENVINO_ASSERT(images_sequence.size() == n_new_images);
-    } else {
-        bool found = true;
-        while (found) {
-            found = false;
-            for (std::sregex_iterator it(unified_prompt.begin(), unified_prompt.end(), UNIVERSAL_PATTERN); it != end_it; ++it) {
-                images_sequence.push_back(std::stoi((*it)[1].str()));
-                OPENVINO_ASSERT(images_sequence.back() < n_new_images + first_new_image_id, "Missing image ", images_sequence.back());
-                OPENVINO_ASSERT(first_new_image_id <= images_sequence.back(), "Referring to older images isn't implemented");
-                unified_prompt.replace(it->position(), it->length(), unified_tag_to_native_tag);
-                found = true;
-                break;
-            }
-        }
+    // Restore ids from native tags
+    while (pos != std::string::npos) {
+        image_sequence.push_back(base_id + image_sequence.size());
+        pos = prompt.find(native_tag, pos + native_tag.length());
+    }
+    if (!image_sequence.empty()) {
+        OPENVINO_ASSERT(image_sequence.size() == n_images, "The number of native image tags and provided images must match because it's ambiguous which image should be ignored.");
+        return {std::move(image_prompt), std::move(image_sequence)};
+    }
+    // Prepend automatic tags
+    std::stringstream stream;
+    for (size_t relative_id = 0; relative_id < n_images; relative_id++) {
+        image_sequence.push_back(base_id + relative_id);
+        stream << automatic_tag;
     }
-    return {std::move(unified_prompt), std::move(images_sequence)};
+    stream << prompt;
+    return {stream.str(), std::move(image_sequence)};
 }
 
 } // namespace ov::genai
@@ -6,6 +6,7 @@
 #include <string>
 #include <vector>
 #include <filesystem>
+#include <regex>
 
 #include "utils.hpp"
 #include "lm_encoding.hpp"
@@ -19,6 +20,7 @@
 
 namespace ov::genai {
 struct VLMPerfMetrics;
+const static std::regex UNIVERSAL_PATTERN{R"(<ov_genai_image_(\d+)>)"};
 
 class InputsEmbedder {
 public:
@@ -168,23 +170,45 @@ class InputsEmbedder {
     friend class InputsEmbedderQwen2VL;
 };
 
-/// @brief Check if universal tag is given.
-/// Check if native tag is given.
-/// Assert different tag aren't mixed.
-/// If no any tag, prepend universal image tag.
-/// If native tag, assume incremental image order.
-/// Else replace universal tags with native tags and save image order.
-/// @param unified_tag_to_native_tag MiniCPM-V-2_6 inserts
+template <typename Func>
+std::pair<std::string, std::vector<size_t>> universal_to_native(
+    const std::string& prompt,
+    const Func& write_native
+) {
+    std::stringstream stream;
+    std::vector<size_t> image_sequence;
+    std::smatch match;
+    std::regex_search(prompt, match, UNIVERSAL_PATTERN);
+    auto search_begin = prompt.begin();
+    while (!match.empty()) {
+        stream.write(&*search_begin, match.position());
+        image_sequence.push_back(std::stoul(match.str(1)));
+        write_native(stream, image_sequence.back());
+        search_begin = match.suffix().first;
+        std::regex_search(search_begin, prompt.end(), match, UNIVERSAL_PATTERN);
+    }
+    stream.write(&*search_begin, prompt.end() - search_begin);
+    return {stream.str(), std::move(image_sequence)};
+}
+
+void verify_ids(const std::vector<size_t>& image_ids, size_t base_id, size_t n_images);
+
+/// @brief 1. Verify native and universal tags aren't mixed.
+/// 2. Replace universal tags with native and save image order.
+/// 3. If there were no universal tags, restore image order from native.
+/// 4. If no tags were found, prepend native tags and assume incremental
+/// ordering.
+/// @param automatic_tag MiniCPM-V-2_6 inserts
 /// (<image>./</image>)\n per image but it only replaces
 /// <image>./</image> leaving ()\n untouched.
-/// unified_tag_to_native_tag allows to handle this by being separated
+/// automatic_tag allows to handle this by being separated
 /// from native_tag param.
-std::pair<std::string, std::vector<size_t>> unify_prompt(
+std::pair<std::string, std::vector<size_t>> normalize_prompt(
     const std::string& prompt,
     const std::string& native_tag,
-    const std::string& unified_tag_to_native_tag,
-    size_t n_new_images,
-    size_t first_new_image_id
+    const std::string& automatic_tag,
+    size_t base_id,
+    size_t n_images
 );
 
 } // namespace ov::genai
@@ -581,12 +581,12 @@ InputsEmbedderMiniCPM::InputsEmbedderMiniCPM(
 }
 
 ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) {
-    auto [unified_prompt, images_sequence] = unify_prompt(
+    auto [unified_prompt, images_sequence] = normalize_prompt(
         prompt,
         NATIVE_TAG,
         '(' + NATIVE_TAG + ")\n",
-        images.size(),
-        m_image_id
+        m_image_id,
+        images.size()
     );
 
     std::string unk64;