CB constructor from ModelsMap (openvinotoolkit#1863)

popovaan · ilya-lavrenov · web-flow · commit 3c2f4a9d1391 · 2025-03-10T13:53:54.000+04:00
Co-authored-by: Ilya Lavrenov &lt;ilya.lavrenov@intel.com&gt;
diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -119,6 +119,28 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
         const ov::genai::GenerationConfig& generation_config = {}
     );
 
+    /**
+    * @brief Constructs a ContinuousBatchingPipeline from models map.
+    *
+    * @param models_map  A map where key is model name (e.g. "vision_embeddings", "text_embeddings", "language", "resampler") 
+    * and value is a pair of model IR as string and weights as tensor.
+    * @param tokenizer A manually initialized ov::genai::Tokenizer.
+    * @param scheduler_config Configuration for the scheduler.
+    * @param device The device to run the pipeline on (e.g., CPU, GPU).
+    * @param embedder_config_dir_path Optional path to a directory containing embedder config.
+    * @param properties Optional properties for the pipeline.
+    * @param generation_config Optional generation configuration for the pipeline.
+    */
+    ContinuousBatchingPipeline(
+        const ModelsMap& models_map,
+        const ov::genai::Tokenizer& tokenizer,
+        const SchedulerConfig& scheduler_config,
+        const std::string& device,
+        std::optional<std::filesystem::path> embedder_config_dir_path = std::nullopt,
+        const ov::AnyMap& properties = {},
+        const ov::genai::GenerationConfig& generation_config = {}
+    );
+
     ov::genai::Tokenizer get_tokenizer() const;
 
     ov::genai::GenerationConfig get_config() const;
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -26,6 +26,11 @@ using OptionalGenerationConfig = std::optional<GenerationConfig>;
 using EncodedInputs = std::variant<ov::Tensor, TokenizedInputs>;
 using StringInputs = std::variant<std::string, std::vector<std::string>>;
 
+/// @brief A map of models for VLMPipeline constructor. 
+/// Key is model name (e.g. "vision_embeddings", "text_embeddings", "language", "resampler")
+/// and value is a pair of model IR as string and weights as tensor.
+using ModelsMap = std::map<std::string, std::pair<std::string, ov::Tensor>>;
+
 /**
 * @brief Structure to store resulting batched tokens and scores for each batch sequence.
 * The first num_return_sequences elements correspond to the first batch element.
diff --git a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp
@@ -19,11 +19,6 @@ class OPENVINO_GENAI_EXPORTS VLMDecodedResults : public DecodedResults{
     VLMPerfMetrics perf_metrics;
 };
 
-/// @brief A map of models for VLMPipeline constructor. 
-/// Key is model name (e.g. "vision_embeddings", "text_embeddings", "language", "resampler")
-/// and value is a pair of model IR as string and weights as tensor.
-using ModelsMap = std::map<std::string, std::pair<std::string, ov::Tensor>>;
-
 /// @brief A Visual language modeling pipeline class used to generate a
 /// response or run a chat given a prompt and an image.
 class OPENVINO_GENAI_EXPORTS VLMPipeline {
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -71,15 +71,21 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p
     auto tokenizer = ov::genai::Tokenizer(directory, tokenizer_properties);
     auto generation_config = utils::from_config_json_if_exists(directory);
 
+    std::shared_ptr<InputsEmbedder> embedder;
+    if (std::filesystem::exists(directory / "openvino_text_embeddings_model.xml")) {
+        embedder = std::make_shared<InputsEmbedder>(directory, device, properties);
+    }
+
     if (is_prompt_lookup_enabled) {
         OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
+        OPENVINO_ASSERT(embedder == nullptr, "Prompt lookup decoding is not supported for models with embeddings");
         m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
     } else if (draft_model_desr.model != nullptr) {
+        OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings");
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
         m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
-    } else if (std::filesystem::exists(directory / "openvino_text_embeddings_model.xml") ) {
-        auto inputs_embedder = std::make_shared<InputsEmbedder>(directory, device, properties);
-        m_impl = std::make_shared<ContinuousBatchingImpl>(model, inputs_embedder, tokenizer, scheduler_config, device, properties, generation_config);
+    } else if (embedder) {
+        m_impl = std::make_shared<ContinuousBatchingImpl>(model, embedder, tokenizer, scheduler_config, device, properties, generation_config);
     }
     else {
         m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
@@ -112,16 +118,21 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
     }
     auto model = utils::singleton_core().read_model(model_path, {}, properties_without_draft_model);
     auto generation_config = utils::from_config_json_if_exists(directory);
+    std::shared_ptr<InputsEmbedder> embedder;
+    if (std::filesystem::exists(directory / "openvino_text_embeddings_model.xml")) {
+        embedder = std::make_shared<InputsEmbedder>(directory, device, properties);
+    }
 
     if (is_prompt_lookup_enabled) {
         OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
+        OPENVINO_ASSERT(embedder == nullptr, "Prompt lookup decoding is not supported for models with embeddings");
         m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
     } else if (draft_model_desr.model != nullptr) {
+        OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings");
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
         m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
-    } else if (std::filesystem::exists(directory / "openvino_text_embeddings_model.xml") ) {
-        auto inputs_embedder = std::make_shared<InputsEmbedder>(directory, device, properties);
-        m_impl = std::make_shared<ContinuousBatchingImpl>(model, inputs_embedder, tokenizer, scheduler_config, device, properties, generation_config);
+    } else if (embedder) {
+        m_impl = std::make_shared<ContinuousBatchingImpl>(model, embedder, tokenizer, scheduler_config, device, properties, generation_config);
     } else {
         m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
     }
@@ -144,20 +155,71 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
     auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
     auto model = utils::singleton_core().read_model(model_str, weights_tensor);
     auto rt_info = model->get_rt_info();
+    std::shared_ptr<InputsEmbedder> embedder = nullptr;
     std::filesystem::path directory = "";
     if (rt_info.find("__weights_path") != rt_info.end()) {
         std::string weights_path = rt_info.at("__weights_path").as<std::string>();
         directory = std::filesystem::path(weights_path).parent_path();
+        if (std::filesystem::exists(directory / "openvino_text_embeddings_model.xml")) {
+            embedder = std::make_shared<InputsEmbedder>(directory, device, properties);
+        }
+    }
+    if (is_prompt_lookup_enabled) {
+        OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
+        OPENVINO_ASSERT(embedder == nullptr, "Prompt lookup decoding is not supported for models with embeddings");
+        m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
+    } else if (draft_model_desr.model != nullptr) {
+        OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings");
+        auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
+        m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
+    } else if (embedder) {
+        m_impl = std::make_shared<ContinuousBatchingImpl>(model, embedder, tokenizer, scheduler_config, device, properties, generation_config);
+    } else {
+        m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
+    }
+
+    m_impl->m_load_time_ms = get_load_time(start_time);
+}
+
+ContinuousBatchingPipeline::ContinuousBatchingPipeline(
+        const ModelsMap& models_map,
+        const ov::genai::Tokenizer& tokenizer,
+        const SchedulerConfig& scheduler_config,
+        const std::string& device,
+        std::optional<std::filesystem::path> embedder_config_dir_path,
+        const ov::AnyMap& properties,
+        const ov::genai::GenerationConfig& generation_config) {
+    auto start_time = std::chrono::steady_clock::now();
+
+    auto properties_without_draft_model = properties;
+    auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
+    auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
+    auto model_pair = utils::get_model_weights_pair(models_map, "language");
+    auto model = utils::singleton_core().read_model(model_pair.first, model_pair.second);
+    auto rt_info = model->get_rt_info();
+    std::filesystem::path directory = "";
+    std::shared_ptr<InputsEmbedder> embedder = nullptr;
+    if (embedder_config_dir_path.has_value()) {
+        auto path = *embedder_config_dir_path;
+        embedder = std::make_shared<InputsEmbedder>(models_map, tokenizer, path, device, properties);
+    }
+    else if (rt_info.find("__weights_path") != rt_info.end()) {
+        std::string weights_path = rt_info.at("__weights_path").as<std::string>();
+        directory = std::filesystem::path(weights_path).parent_path();
+        if (std::filesystem::exists(directory / "openvino_text_embeddings_model.xml")) {
+            embedder = std::make_shared<InputsEmbedder>(directory, device, properties);
+        }
     }
     if (is_prompt_lookup_enabled) {
         OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
+        OPENVINO_ASSERT(embedder == nullptr, "Prompt lookup decoding is not supported for models with embeddings");
         m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
     } else if (draft_model_desr.model != nullptr) {
+        OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings");
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
         m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
-    } else if (std::filesystem::exists(directory / "openvino_text_embeddings_model.xml")) {
-        auto inputs_embedder = std::make_shared<InputsEmbedder>(directory, device, properties);
-        m_impl = std::make_shared<ContinuousBatchingImpl>(model, inputs_embedder, tokenizer, scheduler_config, device, properties, generation_config);
+    } else if (embedder) {
+        m_impl = std::make_shared<ContinuousBatchingImpl>(model, embedder, tokenizer, scheduler_config, device, properties, generation_config);
     } else {
         m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
     }
diff --git a/src/cpp/src/visual_language/continuous_batching_adapter.hpp b/src/cpp/src/visual_language/continuous_batching_adapter.hpp
@@ -32,12 +32,13 @@ class ov::genai::VLMPipeline::VLMContinuousBatchingAdapter : public ov::genai::V
         const ov::AnyMap& properties,
         const ov::genai::GenerationConfig& generation_config
     ): m_impl{
-        "./", 
-        scheduler_config, 
-        device, 
-        properties} {
-        // TODO: Implement the constructor of ContinuousBatchingPipeline from ModelsMap
-        OPENVINO_THROW("Not implemented.");
+        models_map,
+        tokenizer,
+        scheduler_config,
+        device,
+        config_dir_path,
+        properties,
+        generation_config} {
     }
 
     VLMDecodedResults generate(