@@ -71,15 +71,21 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p
71
71
auto tokenizer = ov::genai::Tokenizer (directory, tokenizer_properties);
72
72
auto generation_config = utils::from_config_json_if_exists (directory);
73
73
74
+ std::shared_ptr<InputsEmbedder> embedder;
75
+ if (std::filesystem::exists (directory / " openvino_text_embeddings_model.xml" )) {
76
+ embedder = std::make_shared<InputsEmbedder>(directory, device, properties);
77
+ }
78
+
74
79
if (is_prompt_lookup_enabled) {
75
80
OPENVINO_ASSERT (draft_model_desr.model == nullptr , " Speculative decoding and prompt lookup decoding are mutually exclusive" );
81
+ OPENVINO_ASSERT (embedder == nullptr , " Prompt lookup decoding is not supported for models with embeddings" );
76
82
m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
77
83
} else if (draft_model_desr.model != nullptr ) {
84
+ OPENVINO_ASSERT (embedder == nullptr , " Speculative decoding is not supported for models with embeddings" );
78
85
auto main_model_descr = ov::genai::ModelDesc (model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
79
86
m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
80
- } else if (std::filesystem::exists (directory / " openvino_text_embeddings_model.xml" ) ) {
81
- auto inputs_embedder = std::make_shared<InputsEmbedder>(directory, device, properties);
82
- m_impl = std::make_shared<ContinuousBatchingImpl>(model, inputs_embedder, tokenizer, scheduler_config, device, properties, generation_config);
87
+ } else if (embedder) {
88
+ m_impl = std::make_shared<ContinuousBatchingImpl>(model, embedder, tokenizer, scheduler_config, device, properties, generation_config);
83
89
}
84
90
else {
85
91
m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
@@ -112,16 +118,21 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
112
118
}
113
119
auto model = utils::singleton_core ().read_model (model_path, {}, properties_without_draft_model);
114
120
auto generation_config = utils::from_config_json_if_exists (directory);
121
+ std::shared_ptr<InputsEmbedder> embedder;
122
+ if (std::filesystem::exists (directory / " openvino_text_embeddings_model.xml" )) {
123
+ embedder = std::make_shared<InputsEmbedder>(directory, device, properties);
124
+ }
115
125
116
126
if (is_prompt_lookup_enabled) {
117
127
OPENVINO_ASSERT (draft_model_desr.model == nullptr , " Speculative decoding and prompt lookup decoding are mutually exclusive" );
128
+ OPENVINO_ASSERT (embedder == nullptr , " Prompt lookup decoding is not supported for models with embeddings" );
118
129
m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
119
130
} else if (draft_model_desr.model != nullptr ) {
131
+ OPENVINO_ASSERT (embedder == nullptr , " Speculative decoding is not supported for models with embeddings" );
120
132
auto main_model_descr = ov::genai::ModelDesc (model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
121
133
m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
122
- } else if (std::filesystem::exists (directory / " openvino_text_embeddings_model.xml" ) ) {
123
- auto inputs_embedder = std::make_shared<InputsEmbedder>(directory, device, properties);
124
- m_impl = std::make_shared<ContinuousBatchingImpl>(model, inputs_embedder, tokenizer, scheduler_config, device, properties, generation_config);
134
+ } else if (embedder) {
135
+ m_impl = std::make_shared<ContinuousBatchingImpl>(model, embedder, tokenizer, scheduler_config, device, properties, generation_config);
125
136
} else {
126
137
m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
127
138
}
@@ -144,20 +155,71 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
144
155
auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config (properties_without_draft_model);
145
156
auto model = utils::singleton_core ().read_model (model_str, weights_tensor);
146
157
auto rt_info = model->get_rt_info ();
158
+ std::shared_ptr<InputsEmbedder> embedder = nullptr ;
147
159
std::filesystem::path directory = " " ;
148
160
if (rt_info.find (" __weights_path" ) != rt_info.end ()) {
149
161
std::string weights_path = rt_info.at (" __weights_path" ).as <std::string>();
150
162
directory = std::filesystem::path (weights_path).parent_path ();
163
+ if (std::filesystem::exists (directory / " openvino_text_embeddings_model.xml" )) {
164
+ embedder = std::make_shared<InputsEmbedder>(directory, device, properties);
165
+ }
166
+ }
167
+ if (is_prompt_lookup_enabled) {
168
+ OPENVINO_ASSERT (draft_model_desr.model == nullptr , " Speculative decoding and prompt lookup decoding are mutually exclusive" );
169
+ OPENVINO_ASSERT (embedder == nullptr , " Prompt lookup decoding is not supported for models with embeddings" );
170
+ m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
171
+ } else if (draft_model_desr.model != nullptr ) {
172
+ OPENVINO_ASSERT (embedder == nullptr , " Speculative decoding is not supported for models with embeddings" );
173
+ auto main_model_descr = ov::genai::ModelDesc (model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
174
+ m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
175
+ } else if (embedder) {
176
+ m_impl = std::make_shared<ContinuousBatchingImpl>(model, embedder, tokenizer, scheduler_config, device, properties, generation_config);
177
+ } else {
178
+ m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
179
+ }
180
+
181
+ m_impl->m_load_time_ms = get_load_time (start_time);
182
+ }
183
+
184
+ ContinuousBatchingPipeline::ContinuousBatchingPipeline (
185
+ const ModelsMap& models_map,
186
+ const ov::genai::Tokenizer& tokenizer,
187
+ const SchedulerConfig& scheduler_config,
188
+ const std::string& device,
189
+ std::optional<std::filesystem::path> embedder_config_dir_path,
190
+ const ov::AnyMap& properties,
191
+ const ov::genai::GenerationConfig& generation_config) {
192
+ auto start_time = std::chrono::steady_clock::now ();
193
+
194
+ auto properties_without_draft_model = properties;
195
+ auto draft_model_desr = extract_draft_model_from_config (properties_without_draft_model);
196
+ auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config (properties_without_draft_model);
197
+ auto model_pair = utils::get_model_weights_pair (models_map, " language" );
198
+ auto model = utils::singleton_core ().read_model (model_pair.first , model_pair.second );
199
+ auto rt_info = model->get_rt_info ();
200
+ std::filesystem::path directory = " " ;
201
+ std::shared_ptr<InputsEmbedder> embedder = nullptr ;
202
+ if (embedder_config_dir_path.has_value ()) {
203
+ auto path = *embedder_config_dir_path;
204
+ embedder = std::make_shared<InputsEmbedder>(models_map, tokenizer, path, device, properties);
205
+ }
206
+ else if (rt_info.find (" __weights_path" ) != rt_info.end ()) {
207
+ std::string weights_path = rt_info.at (" __weights_path" ).as <std::string>();
208
+ directory = std::filesystem::path (weights_path).parent_path ();
209
+ if (std::filesystem::exists (directory / " openvino_text_embeddings_model.xml" )) {
210
+ embedder = std::make_shared<InputsEmbedder>(directory, device, properties);
211
+ }
151
212
}
152
213
if (is_prompt_lookup_enabled) {
153
214
OPENVINO_ASSERT (draft_model_desr.model == nullptr , " Speculative decoding and prompt lookup decoding are mutually exclusive" );
215
+ OPENVINO_ASSERT (embedder == nullptr , " Prompt lookup decoding is not supported for models with embeddings" );
154
216
m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
155
217
} else if (draft_model_desr.model != nullptr ) {
218
+ OPENVINO_ASSERT (embedder == nullptr , " Speculative decoding is not supported for models with embeddings" );
156
219
auto main_model_descr = ov::genai::ModelDesc (model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
157
220
m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
158
- } else if (std::filesystem::exists (directory / " openvino_text_embeddings_model.xml" )) {
159
- auto inputs_embedder = std::make_shared<InputsEmbedder>(directory, device, properties);
160
- m_impl = std::make_shared<ContinuousBatchingImpl>(model, inputs_embedder, tokenizer, scheduler_config, device, properties, generation_config);
221
+ } else if (embedder) {
222
+ m_impl = std::make_shared<ContinuousBatchingImpl>(model, embedder, tokenizer, scheduler_config, device, properties, generation_config);
161
223
} else {
162
224
m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
163
225
}
0 commit comments