@@ -57,7 +57,16 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
57
57
// TODO: remove this code and within model runner add check: if sequence group type is tokens,
58
58
// but embedding model is available => compute embeddings first, then pass to LLM
59
59
std::vector<std::vector<ov::Tensor>> images (prompts.size ());
60
- return generate (prompts, images, sampling_params, streamer);
60
+ auto results_vlm = generate (prompts, images, sampling_params, streamer);
61
+ std::vector<GenerationResult> resutls;
62
+ for (auto & vlm_result : results_vlm) {
63
+ GenerationResult result;
64
+ result.m_generation_ids = std::move (vlm_result.texts );
65
+ result.m_scores = std::move (vlm_result.scores );
66
+ result.perf_metrics = std::move (vlm_result.perf_metrics );
67
+ resutls.push_back (result);
68
+ }
69
+ return resutls;
61
70
}
62
71
std::vector<ov::Tensor> input_ids;
63
72
auto start_time = std::chrono::steady_clock::now ();
@@ -142,20 +151,20 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
142
151
return decoded;
143
152
}
144
153
145
- std::vector<GenerationResult >
154
+ std::vector<VLMDecodedResults >
146
155
ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate (
147
156
const std::vector<std::string>& prompts,
148
157
const std::vector<std::vector<ov::Tensor>>& rgbs_vector,
149
158
const std::vector<GenerationConfig>& sampling_params,
150
159
const StreamerVariant& streamer) {
151
- // TODO: Add performance metrics
152
160
auto generate_start_time = std::chrono::steady_clock::now ();
153
161
OPENVINO_ASSERT (m_model_input_type == ModelInputType::EMBEDDINGS);
154
162
155
163
OPENVINO_ASSERT (prompts.size () == sampling_params.size (), " Number of prompts should be equal to the number of generation configs." );
156
164
OPENVINO_ASSERT (prompts.size () == rgbs_vector.size (), " Number of prompts should be equal to the number of images vectors." );
157
165
158
166
std::vector<ov::Tensor> input_embeds_list;
167
+ std::vector<VLMPerfMetrics> vlm_perf_metrics (prompts.size ());
159
168
160
169
if (m_is_chat_conversation) {
161
170
OPENVINO_ASSERT (1 == prompts.size (), " Can't chat with multiple prompts" );
@@ -171,37 +180,49 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
171
180
172
181
m_inputs_embedder->set_apply_chat_template_status (false );
173
182
174
- VLMPerfMetrics perf_metrics;
175
- input_embeds_list.push_back (m_inputs_embedder->get_inputs_embeds (templated_history, m_history_images, perf_metrics));
183
+ input_embeds_list.push_back (m_inputs_embedder->get_inputs_embeds (templated_history, m_history_images, vlm_perf_metrics[0 ]));
176
184
} else {
177
185
for (size_t i = 0 ; i < prompts.size (); i++) {
178
186
const auto & prompt = prompts[i];
179
187
const auto & rgbs = rgbs_vector[i];
180
188
189
+ auto start_get_inputs_embeds = std::chrono::steady_clock::now ();
181
190
m_inputs_embedder->set_apply_chat_template_status (sampling_params[i].apply_chat_template );
182
-
183
- VLMPerfMetrics perf_metrics ;
184
- input_embeds_list. emplace_back (m_inputs_embedder-> get_inputs_embeds (prompt, rgbs, perf_metrics ));
191
+ input_embeds_list. emplace_back (m_inputs_embedder-> get_inputs_embeds (prompt, rgbs, vlm_perf_metrics[i]));
192
+ auto end_get_inputs_embeds = std::chrono::steady_clock::now () ;
193
+ vlm_perf_metrics[i]. vlm_raw_metrics . prepare_embeddings_durations . emplace_back (PerfMetrics::get_microsec (end_get_inputs_embeds - start_get_inputs_embeds ));
185
194
}
186
195
}
187
-
188
- std::vector<GenerationResult> results;
196
+ std::vector<VLMDecodedResults> results;
189
197
auto encoded_results = generate (input_embeds_list, sampling_params, streamer);
190
- for (const auto & result: encoded_results) {
191
- GenerationResult gen_result;
198
+ for (size_t i = 0 ; i < prompts.size (); i++) {
199
+ auto result = encoded_results[i];
200
+ VLMDecodedResults gen_result;
201
+ gen_result.perf_metrics = result.perf_metrics ;
202
+
203
+ gen_result.perf_metrics .vlm_raw_metrics = vlm_perf_metrics[i].vlm_raw_metrics ;
204
+ gen_result.perf_metrics .raw_metrics .tokenization_durations = vlm_perf_metrics[i].raw_metrics .tokenization_durations ;
205
+ gen_result.perf_metrics .raw_metrics .detokenization_durations = vlm_perf_metrics[i].raw_metrics .detokenization_durations ;
206
+
207
+ auto decode_start_time = std::chrono::steady_clock::now ();
192
208
for (size_t idx = 0 ; idx < result.m_generation_ids .size (); ++idx) {
193
- gen_result.m_generation_ids .push_back (m_tokenizer.decode (result.m_generation_ids .at (idx)));
194
- gen_result.m_scores .push_back (result.m_scores .at (idx));
195
- gen_result.m_status = result.m_status ;
209
+ gen_result.texts .push_back (m_tokenizer.decode (result.m_generation_ids .at (idx)));
210
+ gen_result.scores .push_back (result.m_scores .at (idx));
196
211
}
212
+ auto decode_end_time = std::chrono::steady_clock::now ();
213
+ gen_result.perf_metrics .raw_metrics .detokenization_durations .emplace_back (PerfMetrics::get_microsec (decode_end_time - decode_start_time));
214
+
215
+ gen_result.perf_metrics .m_evaluated = false ;
216
+ gen_result.perf_metrics .evaluate_statistics ();
217
+
197
218
results.emplace_back (gen_result);
198
219
}
199
220
if (m_is_chat_conversation) {
200
- if (results [0 ].m_status == ov::genai::GenerationStatus::CANCEL) {
221
+ if (encoded_results [0 ].m_status == ov::genai::GenerationStatus::CANCEL) {
201
222
m_history.pop_back ();
202
223
}
203
224
else {
204
- m_history.push_back ({{" role" , " assistant" }, {" content" , results[0 ].m_generation_ids [0 ]}});
225
+ m_history.push_back ({{" role" , " assistant" }, {" content" , results[0 ].texts [0 ]}});
205
226
}
206
227
}
207
228
return results;
0 commit comments