Skip to content

Commit 69720e5

Browse files
Add perf metrics for CB VLM (openvinotoolkit#1897)
- Added passing CB perf metric for VLM cases as well. - Numbers are similar to vanilla pipeline ![image](https://github.com/user-attachments/assets/e2fbdfe0-cd0f-411a-90e4-f33ca1b06bbf) Ticket CVS-163346 --------- Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
1 parent f274cb3 commit 69720e5

8 files changed

+80
-30
lines changed

src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp

+8-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
#include "openvino/genai/llm_pipeline.hpp"
1616
#include "openvino/genai/streamer_base.hpp"
1717
#include "openvino/genai/visibility.hpp"
18+
#include "openvino/genai/visual_language/pipeline.hpp"
19+
1820
#include "cache_eviction.hpp"
1921

2022
namespace ov::genai {
@@ -48,6 +50,11 @@ struct PipelineMetrics {
4850
* Running average of the KV cache usage during the lifetime of the pipeline, with max window size of 1000 steps
4951
*/
5052
float avg_cache_usage = 0.0;
53+
54+
/**
55+
* Duration of the last generation step in microseconds.
56+
*/
57+
float inference_duration = 0.0;
5158
};
5259

5360
class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
@@ -164,7 +171,7 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
164171
// more high level interface, which can process multiple prompts in continuous batching manner
165172
std::vector<EncodedGenerationResult> generate(const std::vector<ov::Tensor>& input_ids, const std::vector<ov::genai::GenerationConfig>& sampling_params, const ov::genai::StreamerVariant& streamer=std::monostate{});
166173
std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params, const ov::genai::StreamerVariant& streamer=std::monostate{});
167-
std::vector<GenerationResult> generate(
174+
std::vector<VLMDecodedResults> generate(
168175
const std::vector<std::string>& prompts,
169176
const std::vector<std::vector<ov::Tensor>>& images,
170177
const std::vector<GenerationConfig>& sampling_params,

src/cpp/src/continuous_batching_impl.cpp

+7-2
Original file line numberDiff line numberDiff line change
@@ -278,8 +278,11 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
278278

279279
{
280280
static ManualTimer timer("forward");
281+
const auto infer_start = std::chrono::steady_clock::now();
281282
timer.start();
282283
logits = m_model_runner->forward(m_requests, scheduler_output);
284+
const auto infer_end = std::chrono::steady_clock::now();
285+
m_pipeline_metrics.inference_duration = PerfMetrics::get_microsec(infer_end - infer_start);
283286
timer.end();
284287
}
285288

@@ -403,11 +406,13 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
403406
try {
404407
const auto infer_start = std::chrono::steady_clock::now();
405408
step();
409+
// During prefill step (or steps if max_batch_size < prompt_len) we don't generate new tokens,
410+
// but still inference took place, so we need to add this time to the total inference duration.
411+
raw_perf_counters.m_inference_durations[0] += MicroSeconds(m_pipeline_metrics.inference_duration);
406412
if (m_batch_size > 0) {
407413
const auto infer_end = std::chrono::steady_clock::now();
408-
const auto infer_ms = PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
414+
const auto infer_ms = PerfMetrics::get_microsec(infer_end - infer_start);
409415
raw_perf_counters.m_token_infer_durations.emplace_back(infer_ms);
410-
raw_perf_counters.m_inference_durations[0] += MicroSeconds(infer_ms);
411416
raw_perf_counters.m_new_token_times.emplace_back(infer_end);
412417
raw_perf_counters.m_batch_sizes.emplace_back(m_batch_size);
413418
}

src/cpp/src/continuous_batching_pipeline.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ std::vector<GenerationResult> ContinuousBatchingPipeline::generate(const std::ve
284284
return decoded_results;
285285
}
286286

287-
std::vector<GenerationResult> ContinuousBatchingPipeline::generate(
287+
std::vector<VLMDecodedResults> ContinuousBatchingPipeline::generate(
288288
const std::vector<std::string>& prompts,
289289
const std::vector<std::vector<ov::Tensor>>& images,
290290
const std::vector<GenerationConfig>& sampling_params,

src/cpp/src/icontinuous_batching.cpp

+38-17
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,16 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
5757
// TODO: remove this code and within model runner add check: if sequence group type is tokens,
5858
// but embedding model is available => compute embeddings first, then pass to LLM
5959
std::vector<std::vector<ov::Tensor>> images(prompts.size());
60-
return generate(prompts, images, sampling_params, streamer);
60+
auto results_vlm = generate(prompts, images, sampling_params, streamer);
61+
std::vector<GenerationResult> resutls;
62+
for (auto& vlm_result : results_vlm) {
63+
GenerationResult result;
64+
result.m_generation_ids = std::move(vlm_result.texts);
65+
result.m_scores = std::move(vlm_result.scores);
66+
result.perf_metrics = std::move(vlm_result.perf_metrics);
67+
resutls.push_back(result);
68+
}
69+
return resutls;
6170
}
6271
std::vector<ov::Tensor> input_ids;
6372
auto start_time = std::chrono::steady_clock::now();
@@ -142,20 +151,20 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
142151
return decoded;
143152
}
144153

145-
std::vector<GenerationResult>
154+
std::vector<VLMDecodedResults>
146155
ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
147156
const std::vector<std::string>& prompts,
148157
const std::vector<std::vector<ov::Tensor>>& rgbs_vector,
149158
const std::vector<GenerationConfig>& sampling_params,
150159
const StreamerVariant& streamer) {
151-
// TODO: Add performance metrics
152160
auto generate_start_time = std::chrono::steady_clock::now();
153161
OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS);
154162

155163
OPENVINO_ASSERT(prompts.size() == sampling_params.size(), "Number of prompts should be equal to the number of generation configs.");
156164
OPENVINO_ASSERT(prompts.size() == rgbs_vector.size(), "Number of prompts should be equal to the number of images vectors.");
157165

158166
std::vector<ov::Tensor> input_embeds_list;
167+
std::vector<VLMPerfMetrics> vlm_perf_metrics(prompts.size());
159168

160169
if (m_is_chat_conversation) {
161170
OPENVINO_ASSERT(1 == prompts.size(), "Can't chat with multiple prompts");
@@ -171,37 +180,49 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
171180

172181
m_inputs_embedder->set_apply_chat_template_status(false);
173182

174-
VLMPerfMetrics perf_metrics;
175-
input_embeds_list.push_back(m_inputs_embedder->get_inputs_embeds(templated_history, m_history_images, perf_metrics));
183+
input_embeds_list.push_back(m_inputs_embedder->get_inputs_embeds(templated_history, m_history_images, vlm_perf_metrics[0]));
176184
} else {
177185
for (size_t i = 0; i < prompts.size(); i++) {
178186
const auto& prompt = prompts[i];
179187
const auto& rgbs = rgbs_vector[i];
180188

189+
auto start_get_inputs_embeds = std::chrono::steady_clock::now();
181190
m_inputs_embedder->set_apply_chat_template_status(sampling_params[i].apply_chat_template);
182-
183-
VLMPerfMetrics perf_metrics;
184-
input_embeds_list.emplace_back(m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics));
191+
input_embeds_list.emplace_back(m_inputs_embedder->get_inputs_embeds(prompt, rgbs, vlm_perf_metrics[i]));
192+
auto end_get_inputs_embeds = std::chrono::steady_clock::now();
193+
vlm_perf_metrics[i].vlm_raw_metrics.prepare_embeddings_durations.emplace_back(PerfMetrics::get_microsec(end_get_inputs_embeds - start_get_inputs_embeds));
185194
}
186195
}
187-
188-
std::vector<GenerationResult> results;
196+
std::vector<VLMDecodedResults> results;
189197
auto encoded_results = generate(input_embeds_list, sampling_params, streamer);
190-
for (const auto& result: encoded_results) {
191-
GenerationResult gen_result;
198+
for (size_t i = 0; i < prompts.size(); i++) {
199+
auto result = encoded_results[i];
200+
VLMDecodedResults gen_result;
201+
gen_result.perf_metrics = result.perf_metrics;
202+
203+
gen_result.perf_metrics.vlm_raw_metrics = vlm_perf_metrics[i].vlm_raw_metrics;
204+
gen_result.perf_metrics.raw_metrics.tokenization_durations = vlm_perf_metrics[i].raw_metrics.tokenization_durations;
205+
gen_result.perf_metrics.raw_metrics.detokenization_durations = vlm_perf_metrics[i].raw_metrics.detokenization_durations;
206+
207+
auto decode_start_time = std::chrono::steady_clock::now();
192208
for (size_t idx = 0; idx < result.m_generation_ids.size(); ++idx) {
193-
gen_result.m_generation_ids.push_back(m_tokenizer.decode(result.m_generation_ids.at(idx)));
194-
gen_result.m_scores.push_back(result.m_scores.at(idx));
195-
gen_result.m_status = result.m_status;
209+
gen_result.texts.push_back(m_tokenizer.decode(result.m_generation_ids.at(idx)));
210+
gen_result.scores.push_back(result.m_scores.at(idx));
196211
}
212+
auto decode_end_time = std::chrono::steady_clock::now();
213+
gen_result.perf_metrics.raw_metrics.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_end_time - decode_start_time));
214+
215+
gen_result.perf_metrics.m_evaluated = false;
216+
gen_result.perf_metrics.evaluate_statistics();
217+
197218
results.emplace_back(gen_result);
198219
}
199220
if (m_is_chat_conversation) {
200-
if (results[0].m_status == ov::genai::GenerationStatus::CANCEL) {
221+
if (encoded_results[0].m_status == ov::genai::GenerationStatus::CANCEL) {
201222
m_history.pop_back();
202223
}
203224
else {
204-
m_history.push_back({{"role", "assistant"}, {"content", results[0].m_generation_ids[0]}});
225+
m_history.push_back({{"role", "assistant"}, {"content", results[0].texts[0]}});
205226
}
206227
}
207228
return results;

src/cpp/src/icontinuous_batching.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline {
115115
std::vector<GenerationConfig> sampling_params,
116116
const StreamerVariant& streamer);
117117

118-
virtual std::vector<GenerationResult>
118+
virtual std::vector<VLMDecodedResults>
119119
generate(
120120
const std::vector<std::string>& prompts,
121121
const std::vector<std::vector<ov::Tensor>>& rgbs,

src/cpp/src/visual_language/continuous_batching_adapter.hpp

+14-3
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,22 @@ class ov::genai::VLMPipeline::VLMContinuousBatchingAdapter : public ov::genai::V
4747
GenerationConfig generation_config,
4848
const StreamerVariant& streamer
4949
) override {
50+
auto start_time = std::chrono::steady_clock::now();
5051
auto result = m_impl.generate({prompt}, {rgbs}, {generation_config}, streamer)[0];
52+
auto stop_time = std::chrono::steady_clock::now();
53+
5154
VLMDecodedResults decoded;
52-
for (size_t idx = 0; idx < result.m_generation_ids.size(); ++idx) {
53-
decoded.texts.push_back(result.m_generation_ids.at(idx));
54-
decoded.scores.push_back(result.m_scores.at(idx));
55+
decoded.perf_metrics = result.perf_metrics;
56+
decoded.perf_metrics.load_time = get_load_time();
57+
58+
decoded.perf_metrics.raw_metrics.generate_durations.clear();
59+
decoded.perf_metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
60+
decoded.perf_metrics.m_evaluated = false;
61+
decoded.perf_metrics.evaluate_statistics(start_time);
62+
63+
for (size_t idx = 0; idx < result.texts.size(); ++idx) {
64+
decoded.texts.push_back(result.texts.at(idx));
65+
decoded.scores.push_back(result.scores.at(idx));
5566
}
5667
return decoded;
5768
}

src/python/py_continuous_batching_pipeline.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,7 @@ void init_continuous_batching_pipeline(py::module_& m) {
341341
const pyutils::PyBindStreamerVariant& py_streamer
342342
) -> py::typing::Union<std::vector<ov::genai::GenerationResult>> {
343343
ov::genai::StreamerVariant streamer = pyutils::pystreamer_to_streamer(py_streamer);
344-
std::vector<ov::genai::GenerationResult> generated_results;
344+
std::vector<ov::genai::VLMDecodedResults> generated_results;
345345
{
346346
py::gil_scoped_release rel;
347347
generated_results = pipe.generate(prompts, images, generation_config, streamer);

tests/python_tests/test_vlm_pipeline.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,8 @@ def test_vlm_continuous_batching_vs_stateful(config, cache):
178178

179179
res_stateful = stateful_pipe.generate(prompts[0], images=images, generation_config=generation_config)
180180
for out_idx, text in enumerate(res_stateful.texts):
181-
assert text == res_cb[idx][0].m_generation_ids[out_idx]
182-
assert abs(res_stateful.scores[out_idx] - res_cb[idx][0].m_scores[out_idx]) < eps
181+
assert text == res_cb[idx][0].texts[out_idx]
182+
assert abs(res_stateful.scores[out_idx] - res_cb[idx][0].scores[out_idx]) < eps
183183

184184

185185

@@ -287,7 +287,8 @@ def test_sampling(config, cache):
287287

288288
@pytest.mark.precommit
289289
@pytest.mark.nightly
290-
def test_perf_metrics(cache):
290+
@pytest.mark.parametrize("scheduler_config", [SchedulerConfig(), None])
291+
def test_perf_metrics(cache, scheduler_config):
291292
import numpy as np
292293
from time import perf_counter_ns
293294
models_path = get_ov_model("katuni4ka/tiny-random-minicpmv-2_6", cache)
@@ -297,7 +298,11 @@ def test_perf_metrics(cache):
297298
max_new_tokens = 30
298299

299300
start_time = perf_counter_ns()
300-
pipe = VLMPipeline(models_path, "CPU")
301+
if scheduler_config:
302+
pipe = VLMPipeline(models_path, "CPU", scheduler_config=scheduler_config)
303+
else:
304+
pipe = VLMPipeline(models_path, "CPU")
305+
301306
start_generate = perf_counter_ns()
302307
result = pipe.generate(prompts[0], images=images, generation_config=GenerationConfig(max_new_tokens=max_new_tokens))
303308
generate_time = (perf_counter_ns() - start_generate) / 1_000_000.0
@@ -315,6 +320,7 @@ def test_perf_metrics(cache):
315320
assert 0 < perf_metrics.get_tpot().mean < generate_time / num_tokens
316321
assert 0 < perf_metrics.get_ipot().mean < generate_time / num_tokens
317322
assert num_tokens / (generate_time / 1000.0) < perf_metrics.get_throughput().mean < num_tokens / ((generate_time - perf_metrics.get_ttft().mean) / 1000.0)
323+
318324
assert 0 < perf_metrics.get_inference_duration().mean < generate_time
319325
assert 0 < perf_metrics.get_generate_duration().mean < generate_time
320326
assert 0 < perf_metrics.get_tokenization_duration().mean < generate_time

0 commit comments

Comments
 (0)