Skip to content

Commit 499096a

Browse files
authored
[llm bench]: add infer latency for genai (openvinotoolkit#1397)
CVS-158466 port from 2024.6 to master openvinotoolkit#1391
1 parent 9bcadf7 commit 499096a

File tree

3 files changed

+5
-3
lines changed

3 files changed

+5
-3
lines changed

tools/llm_bench/task/speech_to_text_generation.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def run_speech_2_txt_generation(input_param, args, md5_list, iter_data_list):
5757
- np.array(perf_metrics.raw_metrics.m_new_token_times[:-1])
5858
).tolist()
5959
tm_list = (np.array([first_token_time] + second_tokens_durations) / 1000).tolist()
60-
tm_infer_list = None
60+
tm_infer_list = (np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000).tolist()
6161
result_text = result_text.texts[0]
6262
else:
6363
start = time.perf_counter()

tools/llm_bench/task/text_generation.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,7 @@ def token_printer():
302302
).tolist()
303303

304304
tm_list = np.array([first_token_time] + second_tokens_durations) / 1000
305+
inference_durations = (np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000).tolist()
305306
log.debug('latency of all tokens:')
306307
[log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)]
307308
iter_data = gen_output_data.gen_iterate_data(
@@ -323,7 +324,7 @@ def token_printer():
323324
num,
324325
iter_data,
325326
tm_list.tolist(),
326-
None,
327+
inference_durations.tolist(),
327328
warm_up=(num == 0),
328329
max_rss_mem=max_rss_mem_consumption,
329330
max_shared_mem=max_shared_mem_consumption,

tools/llm_bench/task/visual_language_generation.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -268,11 +268,12 @@ def run_visual_language_generation_genai(
268268
mm_embeddings_preparation_time=perf_metrics.get_prepare_embeddings_duration().mean
269269
)
270270
iter_data_list.append(iter_data)
271+
inference_durations = np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000
271272
metrics_print.print_metrics(
272273
num,
273274
iter_data,
274275
tm_list.tolist(),
275-
None,
276+
inference_durations.tolist(),
276277
warm_up=(num == 0),
277278
max_rss_mem=max_rss_mem_consumption,
278279
max_shared_mem=max_shared_mem_consumption,

0 commit comments

Comments
 (0)