Skip to content

Commit f053e5e

Browse files
authoredSep 27, 2024··
Change -1 to N/A (openvinotoolkit#885)
When "first token" or "other tokens latency" no output, change output -1 to NA. examples: [phi-3-mini-4k-instruct.txt](https://github.com/user-attachments/files/17072285/phi-3-mini-4k-instruct.txt) [phi-3-mini-4k-instruct.csv](https://github.com/user-attachments/files/17072287/phi-3-mini-4k-instruct.csv)
1 parent 2ed9889 commit f053e5e

File tree

2 files changed

+41
-26
lines changed

2 files changed

+41
-26
lines changed
 

‎llm_bench/python/llm_bench_utils/metrics_print.py

+25-22
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,6 @@ def print_metrics(
88
iter_num, iter_data, tms=None, tms_infer=None, warm_up=False, max_rss_mem=-1, max_shared_mem=-1,
99
max_uss_mem=-1, stable_diffusion=None, tokenization_time=None, batch_size=1
1010
):
11-
if tms is None:
12-
tms = []
13-
if tms_infer is None:
14-
tms_infer = []
1511
iter_str = str(iter_num)
1612
if warm_up:
1713
iter_str = 'warm-up'
@@ -36,25 +32,27 @@ def print_metrics(
3632
if output_str != '':
3733
output_str = ' '.join(['[{}]'.format(iter_str), output_str])
3834
log.info(output_str)
39-
if len(tms) > 0:
35+
if tms is not None:
4036
iter_data['first_token_latency'] = tms[0] * 1000 if len(tms) > 0 else -1
4137
iter_data['other_tokens_avg_latency'] = sum(tms[1:]) / (len(tms) - 1) * 1000 if len(tms) > 1 else -1
38+
first_token_latency = 'NA' if iter_data['first_token_latency'] == -1 else f"{iter_data['first_token_latency']:.2f} ms/{latency_unit}"
39+
other_token_latency = 'NA' if iter_data['other_tokens_avg_latency'] == -1 else f"{iter_data['other_tokens_avg_latency']:.2f} ms/{latency_unit}"
4240
log.info(
43-
f"[{iter_str}] First token latency: {iter_data['first_token_latency']:.2f} ms/{latency_unit}, "
44-
f"other tokens latency: {iter_data['other_tokens_avg_latency']:.2f} ms/{latency_unit}, len of tokens: {len(tms)} * {batch_size}",
41+
f"[{iter_str}] First token latency: {first_token_latency}, "
42+
f"other tokens latency: {other_token_latency}, len of tokens: {len(tms)} * {batch_size}",
4543
)
46-
else:
47-
if tokenization_time:
44+
if len(tms) == 0:
4845
log.warning(f'[{iter_str}] No hook data output for first token latency and other tokens latency')
49-
if len(tms_infer) > 0:
46+
if tms_infer is not None:
5047
iter_data['first_token_infer_latency'] = tms_infer[0] * 1000 if len(tms_infer) > 0 else -1
5148
iter_data['other_tokens_infer_avg_latency'] = sum(tms_infer[1:]) / (len(tms_infer) - 1) * 1000 if len(tms_infer) > 1 else -1
49+
first_infer_latency = 'NA' if iter_data['first_token_infer_latency'] == -1 else f"{iter_data['first_token_infer_latency']:.2f} ms/infer"
50+
other_infer_latency = 'NA' if iter_data['other_tokens_infer_avg_latency'] == -1 else f"{iter_data['other_tokens_infer_avg_latency']:.2f} ms/infer"
5251
log.info(
53-
f"[{iter_str}] First infer latency: {iter_data['first_token_infer_latency']:.2f} ms/infer, "
54-
f"other infers latency: {iter_data['other_tokens_infer_avg_latency']:.2f} ms/infer, inference count: {len(tms_infer)}",
52+
f"[{iter_str}] First infer latency: {first_infer_latency}, "
53+
f"other infers latency: {other_infer_latency}, inference count: {len(tms_infer)}",
5554
)
56-
else:
57-
if tokenization_time:
55+
if len(tms_infer) == 0:
5856
log.warning(f'[{iter_str}] No hook data output for first infer latency and other infers latency')
5957
if stable_diffusion is not None:
6058
print_stable_diffusion_infer_latency(iter_str, iter_data, stable_diffusion)
@@ -112,8 +110,10 @@ def print_ldm_unet_vqvae_infer_latency(iter_num, iter_data, tms=None, warm_up=Fa
112110
iter_data['first_token_infer_latency'] = iter_data['first_token_latency']
113111
iter_data['other_tokens_infer_avg_latency'] = iter_data['other_tokens_avg_latency']
114112

115-
log.info(f"[{iter_str}] First step of unet latency: {iter_data['first_token_latency']:.2f} ms/step, "
116-
f"other steps of unet latency: {iter_data['other_tokens_avg_latency']:.2f} ms/step",)
113+
first_token_latency = 'NA' if iter_data['first_token_latency'] == -1 else f"{iter_data['first_token_latency']:.2f} ms/step"
114+
other_token_latency = 'NA' if iter_data['other_tokens_avg_latency'] == -1 else f"{iter_data['other_tokens_avg_latency']:.2f} ms/step"
115+
log.info(f"[{iter_str}] First step of unet latency: {first_token_latency}, "
116+
f"other steps of unet latency: {other_token_latency}",)
117117
if len_tms > 1:
118118
log.info(f"[{iter_str}] Unet latency: {(sum(tms[0:(len_tms - 1)]) / (len_tms - 1)) * 1000:.2f} ms/step, "
119119
f"vqvae decoder latency: {tms[len_tms - 1] * 1000:.2f} ms/step, "
@@ -149,14 +149,17 @@ def output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch
149149
latency_unit = '{}tokens'.format(batch_size)
150150
else:
151151
latency_unit = '{}steps'.format(batch_size)
152+
avg_1st_token_latency = 'NA' if avg_1st_token_latency < 0 else f'{avg_1st_token_latency:.2f} ms/{latency_unit}'
153+
avg_2nd_tokens_latency = 'NA' if avg_2nd_tokens_latency < 0 else f'{avg_2nd_tokens_latency:.2f} ms/{latency_unit}'
154+
avg_2nd_token_tput = 'NA' if avg_2nd_tokens_latency == 'NA' else f'{avg_2nd_token_tput:.2f} {latency_unit}s/s'
152155
if is_text_gen is True:
153-
prompt_dict[p_idx] = '\n[ INFO ] [Average] Prompt[{}] Input token size: {}, 1st token lantency: {:.2f} ms/{}, ' \
154-
'2nd tokens latency: {:.2f} ms/{}, 2nd tokens throughput: {:.2f} tokens/s' \
155-
.format(p_idx, avg_input_size, avg_1st_token_latency, latency_unit, avg_2nd_tokens_latency, latency_unit, avg_2nd_token_tput)
156+
prompt_dict[p_idx] = '\n[ INFO ] [Average] Prompt[{}] Input token size: {}, 1st token lantency: {}, ' \
157+
'2nd token lantency: {}, 2nd tokens throughput: {}' \
158+
.format(p_idx, avg_input_size, avg_1st_token_latency, avg_2nd_tokens_latency, avg_2nd_token_tput)
156159
else:
157-
prompt_dict[p_idx] = '\n[ INFO ] [Average] Prompt[{}] 1st step of unet latency {:.2f} ms/{}, ' \
158-
'2nd steps of unet latency: {:.2f} ms/{}, 2nd steps throughput: {:.2f} steps/s' \
159-
.format(p_idx, avg_1st_token_latency, latency_unit, avg_2nd_tokens_latency, latency_unit, avg_2nd_token_tput)
160+
prompt_dict[p_idx] = '\n[ INFO ] [Average] Prompt[{}] 1st step of unet latency: {}, ' \
161+
'2nd steps of unet latency: {}, 2nd steps throughput: {}' \
162+
.format(p_idx, avg_1st_token_latency, avg_2nd_tokens_latency, avg_2nd_token_tput)
160163

161164

162165
def print_average(iter_data_list, prompt_idx_list, batch_size, is_text_gen=False):

‎llm_bench/python/llm_bench_utils/output_csv.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,22 @@ def gen_data_to_csv(result, iter_data, pretrain_time):
106106
result['output_size'] = iter_data['output_size']
107107
result['latency(ms)'] = round(latency, 5) if latency != '' else latency
108108
result['result_md5'] = iter_data['result_md5']
109-
result['1st_latency(ms)'] = round(first_latency, 5) if first_latency != '' else first_latency
110-
result['2nd_avg_latency(ms)'] = round(other_latency, 5) if other_latency != '' else other_latency
111-
result['1st_infer_latency(ms)'] = round(first_token_infer_latency, 5) if first_token_infer_latency != '' else first_token_infer_latency
112-
result['2nd_infer_avg_latency(ms)'] = round(other_token_infer_latency, 5) if other_token_infer_latency != '' else other_token_infer_latency
109+
if first_latency < 0:
110+
result['1st_latency(ms)'] = 'NA'
111+
else:
112+
result['1st_latency(ms)'] = round(first_latency, 5) if first_latency != '' else first_latency
113+
if other_latency < 0:
114+
result['2nd_avg_latency(ms)'] = 'NA'
115+
else:
116+
result['2nd_avg_latency(ms)'] = round(other_latency, 5) if other_latency != '' else other_latency
117+
if first_token_infer_latency < 0:
118+
result['1st_infer_latency(ms)'] = 'NA'
119+
else:
120+
result['1st_infer_latency(ms)'] = round(first_token_infer_latency, 5) if first_token_infer_latency != '' else first_token_infer_latency
121+
if other_token_infer_latency < 0:
122+
result['2nd_infer_avg_latency(ms)'] = 'NA'
123+
else:
124+
result['2nd_infer_avg_latency(ms)'] = round(other_token_infer_latency, 5) if other_token_infer_latency != '' else other_token_infer_latency
113125
result['max_rss_mem(MB)'] = round(rss_mem, 5) if rss_mem != '' else rss_mem
114126
result['max_uss_mem(MB)'] = round(uss_mem, 5) if uss_mem != '' else uss_mem
115127
result['max_shared_mem(MB)'] = round(shared_mem, 5) if shared_mem != '' else shared_mem

0 commit comments

Comments
 (0)
Please sign in to comment.