Skip to content

Commit 4051f45

Browse files
fix perf metrics calculation for genai llm pipeline (openvinotoolkit#1162)
Fixing the regression introduced in openvinotoolkit#1118 CVS-156870 --------- Co-authored-by: Chen Peter <peter.chen@intel.com>
1 parent ed2baf4 commit 4051f45

File tree

2 files changed

+29
-18
lines changed

2 files changed

+29
-18
lines changed

tools/llm_bench/task/speech_to_text_generation.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,13 @@ def run_speech_2_txt_generation(input_param, args, md5_list, iter_data_list):
5050
return_timestamps=ret_timestamps
5151
)
5252
end = time.perf_counter()
53-
tm_list = np.array(result_text.perf_metrics.raw_metrics.m_durations) / 1000 / 1000
53+
perf_metrics = result_text.perf_metrics
54+
first_token_time = perf_metrics.get_ttft().mean / args["batch_size"]
55+
second_tokens_durations = (
56+
np.array(perf_metrics.raw_metrics.m_new_token_times[1:])
57+
- np.array(perf_metrics.raw_metrics.m_new_token_times[:-1]) / args["batch_size"]
58+
).tolist()
59+
tm_list = (np.array([first_token_time] + second_tokens_durations) / 1000).tolist()
5460
tm_infer_list = []
5561
result_text = result_text.texts[0]
5662
else:

tools/llm_bench/task/text_generation.py

+22-17
Original file line numberDiff line numberDiff line change
@@ -178,22 +178,22 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
178178
if args["output_dir"] is not None and num == 0:
179179
for bs_index, in_text in enumerate(input_text_list):
180180
llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id)
181-
pt_inputs = tokenizer(input_text_list, return_tensors="pt")
182-
input_token_size = pt_inputs.input_ids.shape[1]
183-
if args['batch_size'] > 1:
184-
out_str = '[warm-up]' if num == 0 else '[{}]'.format(num)
185-
out_str += " Batch_size={}, ".format(args['batch_size'])
186-
out_str += 'all input token size after padding: {} * {}, '.format(input_token_size, args['batch_size'])
187-
if args['infer_count'] is not None:
188-
out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size'])
189-
log.info(out_str)
190-
191181
max_rss_mem_consumption = ''
192182
max_uss_mem_consumption = ''
193183
max_shared_mem_consumption = ''
194184
if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
195185
mem_consumption.start_collect_memory_consumption()
196186
max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count']
187+
tokenizer = model.get_tokenizer()
188+
input_data = tokenizer.encode(input_text_list)
189+
num_input_tokens = input_data.input_ids.shape[1]
190+
if args['batch_size'] > 1:
191+
out_str = '[warm-up]' if num == 0 else '[{}]'.format(num)
192+
out_str += " Batch_size={}, ".format(args['batch_size'])
193+
out_str += 'all input token size after padding: {} * {}, '.format(num_input_tokens, args['batch_size'])
194+
if args['infer_count'] is not None:
195+
out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size'])
196+
log.info(out_str)
197197
start = time.perf_counter()
198198
generation_result = model.generate(input_text_list, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"], do_sample=False)
199199
end = time.perf_counter()
@@ -206,12 +206,12 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
206206
mem_consumption.clear_max_memory_consumption()
207207

208208
generation_time = end - start
209-
generated_tokens = [tokenizer(text).input_ids for text in generated_text]
209+
generated_tokens = [tokenizer.encode(text).input_ids.data for text in generated_text]
210210
# Only text_gen need to minus length of input_data, because generated_text may include input_text
211211
num_tokens = 0
212212
result_md5_list = []
213213
for bs_idx in range(args['batch_size']):
214-
generated_text_len = len(generated_tokens[bs_idx])
214+
generated_text_len = generated_tokens[bs_idx].shape[-1]
215215
num_tokens += generated_text_len
216216
if generated_text_len > max_gen_tokens:
217217
log.error('Output token size is over max output token size!')
@@ -228,7 +228,13 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
228228
per_token_time = generation_time * 1000 / (num_tokens / args['batch_size'])
229229
else:
230230
log.warning("No generated tokens")
231-
tm_list = np.array(perf_metrics.raw_metrics.m_durations) / 1000 / 1000
231+
first_token_time = (perf_metrics.get_ttft().mean - perf_metrics.raw_metrics.tokenization_durations[-1] / 1000) / args["batch_size"]
232+
second_tokens_durations = (
233+
np.array(perf_metrics.raw_metrics.m_new_token_times[1:])
234+
- np.array(perf_metrics.raw_metrics.m_new_token_times[:-1]) / args["batch_size"]
235+
).tolist()
236+
237+
tm_list = np.array([first_token_time] + second_tokens_durations) / 1000
232238
log.debug('latency of all tokens:')
233239
[log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)]
234240
tokenization_time = (
@@ -237,7 +243,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
237243
)
238244
iter_data = gen_output_data.gen_iterate_data(
239245
iter_idx=num,
240-
in_size=input_token_size * args['batch_size'],
246+
in_size=num_input_tokens * args['batch_size'],
241247
infer_count=len(tm_list),
242248
out_size=num_tokens,
243249
gen_time=generation_time,
@@ -288,12 +294,11 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg
288294
if args["output_dir"] is not None and num == 0:
289295
for bs_index, in_text in enumerate(input_text_list):
290296
llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id)
291-
pt_inputs = tokenizer(input_text_list, return_tensors="pt")
292-
input_token_size = pt_inputs.input_ids.shape[1]
293297
pipe_tokenizer = model.get_tokenizer()
294298
tok_encode_start = time.perf_counter()
295299
input_data = pipe_tokenizer.encode(input_text_list)
296300
tok_encode_end = time.perf_counter()
301+
input_token_size = input_data.input_ids.shape[1]
297302
tok_encode_time = (tok_encode_end - tok_encode_start) * 1000
298303
if args['batch_size'] > 1:
299304
out_str = '[warm-up]' if num == 0 else '[{}]'.format(num)
@@ -310,7 +315,7 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg
310315
max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count']
311316
streamer.reset()
312317
start = time.perf_counter()
313-
generated_tokens = model.generate(input_data, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"], streamer=streamer).tokens
318+
generated_tokens = model.generate(input_data, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"], streamer=streamer, do_sample=False).tokens
314319
end = time.perf_counter()
315320
if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
316321
mem_consumption.end_collect_momory_consumption()

0 commit comments

Comments
 (0)