@@ -178,22 +178,22 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
178
178
if args ["output_dir" ] is not None and num == 0 :
179
179
for bs_index , in_text in enumerate (input_text_list ):
180
180
llm_bench_utils .output_file .output_input_text (in_text , args , model_precision , prompt_index , bs_index , proc_id )
181
- pt_inputs = tokenizer (input_text_list , return_tensors = "pt" )
182
- input_token_size = pt_inputs .input_ids .shape [1 ]
183
- if args ['batch_size' ] > 1 :
184
- out_str = '[warm-up]' if num == 0 else '[{}]' .format (num )
185
- out_str += " Batch_size={}, " .format (args ['batch_size' ])
186
- out_str += 'all input token size after padding: {} * {}, ' .format (input_token_size , args ['batch_size' ])
187
- if args ['infer_count' ] is not None :
188
- out_str += 'all max_output_token_size: {} * {}' .format (args ['infer_count' ], args ['batch_size' ])
189
- log .info (out_str )
190
-
191
181
max_rss_mem_consumption = ''
192
182
max_uss_mem_consumption = ''
193
183
max_shared_mem_consumption = ''
194
184
if (args ['mem_consumption' ] == 1 and num == 0 ) or args ['mem_consumption' ] == 2 :
195
185
mem_consumption .start_collect_memory_consumption ()
196
186
max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args ['infer_count' ] is None else args ['infer_count' ]
187
+ tokenizer = model .get_tokenizer ()
188
+ input_data = tokenizer .encode (input_text_list )
189
+ num_input_tokens = input_data .input_ids .shape [1 ]
190
+ if args ['batch_size' ] > 1 :
191
+ out_str = '[warm-up]' if num == 0 else '[{}]' .format (num )
192
+ out_str += " Batch_size={}, " .format (args ['batch_size' ])
193
+ out_str += 'all input token size after padding: {} * {}, ' .format (num_input_tokens , args ['batch_size' ])
194
+ if args ['infer_count' ] is not None :
195
+ out_str += 'all max_output_token_size: {} * {}' .format (args ['infer_count' ], args ['batch_size' ])
196
+ log .info (out_str )
197
197
start = time .perf_counter ()
198
198
generation_result = model .generate (input_text_list , max_new_tokens = max_gen_tokens , num_beams = args ["num_beams" ], do_sample = False )
199
199
end = time .perf_counter ()
@@ -206,12 +206,12 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
206
206
mem_consumption .clear_max_memory_consumption ()
207
207
208
208
generation_time = end - start
209
- generated_tokens = [tokenizer (text ).input_ids for text in generated_text ]
209
+ generated_tokens = [tokenizer . encode (text ).input_ids . data for text in generated_text ]
210
210
# Only text_gen need to minus length of input_data, because generated_text may include input_text
211
211
num_tokens = 0
212
212
result_md5_list = []
213
213
for bs_idx in range (args ['batch_size' ]):
214
- generated_text_len = len ( generated_tokens [bs_idx ])
214
+ generated_text_len = generated_tokens [bs_idx ]. shape [ - 1 ]
215
215
num_tokens += generated_text_len
216
216
if generated_text_len > max_gen_tokens :
217
217
log .error ('Output token size is over max output token size!' )
@@ -228,7 +228,13 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
228
228
per_token_time = generation_time * 1000 / (num_tokens / args ['batch_size' ])
229
229
else :
230
230
log .warning ("No generated tokens" )
231
- tm_list = np .array (perf_metrics .raw_metrics .m_durations ) / 1000 / 1000
231
+ first_token_time = (perf_metrics .get_ttft ().mean - perf_metrics .raw_metrics .tokenization_durations [- 1 ] / 1000 ) / args ["batch_size" ]
232
+ second_tokens_durations = (
233
+ np .array (perf_metrics .raw_metrics .m_new_token_times [1 :])
234
+ - np .array (perf_metrics .raw_metrics .m_new_token_times [:- 1 ]) / args ["batch_size" ]
235
+ ).tolist ()
236
+
237
+ tm_list = np .array ([first_token_time ] + second_tokens_durations ) / 1000
232
238
log .debug ('latency of all tokens:' )
233
239
[log .debug ('[{}]{:.4f}' .format (idx , tm )) for idx , tm in enumerate (tm_list )]
234
240
tokenization_time = (
@@ -237,7 +243,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
237
243
)
238
244
iter_data = gen_output_data .gen_iterate_data (
239
245
iter_idx = num ,
240
- in_size = input_token_size * args ['batch_size' ],
246
+ in_size = num_input_tokens * args ['batch_size' ],
241
247
infer_count = len (tm_list ),
242
248
out_size = num_tokens ,
243
249
gen_time = generation_time ,
@@ -288,12 +294,11 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg
288
294
if args ["output_dir" ] is not None and num == 0 :
289
295
for bs_index , in_text in enumerate (input_text_list ):
290
296
llm_bench_utils .output_file .output_input_text (in_text , args , model_precision , prompt_index , bs_index , proc_id )
291
- pt_inputs = tokenizer (input_text_list , return_tensors = "pt" )
292
- input_token_size = pt_inputs .input_ids .shape [1 ]
293
297
pipe_tokenizer = model .get_tokenizer ()
294
298
tok_encode_start = time .perf_counter ()
295
299
input_data = pipe_tokenizer .encode (input_text_list )
296
300
tok_encode_end = time .perf_counter ()
301
+ input_token_size = input_data .input_ids .shape [1 ]
297
302
tok_encode_time = (tok_encode_end - tok_encode_start ) * 1000
298
303
if args ['batch_size' ] > 1 :
299
304
out_str = '[warm-up]' if num == 0 else '[{}]' .format (num )
@@ -310,7 +315,7 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg
310
315
max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args ['infer_count' ] is None else args ['infer_count' ]
311
316
streamer .reset ()
312
317
start = time .perf_counter ()
313
- generated_tokens = model .generate (input_data , max_new_tokens = max_gen_tokens , num_beams = args ["num_beams" ], streamer = streamer ).tokens
318
+ generated_tokens = model .generate (input_data , max_new_tokens = max_gen_tokens , num_beams = args ["num_beams" ], streamer = streamer , do_sample = False ).tokens
314
319
end = time .perf_counter ()
315
320
if (args ['mem_consumption' ] == 1 and num == 0 ) or args ['mem_consumption' ] == 2 :
316
321
mem_consumption .end_collect_momory_consumption ()
0 commit comments