print [Px][Lx]

wgzintel · wgzintel · commit 311766eee725 · 2024-09-30T23:29:36.000+08:00
diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py
@@ -752,7 +752,8 @@ def run_speech_2txt_generation(pipe, args, num, md5_list, prompt_id, audio_promp
         max_rss_mem=max_rss_mem_consumption,
         max_shared_mem=max_shared_mem_consumption,
         max_uss_mem=max_uss_mem_consumption,
-        whisper=whisper_hook
+        whisper=whisper_hook,
+        prompt_idx=prompt_id
     )
     if num > 0:
         prev_md5 = md5_list[num - 1][prompt_id]
diff --git a/llm_bench/python/llm_bench_utils/hook_forward_whisper.py b/llm_bench/python/llm_bench_utils/hook_forward_whisper.py
@@ -27,15 +27,15 @@ def get_time_infer_list(self):
             time_infer_list.insert(0, self.time_data[0]['enc_infer_time'])
         return time_infer_list
     
-    def get_whisper_latency(self, iter):
+    def get_whisper_latency(self, iter, prompt_idx):
         str = ''
         for idx, data in enumerate(self.time_data):
             enc_infer_time = data['enc_infer_time'] * 1000
             dec_token_count = len(data['dec_token_time'])
             dec_infer_count = len(data['dec_infer_time'])
             dec_token_time = sum(data['dec_token_time']) / dec_token_count * 1000 if dec_token_count > 1 else 0
             dec_infer_time = sum(data['dec_infer_time']) / dec_infer_count * 1000 if dec_infer_count > 1 else 0
-            str += f"[{iter}][{idx}] encoder token latency: {enc_infer_time:.2f} ms/token, " \
+            str += f"[{iter}][P{prompt_idx}][L{idx}] encoder token latency: {enc_infer_time:.2f} ms/token, " \
                 f"decoder tokens latency: {dec_token_time:.2f} ms/token, " \
                 f"decoder infers latency: {dec_infer_time:.2f} ms/infer, " \
                 f"decoder tokens count: {dec_token_count}, " \
diff --git a/llm_bench/python/llm_bench_utils/metrics_print.py b/llm_bench/python/llm_bench_utils/metrics_print.py
@@ -6,7 +6,7 @@
 
 def print_metrics(
         iter_num, iter_data, tms=None, tms_infer=None, warm_up=False, max_rss_mem=-1, max_shared_mem=-1,
-        max_uss_mem=-1, stable_diffusion=None, tokenization_time=None, batch_size=1, whisper = None
+        max_uss_mem=-1, stable_diffusion=None, tokenization_time=None, batch_size=1, whisper = None, prompt_idx=-1
 ):
     iter_str = str(iter_num)
     if warm_up:
@@ -57,7 +57,7 @@ def print_metrics(
     if stable_diffusion is not None:
         print_stable_diffusion_infer_latency(iter_str, iter_data, stable_diffusion)
     if whisper is not None:
-        print_whisper_infer_latency(iter_str, whisper)
+        print_whisper_infer_latency(iter_str, whisper, prompt_idx)
     output_str = ''
     if max_rss_mem != '' and max_rss_mem > -1:
         output_str += 'Max rss memory cost: {:.2f}MBytes, '.format(max_rss_mem)
@@ -102,8 +102,8 @@ def print_stable_diffusion_infer_latency(iter_str, iter_data, stable_diffusion):
              f"vae decoder step count: {stable_diffusion.get_vae_decoder_step_count()}",)
 
 
-def print_whisper_infer_latency(iter_str, whisper):
-    print(f'{whisper.get_whisper_latency(iter_str)}')
+def print_whisper_infer_latency(iter_str, whisper, prompt_idx):
+    print(f'{whisper.get_whisper_latency(iter_str, prompt_idx)}')
 
 
 def print_ldm_unet_vqvae_infer_latency(iter_num, iter_data, tms=None, warm_up=False):