Skip to content

Commit b42415f

Browse files
committed
[llm bench] Move calculation of memory consumption to memory_monitor tool
1 parent 69720e5 commit b42415f

14 files changed

+632
-242
lines changed

tools/llm_bench/benchmark.py

+16-8
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from openvino import get_version
1010
import torch
1111
import traceback
12-
from llm_bench_utils.memory_profile import MemConsumption
12+
from llm_bench_utils.memory_profile import MemMonitorWrapper
1313
import llm_bench_utils.output_csv
1414
import llm_bench_utils.output_json
1515
import task.visual_language_generation as bench_vlm
@@ -19,8 +19,7 @@
1919
import task.speech_to_text_generation as bench_speech
2020

2121
DEFAULT_TORCH_THREAD_NUMS = 16
22-
mem_consumption = MemConsumption()
23-
22+
memory_monitor = MemMonitorWrapper()
2423

2524
def num_iters_type(x):
2625
x = int(x)
@@ -85,6 +84,14 @@ def get_argprser():
8584
help='if the value is 1, output the maximum memory consumption in warm-up iterations. If the value is 2,'
8685
' output the maximum memory consumption in all iterations.',
8786
)
87+
parser.add_argument(
88+
'-mc_dir',
89+
'--memory_consumption_dir',
90+
default=None,
91+
required=False,
92+
type=str,
93+
help='Path to store memory consamption logs and chart.',
94+
)
8895
parser.add_argument('-bs', '--batch_size', type=int, default=1, required=False, help='Batch size value')
8996
parser.add_argument('--num_beams', type=int, default=1, help='Number of beams in the decoding strategy, activates beam_search if greater than 1')
9097
parser.add_argument(
@@ -226,16 +233,17 @@ def main():
226233
f'{original_torch_thread_nums} to {torch.get_num_threads()}, avoid to use the CPU cores for OpenVINO inference.')
227234
log.info(out_str)
228235
if args.memory_consumption:
229-
mem_consumption.start_collect_mem_consumption_thread()
236+
memory_monitor.create_monitors()
237+
if args.memory_consumption_dir:
238+
memory_monitor.set_dir(args.memory_consumption_dir)
230239
try:
231240
if model_args['use_case'] in ['text_gen', 'code_gen']:
232241
iter_data_list, pretrain_time, iter_timestamp = CASE_TO_BENCH[model_args['use_case']](
233242
model_path, framework, args.device, args.tokens_len, args.streaming, model_args,
234-
args.num_iters, mem_consumption)
243+
args.num_iters, memory_monitor)
235244
else:
236245
iter_data_list, pretrain_time, iter_timestamp = CASE_TO_BENCH[model_args['use_case']](
237-
model_path, framework, args.device, model_args, args.num_iters,
238-
mem_consumption)
246+
model_path, framework, args.device, model_args, args.num_iters, memory_monitor)
239247
if args.report is not None or args.report_json is not None:
240248
model_precision = ''
241249
if framework == 'ov':
@@ -276,7 +284,7 @@ def main():
276284
exit(1)
277285
finally:
278286
if args.memory_consumption:
279-
mem_consumption.end_collect_mem_consumption_thread()
287+
memory_monitor.stop()
280288

281289

282290
if __name__ == '__main__':

tools/llm_bench/llm_bench_utils/gen_output_data.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,7 @@ def gen_iterate_data(
1212
latency='',
1313
res_md5='',
1414
max_rss_mem='',
15-
max_shared_mem='',
16-
max_uss_mem='',
15+
max_sys_mem='',
1716
prompt_idx='',
1817
tokenization_time=[],
1918
mm_embeddings_preparation_time=''
@@ -31,8 +30,7 @@ def gen_iterate_data(
3130
iter_data['first_token_infer_latency'] = -1
3231
iter_data['other_tokens_infer_avg_latency'] = -1
3332
iter_data['max_rss_mem_consumption'] = max_rss_mem
34-
iter_data['max_shared_mem_consumption'] = max_shared_mem
35-
iter_data['max_uss_mem_consumption'] = max_uss_mem
33+
iter_data['max_sys_mem_consumption'] = max_sys_mem
3634
iter_data['prompt_idx'] = prompt_idx
3735
iter_data['tokenization_time'] = tokenization_time[0] if len(tokenization_time) > 0 else ''
3836
iter_data['detokenization_time'] = tokenization_time[1] if len(tokenization_time) > 1 else ''

0 commit comments

Comments
 (0)