diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py index 819879e9f4..1884dba15c 100644 --- a/tools/llm_bench/benchmark.py +++ b/tools/llm_bench/benchmark.py @@ -9,7 +9,7 @@ from openvino import get_version import torch import traceback -from llm_bench_utils.memory_profile import MemConsumption +from llm_bench_utils.memory_monitor import MemMonitorWrapper import llm_bench_utils.output_csv import llm_bench_utils.output_json import task.visual_language_generation as bench_vlm @@ -19,7 +19,7 @@ import task.speech_to_text_generation as bench_speech DEFAULT_TORCH_THREAD_NUMS = 16 -mem_consumption = MemConsumption() +memory_monitor = MemMonitorWrapper() def num_iters_type(x): @@ -87,11 +87,19 @@ def get_argprser(): ) parser.add_argument( "--memory_consumption_delay", - default=0.5, + default=None, required=False, type=float, help="delay for memory consumption check in seconds, smaller value will lead to more precised memory consumption, but may affects performance." - "It is not recommended to run memory consumption and performance benchmarking in the same time" + "It is not recommended to run memory consumption and performance benchmarking in the same time", + ) + parser.add_argument( + '-mc_dir', + '--memory_consumption_dir', + default=None, + required=False, + type=str, + help='Path to store memory consamption logs and chart.', ) parser.add_argument('-bs', '--batch_size', type=int, default=1, required=False, help='Batch size value') parser.add_argument('--num_beams', type=int, default=1, help='Number of beams in the decoding strategy, activates beam_search if greater than 1') @@ -233,22 +241,25 @@ def main(): if half_nums_of_torch_threads > DEFAULT_TORCH_THREAD_NUMS: torch.set_num_threads(DEFAULT_TORCH_THREAD_NUMS) else: + half_nums_of_torch_threads = int(half_nums_of_torch_threads) if int(half_nums_of_torch_threads) else 1 torch.set_num_threads(int(half_nums_of_torch_threads)) log.info(f"The num_beams is {model_args['num_beams']}, update Torch thread num from " f'{original_torch_thread_nums} to {torch.get_num_threads()}, avoid to use the CPU cores for OpenVINO inference.') log.info(out_str) if args.memory_consumption: - mem_consumption.delay = args.memory_consumption_delay - mem_consumption.start_collect_mem_consumption_thread() + if args.memory_consumption_delay: + memory_monitor.interval = args.memory_consumption_delay + memory_monitor.create_monitors() + if args.memory_consumption_dir: + memory_monitor.set_dir(args.memory_consumption_dir) try: if model_args['use_case'] in ['text_gen', 'code_gen']: iter_data_list, pretrain_time, iter_timestamp = CASE_TO_BENCH[model_args['use_case']]( model_path, framework, args.device, args.tokens_len, args.streaming, model_args, - args.num_iters, mem_consumption) + args.num_iters, memory_monitor) else: iter_data_list, pretrain_time, iter_timestamp = CASE_TO_BENCH[model_args['use_case']]( - model_path, framework, args.device, model_args, args.num_iters, - mem_consumption) + model_path, framework, args.device, model_args, args.num_iters, memory_monitor) if args.report is not None or args.report_json is not None: model_precision = '' if framework == 'ov': @@ -289,7 +300,7 @@ def main(): exit(1) finally: if args.memory_consumption: - mem_consumption.end_collect_mem_consumption_thread() + memory_monitor.stop() if __name__ == '__main__': diff --git a/tools/llm_bench/llm_bench_utils/gen_output_data.py b/tools/llm_bench/llm_bench_utils/gen_output_data.py index 594903912d..1287a0fd81 100644 --- a/tools/llm_bench/llm_bench_utils/gen_output_data.py +++ b/tools/llm_bench/llm_bench_utils/gen_output_data.py @@ -12,8 +12,9 @@ def gen_iterate_data( latency='', res_md5='', max_rss_mem='', - max_shared_mem='', - max_uss_mem='', + max_rss_mem_increase='', + max_sys_mem='', + max_sys_mem_increase='', prompt_idx='', tokenization_time=[], mm_embeddings_preparation_time='' @@ -31,8 +32,9 @@ def gen_iterate_data( iter_data['first_token_infer_latency'] = -1 iter_data['other_tokens_infer_avg_latency'] = -1 iter_data['max_rss_mem_consumption'] = max_rss_mem - iter_data['max_shared_mem_consumption'] = max_shared_mem - iter_data['max_uss_mem_consumption'] = max_uss_mem + iter_data['max_rss_mem_increase'] = max_rss_mem_increase + iter_data['max_sys_mem_consumption'] = max_sys_mem + iter_data['max_sys_mem_increase'] = max_sys_mem_increase iter_data['prompt_idx'] = prompt_idx iter_data['tokenization_time'] = tokenization_time[0] if len(tokenization_time) > 0 else '' iter_data['detokenization_time'] = tokenization_time[1] if len(tokenization_time) > 1 else '' diff --git a/tools/llm_bench/llm_bench_utils/memory_monitor.py b/tools/llm_bench/llm_bench_utils/memory_monitor.py new file mode 100644 index 0000000000..db6f5fefe9 --- /dev/null +++ b/tools/llm_bench/llm_bench_utils/memory_monitor.py @@ -0,0 +1,419 @@ +# Copyright (c) 2025 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import atexit +import queue +import threading +import time +from enum import Enum +from functools import lru_cache +from functools import partial +from pathlib import Path +from typing import Callable, List, Optional, Tuple + +import psutil +import matplotlib +import matplotlib.pyplot as plt +import logging as log + + +# CUSTOM FIX TO AVOID ISSUE: RuntimeError: main thread is not in main loop +matplotlib.use('Agg') + + +class MemoryType(Enum): + RSS = "rss" + SYSTEM = "system" + + +class MemoryUnit(Enum): + B = "B" # byte + KiB = "KiB" # Kibibyte + MiB = "MiB" # Mibibyte + GiB = "GiB" # Gibibyte + KB = "KB" # Kilobyte + MB = "MB" # Megabyte + GB = "GB" # Gigabyte + + +@lru_cache +def system_memory_warning(): + # Log once + log.warning( + "Please note that MemoryType.SYSTEM in general is affected by other processes that change RAM availability." + ) + + +class MemoryMonitor: + def __init__( + self, + interval: Optional[float] = 0.1, + memory_type: Optional[MemoryType] = MemoryType.RSS, + memory_unit: Optional[MemoryUnit] = MemoryUnit.MiB, + include_child_processes: Optional[bool] = None, + ): + """ + Memory monitoring utility to measure python process memory footprint. After start() is called, it + creates a thread which runs in parallel and takes memory measurements every *interval* seconds using the + specified *memory_type* approach. When stop() is called, the memory measuring thread is stopped. The results + can be obtained by calling get_data(). Memory logs can be saved by calling save_memory_logs(). There are two + log files: one with data values in a .txt format and another one in a form of a 2D time-memory plot. + + Memory monitor itself allocates some memory itself, especially during figure saving. It is advised to use it + for measuring large memory processes. + + :param interval: How frequently to take memory measurements (in seconds). + :param memory_type: Type of memory to log. Accepts four possible values: + - MemoryType.RSS: Resident Set Size is the portion of memory occupied by a process that is held in RAM. + Values are obtained through psutil library. If some data is read using mmap, RSS will report this data + as allocated, however this is not necessarily the case. + - MemoryType.SYSTEM: This metric is defined as the difference between total system virtual memory + and system available memory. Be aware, that this way it is affected by other processes that can change + RAM availability. It is advised to call get_data(memory_from_zero=True) for this type of memory logging, + if one is interested in memory footprint for a certain process. This subtracts the starting memory from + all values. + + RSS and SYSTEM behave differently when mmap is used, e.g. during OV model loading. RSS will report data + which was read with mmap enabled as allocated, however this is not necessarily the case. SYSTEM does not + report memory loaded with mmap. So it can be used to analyze "pure" memory usage without contribution of + mmap pages which are actually free, but are reported as allocated by RSS. + :param memory_unit: Unit to report memory in. + :param include_child_processes: For MemoryType.RSS only: whether to include memory of child processes. If not + provided, child processes are counted. + """ + self.interval = interval + self.memory_type = memory_type + if memory_type == MemoryType.SYSTEM: + system_memory_warning() + elif memory_type == MemoryType.RSS: + if include_child_processes is None: + include_child_processes = True + else: + raise ValueError("Unknown memory type to log") + self.memory_unit = memory_unit + self.include_child_processes = include_child_processes + + self._monitoring_thread_should_stop = False + self._monitoring_in_progress = False + + self._memory_monitor_thread = None + self._memory_values_queue = None + self._stop_logging_atexit_fn = None + + def start(self, at_exit_fn: Optional[Callable] = None) -> "MemoryMonitor": + """ + Start memory monitoring. + + :param at_exit_fn: A callable to execute at program exit. Useful fot providing logs saving routine, e.g. + ``` + at_exit_fn = lambda: memory_monitor.save_memory_logs(*memory_monitor.get_data(), save_dir) + memory_monitor.start(at_exit_fn=at_exit_fn) + ``` + """ + if self._monitoring_in_progress: + raise Exception("Monitoring already in progress") + + self._memory_values_queue = queue.Queue() + self._monitoring_thread_should_stop = False + + self._memory_monitor_thread = threading.Thread(target=self._monitor_memory) + self._memory_monitor_thread.daemon = True + self._memory_monitor_thread.start() + if at_exit_fn: + self._stop_logging_atexit_fn = at_exit_fn + atexit.register(self._stop_logging_atexit_fn) + + self._monitoring_in_progress = True + + return self + + def stop(self): + """ + Stop memory monitoring. + """ + if not self._monitoring_in_progress: + return + self._monitoring_thread_should_stop = True + self._monitoring_in_progress = False + self._memory_monitor_thread.join() + if self._stop_logging_atexit_fn is not None: + atexit.unregister(self._stop_logging_atexit_fn) + self._stop_logging_atexit_fn = None + + def get_data(self, memory_from_zero: Optional[bool] = False) -> Tuple[List, List]: + """ + :param memory_from_zero: Whether to normalize memory measurements by subtracting the first value. This way + the measurements will start with 0. Hence, is not very reliable and may actually result in negative values. + :returns: A tuple of list where the first element corresponds to measurements timestamps and the second one -- + to memory values. + """ + memory_usage_data = list(self._memory_values_queue.queue) + if len(memory_usage_data) == 0: + return [], [] + time_values, memory_values = tuple(zip(*memory_usage_data)) + time_values = _subtract_first_element(list(time_values)) + if memory_from_zero: + memory_values = _subtract_first_element(list(memory_values)) + + # Convert to target memory unit + memory_values = list(map(partial(_cast_bytes_to, memory_unit=self.memory_unit), memory_values)) + + return time_values, memory_values + + def save_memory_logs( + self, + time_values: List[float], + memory_values: List[float], + save_dir: Path, + plot_title: Optional[str] = "", + filename_suffix: Optional[str] = "", + ): + """ + Save memory logs as a text file and a 2D plot. + + :param time_values: Timestamps of the memory measurements. + :param memory_values: Memory measurements. + :param save_dir: Directory to save logs into. + :param plot_title: A title for a plot. + :param filename_suffix: A string suffix to give to the saved files. + """ + if not save_dir.exists(): + save_dir.mkdir(parents=True) + + filename_label = f"{self.memory_type.value}_memory_usage{filename_suffix}" + # Save measurements to text file + log_filepath = save_dir / f"{filename_label}.txt" + with open(log_filepath, "w") as log_file: + if len(time_values) == 0: + log_file.write("No measurements recorded.\nPlease make sure logging duration or interval were enough.") + return + for timestamp, memory_usage in zip(time_values, memory_values): + log_file.write(f"{timestamp} {memory_usage:.3f}\n") + + log_file.writelines( + [ + f"Total time: {time_values[-1] - time_values[0]}\n", + f"Max memory: {max(memory_values):.3f} ({self.memory_unit.value})", + ] + ) + + # Save measurements plot + self.save_memory_plot(log_filepath, plot_title) + + def save_memory_plot(self, log_filepath: Path, plot_title: Optional[str] = "", filename_suffix: Optional[str] = ""): + """ + Parse pre-saved txt file logs and plot a new figure based on this data. May be useful for re-plotting with + different title. + + :param log_filepath: A path to a .txt log file. + :param plot_title: A title to give to a plot. + :param filename_suffix: A string suffix to give to the saved figure. + """ + with open(log_filepath, "r") as f: + lines = f.readlines() + time_values, memory_values = [], [] + for line in lines[:-2]: + time_value, memory_value = tuple(map(float, line.split(" "))) + time_values.append(time_value) + memory_values.append(memory_value) + + fig = plt.figure(figsize=(10, 6)) + plt.plot(time_values, memory_values) + plt.xlabel("Time (seconds)") + plt.ylabel(f"Memory Usage ({self.memory_type.value}, {self.memory_unit.value})") + plt.title(f"{plot_title} Max_{self.memory_type.value}: {max(memory_values):.2f} {self.memory_unit.value}") + plt.grid(True) + plt.tight_layout() + plt.savefig(str(log_filepath).replace(".txt", f"{filename_suffix}.png")) + plt.close(fig) + + def __enter__(self) -> "MemoryMonitor": + return self.start() + + def __exit__(self, exc_type, exc_val, exc_tb): + self.stop() + + def _monitor_memory(self): + while not self._monitoring_thread_should_stop: + _last_measurement_time = time.perf_counter() + if self.memory_type == MemoryType.RSS: + bytes_used = psutil.Process().memory_info().rss + if self.include_child_processes: + for child_process in psutil.Process().children(recursive=True): + bytes_used += psutil.Process(child_process.pid).memory_info().rss + elif self.memory_type == MemoryType.SYSTEM: + bytes_used = psutil.virtual_memory().total - psutil.virtual_memory().available + else: + raise Exception("Unknown memory type to log") + if self._monitoring_thread_should_stop: + break + self._memory_values_queue.put((time.perf_counter(), bytes_used)) + time.sleep(max(0.0, self.interval - (time.perf_counter() - _last_measurement_time))) + + +class memory_monitor_context: + def __init__( + self, + interval: Optional[float] = 0.01, + memory_unit: Optional[MemoryUnit] = MemoryUnit.MiB, + return_max_value: Optional[bool] = True, + save_dir: Optional[Path] = None, + ): + """ + A memory monitor context manager which monitors both RSS and SYSTEM memory types. After, it stores the + result for the maximum memory recorded if `return_max_value=True or the whole time-memory sequences. Works + by subtracting the first memory measurement from all the other ones so that the resulting sequence starts + from 0. Hence, it can actually return negative memory values. + + After exiting, the result is stored at .memory_data field -- a dict with memory types (RSS or SYSTEM) + as keys. The values are either a single float number if return_max_value is provided, or a tuple with time + and memory value lists. + + Additionally, memory logs may be saved by providing save_dir argument. + + :param interval: Interval in seconds to take measurements. + :param memory_unit: Memory unit. + :param return_max_value: Whether to return max value for each memory type or full memory sequences. + :param save_dir: If provided, will save memory logs at this location. + """ + + self.memory_monitors = {} + for memory_type in [MemoryType.RSS, MemoryType.SYSTEM]: + self.memory_monitors[memory_type] = MemoryMonitor( + interval=interval, memory_type=memory_type, memory_unit=memory_unit + ) + self.return_max_value = return_max_value + self.save_dir = save_dir + + self.memory_data = {'full_mem': {}, 'from_zero': {}} + + def __enter__(self): + for mm in self.memory_monitors.values(): + mm.start() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + # Stop addition of new values as soon as possible + for mm in self.memory_monitors.values(): + mm._monitoring_thread_should_stop = True + + for mt, mm in self.memory_monitors.items(): + mm.stop() + for fz in [False, True]: + time_values, memory_values = mm.get_data(memory_from_zero=fz) + + mm_measure_type = 'from_zero' if fz else 'full_mem' + self.memory_data[mm_measure_type][mt] = max(memory_values) if self.return_max_value else (time_values, memory_values) + + if self.save_dir: + mm.save_memory_logs( + time_values, + memory_values, + save_dir=self.save_dir, + filename_suffix="_mem_increase" if fz else "", + ) + + +class MemMonitorWrapper(): + def __init__(self): + self.save_dir = None + + self.interval = 0.01 + self.memory_unit = MemoryUnit.MiB + + self.memory_types = [MemoryType.RSS, MemoryType.SYSTEM] + + self.memory_monitors = {} + self.memory_data = {'full_mem': {}, 'from_zero': {}} + + def create_monitors(self): + for memory_type in self.memory_types: + self.memory_monitors[memory_type] = MemoryMonitor( + interval=self.interval, memory_type=memory_type, memory_unit=self.memory_unit + ) + + def set_dir(self, dir): + if not Path(dir).exists(): + log.warning(f"Path to dir for memory consamption data is not exists {dir}, run without it.") + else: + self.save_dir = Path(dir) + + def start(self, delay=None): + self.memory_data = {'full_mem': {}, 'from_zero': {}} + for mm in self.memory_monitors.values(): + mm.start() + + # compilation could be very fast, apply delay + if delay: + time.sleep(delay) + else: + time.sleep(self.interval * 3) + + def stop_and_collect_data(self, dir_name='mem_monitor_log'): + self.stop() + + for mt, mm in self.memory_monitors.items(): + if not mm._memory_values_queue or len(mm._memory_values_queue.queue) == 0: + continue + + for from_zero in [False, True]: + time_values, memory_values = mm.get_data(memory_from_zero=from_zero) + + mm_measure_type = 'from_zero' if from_zero else 'full_mem' + self.memory_data[mm_measure_type][mt] = max(memory_values) + + if self.save_dir: + mm.save_memory_logs( + time_values, + memory_values, + save_dir=self.save_dir / dir_name, + filename_suffix="_mem_increase" if from_zero else "", + ) + + def stop(self): + # Stop addition of new values as soon as possible + for mm in self.memory_monitors.values(): + mm._monitoring_thread_should_stop = True + + for mm in self.memory_monitors.values(): + mm.stop() + + def get_data(self): + return (self.memory_data['full_mem'].get(MemoryType.RSS, -1), self.memory_data['from_zero'].get(MemoryType.RSS, -1), + self.memory_data['full_mem'].get(MemoryType.SYSTEM, -1), self.memory_data['from_zero'].get(MemoryType.SYSTEM, -1)) + + def log_data(self, comment): + max_rss_mem, max_rss_increase, max_sys_mem, max_sys_increase = self.get_data() + msg = (f"Max rss memory cost {comment}: {max_rss_mem:.2f}{self.memory_unit.value}, " + f"rss memory increase {comment}: {max_rss_increase:.2f}{self.memory_unit.value}, " + f"max system memory cost {comment}: {max_sys_mem:.2f}{self.memory_unit.value}, " + f"system memory increase {comment}: {max_sys_increase:.2f}{self.memory_unit.value}") + log.info(msg) + + +def _cast_bytes_to(bytes, memory_unit, round_to_int=False): + memory_unit_divisors = { + MemoryUnit.B: 1, + MemoryUnit.KiB: 2**10, + MemoryUnit.MiB: 2**20, + MemoryUnit.GiB: 2**30, + MemoryUnit.KB: 10**3, + MemoryUnit.MB: 10**6, + MemoryUnit.GB: 10**9, + } + result = bytes / memory_unit_divisors[memory_unit] + return int(result) if round_to_int else result + + +def _subtract_first_element(data): + for i in range(1, len(data)): + data[i] = data[i] - data[0] + data[0] = 0 + return data diff --git a/tools/llm_bench/llm_bench_utils/memory_profile.py b/tools/llm_bench/llm_bench_utils/memory_profile.py deleted file mode 100644 index 813e3e8489..0000000000 --- a/tools/llm_bench/llm_bench_utils/memory_profile.py +++ /dev/null @@ -1,96 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2023-2025 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -from threading import Event, Thread -import psutil -import time -import os -import sys - - -class MemConsumption: - def __init__(self): - """Initialize MemConsumption.""" - self.g_exit_get_mem_thread = False - self.g_end_collect_mem = False - self.g_max_rss_mem_consumption = -1 - self.g_max_uss_mem_consumption = -1 - self.g_max_shared_mem_consumption = -1 - self.g_event = Event() - self.g_data_event = Event() - self.delay = 0.5 - - def collect_memory_consumption(self): - """Collect the data.""" - while self.g_exit_get_mem_thread is False: - self.g_event.wait() - while True: - process = psutil.Process(os.getpid()) - try: - memory_full_info = process.memory_full_info() - rss_mem_data = memory_full_info.rss - if sys.platform.startswith('linux'): - shared_mem_data = memory_full_info.shared - uss_mem_data = rss_mem_data - shared_mem_data - elif sys.platform.startswith('win'): - uss_mem_data = memory_full_info.uss - shared_mem_data = rss_mem_data - uss_mem_data - else: - uss_mem_data = -1 - shared_mem_data = -1 - except Exception: - rss_mem_data = -1 - uss_mem_data = -1 - shared_mem_data = -1 - - if rss_mem_data > self.g_max_rss_mem_consumption: - self.g_max_rss_mem_consumption = rss_mem_data - if shared_mem_data > self.g_max_shared_mem_consumption: - self.g_max_shared_mem_consumption = shared_mem_data - if uss_mem_data > self.g_max_uss_mem_consumption: - self.g_max_uss_mem_consumption = uss_mem_data - self.g_data_event.set() - if self.g_end_collect_mem is True: - self.g_event.set() - self.g_event.clear() - self.g_end_collect_mem = False - break - time.sleep(self.delay) - - def start_collect_memory_consumption(self): - """Start collect.""" - self.g_end_collect_mem = False - self.g_event.set() - - def end_collect_momory_consumption(self): - """Stop collect.""" - self.g_end_collect_mem = True - self.g_event.wait() - - def get_max_memory_consumption(self): - """Return the data.""" - self.g_data_event.wait() - self.g_data_event.clear() - max_rss_mem = self.g_max_rss_mem_consumption / float(2**20) if self.g_max_rss_mem_consumption > -1 else -1 - max_shared_mem = self.g_max_shared_mem_consumption / float(2**20) if self.g_max_shared_mem_consumption > -1 else -1 - max_uss_mem = self.g_max_uss_mem_consumption / float(2**20) if self.g_max_uss_mem_consumption > -1 else -1 - return max_rss_mem, max_shared_mem, max_uss_mem - - def clear_max_memory_consumption(self): - """Clear MemConsumption.""" - self.g_max_rss_mem_consumption = -1 - self.g_max_uss_mem_consumption = -1 - self.g_max_shared_mem_consumption = -1 - - def start_collect_mem_consumption_thread(self): - """Start the thread.""" - self.t_mem_thread = Thread(target=self.collect_memory_consumption) - self.t_mem_thread.start() - - def end_collect_mem_consumption_thread(self): - """End the thread.""" - self.g_event.set() - self.g_data_event.set() - self.g_end_collect_mem = True - self.g_exit_get_mem_thread = True - self.t_mem_thread.join() diff --git a/tools/llm_bench/llm_bench_utils/metrics_print.py b/tools/llm_bench/llm_bench_utils/metrics_print.py index 8668e3bba2..ca0fc2d20f 100644 --- a/tools/llm_bench/llm_bench_utils/metrics_print.py +++ b/tools/llm_bench/llm_bench_utils/metrics_print.py @@ -5,8 +5,7 @@ def print_metrics( - iter_num, iter_data, tms=None, tms_infer=None, warm_up=False, max_rss_mem=-1, max_shared_mem=-1, - max_uss_mem=-1, stable_diffusion=None, tokenization_time=None, batch_size=1, prompt_idx=-1, whisper=None + iter_num, iter_data, tms=None, tms_infer=None, warm_up=False, stable_diffusion=None, tokenization_time=None, batch_size=1, prompt_idx=-1, whisper=None ): iter_str = str(iter_num) if warm_up: @@ -62,12 +61,14 @@ def print_metrics( if whisper is not None: print_whisper_infer_latency(iter_str, whisper, prompt_idx) output_str = '' - if max_rss_mem != '' and max_rss_mem > -1: - output_str += 'Max rss memory cost: {:.2f}MBytes, '.format(max_rss_mem) - if max_uss_mem != '' and max_uss_mem > -1: - output_str += 'max uss memory cost: {:.2f}MBytes, '.format(max_uss_mem) - if max_shared_mem != '' and max_shared_mem > -1: - output_str += 'max shared memory cost: {:.2f}MBytes'.format(max_shared_mem) + if iter_data['max_rss_mem_consumption'] != '' and iter_data['max_rss_mem_consumption'] > -1: + output_str += f"Max rss memory cost: {iter_data['max_rss_mem_consumption']:.2f}MBytes, " + if iter_data['max_rss_mem_increase'] != '' and iter_data['max_rss_mem_increase'] > -1: + output_str += f"rss memory increase: {iter_data['max_rss_mem_increase']:.2f}MBytes, " + if iter_data['max_sys_mem_consumption'] != '' and iter_data['max_sys_mem_consumption'] > -1: + output_str += f"max system memory memory cost: {iter_data['max_sys_mem_consumption']:.2f}MBytes, " + if iter_data['max_sys_mem_increase'] != '' and iter_data['max_sys_mem_increase'] > -1: + output_str += f"system memory increase: {iter_data['max_sys_mem_increase']:.2f}MBytes " if output_str != '': output_str = ' '.join([prefix, output_str]) log.info(output_str) diff --git a/tools/llm_bench/llm_bench_utils/output_csv.py b/tools/llm_bench/llm_bench_utils/output_csv.py index ea1402f82f..72ccb3f7d4 100644 --- a/tools/llm_bench/llm_bench_utils/output_csv.py +++ b/tools/llm_bench/llm_bench_utils/output_csv.py @@ -49,7 +49,7 @@ def output_comments(result, use_case, writer): 'max_rss_mem: max rss memory consumption;' ) comment_list.append( - 'max_shared_mem: max shared memory consumption;' + 'max_sys_mem: max system consumption;' ) for comments in comment_list: @@ -95,8 +95,7 @@ def gen_data_to_csv(result, iter_data, pretrain_time, iter_timestamp): first_token_infer_latency = iter_data['first_token_infer_latency'] other_token_infer_latency = iter_data['other_tokens_infer_avg_latency'] rss_mem = iter_data['max_rss_mem_consumption'] - uss_mem = iter_data['max_uss_mem_consumption'] - shared_mem = iter_data['max_shared_mem_consumption'] + sys_mem = iter_data['max_sys_mem_consumption'] token_time = iter_data['tokenization_time'] detoken_time = iter_data['detokenization_time'] result['iteration'] = str(iter_data['iteration']) @@ -124,8 +123,7 @@ def gen_data_to_csv(result, iter_data, pretrain_time, iter_timestamp): else: result['2nd_infer_avg_latency(ms)'] = round(other_token_infer_latency, 5) if other_token_infer_latency != '' else other_token_infer_latency result['max_rss_mem(MB)'] = round(rss_mem, 5) if rss_mem != '' else rss_mem - result['max_uss_mem(MB)'] = round(uss_mem, 5) if uss_mem != '' else uss_mem - result['max_shared_mem(MB)'] = round(shared_mem, 5) if shared_mem != '' else shared_mem + result['max_sys_mem(MB)'] = round(sys_mem, 5) if sys_mem != '' else sys_mem result['prompt_idx'] = iter_data['prompt_idx'] result['tokenization_time'] = round(token_time, 5) if token_time != '' else token_time result['detokenization_time'] = round(detoken_time, 5) if detoken_time != '' else detoken_time @@ -148,8 +146,7 @@ def write_result(report_file, model, framework, device, model_args, iter_data_li '2nd_avg_latency(ms)', 'precision', 'max_rss_mem(MB)', - 'max_uss_mem(MB)', - 'max_shared_mem(MB)', + 'max_sys_mem(MB)', 'prompt_idx', '1st_infer_latency(ms)', '2nd_infer_avg_latency(ms)', diff --git a/tools/llm_bench/llm_bench_utils/output_json.py b/tools/llm_bench/llm_bench_utils/output_json.py index 4a95a9e94d..08ea1c8e79 100644 --- a/tools/llm_bench/llm_bench_utils/output_json.py +++ b/tools/llm_bench/llm_bench_utils/output_json.py @@ -15,8 +15,7 @@ def write_result(report_file, model, framework, device, model_args, iter_data_li first_token_infer_latency = iter_data['first_token_infer_latency'] other_token_infer_latency = iter_data['other_tokens_infer_avg_latency'] rss_mem = iter_data['max_rss_mem_consumption'] - uss_mem = iter_data['max_uss_mem_consumption'] - shared_mem = iter_data['max_shared_mem_consumption'] + max_sys_mem = iter_data['max_sys_mem_consumption'] tokenization_time = iter_data['tokenization_time'] detokenization_time = iter_data['detokenization_time'] @@ -39,8 +38,7 @@ def write_result(report_file, model, framework, device, model_args, iter_data_li 'first_infer_latency': round(first_token_infer_latency, 5) if first_token_infer_latency != '' else first_token_infer_latency, 'second_infer_avg_latency': round(other_token_infer_latency, 5) if other_token_infer_latency != '' else other_token_infer_latency, 'max_rss_mem': round(rss_mem, 5) if rss_mem != '' else -1, - 'max_uss_mem': round(uss_mem, 5) if uss_mem != '' else -1, - 'max_shared_mem': round(shared_mem, 5) if shared_mem != '' else -1, + 'max_sys_mem': round(max_sys_mem, 5) if max_sys_mem != '' else -1, 'prompt_idx': iter_data['prompt_idx'], 'tokenization_time': round(tokenization_time, 5) if tokenization_time != '' else tokenization_time, 'detokenization_time': round(detokenization_time, 5) if detokenization_time != '' else detokenization_time, diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py index 6aa3ec2395..011c56402f 100644 --- a/tools/llm_bench/llm_bench_utils/ov_utils.py +++ b/tools/llm_bench/llm_bench_utils/ov_utils.py @@ -102,7 +102,7 @@ def get_lora_config(lora_paths, lora_alphas, lora_mode=None): return adapter_config -def create_text_gen_model(model_path, device, **kwargs): +def create_text_gen_model(model_path, device, memory_monitor, **kwargs): """Create text generation model. - model_path: can be model_path or IR path @@ -130,7 +130,7 @@ def create_text_gen_model(model_path, device, **kwargs): log.warning(f"OpenVINO GenAI based benchmarking is not available for {model_type}. Will be switched to default benchmarking") else: log.info("Selected OpenVINO GenAI for benchmarking") - return create_genai_text_gen_model(model_path, device, ov_config, **kwargs) + return create_genai_text_gen_model(model_path, device, ov_config, memory_monitor, **kwargs) log.info("Selected Optimum Intel for benchmarking") remote_code = False try: @@ -138,6 +138,9 @@ def create_text_gen_model(model_path, device, **kwargs): except Exception: model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) remote_code = True + + if kwargs.get("mem_consumption"): + memory_monitor.start() start = time.perf_counter() ov_model = model_class.from_pretrained( model_path, @@ -148,6 +151,9 @@ def create_text_gen_model(model_path, device, **kwargs): trust_remote_code=remote_code ) end = time.perf_counter() + if kwargs.get("mem_consumption"): + memory_monitor.stop_and_collect_data('compilation_phase') + memory_monitor.log_data('for copmpilation phase') bench_hook = get_bench_hook(kwargs['num_beams'], ov_model) from_pretrained_time = end - start log.info(f'From pretrained time: {from_pretrained_time:.2f}s') @@ -173,7 +179,7 @@ def get_scheduler_config_genai(user_config, config_name="CB config"): return scheduler_config -def create_genai_text_gen_model(model_path, device, ov_config, **kwargs): +def create_genai_text_gen_model(model_path, device, ov_config, memory_monitor, **kwargs): import openvino_genai from transformers import AutoTokenizer from packaging.version import parse @@ -213,10 +219,15 @@ def create_genai_text_gen_model(model_path, device, ov_config, **kwargs): if adapter_config: ov_config['adapters'] = adapter_config + if kwargs.get("mem_consumption"): + memory_monitor.start() start = time.perf_counter() llm_pipe = openvino_genai.LLMPipeline(model_path, device.upper(), **ov_config) end = time.perf_counter() log.info(f'Pipeline initialization time: {end - start:.2f}s') + if kwargs.get("mem_consumption"): + memory_monitor.stop_and_collect_data('compilation_phase') + memory_monitor.log_data('for copmpilation phase') class TokenStreamer(openvino_genai.StreamerBase): def __init__(self, tokenizer): @@ -259,7 +270,7 @@ def convert_ov_tokenizer(tokenizer_path): export_tokenizer(hf_tokenizer, tokenizer_path) -def create_image_gen_model(model_path, device, **kwargs): +def create_image_gen_model(model_path, device, memory_monitor, **kwargs): model_index_data = {} with open(str(model_path / "model_index.json"), 'r') as f: model_index_data = json.load(f) @@ -278,8 +289,10 @@ def create_image_gen_model(model_path, device, **kwargs): else: if kwargs.get("genai", True) and is_genai_available(log_msg=True): log.info("Selected OpenVINO GenAI for benchmarking") - return create_genai_image_gen_model(model_path, device, ov_config, model_index_data, **kwargs) + return create_genai_image_gen_model(model_path, device, ov_config, model_index_data, memory_monitor, **kwargs) + if kwargs.get("mem_consumption"): + memory_monitor.start() log.info("Selected Optimum Intel for benchmarking") start = time.perf_counter() if kwargs.get("static_reshape", False): @@ -293,6 +306,9 @@ def create_image_gen_model(model_path, device, **kwargs): else: ov_model = model_class.from_pretrained(model_path, device=device, ov_config=ov_config) end = time.perf_counter() + if kwargs.get("mem_consumption"): + memory_monitor.stop_and_collect_data('compilation_phase') + memory_monitor.log_data('for copmpilation phase') from_pretrained_time = end - start log.info(f'From pretrained time: {from_pretrained_time:.2f}s') return ov_model, from_pretrained_time, False, None @@ -331,7 +347,7 @@ def get_genai_unet_model(model_index_data, model_path, device, ov_config): return unet -def create_genai_image_gen_model(model_path, device, ov_config, model_index_data, **kwargs): +def create_genai_image_gen_model(model_path, device, ov_config, model_index_data, memory_monitor, **kwargs): import openvino_genai class PerfCollector: @@ -406,6 +422,8 @@ def raw_metrics(self): orig_tokenizer = AutoTokenizer.from_pretrained(model_path, subfolder="tokenizer") callback.orig_tokenizer = orig_tokenizer + if kwargs.get("mem_consumption"): + memory_monitor.start() start = time.perf_counter() scheduler_type = model_index_data.get("scheduler", ["", ""])[1] @@ -454,11 +472,14 @@ def raw_metrics(self): image_gen_pipe = image_gen_pipeline_class(model_path, device.upper(), **ov_config) end = time.perf_counter() + if kwargs.get("mem_consumption"): + memory_monitor.stop_and_collect_data('compilation_phase') + memory_monitor.log_data('for copmpilation phase') log.info(f'Pipeline initialization time: {end - start:.2f}s') return image_gen_pipe, end - start, True, callback -def create_ldm_super_resolution_model(model_path, device, **kwargs): +def create_ldm_super_resolution_model(model_path, device, memory_monitor, **kwargs): core = Core() ov_config = kwargs['config'] core.set_property(ov_config) @@ -466,30 +487,40 @@ def create_ldm_super_resolution_model(model_path, device, **kwargs): model_type = kwargs.get('model_type', default_model_type) model_class = OV_MODEL_CLASSES_MAPPING[model_type] model_path = Path(model_path) + if kwargs.get("mem_consumption"): + memory_monitor.start() start = time.perf_counter() ov_model = model_class(model_path, core, device.upper()) end = time.perf_counter() + if kwargs.get("mem_consumption"): + memory_monitor.stop_and_collect_data('compilation_phase') + memory_monitor.log_data('for copmpilation phase') from_pretrained_time = end - start log.info(f'From pretrained time: {from_pretrained_time:.2f}s') return ov_model, from_pretrained_time -def create_genai_speech_2_txt_model(model_path, device, **kwargs): +def create_genai_speech_2_txt_model(model_path, device, memory_monitor, **kwargs): import openvino_genai as ov_genai if kwargs.get("genai", True) is False: raise RuntimeError('==Failure the command line does not set --genai ==') if is_genai_available(log_msg=True) is False: raise RuntimeError('==Failure genai is not enable ==') + if kwargs.get("mem_consumption"): + memory_monitor.start() start = time.perf_counter() genai_pipe = ov_genai.WhisperPipeline(model_path, device.upper()) end = time.perf_counter() + if kwargs.get("mem_consumption"): + memory_monitor.stop_and_collect_data('compilation_phase') + memory_monitor.log_data('for copmpilation phase') from_pretrained_time = end - start log.info(f'From pretrained time: {from_pretrained_time:.2f}s') processor = AutoProcessor.from_pretrained(model_path) return genai_pipe, processor, from_pretrained_time, True -def create_speech_2txt_model(model_path, device, **kwargs): +def create_speech_2txt_model(model_path, device, memory_monitor, **kwargs): """Create speech generation model. - model_path: can be model_path or IR path - device: can be CPU @@ -509,14 +540,19 @@ def create_speech_2txt_model(model_path, device, **kwargs): log.warning("OpenVINO GenAI based benchmarking is not available for {model_type}. Will be switched to default bencmarking") else: log.info("Selected OpenVINO GenAI for benchmarking") - return create_genai_speech_2_txt_model(model_path, device, **kwargs) + return create_genai_speech_2_txt_model(model_path, device, memory_monitor, **kwargs) log.info("Selected Optimum Intel for benchmarking") + if kwargs.get("mem_consumption"): + memory_monitor.start() start = time.perf_counter() ov_model = model_class.from_pretrained( model_path, device=device ) end = time.perf_counter() + if kwargs.get("mem_consumption"): + memory_monitor.stop_and_collect_data('compilation_phase') + memory_monitor.log_data('for copmpilation phase') from_pretrained_time = end - start log.info(f'From pretrained time: {from_pretrained_time:.2f}s') processor = AutoProcessor.from_pretrained(model_path) @@ -546,7 +582,7 @@ def get_vlm_processor(model_path): return preprocessors -def create_genai_image_text_gen_model(model_path, device, ov_config, **kwargs): +def create_genai_image_text_gen_model(model_path, device, ov_config, memory_monitor, **kwargs): import openvino_genai if not (model_path / "openvino_tokenizer.xml").exists() or not (model_path / "openvino_detokenizer.xml").exists(): @@ -560,16 +596,21 @@ def create_genai_image_text_gen_model(model_path, device, ov_config, **kwargs): log.info("Continuous Batching mode activated") ov_config["scheduler_config"] = get_scheduler_config_genai(cb_config) + if kwargs.get("mem_consumption"): + memory_monitor.start() start = time.perf_counter() llm_pipe = openvino_genai.VLMPipeline(model_path, device.upper(), **ov_config) end = time.perf_counter() log.info("Selected OpenVINO GenAI for benchmarking") + if kwargs.get("mem_consumption"): + memory_monitor.stop_and_collect_data('compilation_phase') + memory_monitor.log_data('for copmpilation phase') log.info(f'Pipeline initialization time: {end - start:.2f}s') return llm_pipe, processor_config, end - start, None, True -def create_image_text_gen_model(model_path, device, **kwargs): +def create_image_text_gen_model(model_path, device, memory_monitor, **kwargs): model_path = Path(model_path) # specify the model path if model_path.name.endswith('xml'): @@ -590,7 +631,7 @@ def create_image_text_gen_model(model_path, device, **kwargs): remote_code = True if kwargs.get("genai", True) and is_genai_available(log_msg=True): try: - return create_genai_image_text_gen_model(model_path, device, ov_config, **kwargs) + return create_genai_image_text_gen_model(model_path, device, ov_config, memory_monitor, **kwargs) except Exception as exp: log.warning( f"Model type `{model_config.model_type}` is not supported by OpenVINO GenAI. " @@ -600,6 +641,8 @@ def create_image_text_gen_model(model_path, device, **kwargs): log.info("Selected Optimum Intel for benchmarking") model_class = OV_MODEL_CLASSES_MAPPING.get(DEFAULT_MODEL_CLASSES[kwargs['use_case']]) + if kwargs.get("mem_consumption"): + memory_monitor.start() start = time.perf_counter() ov_model = model_class.from_pretrained( model_path, @@ -609,6 +652,9 @@ def create_image_text_gen_model(model_path, device, **kwargs): trust_remote_code=remote_code ) end = time.perf_counter() + if kwargs.get("mem_consumption"): + memory_monitor.stop_and_collect_data('compilation_phase') + memory_monitor.log_data('for copmpilation phase') bench_hook = get_bench_hook(kwargs['num_beams'], ov_model) from_pretrained_time = end - start log.info(f'From pretrained time: {from_pretrained_time:.2f}s') diff --git a/tools/llm_bench/llm_bench_utils/pt_utils.py b/tools/llm_bench/llm_bench_utils/pt_utils.py index a22dec9578..cdc6a5b447 100644 --- a/tools/llm_bench/llm_bench_utils/pt_utils.py +++ b/tools/llm_bench/llm_bench_utils/pt_utils.py @@ -30,7 +30,9 @@ def torch_compile_child_module(model, child_modules, backend='openvino', dynamic return model -def run_torch_compile(model, backend='openvino', dynamic=None, options=None, child_modules=None): +def run_torch_compile(model, backend='openvino', dynamic=None, options=None, child_modules=None, memory_monitor=None): + if memory_monitor: + memory_monitor.start() if backend == 'pytorch': log.info(f'Running torch.compile() with {backend} backend') start = time.perf_counter() @@ -48,10 +50,13 @@ def run_torch_compile(model, backend='openvino', dynamic=None, options=None, chi end = time.perf_counter() compile_time = end - start log.info(f'Compiling model via torch.compile() took: {compile_time}') + if memory_monitor: + memory_monitor.stop_and_collect_data('compilation_phase') + memory_monitor.log_data('for from torch.compile() phase') return compiled_model -def create_text_gen_model(model_path, device, **kwargs): +def create_text_gen_model(model_path, device, memory_monitor, **kwargs): model_path = Path(model_path) from_pretrain_time = 0 if model_path.exists(): @@ -61,6 +66,8 @@ def create_text_gen_model(model_path, device, **kwargs): model_type = kwargs.get('model_type', default_model_type) model_class = PT_MODEL_CLASSES_MAPPING.get(model_type, PT_MODEL_CLASSES_MAPPING[default_model_type]) token_class = TOKENIZE_CLASSES_MAPPING.get(model_type, TOKENIZE_CLASSES_MAPPING[default_model_type]) + if kwargs.get("mem_consumption"): + memory_monitor.start() start = time.perf_counter() trust_remote_code = False try: @@ -72,6 +79,9 @@ def create_text_gen_model(model_path, device, **kwargs): tokenizer = token_class.from_pretrained(model_path, trust_remote_code=trust_remote_code) end = time.perf_counter() from_pretrain_time = end - start + if kwargs.get("mem_consumption"): + memory_monitor.stop_and_collect_data('from_pretrained_phase') + memory_monitor.log_data('for from pretrained phase') else: raise RuntimeError(f'==Failure ==: model path:{model_path} is not directory or directory is empty') else: @@ -119,12 +129,12 @@ def create_text_gen_model(model_path, device, **kwargs): options = json.loads(kwargs['torch_compile_options']) if kwargs['torch_compile_input_module']: child_modules = kwargs['torch_compile_input_module'].split(".") - compiled_model = run_torch_compile(model, backend, dynamic, options, child_modules) + compiled_model = run_torch_compile(model, backend, dynamic, options, child_modules, memory_monitor if kwargs.get("mem_consumption") else None) model = compiled_model return model, tokenizer, from_pretrain_time, bench_hook, False -def create_image_gen_model(model_path, device, **kwargs): +def create_image_gen_model(model_path, device, memory_monitor, **kwargs): model_path = Path(model_path) from_pretrain_time = 0 if model_path.exists(): @@ -132,10 +142,15 @@ def create_image_gen_model(model_path, device, **kwargs): log.info(f'Load image model from model path:{model_path}') model_type = DEFAULT_MODEL_CLASSES[kwargs['use_case']] model_class = PT_MODEL_CLASSES_MAPPING[model_type] + if kwargs.get("mem_consumption"): + memory_monitor.start() start = time.perf_counter() pipe = model_class.from_pretrained(model_path) pipe = set_bf16(pipe, device, **kwargs) end = time.perf_counter() + if kwargs.get("mem_consumption"): + memory_monitor.stop_and_collect_data('from_pretrained_phase') + memory_monitor.log_data('for from pretrained phase') from_pretrain_time = end - start else: raise RuntimeError(f'==Failure ==: model path:{model_path} is not directory or directory is empty') @@ -158,12 +173,12 @@ def create_image_gen_model(model_path, device, **kwargs): if kwargs['torch_compile_backend']: backend = kwargs['torch_compile_backend'] - compiled_model = run_torch_compile(pipe, backend) + compiled_model = run_torch_compile(pipe, backend, memory_monitor if kwargs.get("mem_consumption") else None) pipe = compiled_model return pipe, from_pretrain_time, False, None -def create_ldm_super_resolution_model(model_path, device, **kwargs): +def create_ldm_super_resolution_model(model_path, device, memory_monitor, **kwargs): model_path = Path(model_path) from_pretrain_time = 0 if model_path.exists(): @@ -174,6 +189,9 @@ def create_ldm_super_resolution_model(model_path, device, **kwargs): start = time.perf_counter() pipe = model_class.from_pretrained(model_path) end = time.perf_counter() + if kwargs.get("mem_consumption"): + memory_monitor.stop_and_collect_data('from_pretrained_phase') + memory_monitor.log_data('for from pretrained phase') from_pretrain_time = end - start else: raise RuntimeError(f'==Failure ==: model path:{model_path} is not directory or directory is empty') @@ -196,6 +214,6 @@ def create_ldm_super_resolution_model(model_path, device, **kwargs): if kwargs['torch_compile_backend']: backend = kwargs['torch_compile_backend'] - compiled_model = run_torch_compile(pipe, backend) + compiled_model = run_torch_compile(pipe, backend, memory_monitor if kwargs.get("mem_consumption") else None) pipe = compiled_model return pipe, from_pretrain_time diff --git a/tools/llm_bench/task/image_generation.py b/tools/llm_bench/task/image_generation.py index eb36a5496a..3c6246259a 100644 --- a/tools/llm_bench/task/image_generation.py +++ b/tools/llm_bench/task/image_generation.py @@ -90,10 +90,11 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, result_md5_list = [] max_rss_mem_consumption = '' - max_uss_mem_consumption = '' - max_shared_mem_consumption = '' + max_sys_mem_consumption = '' + max_rss_mem_increase = '' + max_sys_mem_increase = '' if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: - mem_consumption.start_collect_memory_consumption() + mem_consumption.start() input_text_list = [input_text] * args['batch_size'] input_data = pipe.tokenizer(input_text, return_tensors='pt') @@ -108,9 +109,8 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, res = pipe(input_text_list, **input_args, num_images_per_prompt=args['batch_size']).images end = time.perf_counter() if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: - mem_consumption.end_collect_momory_consumption() - max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() - mem_consumption.clear_max_memory_consumption() + mem_consumption.stop_and_collect_data(f"{'P' + str(num) if num > 0 else 'warm-up'}_{proc_id}") + max_rss_mem_consumption, max_rss_mem_increase, max_sys_mem_consumption, max_sys_mem_increase = mem_consumption.get_data() for bs_idx in range(args['batch_size']): rslt_img_fn = llm_bench_utils.output_file.output_gen_image(res[bs_idx], args, image_id, num, bs_idx, proc_id, '.png') result_md5_list.append(hashlib.md5(Image.open(rslt_img_fn).tobytes(), usedforsecurity=False).hexdigest()) @@ -122,8 +122,9 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, gen_time=generation_time, res_md5=result_md5_list, max_rss_mem=max_rss_mem_consumption, - max_shared_mem=max_shared_mem_consumption, - max_uss_mem=max_uss_mem_consumption, + max_rss_mem_increase=max_rss_mem_increase, + max_sys_mem=max_sys_mem_consumption, + max_sys_mem_increase=max_sys_mem_increase, prompt_idx=image_id, ) iter_data_list.append(iter_data) @@ -131,9 +132,6 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, num, iter_data, warm_up=(num == 0), - max_rss_mem=max_rss_mem_consumption, - max_shared_mem=max_shared_mem_consumption, - max_uss_mem=max_uss_mem_consumption, stable_diffusion=stable_diffusion_hook, prompt_idx=image_id ) @@ -160,10 +158,11 @@ def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data result_md5_list = [] max_rss_mem_consumption = '' - max_uss_mem_consumption = '' - max_shared_mem_consumption = '' + max_sys_mem_consumption = '' + max_rss_mem_increase = '' + max_sys_mem_increase = '' if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: - mem_consumption.start_collect_memory_consumption() + mem_consumption.start() input_text_list = [input_text] * args['batch_size'] if num == 0 and args["output_dir"] is not None: @@ -183,9 +182,8 @@ def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data performance_metrics = callback if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: - mem_consumption.end_collect_momory_consumption() - max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() - mem_consumption.clear_max_memory_consumption() + mem_consumption.stop_and_collect_data(f"{'P' + str(num) if num > 0 else 'warm-up'}_{proc_id}") + max_rss_mem_consumption, max_rss_mem_increase, max_sys_mem_consumption, max_sys_mem_increase = mem_consumption.get_data() for bs_idx in range(args['batch_size']): image = Image.fromarray(res[bs_idx]) rslt_img_fn = llm_bench_utils.output_file.output_gen_image(image, args, image_id, num, bs_idx, proc_id, '.png') @@ -198,8 +196,9 @@ def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data gen_time=generation_time, res_md5=result_md5_list, max_rss_mem=max_rss_mem_consumption, - max_shared_mem=max_shared_mem_consumption, - max_uss_mem=max_uss_mem_consumption, + max_rss_mem_increase=max_rss_mem_increase, + max_sys_mem=max_sys_mem_consumption, + max_sys_mem_increase=max_sys_mem_increase, prompt_idx=image_id, ) iter_data_list.append(iter_data) @@ -207,9 +206,6 @@ def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data num, iter_data, warm_up=(num == 0), - max_rss_mem=max_rss_mem_consumption, - max_shared_mem=max_shared_mem_consumption, - max_uss_mem=max_uss_mem_consumption, stable_diffusion=performance_metrics, prompt_idx=image_id ) @@ -218,7 +214,6 @@ def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data def run_image_generation_benchmark(model_path, framework, device, args, num_iters, mem_consumption): - input_image_list = get_image_prompt(args) if args['prompt_index'] is None: prompt_idx_list = [image_id for image_id, input_text in enumerate(input_image_list)] @@ -243,7 +238,7 @@ def run_image_generation_benchmark(model_path, framework, device, args, num_iter if "guidance_scale" in static_input_args: args["guidance_scale"] = static_input_args["guidance_scale"] - pipe, pretrain_time, use_genai, callback = FW_UTILS[framework].create_image_gen_model(model_path, device, **args) + pipe, pretrain_time, use_genai, callback = FW_UTILS[framework].create_image_gen_model(model_path, device, mem_consumption, **args) iter_data_list = [] if framework == "ov" and not use_genai: diff --git a/tools/llm_bench/task/speech_to_text_generation.py b/tools/llm_bench/task/speech_to_text_generation.py index cb34a81a2f..630df3f02b 100644 --- a/tools/llm_bench/task/speech_to_text_generation.py +++ b/tools/llm_bench/task/speech_to_text_generation.py @@ -24,8 +24,9 @@ def run_speech_2_txt_generation(input_param, args, md5_list, iter_data_list): result_md5_list = [] max_rss_mem_consumption = '' - max_uss_mem_consumption = '' - max_shared_mem_consumption = '' + max_sys_mem_consumption = '' + max_rss_mem_increase = '' + max_sys_mem_increase = '' pipe = input_param['pipe'] raw_speech = input_param['raw_speech'] num = input_param['iter_idx'] @@ -38,7 +39,7 @@ def run_speech_2_txt_generation(input_param, args, md5_list, iter_data_list): max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: - mem_consumption.start_collect_memory_consumption() + mem_consumption.start() if use_genai: start = time.perf_counter() result_text = pipe.generate( @@ -85,9 +86,8 @@ def run_speech_2_txt_generation(input_param, args, md5_list, iter_data_list): else: md5_list[num][speech_id] = result_md5_list if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: - mem_consumption.end_collect_momory_consumption() - max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() - mem_consumption.clear_max_memory_consumption() + mem_consumption.stop_and_collect_data(f"{'P' + str(num) if num > 0 else 'warm-up'}") + max_rss_mem_consumption, max_rss_mem_increase, max_sys_mem_consumption, max_sys_mem_increase = mem_consumption.get_data() iter_data = gen_output_data.gen_iterate_data( iter_idx=num, @@ -95,8 +95,9 @@ def run_speech_2_txt_generation(input_param, args, md5_list, iter_data_list): gen_time=generation_time, res_md5=result_md5_list, max_rss_mem=max_rss_mem_consumption, - max_shared_mem=max_shared_mem_consumption, - max_uss_mem=max_uss_mem_consumption, + max_rss_mem_increase=max_rss_mem_increase, + max_sys_mem=max_sys_mem_consumption, + max_sys_mem_increase=max_sys_mem_increase, prompt_idx=speech_id, ) iter_data_list.append(iter_data) @@ -106,9 +107,6 @@ def run_speech_2_txt_generation(input_param, args, md5_list, iter_data_list): tms=tm_list, tms_infer=tm_infer_list, warm_up=(num == 0), - max_rss_mem=max_rss_mem_consumption, - max_shared_mem=max_shared_mem_consumption, - max_uss_mem=max_uss_mem_consumption, prompt_idx=speech_id, whisper=whisper_hook ) @@ -147,7 +145,7 @@ def run_speech_2_txt_benchmark(model_path, framework, device, args, num_iters, m if len(speech_list) == 0: raise RuntimeError('==Failure speech list is empty ==') log.info(f'Benchmarking iter nums(exclude warm-up): {num_iters}, speech file nums: {len(speech_file_list)}, speech idx: {speech_idx_list}') - pipe, processor, pretrain_time, use_genai = FW_UTILS[framework].create_speech_2txt_model(model_path, device, **args) + pipe, processor, pretrain_time, use_genai = FW_UTILS[framework].create_speech_2txt_model(model_path, device, mem_consumption, **args) md5_list = {num : {} for num in range(num_iters + 1)} iter_timestamp = model_utils.init_timestamp(num_iters, speech_list, speech_idx_list) input_param = { diff --git a/tools/llm_bench/task/super_resolution_generation.py b/tools/llm_bench/task/super_resolution_generation.py index c2f3cff6e4..3e2c35c59b 100644 --- a/tools/llm_bench/task/super_resolution_generation.py +++ b/tools/llm_bench/task/super_resolution_generation.py @@ -33,17 +33,17 @@ def run_ldm_super_resolution(img, num, pipe, args, framework, iter_data_list, im low_res_img = Image.open(img['prompt']).convert('RGB') low_res_img = low_res_img.resize((resize_image_width, resize_image_height)) max_rss_mem_consumption = '' - max_uss_mem_consumption = '' - max_shared_mem_consumption = '' + max_sys_mem_consumption = '' + max_rss_mem_increase = '' + max_sys_mem_increase = '' if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: - mem_consumption.start_collect_memory_consumption() + mem_consumption.start() start = time.perf_counter() res = pipe(low_res_img, num_inference_steps=nsteps, tm_list=tm_list) end = time.perf_counter() if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: - mem_consumption.end_collect_momory_consumption() - max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() - mem_consumption.clear_max_memory_consumption() + mem_consumption.stop_and_collect_data(f"{'P' + str(num) if num > 0 else 'warm-up'}_{proc_id}") + max_rss_mem_consumption, max_rss_mem_increase, max_sys_mem_consumption, max_sys_mem_increase = mem_consumption.get_data() result_md5_list = [] if framework == 'ov': rslt_img_fn = llm_bench_utils.output_file.output_gen_image(res[0], args, image_id, num, None, proc_id, '.png') @@ -56,8 +56,9 @@ def run_ldm_super_resolution(img, num, pipe, args, framework, iter_data_list, im gen_time=generation_time, res_md5=result_md5_list, max_rss_mem=max_rss_mem_consumption, - max_shared_mem=max_shared_mem_consumption, - max_uss_mem=max_uss_mem_consumption, + max_rss_mem_increase=max_rss_mem_increase, + max_sys_mem=max_sys_mem_consumption, + max_sys_mem_increase=max_sys_mem_increase, prompt_idx=image_id, ) iter_data_list.append(iter_data) @@ -65,9 +66,6 @@ def run_ldm_super_resolution(img, num, pipe, args, framework, iter_data_list, im num, iter_data, warm_up=(num == 0), - max_rss_mem=max_rss_mem_consumption, - max_shared_mem=max_shared_mem_consumption, - max_uss_mem=max_uss_mem_consumption, prompt_idx=image_id ) metrics_print.print_generated(num, warm_up=(num == 0), generated=rslt_img_fn, prompt_idx=image_id) @@ -77,7 +75,7 @@ def run_ldm_super_resolution(img, num, pipe, args, framework, iter_data_list, im def run_ldm_super_resolution_benchmark(model_path, framework, device, args, num_iters, mem_consumption): if args["genai"]: log.warning("GenAI pipeline is not supported for this task. Switched on default benchmarking") - pipe, pretrain_time = FW_UTILS[framework].create_ldm_super_resolution_model(model_path, device, **args) + pipe, pretrain_time = FW_UTILS[framework].create_ldm_super_resolution_model(model_path, device, mem_consumption, **args) iter_data_list = [] tm_list = [] images = get_ldm_image_prompt(args) diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py index e83b5eff34..10423f189a 100644 --- a/tools/llm_bench/task/text_generation.py +++ b/tools/llm_bench/task/text_generation.py @@ -49,10 +49,11 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, log.info(out_str) max_rss_mem_consumption = '' - max_uss_mem_consumption = '' - max_shared_mem_consumption = '' + max_sys_mem_consumption = '' + max_rss_mem_increase = '' + max_sys_mem_increase = '' if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: - mem_consumption.start_collect_memory_consumption() + mem_consumption.start() max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] start = time.perf_counter() if streaming: @@ -99,9 +100,8 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, ) end = time.perf_counter() if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: - mem_consumption.end_collect_momory_consumption() - max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() - mem_consumption.clear_max_memory_consumption() + mem_consumption.stop_and_collect_data(f"{'P' + str(num) if num > 0 else 'warm-up'}_{proc_id}") + max_rss_mem_consumption, max_rss_mem_increase, max_sys_mem_consumption, max_sys_mem_increase = mem_consumption.get_data() generation_time = end - start tok_decode_start = time.perf_counter() @@ -156,8 +156,9 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, latency=per_token_time, res_md5=result_md5_list, max_rss_mem=max_rss_mem_consumption, - max_shared_mem=max_shared_mem_consumption, - max_uss_mem=max_uss_mem_consumption, + max_rss_mem_increase=max_rss_mem_increase, + max_sys_mem=max_sys_mem_consumption, + max_sys_mem_increase=max_sys_mem_increase, prompt_idx=prompt_index, tokenization_time=(tok_encode_time, tok_decode_time) ) @@ -168,9 +169,6 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, tm_list, tm_infer_list, warm_up=(num == 0), - max_rss_mem=max_rss_mem_consumption, - max_shared_mem=max_shared_mem_consumption, - max_uss_mem=max_uss_mem_consumption, tokenization_time=(tok_encode_time, tok_decode_time), batch_size=args['batch_size'], prompt_idx=prompt_index @@ -195,10 +193,12 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data for bs_index, in_text in enumerate(input_text_list): llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id) max_rss_mem_consumption = '' - max_uss_mem_consumption = '' - max_shared_mem_consumption = '' + max_sys_mem_consumption = '' + max_rss_mem_increase = '' + max_sys_mem_increase = '' if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: - mem_consumption.start_collect_memory_consumption() + mem_consumption.start() + max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] tokenizer = model.get_tokenizer() @@ -283,9 +283,8 @@ def token_printer(): tokenization_time.append((detokenization_end - detokenization_start) * 1000) if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: - mem_consumption.end_collect_momory_consumption() - max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() - mem_consumption.clear_max_memory_consumption() + mem_consumption.stop_and_collect_data(f"{'P' + str(num) if num > 0 else 'warm-up'}_{proc_id}") + max_rss_mem_consumption, max_rss_mem_increase, max_sys_mem_consumption, max_sys_mem_increase = mem_consumption.get_data() generation_time = end - start # Only text_gen need to minus length of input_data, because generated_text may include input_text @@ -328,8 +327,9 @@ def token_printer(): latency=per_token_time, res_md5=result_md5_list, max_rss_mem=max_rss_mem_consumption, - max_shared_mem=max_shared_mem_consumption, - max_uss_mem=max_uss_mem_consumption, + max_rss_mem_increase=max_rss_mem_increase, + max_sys_mem=max_sys_mem_consumption, + max_sys_mem_increase=max_sys_mem_increase, prompt_idx=prompt_index, tokenization_time=tokenization_time ) @@ -340,9 +340,6 @@ def token_printer(): tm_list, inference_durations, warm_up=(num == 0), - max_rss_mem=max_rss_mem_consumption, - max_shared_mem=max_shared_mem_consumption, - max_uss_mem=max_uss_mem_consumption, tokenization_time=tokenization_time, batch_size=args['batch_size'], prompt_idx=prompt_index @@ -377,10 +374,11 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size']) log.info(out_str) max_rss_mem_consumption = '' - max_uss_mem_consumption = '' - max_shared_mem_consumption = '' + max_sys_mem_consumption = '' + max_rss_mem_increase = '' + max_sys_mem_increase = '' if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: - mem_consumption.start_collect_memory_consumption() + mem_consumption.start() max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] streamer.reset() gen_config = model.get_generation_config() @@ -423,9 +421,8 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg generated_tokens = model.generate(input_data, gen_config, streamer=streamer).tokens end = time.perf_counter() if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: - mem_consumption.end_collect_momory_consumption() - max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() - mem_consumption.clear_max_memory_consumption() + mem_consumption.stop_and_collect_data(f"{'P' + str(num) if num > 0 else 'warm-up'}_{proc_id}") + max_rss_mem_consumption, max_rss_mem_increase, max_sys_mem_consumption, max_sys_mem_increase = mem_consumption.get_data() generation_time = end - start tok_decode_start = time.perf_counter() generated_text = pipe_tokenizer.decode(generated_tokens) @@ -464,8 +461,9 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg latency=per_token_time, res_md5=result_md5_list, max_rss_mem=max_rss_mem_consumption, - max_shared_mem=max_shared_mem_consumption, - max_uss_mem=max_uss_mem_consumption, + max_rss_mem_increase=max_rss_mem_increase, + max_sys_mem=max_sys_mem_consumption, + max_sys_mem_increase=max_sys_mem_increase, prompt_idx=prompt_index, tokenization_time=(tok_encode_time, tok_decode_time) ) @@ -476,9 +474,6 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg tms=tm_list, tms_infer=None, warm_up=(num == 0), - max_rss_mem=max_rss_mem_consumption, - max_shared_mem=max_shared_mem_consumption, - max_uss_mem=max_uss_mem_consumption, tokenization_time=(tok_encode_time, tok_decode_time), batch_size=args['batch_size'], prompt_idx=prompt_index @@ -495,7 +490,7 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg def run_text_generation_benchmark(model_path, framework, device, tokens_len, streaming, args, num_iters, mem_consumption): - model, tokenizer, pretrain_time, bench_hook, use_genai = FW_UTILS[framework].create_text_gen_model(model_path, device, **args) + model, tokenizer, pretrain_time, bench_hook, use_genai = FW_UTILS[framework].create_text_gen_model(model_path, device, mem_consumption, **args) model_precision = model_utils.get_model_precision(model_path.parts) iter_data_list = [] md5_list = {num : {} for num in range(num_iters + 1)} diff --git a/tools/llm_bench/task/visual_language_generation.py b/tools/llm_bench/task/visual_language_generation.py index 54b4467c14..6cd72f225c 100644 --- a/tools/llm_bench/task/visual_language_generation.py +++ b/tools/llm_bench/task/visual_language_generation.py @@ -59,10 +59,11 @@ def run_visual_language_generation_optimum( log.info(out_str) max_rss_mem_consumption = '' - max_uss_mem_consumption = '' - max_shared_mem_consumption = '' + max_sys_mem_consumption = '' + max_rss_mem_increase = '' + max_sys_mem_increase = '' if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: - mem_consumption.start_collect_memory_consumption() + mem_consumption.start() max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] start = time.perf_counter() if args['infer_count'] is not None and args['end_token_stopping'] is False: @@ -86,9 +87,8 @@ def run_visual_language_generation_optimum( ) end = time.perf_counter() if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: - mem_consumption.end_collect_momory_consumption() - max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() - mem_consumption.clear_max_memory_consumption() + mem_consumption.stop_and_collect_data(f"{'P' + str(num) if num > 0 else 'warm-up'}_{proc_id}") + max_rss_mem_consumption, max_rss_mem_increase, max_sys_mem_consumption, max_sys_mem_increase = mem_consumption.get_data() generation_time = end - start tok_decode_start = time.perf_counter() @@ -138,8 +138,9 @@ def run_visual_language_generation_optimum( latency=per_token_time, res_md5=result_md5_list, max_rss_mem=max_rss_mem_consumption, - max_shared_mem=max_shared_mem_consumption, - max_uss_mem=max_uss_mem_consumption, + max_rss_mem_increase=max_rss_mem_increase, + max_sys_mem=max_sys_mem_consumption, + max_sys_mem_increase=max_sys_mem_increase, prompt_idx=prompt_index, tokenization_time=(tok_encode_time, tok_decode_time), mm_embeddings_preparation_time=tm_mm_embeddings @@ -151,9 +152,6 @@ def run_visual_language_generation_optimum( tm_list, tm_infer_list, warm_up=(num == 0), - max_rss_mem=max_rss_mem_consumption, - max_shared_mem=max_shared_mem_consumption, - max_uss_mem=max_uss_mem_consumption, tokenization_time=(tok_encode_time, tok_decode_time), batch_size=args['batch_size'], prompt_idx=prompt_index @@ -203,10 +201,11 @@ def run_visual_language_generation_genai( for bs_index, in_text in enumerate(prompts): llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id) max_rss_mem_consumption = '' - max_uss_mem_consumption = '' - max_shared_mem_consumption = '' + max_sys_mem_consumption = '' + max_rss_mem_increase = '' + max_sys_mem_increase = '' if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: - mem_consumption.start_collect_memory_consumption() + mem_consumption.start() max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] gen_config = model.get_generation_config() gen_config.max_new_tokens = max_gen_tokens @@ -224,9 +223,8 @@ def run_visual_language_generation_genai( generated_text = generation_result.texts perf_metrics = generation_result.perf_metrics if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: - mem_consumption.end_collect_momory_consumption() - max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() - mem_consumption.clear_max_memory_consumption() + mem_consumption.stop_and_collect_data(f"{'P' + str(num) if num > 0 else 'warm-up'}_{proc_id}") + max_rss_mem_consumption, max_rss_mem_increase, max_sys_mem_consumption, max_sys_mem_increase = mem_consumption.get_data() generation_time = end - start result_md5_list = [] @@ -268,8 +266,9 @@ def run_visual_language_generation_genai( latency=per_token_time, res_md5=result_md5_list, max_rss_mem=max_rss_mem_consumption, - max_shared_mem=max_shared_mem_consumption, - max_uss_mem=max_uss_mem_consumption, + max_rss_mem_increase=max_rss_mem_increase, + max_sys_mem=max_sys_mem_consumption, + max_sys_mem_increase=max_sys_mem_increase, prompt_idx=prompt_index, tokenization_time=tokenization_time, mm_embeddings_preparation_time=perf_metrics.get_prepare_embeddings_duration().mean @@ -282,9 +281,6 @@ def run_visual_language_generation_genai( tm_list.tolist(), inference_durations.tolist(), warm_up=(num == 0), - max_rss_mem=max_rss_mem_consumption, - max_shared_mem=max_shared_mem_consumption, - max_uss_mem=max_uss_mem_consumption, tokenization_time=tokenization_time, batch_size=args['batch_size'], prompt_idx=prompt_index @@ -300,7 +296,7 @@ def run_visual_language_generation_genai( def run_visual_language_generation_benchmark(model_path, framework, device, args, num_iters, mem_consumption): - model, processor, pretrain_time, bench_hook, use_genai = FW_UTILS[framework].create_image_text_gen_model(model_path, device, **args) + model, processor, pretrain_time, bench_hook, use_genai = FW_UTILS[framework].create_image_text_gen_model(model_path, device, mem_consumption, **args) model_precision = model_utils.get_model_precision(model_path.parts) iter_data_list = [] md5_list = {num : {} for num in range(num_iters + 1)}