Skip to content

Commit f2baf19

Browse files
committed
Merge branch 'enable_perfspec_metrics' into fixes_for_build
2 parents 2945c4d + 80bcc57 commit f2baf19

File tree

2 files changed

+92
-2
lines changed

2 files changed

+92
-2
lines changed

gprofiler/client.py

+6
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,12 @@ def submit_profile(
211211
"mem_avg": metrics.mem_avg,
212212
"spawn_time": get_iso8601_format_time_from_epoch_time(spawn_time),
213213
"gpid": gpid,
214+
"cpu_freq": metrics.cpu_freq,
215+
"cpu_cpi": metrics.cpu_cpi,
216+
"cpu_tma_fe_bound": metrics.cpu_tma_fe_bound,
217+
"cpu_tma_be_bound": metrics.cpu_tma_be_bound,
218+
"cpu_tma_bad_spec": metrics.cpu_tma_bad_spec,
219+
"cpu_tma_retiring": metrics.cpu_tma_retiring,
214220
},
215221
timeout=self._upload_timeout,
216222
api_version="v2" if profile_api_version is None else profile_api_version,

gprofiler/system_metrics.py

+86-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from threading import Event, RLock, Thread
55
from typing import List, Optional
66

7+
import subprocess
78
import psutil
89

910
from gprofiler.exceptions import ThreadStopTimeoutError
@@ -18,7 +19,18 @@ class Metrics:
1819
cpu_avg: Optional[float]
1920
# The average RAM usage between gProfiler cycles
2021
mem_avg: Optional[float]
21-
22+
# The CPU frequency between gProfiler cycles
23+
cpu_freq: Optional[float]
24+
# The CPI between gProfiler cycles
25+
cpu_cpi: Optional[float]
26+
# The CPU TMA frontend bound between gProfiler cycles
27+
cpu_tma_fe_bound: Optional[float]
28+
# The CPU TMA backend bound between gProfiler cycles
29+
cpu_tma_be_bound: Optional[float]
30+
# The CPU TMA bad speculation between gProfiler cycles
31+
cpu_tma_bad_spec: Optional[float]
32+
# The CPU TMA retiring between gProfiler cycles
33+
cpu_tma_retiring: Optional[float]
2234

2335
class SystemMetricsMonitorBase(metaclass=ABCMeta):
2436
@abstractmethod
@@ -40,8 +52,16 @@ def _get_cpu_utilization(self) -> Optional[float]:
4052
"""
4153
raise NotImplementedError
4254

55+
@abstractmethod
56+
def _get_hw_metrics(self) -> Optional[List[float]]:
57+
"""
58+
Returns the CPU frequency since the last time this method was called.
59+
"""
60+
raise NotImplementedError
61+
4362
def get_metrics(self) -> Metrics:
44-
return Metrics(self._get_cpu_utilization(), self._get_average_memory_utilization())
63+
hw_metrics = self._get_hw_metrics()
64+
return Metrics(self._get_cpu_utilization(), self._get_average_memory_utilization(), hw_metrics[0], hw_metrics[1], hw_metrics[2], hw_metrics[3], hw_metrics[4], hw_metrics[5])
4565

4666

4767
class SystemMetricsMonitor(SystemMetricsMonitorBase):
@@ -51,22 +71,45 @@ def __init__(self, stop_event: Event, polling_rate_seconds: int = DEFAULT_POLLIN
5171
self._stop_event = stop_event
5272
self._thread: Optional[Thread] = None
5373
self._lock = RLock()
74+
self._perfspect_thread: Optional[Thread] = None
75+
self._hw_metrics = {'cpu_freq':[], 'cpu_cpi':[], 'cpu_tma_fe_bound':[], 'cpu_tma_be_bound':[], 'cpu_tma_bad_spec':[], 'cpu_tma_retiring':[]}
76+
self._ps_process = None
5477

5578
self._get_cpu_utilization() # Call this once to set the necessary data
5679

5780
def start(self) -> None:
5881
assert self._thread is None, "SystemMetricsMonitor is already running"
82+
assert self._perfspect_thread is None, "Perfspect is already running"
5983
assert not self._stop_event.is_set(), "Stop condition is already set (perhaps gProfiler was already stopped?)"
6084
self._thread = Thread(target=self._continuously_poll_memory, args=(self._polling_rate_seconds,))
6185
self._thread.start()
6286

87+
ps_cmd = ['/tmp/perfspect', 'metrics', '--metrics', '"CPU operating frequency (in GHz)","CPI","TMA_Frontend_Bound(%)","TMA_Bad_Speculation(%)","TMA_Backend_Bound(%)","TMA_Retiring(%)"', '--duration', '0', '--live', '--format', 'csv', '--interval', '10']
88+
self._ps_process = subprocess.Popen(ps_cmd, stdout=subprocess.PIPE)
89+
# ps_stdout, ps_stderr = ps_process.communicate()
90+
# try:
91+
# # wait 2 seconds to ensure it starts
92+
# ps_process.wait(2)
93+
# except subprocess.TimeoutExpired:
94+
# pass
95+
# else:
96+
# raise Exception(f"Command {ps_cmd} exited unexpectedly with {ps_process.returncode}")
97+
self._perfspect_thread = Thread(target=self._continuously_poll_perfspect, args=(self._polling_rate_seconds,))
98+
self._perfspect_thread.start()
99+
63100
def stop(self) -> None:
64101
assert self._thread is not None, "SystemMetricsMonitor is not running"
102+
assert self._perfspect_thread is not None, "Perfspect is not running"
65103
assert self._stop_event.is_set(), "Stop event was not set before stopping the SystemMetricsMonitor"
66104
self._thread.join(STOP_TIMEOUT_SECONDS)
67105
if self._thread.is_alive():
68106
raise ThreadStopTimeoutError("Timed out while waiting for the SystemMetricsMonitor internal thread to stop")
69107
self._thread = None
108+
self._ps_process.kill()
109+
self._perfspect_thread.join(STOP_TIMEOUT_SECONDS)
110+
if self._perfspect_thread.is_alive():
111+
raise ThreadStopTimeoutError("Timed out while waiting for the SystemMetricsMonitor Perfspect thread to stop")
112+
self._perfspect_thread = None
70113

71114
def _continuously_poll_memory(self, polling_rate_seconds: int) -> None:
72115
while not self._stop_event.is_set():
@@ -75,6 +118,22 @@ def _continuously_poll_memory(self, polling_rate_seconds: int) -> None:
75118
self._mem_percentages.append(current_ram_percent)
76119
self._stop_event.wait(timeout=polling_rate_seconds)
77120

121+
def _continuously_poll_perfspect(self, polling_rate_seconds: int) -> None:
122+
while not self._stop_event.is_set():
123+
metrics_str = self._ps_process.stdout.readline().decode()
124+
print(metrics_str)
125+
if metrics_str.startswith('TS,SKT,CPU,CID'):
126+
continue
127+
metric_values = metrics_str.split(',')
128+
if len(metric_values) > 0:
129+
self._hw_metrics['cpu_freq'].append(float(metric_values[4]))
130+
self._hw_metrics['cpu_cpi'].append(float(metric_values[5]))
131+
self._hw_metrics['cpu_tma_fe_bound'].append(float(metric_values[6]))
132+
self._hw_metrics['cpu_tma_bad_spec'].append(float(metric_values[7]))
133+
self._hw_metrics['cpu_tma_be_bound'].append(float(metric_values[8]))
134+
self._hw_metrics['cpu_tma_retiring'].append(float(metric_values[9]))
135+
self._stop_event.wait(timeout=polling_rate_seconds)
136+
78137
def _get_average_memory_utilization(self) -> Optional[float]:
79138
# Make sure there's only one thread that takes out the values
80139
# NOTE - Since there's currently only a single consumer, this is not necessary but is done to support multiple
@@ -92,6 +151,28 @@ def _get_cpu_utilization(self) -> float:
92151
# See `psutil.cpu_percent` documentation.
93152
return psutil.cpu_percent(interval=None)
94153

154+
def _get_hw_metrics(self) -> List[float]:
155+
current_length = len(self._hw_metrics['cpu_freq'])
156+
if current_length == 0:
157+
return None
158+
159+
metric_list = []
160+
metric_list.append(statistics.mean(self._hw_metrics['cpu_freq'][:current_length]))
161+
metric_list.append(statistics.mean(self._hw_metrics['cpu_cpi'][:current_length]))
162+
metric_list.append(statistics.mean(self._hw_metrics['cpu_tma_fe_bound'][:current_length]))
163+
metric_list.append(statistics.mean(self._hw_metrics['cpu_tma_be_bound'][:current_length]))
164+
metric_list.append(statistics.mean(self._hw_metrics['cpu_tma_bad_spec'][:current_length]))
165+
metric_list.append(statistics.mean(self._hw_metrics['cpu_tma_retiring'][:current_length]))
166+
167+
self._hw_metrics['cpu_freq'] = []
168+
self._hw_metrics['cpu_cpi'] = []
169+
self._hw_metrics['cpu_tma_fe_bound'] = []
170+
self._hw_metrics['cpu_tma_be_bound'] = []
171+
self._hw_metrics['cpu_tma_bad_spec'] = []
172+
self._hw_metrics['cpu_tma_retiring'] = []
173+
174+
return metric_list
175+
95176

96177
class NoopSystemMetricsMonitor(SystemMetricsMonitorBase):
97178
def start(self) -> None:
@@ -105,3 +186,6 @@ def _get_average_memory_utilization(self) -> Optional[float]:
105186

106187
def _get_cpu_utilization(self) -> Optional[float]:
107188
return None
189+
190+
def _get_hw_metrics(self) -> Optional[List[float]]:
191+
return None

0 commit comments

Comments
 (0)