4
4
from threading import Event , RLock , Thread
5
5
from typing import List , Optional
6
6
7
+ import subprocess
7
8
import psutil
8
9
9
10
from gprofiler .exceptions import ThreadStopTimeoutError
@@ -18,7 +19,18 @@ class Metrics:
18
19
cpu_avg : Optional [float ]
19
20
# The average RAM usage between gProfiler cycles
20
21
mem_avg : Optional [float ]
21
-
22
+ # The CPU frequency between gProfiler cycles
23
+ cpu_freq : Optional [float ]
24
+ # The CPI between gProfiler cycles
25
+ cpu_cpi : Optional [float ]
26
+ # The CPU TMA frontend bound between gProfiler cycles
27
+ cpu_tma_fe_bound : Optional [float ]
28
+ # The CPU TMA backend bound between gProfiler cycles
29
+ cpu_tma_be_bound : Optional [float ]
30
+ # The CPU TMA bad speculation between gProfiler cycles
31
+ cpu_tma_bad_spec : Optional [float ]
32
+ # The CPU TMA retiring between gProfiler cycles
33
+ cpu_tma_retiring : Optional [float ]
22
34
23
35
class SystemMetricsMonitorBase (metaclass = ABCMeta ):
24
36
@abstractmethod
@@ -40,8 +52,16 @@ def _get_cpu_utilization(self) -> Optional[float]:
40
52
"""
41
53
raise NotImplementedError
42
54
55
+ @abstractmethod
56
+ def _get_hw_metrics (self ) -> Optional [List [float ]]:
57
+ """
58
+ Returns the CPU frequency since the last time this method was called.
59
+ """
60
+ raise NotImplementedError
61
+
43
62
def get_metrics (self ) -> Metrics :
44
- return Metrics (self ._get_cpu_utilization (), self ._get_average_memory_utilization ())
63
+ hw_metrics = self ._get_hw_metrics ()
64
+ return Metrics (self ._get_cpu_utilization (), self ._get_average_memory_utilization (), hw_metrics [0 ], hw_metrics [1 ], hw_metrics [2 ], hw_metrics [3 ], hw_metrics [4 ], hw_metrics [5 ])
45
65
46
66
47
67
class SystemMetricsMonitor (SystemMetricsMonitorBase ):
@@ -51,22 +71,45 @@ def __init__(self, stop_event: Event, polling_rate_seconds: int = DEFAULT_POLLIN
51
71
self ._stop_event = stop_event
52
72
self ._thread : Optional [Thread ] = None
53
73
self ._lock = RLock ()
74
+ self ._perfspect_thread : Optional [Thread ] = None
75
+ self ._hw_metrics = {'cpu_freq' :[], 'cpu_cpi' :[], 'cpu_tma_fe_bound' :[], 'cpu_tma_be_bound' :[], 'cpu_tma_bad_spec' :[], 'cpu_tma_retiring' :[]}
76
+ self ._ps_process = None
54
77
55
78
self ._get_cpu_utilization () # Call this once to set the necessary data
56
79
57
80
def start (self ) -> None :
58
81
assert self ._thread is None , "SystemMetricsMonitor is already running"
82
+ assert self ._perfspect_thread is None , "Perfspect is already running"
59
83
assert not self ._stop_event .is_set (), "Stop condition is already set (perhaps gProfiler was already stopped?)"
60
84
self ._thread = Thread (target = self ._continuously_poll_memory , args = (self ._polling_rate_seconds ,))
61
85
self ._thread .start ()
62
86
87
+ ps_cmd = ['/tmp/perfspect' , 'metrics' , '--metrics' , '"CPU operating frequency (in GHz)","CPI","TMA_Frontend_Bound(%)","TMA_Bad_Speculation(%)","TMA_Backend_Bound(%)","TMA_Retiring(%)"' , '--duration' , '0' , '--live' , '--format' , 'csv' , '--interval' , '10' ]
88
+ self ._ps_process = subprocess .Popen (ps_cmd , stdout = subprocess .PIPE )
89
+ # ps_stdout, ps_stderr = ps_process.communicate()
90
+ # try:
91
+ # # wait 2 seconds to ensure it starts
92
+ # ps_process.wait(2)
93
+ # except subprocess.TimeoutExpired:
94
+ # pass
95
+ # else:
96
+ # raise Exception(f"Command {ps_cmd} exited unexpectedly with {ps_process.returncode}")
97
+ self ._perfspect_thread = Thread (target = self ._continuously_poll_perfspect , args = (self ._polling_rate_seconds ,))
98
+ self ._perfspect_thread .start ()
99
+
63
100
def stop (self ) -> None :
64
101
assert self ._thread is not None , "SystemMetricsMonitor is not running"
102
+ assert self ._perfspect_thread is not None , "Perfspect is not running"
65
103
assert self ._stop_event .is_set (), "Stop event was not set before stopping the SystemMetricsMonitor"
66
104
self ._thread .join (STOP_TIMEOUT_SECONDS )
67
105
if self ._thread .is_alive ():
68
106
raise ThreadStopTimeoutError ("Timed out while waiting for the SystemMetricsMonitor internal thread to stop" )
69
107
self ._thread = None
108
+ self ._ps_process .kill ()
109
+ self ._perfspect_thread .join (STOP_TIMEOUT_SECONDS )
110
+ if self ._perfspect_thread .is_alive ():
111
+ raise ThreadStopTimeoutError ("Timed out while waiting for the SystemMetricsMonitor Perfspect thread to stop" )
112
+ self ._perfspect_thread = None
70
113
71
114
def _continuously_poll_memory (self , polling_rate_seconds : int ) -> None :
72
115
while not self ._stop_event .is_set ():
@@ -75,6 +118,22 @@ def _continuously_poll_memory(self, polling_rate_seconds: int) -> None:
75
118
self ._mem_percentages .append (current_ram_percent )
76
119
self ._stop_event .wait (timeout = polling_rate_seconds )
77
120
121
+ def _continuously_poll_perfspect (self , polling_rate_seconds : int ) -> None :
122
+ while not self ._stop_event .is_set ():
123
+ metrics_str = self ._ps_process .stdout .readline ().decode ()
124
+ print (metrics_str )
125
+ if metrics_str .startswith ('TS,SKT,CPU,CID' ):
126
+ continue
127
+ metric_values = metrics_str .split (',' )
128
+ if len (metric_values ) > 0 :
129
+ self ._hw_metrics ['cpu_freq' ].append (float (metric_values [4 ]))
130
+ self ._hw_metrics ['cpu_cpi' ].append (float (metric_values [5 ]))
131
+ self ._hw_metrics ['cpu_tma_fe_bound' ].append (float (metric_values [6 ]))
132
+ self ._hw_metrics ['cpu_tma_bad_spec' ].append (float (metric_values [7 ]))
133
+ self ._hw_metrics ['cpu_tma_be_bound' ].append (float (metric_values [8 ]))
134
+ self ._hw_metrics ['cpu_tma_retiring' ].append (float (metric_values [9 ]))
135
+ self ._stop_event .wait (timeout = polling_rate_seconds )
136
+
78
137
def _get_average_memory_utilization (self ) -> Optional [float ]:
79
138
# Make sure there's only one thread that takes out the values
80
139
# NOTE - Since there's currently only a single consumer, this is not necessary but is done to support multiple
@@ -92,6 +151,28 @@ def _get_cpu_utilization(self) -> float:
92
151
# See `psutil.cpu_percent` documentation.
93
152
return psutil .cpu_percent (interval = None )
94
153
154
+ def _get_hw_metrics (self ) -> List [float ]:
155
+ current_length = len (self ._hw_metrics ['cpu_freq' ])
156
+ if current_length == 0 :
157
+ return None
158
+
159
+ metric_list = []
160
+ metric_list .append (statistics .mean (self ._hw_metrics ['cpu_freq' ][:current_length ]))
161
+ metric_list .append (statistics .mean (self ._hw_metrics ['cpu_cpi' ][:current_length ]))
162
+ metric_list .append (statistics .mean (self ._hw_metrics ['cpu_tma_fe_bound' ][:current_length ]))
163
+ metric_list .append (statistics .mean (self ._hw_metrics ['cpu_tma_be_bound' ][:current_length ]))
164
+ metric_list .append (statistics .mean (self ._hw_metrics ['cpu_tma_bad_spec' ][:current_length ]))
165
+ metric_list .append (statistics .mean (self ._hw_metrics ['cpu_tma_retiring' ][:current_length ]))
166
+
167
+ self ._hw_metrics ['cpu_freq' ] = []
168
+ self ._hw_metrics ['cpu_cpi' ] = []
169
+ self ._hw_metrics ['cpu_tma_fe_bound' ] = []
170
+ self ._hw_metrics ['cpu_tma_be_bound' ] = []
171
+ self ._hw_metrics ['cpu_tma_bad_spec' ] = []
172
+ self ._hw_metrics ['cpu_tma_retiring' ] = []
173
+
174
+ return metric_list
175
+
95
176
96
177
class NoopSystemMetricsMonitor (SystemMetricsMonitorBase ):
97
178
def start (self ) -> None :
@@ -105,3 +186,6 @@ def _get_average_memory_utilization(self) -> Optional[float]:
105
186
106
187
def _get_cpu_utilization (self ) -> Optional [float ]:
107
188
return None
189
+
190
+ def _get_hw_metrics (self ) -> Optional [List [float ]]:
191
+ return None
0 commit comments