4
4
from threading import Event , RLock , Thread
5
5
from typing import List , Optional
6
6
7
+ import os
7
8
import subprocess
8
9
import psutil
9
10
@@ -20,17 +21,17 @@ class Metrics:
20
21
# The average RAM usage between gProfiler cycles
21
22
mem_avg : Optional [float ]
22
23
# The CPU frequency between gProfiler cycles
23
- cpu_freq : Optional [float ]
24
+ cpu_freq : Optional [float ] = None
24
25
# The CPI between gProfiler cycles
25
- cpu_cpi : Optional [float ]
26
+ cpu_cpi : Optional [float ] = None
26
27
# The CPU TMA frontend bound between gProfiler cycles
27
- cpu_tma_fe_bound : Optional [float ]
28
+ cpu_tma_fe_bound : Optional [float ] = None
28
29
# The CPU TMA backend bound between gProfiler cycles
29
- cpu_tma_be_bound : Optional [float ]
30
+ cpu_tma_be_bound : Optional [float ] = None
30
31
# The CPU TMA bad speculation between gProfiler cycles
31
- cpu_tma_bad_spec : Optional [float ]
32
+ cpu_tma_bad_spec : Optional [float ] = None
32
33
# The CPU TMA retiring between gProfiler cycles
33
- cpu_tma_retiring : Optional [float ]
34
+ cpu_tma_retiring : Optional [float ] = None
34
35
35
36
class SystemMetricsMonitorBase (metaclass = ABCMeta ):
36
37
@abstractmethod
@@ -61,11 +62,19 @@ def _get_hw_metrics(self) -> Optional[List[float]]:
61
62
62
63
def get_metrics (self ) -> Metrics :
63
64
hw_metrics = self ._get_hw_metrics ()
64
- return Metrics (self ._get_cpu_utilization (), self ._get_average_memory_utilization (), hw_metrics [0 ], hw_metrics [1 ], hw_metrics [2 ], hw_metrics [3 ], hw_metrics [4 ], hw_metrics [5 ])
65
+ if hw_metrics and len (hw_metrics ) == 6 :
66
+ return Metrics (self ._get_cpu_utilization (), self ._get_average_memory_utilization (), hw_metrics [0 ], hw_metrics [1 ], hw_metrics [2 ], hw_metrics [3 ], hw_metrics [4 ], hw_metrics [5 ])
67
+ else :
68
+ return Metrics (self ._get_cpu_utilization (), self ._get_average_memory_utilization ())
65
69
66
70
67
71
class SystemMetricsMonitor (SystemMetricsMonitorBase ):
68
- def __init__ (self , stop_event : Event , polling_rate_seconds : int = DEFAULT_POLLING_INTERVAL_SECONDS ):
72
+ def __init__ (
73
+ self ,
74
+ stop_event : Event ,
75
+ polling_rate_seconds : int = DEFAULT_POLLING_INTERVAL_SECONDS ,
76
+ perfspect_path : str = None
77
+ ):
69
78
self ._polling_rate_seconds = polling_rate_seconds
70
79
self ._mem_percentages : List [float ] = []
71
80
self ._stop_event = stop_event
@@ -74,42 +83,46 @@ def __init__(self, stop_event: Event, polling_rate_seconds: int = DEFAULT_POLLIN
74
83
self ._perfspect_thread : Optional [Thread ] = None
75
84
self ._hw_metrics = {'cpu_freq' :[], 'cpu_cpi' :[], 'cpu_tma_fe_bound' :[], 'cpu_tma_be_bound' :[], 'cpu_tma_bad_spec' :[], 'cpu_tma_retiring' :[]}
76
85
self ._ps_process = None
86
+ self ._perfspect_path = perfspect_path
77
87
78
88
self ._get_cpu_utilization () # Call this once to set the necessary data
79
89
80
90
def start (self ) -> None :
81
91
assert self ._thread is None , "SystemMetricsMonitor is already running"
82
- assert self ._perfspect_thread is None , "Perfspect is already running"
83
92
assert not self ._stop_event .is_set (), "Stop condition is already set (perhaps gProfiler was already stopped?)"
84
93
self ._thread = Thread (target = self ._continuously_poll_memory , args = (self ._polling_rate_seconds ,))
85
94
self ._thread .start ()
86
95
87
- ps_cmd = ['/tmp/perfspect' , 'metrics' , '--metrics' , '"CPU operating frequency (in GHz)","CPI","TMA_Frontend_Bound(%)","TMA_Bad_Speculation(%)","TMA_Backend_Bound(%)","TMA_Retiring(%)"' , '--duration' , '0' , '--live' , '--format' , 'csv' , '--interval' , '10' ]
88
- self ._ps_process = subprocess .Popen (ps_cmd , stdout = subprocess .PIPE )
89
- # ps_stdout, ps_stderr = ps_process.communicate()
90
- # try:
91
- # # wait 2 seconds to ensure it starts
92
- # ps_process.wait(2)
93
- # except subprocess.TimeoutExpired:
94
- # pass
95
- # else:
96
- # raise Exception(f"Command {ps_cmd} exited unexpectedly with {ps_process.returncode}")
97
- self ._perfspect_thread = Thread (target = self ._continuously_poll_perfspect , args = (self ._polling_rate_seconds ,))
98
- self ._perfspect_thread .start ()
96
+ if self ._perfspect_path :
97
+ assert self ._perfspect_thread is None , "Perfspect is already running"
98
+ ps_cmd = [self ._perfspect_path , 'metrics' , '--metrics' , '"CPU operating frequency (in GHz)","CPI","TMA_Frontend_Bound(%)","TMA_Bad_Speculation(%)","TMA_Backend_Bound(%)","TMA_Retiring(%)"' , '--duration' , '0' , '--live' , '--format' , 'csv' , '--interval' , '10' ]
99
+ self ._ps_process = subprocess .Popen (ps_cmd , stdout = subprocess .PIPE )
100
+ # ps_stdout, ps_stderr = ps_process.communicate()
101
+ # try:
102
+ # # wait 2 seconds to ensure it starts
103
+ # ps_process.wait(2)
104
+ # except subprocess.TimeoutExpired:
105
+ # pass
106
+ # else:
107
+ # raise Exception(f"Command {ps_cmd} exited unexpectedly with {ps_process.returncode}")
108
+ self ._perfspect_thread = Thread (target = self ._continuously_poll_perfspect , args = (self ._polling_rate_seconds ,))
109
+ self ._perfspect_thread .start ()
99
110
100
111
def stop (self ) -> None :
101
112
assert self ._thread is not None , "SystemMetricsMonitor is not running"
102
- assert self ._perfspect_thread is not None , "Perfspect is not running"
103
113
assert self ._stop_event .is_set (), "Stop event was not set before stopping the SystemMetricsMonitor"
104
114
self ._thread .join (STOP_TIMEOUT_SECONDS )
105
115
if self ._thread .is_alive ():
106
116
raise ThreadStopTimeoutError ("Timed out while waiting for the SystemMetricsMonitor internal thread to stop" )
107
117
self ._thread = None
108
- self ._ps_process .kill ()
109
- self ._perfspect_thread .join (STOP_TIMEOUT_SECONDS )
110
- if self ._perfspect_thread .is_alive ():
111
- raise ThreadStopTimeoutError ("Timed out while waiting for the SystemMetricsMonitor Perfspect thread to stop" )
112
- self ._perfspect_thread = None
118
+
119
+ if self ._perfspect_path :
120
+ self ._ps_process .kill ()
121
+ assert self ._perfspect_thread is not None , "Perfspect is not running"
122
+ self ._perfspect_thread .join (STOP_TIMEOUT_SECONDS )
123
+ if self ._perfspect_thread .is_alive ():
124
+ raise ThreadStopTimeoutError ("Timed out while waiting for the SystemMetricsMonitor Perfspect thread to stop" )
125
+ self ._perfspect_thread = None
113
126
114
127
def _continuously_poll_memory (self , polling_rate_seconds : int ) -> None :
115
128
while not self ._stop_event .is_set ():
@@ -125,7 +138,7 @@ def _continuously_poll_perfspect(self, polling_rate_seconds: int) -> None:
125
138
if metrics_str .startswith ('TS,SKT,CPU,CID' ):
126
139
continue
127
140
metric_values = metrics_str .split (',' )
128
- if len (metric_values ) > 0 :
141
+ if len (metric_values ) > 0 and metric_values [ 0 ] != '' :
129
142
self ._hw_metrics ['cpu_freq' ].append (float (metric_values [4 ]))
130
143
self ._hw_metrics ['cpu_cpi' ].append (float (metric_values [5 ]))
131
144
self ._hw_metrics ['cpu_tma_fe_bound' ].append (float (metric_values [6 ]))
@@ -172,7 +185,7 @@ def _get_hw_metrics(self) -> List[float]:
172
185
self ._hw_metrics ['cpu_tma_retiring' ] = []
173
186
174
187
return metric_list
175
-
188
+
176
189
177
190
class NoopSystemMetricsMonitor (SystemMetricsMonitorBase ):
178
191
def start (self ) -> None :
0 commit comments