Skip to content

Commit 0d2f80b

Browse files
committed
(1) added argument to path of perfspect binary; (2) disabled perfspect on aarch64 and windows; (3) fixed case when perfspect metrics are not collected
1 parent f2baf19 commit 0d2f80b

File tree

3 files changed

+71
-31
lines changed

3 files changed

+71
-31
lines changed

gprofiler/main.py

+24-2
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
from gprofiler.metadata.external_metadata import ExternalMetadataStaleError, read_external_metadata
5858
from gprofiler.metadata.metadata_collector import get_current_metadata, get_static_metadata
5959
from gprofiler.metadata.system_metadata import get_hostname, get_run_mode, get_static_system_info
60-
from gprofiler.platform import is_linux, is_windows
60+
from gprofiler.platform import is_linux, is_windows, is_aarch64
6161
from gprofiler.profiler_state import ProfilerState
6262
from gprofiler.profilers.factory import get_profilers
6363
from gprofiler.profilers.profiler_base import NoopProfiler, ProcessProfilerBase, ProfilerInterface
@@ -137,6 +137,7 @@ def __init__(
137137
controller_process: Optional[Process] = None,
138138
external_metadata_path: Optional[Path] = None,
139139
heartbeat_file_path: Optional[Path] = None,
140+
perfspect_path: Optional[Path] = None,
140141
):
141142
self._output_dir = output_dir
142143
self._flamegraph = flamegraph
@@ -157,6 +158,7 @@ def __init__(
157158
self._duration = duration
158159
self._external_metadata_path = external_metadata_path
159160
self._heartbeat_file_path = heartbeat_file_path
161+
self._perfspect_path = perfspect_path
160162
if self._collect_metadata:
161163
self._static_metadata = get_static_metadata(self._spawn_time, user_args, self._external_metadata_path)
162164
self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=10)
@@ -179,7 +181,8 @@ def __init__(
179181
self._usage_logger = usage_logger
180182
if self._collect_metrics:
181183
self._system_metrics_monitor: SystemMetricsMonitorBase = SystemMetricsMonitor(
182-
self._profiler_state.stop_event
184+
stop_event = self._profiler_state.stop_event,
185+
perfspect_path = perfspect_path
183186
)
184187
else:
185188
self._system_metrics_monitor = NoopSystemMetricsMonitor()
@@ -834,6 +837,17 @@ def parse_cmd_args() -> configargparse.Namespace:
834837
"The file modification indicates the last snapshot time.",
835838
)
836839

840+
if is_linux() and not is_aarch64():
841+
hw_metrics_options = parser.add_argument_group("hardware metrics")
842+
hw_metrics_options.add_argument(
843+
"--perfspect-path",
844+
type=str,
845+
dest="intel_perfspect_path",
846+
default=None,
847+
help="Enable HW metrics collection with Intel PerfSpect."
848+
" Provide path to PerfSpect binary to enable collection.",
849+
)
850+
837851
args = parser.parse_args()
838852

839853
args.perf_inject = args.nodejs_mode == "perf"
@@ -1103,6 +1117,13 @@ def main() -> None:
11031117
if args.heartbeat_file is not None:
11041118
heartbeat_file_path = Path(args.heartbeat_file)
11051119

1120+
perfspect_path: Optional[Path] = None
1121+
if args.intel_perfspect_path is not None:
1122+
perfspect_path = Path(args.intel_perfspect_path)
1123+
if not perfspect_path.is_file():
1124+
logger.error(f"Perfspect binary file {args.perfspect_path} does not exist!")
1125+
sys.exit(1)
1126+
11061127
try:
11071128
log_system_info()
11081129
except Exception:
@@ -1187,6 +1208,7 @@ def main() -> None:
11871208
processes_to_profile=processes_to_profile,
11881209
external_metadata_path=external_metadata_path,
11891210
heartbeat_file_path=heartbeat_file_path,
1211+
perfspect_path=perfspect_path,
11901212
)
11911213
logger.info("gProfiler initialized and ready to start profiling")
11921214
if args.continuous:

gprofiler/platform.py

+5
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515
#
16+
import platform
1617
import sys
1718
from functools import lru_cache
1819

@@ -28,3 +29,7 @@ def is_windows() -> bool:
2829
@lru_cache(maxsize=None)
2930
def is_linux() -> bool:
3031
return sys.platform == LINUX_PLATFORM_NAME
32+
33+
@lru_cache(maxsize=None)
34+
def is_aarch64() -> bool:
35+
return platform.machine() == "aarch64"

gprofiler/system_metrics.py

+42-29
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from threading import Event, RLock, Thread
55
from typing import List, Optional
66

7+
import os
78
import subprocess
89
import psutil
910

@@ -20,17 +21,17 @@ class Metrics:
2021
# The average RAM usage between gProfiler cycles
2122
mem_avg: Optional[float]
2223
# The CPU frequency between gProfiler cycles
23-
cpu_freq: Optional[float]
24+
cpu_freq: Optional[float] = None
2425
# The CPI between gProfiler cycles
25-
cpu_cpi: Optional[float]
26+
cpu_cpi: Optional[float] = None
2627
# The CPU TMA frontend bound between gProfiler cycles
27-
cpu_tma_fe_bound: Optional[float]
28+
cpu_tma_fe_bound: Optional[float] = None
2829
# The CPU TMA backend bound between gProfiler cycles
29-
cpu_tma_be_bound: Optional[float]
30+
cpu_tma_be_bound: Optional[float] = None
3031
# The CPU TMA bad speculation between gProfiler cycles
31-
cpu_tma_bad_spec: Optional[float]
32+
cpu_tma_bad_spec: Optional[float] = None
3233
# The CPU TMA retiring between gProfiler cycles
33-
cpu_tma_retiring: Optional[float]
34+
cpu_tma_retiring: Optional[float] = None
3435

3536
class SystemMetricsMonitorBase(metaclass=ABCMeta):
3637
@abstractmethod
@@ -61,11 +62,19 @@ def _get_hw_metrics(self) -> Optional[List[float]]:
6162

6263
def get_metrics(self) -> Metrics:
6364
hw_metrics = self._get_hw_metrics()
64-
return Metrics(self._get_cpu_utilization(), self._get_average_memory_utilization(), hw_metrics[0], hw_metrics[1], hw_metrics[2], hw_metrics[3], hw_metrics[4], hw_metrics[5])
65+
if hw_metrics and len(hw_metrics) == 6:
66+
return Metrics(self._get_cpu_utilization(), self._get_average_memory_utilization(), hw_metrics[0], hw_metrics[1], hw_metrics[2], hw_metrics[3], hw_metrics[4], hw_metrics[5])
67+
else:
68+
return Metrics(self._get_cpu_utilization(), self._get_average_memory_utilization())
6569

6670

6771
class SystemMetricsMonitor(SystemMetricsMonitorBase):
68-
def __init__(self, stop_event: Event, polling_rate_seconds: int = DEFAULT_POLLING_INTERVAL_SECONDS):
72+
def __init__(
73+
self,
74+
stop_event: Event,
75+
polling_rate_seconds: int = DEFAULT_POLLING_INTERVAL_SECONDS,
76+
perfspect_path: str = None
77+
):
6978
self._polling_rate_seconds = polling_rate_seconds
7079
self._mem_percentages: List[float] = []
7180
self._stop_event = stop_event
@@ -74,42 +83,46 @@ def __init__(self, stop_event: Event, polling_rate_seconds: int = DEFAULT_POLLIN
7483
self._perfspect_thread: Optional[Thread] = None
7584
self._hw_metrics = {'cpu_freq':[], 'cpu_cpi':[], 'cpu_tma_fe_bound':[], 'cpu_tma_be_bound':[], 'cpu_tma_bad_spec':[], 'cpu_tma_retiring':[]}
7685
self._ps_process = None
86+
self._perfspect_path = perfspect_path
7787

7888
self._get_cpu_utilization() # Call this once to set the necessary data
7989

8090
def start(self) -> None:
8191
assert self._thread is None, "SystemMetricsMonitor is already running"
82-
assert self._perfspect_thread is None, "Perfspect is already running"
8392
assert not self._stop_event.is_set(), "Stop condition is already set (perhaps gProfiler was already stopped?)"
8493
self._thread = Thread(target=self._continuously_poll_memory, args=(self._polling_rate_seconds,))
8594
self._thread.start()
8695

87-
ps_cmd = ['/tmp/perfspect', 'metrics', '--metrics', '"CPU operating frequency (in GHz)","CPI","TMA_Frontend_Bound(%)","TMA_Bad_Speculation(%)","TMA_Backend_Bound(%)","TMA_Retiring(%)"', '--duration', '0', '--live', '--format', 'csv', '--interval', '10']
88-
self._ps_process = subprocess.Popen(ps_cmd, stdout=subprocess.PIPE)
89-
# ps_stdout, ps_stderr = ps_process.communicate()
90-
# try:
91-
# # wait 2 seconds to ensure it starts
92-
# ps_process.wait(2)
93-
# except subprocess.TimeoutExpired:
94-
# pass
95-
# else:
96-
# raise Exception(f"Command {ps_cmd} exited unexpectedly with {ps_process.returncode}")
97-
self._perfspect_thread = Thread(target=self._continuously_poll_perfspect, args=(self._polling_rate_seconds,))
98-
self._perfspect_thread.start()
96+
if self._perfspect_path:
97+
assert self._perfspect_thread is None, "Perfspect is already running"
98+
ps_cmd = [self._perfspect_path, 'metrics', '--metrics', '"CPU operating frequency (in GHz)","CPI","TMA_Frontend_Bound(%)","TMA_Bad_Speculation(%)","TMA_Backend_Bound(%)","TMA_Retiring(%)"', '--duration', '0', '--live', '--format', 'csv', '--interval', '10']
99+
self._ps_process = subprocess.Popen(ps_cmd, stdout=subprocess.PIPE)
100+
# ps_stdout, ps_stderr = ps_process.communicate()
101+
# try:
102+
# # wait 2 seconds to ensure it starts
103+
# ps_process.wait(2)
104+
# except subprocess.TimeoutExpired:
105+
# pass
106+
# else:
107+
# raise Exception(f"Command {ps_cmd} exited unexpectedly with {ps_process.returncode}")
108+
self._perfspect_thread = Thread(target=self._continuously_poll_perfspect, args=(self._polling_rate_seconds,))
109+
self._perfspect_thread.start()
99110

100111
def stop(self) -> None:
101112
assert self._thread is not None, "SystemMetricsMonitor is not running"
102-
assert self._perfspect_thread is not None, "Perfspect is not running"
103113
assert self._stop_event.is_set(), "Stop event was not set before stopping the SystemMetricsMonitor"
104114
self._thread.join(STOP_TIMEOUT_SECONDS)
105115
if self._thread.is_alive():
106116
raise ThreadStopTimeoutError("Timed out while waiting for the SystemMetricsMonitor internal thread to stop")
107117
self._thread = None
108-
self._ps_process.kill()
109-
self._perfspect_thread.join(STOP_TIMEOUT_SECONDS)
110-
if self._perfspect_thread.is_alive():
111-
raise ThreadStopTimeoutError("Timed out while waiting for the SystemMetricsMonitor Perfspect thread to stop")
112-
self._perfspect_thread = None
118+
119+
if self._perfspect_path:
120+
self._ps_process.kill()
121+
assert self._perfspect_thread is not None, "Perfspect is not running"
122+
self._perfspect_thread.join(STOP_TIMEOUT_SECONDS)
123+
if self._perfspect_thread.is_alive():
124+
raise ThreadStopTimeoutError("Timed out while waiting for the SystemMetricsMonitor Perfspect thread to stop")
125+
self._perfspect_thread = None
113126

114127
def _continuously_poll_memory(self, polling_rate_seconds: int) -> None:
115128
while not self._stop_event.is_set():
@@ -125,7 +138,7 @@ def _continuously_poll_perfspect(self, polling_rate_seconds: int) -> None:
125138
if metrics_str.startswith('TS,SKT,CPU,CID'):
126139
continue
127140
metric_values = metrics_str.split(',')
128-
if len(metric_values) > 0:
141+
if len(metric_values) > 0 and metric_values[0] != '':
129142
self._hw_metrics['cpu_freq'].append(float(metric_values[4]))
130143
self._hw_metrics['cpu_cpi'].append(float(metric_values[5]))
131144
self._hw_metrics['cpu_tma_fe_bound'].append(float(metric_values[6]))
@@ -172,7 +185,7 @@ def _get_hw_metrics(self) -> List[float]:
172185
self._hw_metrics['cpu_tma_retiring'] = []
173186

174187
return metric_list
175-
188+
176189

177190
class NoopSystemMetricsMonitor(SystemMetricsMonitorBase):
178191
def start(self) -> None:

0 commit comments

Comments
 (0)