diff --git a/CHANGELOG.md b/CHANGELOG.md index 22768bd1..634da6b0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -135,46 +135,124 @@ GPU: 0 ### Changed - -- **All `amd-smi set` and `amd-smi reset` options are now mutually exclusive**. - - Users can only use one set option at a time now. +- **All `amd-smi set` and `amd-smi reset` options are now mutually exclusive**. + - Users can only use one set option at a time now. - **Python API for `amdsmi_get_energy_count()` will change the name for the `power` field to `energy_accumulator`**. +- **Added violation status output for Graphics Clock Below Host Limit to our CLI: `amdsmi_get_violation_status()`, `amd-smi metric --throttle`, and `amd-smi monitor --violation`.** + ***Only available for MI300+ ASICs.*** + Users can retrieve violation status' through either our Python or C++ APIs. + Additionally, we have added capability to view these outputs conviently through `amd-smi metric --throttle` and `amd-smi monitor --violation`. + Example outputs are listed below (below is for reference, output is subject to change): + + ```shell + $ amd-smi monitor --violation + GPU PVIOL TVIOL TVIOL_ACTIVE PHOT_TVIOL VR_TVIOL HBM_TVIOL GFX_CLKVIOL + 0 0 % 0 % False 0 % 0 % 0 % 0 % + 1 0 % 0 % False 0 % 0 % 0 % 0 % + ... + ``` + + ```shell + $ amd-smi metric --throttle + GPU: 0 + THROTTLE: + ACCUMULATION_COUNTER: 11240028 + PROCHOT_ACCUMULATED: 0 + PPT_ACCUMULATED: 0 + SOCKET_THERMAL_ACCUMULATED: 0 + VR_THERMAL_ACCUMULATED: 0 + HBM_THERMAL_ACCUMULATED: 0 + GFX_CLK_BELOW_HOST_LIMIT_ACCUMULATED: N/A + PROCHOT_VIOLATION_STATUS: NOT ACTIVE + PPT_VIOLATION_STATUS: NOT ACTIVE + SOCKET_THERMAL_VIOLATION_STATUS: NOT ACTIVE + VR_THERMAL_VIOLATION_STATUS: NOT ACTIVE + HBM_THERMAL_VIOLATION_STATUS: NOT ACTIVE + GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_STATUS: N/A + PROCHOT_VIOLATION_ACTIVITY: 0 % + PPT_VIOLATION_ACTIVITY: 0 % + SOCKET_THERMAL_VIOLATION_ACTIVITY: 0 % + VR_THERMAL_VIOLATION_ACTIVITY: 0 % + HBM_THERMAL_VIOLATION_ACTIVITY: 0 % + GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_ACTIVITY: 0 % + + GPU: 1 + THROTTLE: + ACCUMULATION_COUNTER: 11238232 + PROCHOT_ACCUMULATED: 0 + PPT_ACCUMULATED: 0 + SOCKET_THERMAL_ACCUMULATED: 0 + VR_THERMAL_ACCUMULATED: 0 + HBM_THERMAL_ACCUMULATED: 0 + GFX_CLK_BELOW_HOST_LIMIT_ACCUMULATED: 0 + PROCHOT_VIOLATION_STATUS: NOT ACTIVE + PPT_VIOLATION_STATUS: NOT ACTIVE + SOCKET_THERMAL_VIOLATION_STATUS: NOT ACTIVE + VR_THERMAL_VIOLATION_STATUS: NOT ACTIVE + HBM_THERMAL_VIOLATION_STATUS: NOT ACTIVE + GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_STATUS: NOT ACTIVE + PROCHOT_VIOLATION_ACTIVITY: 0 % + PPT_VIOLATION_ACTIVITY: 0 % + SOCKET_THERMAL_VIOLATION_ACTIVITY: 0 % + VR_THERMAL_VIOLATION_ACTIVITY: 0 % + HBM_THERMAL_VIOLATION_ACTIVITY: 0 % + GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_ACTIVITY: 0 % + ... + ``` + +- **Updated API `amdsmi_get_violation_status()` structure and CLI `amdsmi_violation_status_t` to include GFX Clk below host limit** +Updated structure `amdsmi_violation_status_t`: + + ```C + typedef struct { + ... + uint64_t acc_gfx_clk_below_host_limit; //!< Current graphic clock below host limit count; Max uint64 means unsupported + ... + uint64_t per_gfx_clk_below_host_limit; //!< Graphics clock below host limit violation % (greater than 0% is a violation); Max uint64 means unsupported + ... + uint8_t active_gfx_clk_below_host_limit; //!< Graphics clock below host limit violation; 1 = active 0 = not active; Max uint8 means unsupported + ... + } amdsmi_violation_status_t; + ``` + - **Updated API `amdsmi_get_gpu_vram_info()` structure and CLI `amd-smi static --vram`** -Updated structure `amdsmi_vram_info_t`: -```C -typedef struct { - amdsmi_vram_type_t vram_type; - amdsmi_vram_vendor_type_t vram_vendor; - uint64_t vram_size; - uint32_t vram_bit_width; - uint64_t vram_max_bandwidth; //!< The VRAM max bandwidth at current memory clock (GB/s) - uint64_t reserved[4]; -} amdsmi_vram_info_t; - -amdsmi_status_t amdsmi_get_gpu_vram_info(amdsmi_processor_handle processor_handle, amdsmi_vram_info_t *info) -``` -Example CLI output: -```shell -$ amd-smi static --vram -GPU: 0 - VRAM: - TYPE: GDDR6 - VENDOR: N/A - SIZE: 16368 MB - BIT_WIDTH: 256 - MAX_BANDWIDTH: 1555 GB/s -GPU: 1 - VRAM: - TYPE: GDDR6 - VENDOR: N/A - SIZE: 30704 MB - BIT_WIDTH: 256 - MAX_BANDWIDTH: 1555 GB/s -... +Updated structure `amdsmi_vram_info_t`: -``` + ```C + typedef struct { + amdsmi_vram_type_t vram_type; + amdsmi_vram_vendor_type_t vram_vendor; + uint64_t vram_size; + uint32_t vram_bit_width; + uint64_t vram_max_bandwidth; //!< The VRAM max bandwidth at current memory clock (GB/s) + uint64_t reserved[4]; + } amdsmi_vram_info_t; + + amdsmi_status_t amdsmi_get_gpu_vram_info(amdsmi_processor_handle processor_handle, amdsmi_vram_info_t *info) + ``` + + Example CLI output: + + ```shell + $ amd-smi static --vram + GPU: 0 + VRAM: + TYPE: GDDR6 + VENDOR: N/A + SIZE: 16368 MB + BIT_WIDTH: 256 + MAX_BANDWIDTH: 1555 GB/s + GPU: 1 + VRAM: + TYPE: GDDR6 + VENDOR: N/A + SIZE: 30704 MB + BIT_WIDTH: 256 + MAX_BANDWIDTH: 1555 GB/s + ... + ``` ### Removed diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index e2534c7b..8556361c 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -2277,7 +2277,7 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No 'socket_thermal_accumulated': "N/A", 'vr_thermal_accumulated': "N/A", 'hbm_thermal_accumulated': "N/A", - 'gfx_below_host_limit_acc': "N/A", + 'gfx_clk_below_host_limit_accumulated': "N/A", # violation status values - active/not active 'prochot_violation_status': "N/A", @@ -2285,13 +2285,15 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No 'socket_thermal_violation_status': "N/A", 'vr_thermal_violation_status': "N/A", 'hbm_thermal_violation_status': "N/A", + 'gfx_clk_below_host_limit_violation_status': "N/A", # violation activity values - percent 'prochot_violation_activity': "N/A", 'ppt_violation_activity': "N/A", 'socket_thermal_violation_activity': "N/A", 'vr_thermal_violation_activity': "N/A", - 'hbm_thermal_violation_activity': "N/A" + 'hbm_thermal_violation_activity': "N/A", + 'gfx_clk_below_host_limit_violation_activity': "N/A", } try: @@ -2302,18 +2304,21 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No throttle_status['socket_thermal_accumulated'] = violation_status['acc_socket_thrm'] throttle_status['vr_thermal_accumulated'] = violation_status['acc_vr_thrm'] throttle_status['hbm_thermal_accumulated'] = violation_status['acc_hbm_thrm'] + throttle_status['gfx_clk_below_host_limit_accumulated'] = violation_status['acc_gfx_clk_below_host_limit'] throttle_status['prochot_violation_status'] = violation_status['active_prochot_thrm'] throttle_status['ppt_violation_status'] = violation_status['active_ppt_pwr'] throttle_status['socket_thermal_violation_status'] = violation_status['active_socket_thrm'] throttle_status['vr_thermal_violation_status'] = violation_status['active_vr_thrm'] throttle_status['hbm_thermal_violation_status'] = violation_status['active_hbm_thrm'] + throttle_status['gfx_clk_below_host_limit_violation_status'] = violation_status['active_gfx_clk_below_host_limit'] throttle_status['prochot_violation_activity'] = violation_status['per_prochot_thrm'] throttle_status['ppt_violation_activity'] = violation_status['per_ppt_pwr'] throttle_status['socket_thermal_violation_activity'] = violation_status['per_socket_thrm'] throttle_status['vr_thermal_violation_activity'] = violation_status['per_vr_thrm'] throttle_status['hbm_thermal_violation_activity'] = violation_status['per_hbm_thrm'] + throttle_status['gfx_clk_below_host_limit_violation_activity'] = violation_status['per_gfx_clk_below_host_limit'] except amdsmi_exception.AmdSmiLibraryException as e: values_dict['throttle'] = throttle_status @@ -5274,6 +5279,7 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None, "phot_tviol": "N/A", "vr_tviol": "N/A", "hbm_tviol": "N/A", + "gfx_clkviol": "N/A", } try: violations = amdsmi_interface.amdsmi_get_violation_status(args.gpu) @@ -5283,6 +5289,7 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None, violation_status['phot_tviol'] = violations['per_prochot_thrm'] violation_status['vr_tviol'] = violations['per_vr_thrm'] violation_status['hbm_tviol'] = violations['per_hbm_thrm'] + violation_status['gfx_clkviol'] = violations['per_gfx_clk_below_host_limit'] except amdsmi_exception.AmdSmiLibraryException as e: monitor_values['pviol'] = violation_status['pviol'] monitor_values['tviol'] = violation_status['tviol'] @@ -5290,32 +5297,40 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None, monitor_values['phot_tviol'] = violation_status['phot_tviol'] monitor_values['vr_tviol'] = violation_status['vr_tviol'] monitor_values['hbm_tviol'] = violation_status['hbm_tviol'] + monitor_values['gfx_clkviol'] = violation_status['gfx_clkviol'] logging.debug("Failed to get violation status on gpu %s | %s", gpu_id, e.get_error_info()) violation_status_unit = "%" + kPVIOL_MAX_WIDTH = 7 kTVIOL_MAX_WIDTH = 7 kTVIOL_ACTIVE_MAX_WIDTH = 14 - kPVIOL_MAX_WIDTH = 7 kPHOT_MAX_WIDTH = 12 kVR_MAX_WIDTH = 10 kHBM_MAX_WIDTH = 11 + kGFXC_MAX_WIDTH = 13 for key, value in violation_status.items(): - if key == "tviol_active": - monitor_values[key] = value - elif key != "tviol_active": - monitor_values[key] = self.helpers.unit_format(self.logger, violation_status[key], violation_status_unit) + if value != "N/A": + if key == "tviol_active": + monitor_values[key] = value + else: + monitor_values[key] = self.helpers.unit_format(self.logger, violation_status[key], violation_status_unit) + else: + monitor_values[key] = violation_status[key] + if self.logger.is_human_readable_format(): monitor_values['pviol'] = monitor_values['pviol'].rjust(kPVIOL_MAX_WIDTH, ' ') monitor_values['tviol'] = monitor_values['tviol'].rjust(kTVIOL_MAX_WIDTH, ' ') monitor_values['phot_tviol'] = monitor_values['phot_tviol'].rjust(kPHOT_MAX_WIDTH, ' ') monitor_values['vr_tviol'] = monitor_values['vr_tviol'].rjust(kVR_MAX_WIDTH, ' ') monitor_values['hbm_tviol'] = monitor_values['hbm_tviol'].rjust(kHBM_MAX_WIDTH, ' ') + monitor_values['gfx_clkviol'] = monitor_values['gfx_clkviol'].rjust(kGFXC_MAX_WIDTH, ' ') self.logger.table_header += 'PVIOL'.rjust(kPVIOL_MAX_WIDTH, ' ') self.logger.table_header += 'TVIOL'.rjust(kTVIOL_MAX_WIDTH, ' ') self.logger.table_header += 'TVIOL_ACTIVE'.rjust(kTVIOL_ACTIVE_MAX_WIDTH, ' ') self.logger.table_header += 'PHOT_TVIOL'.rjust(kPHOT_MAX_WIDTH, ' ') self.logger.table_header += 'VR_TVIOL'.rjust(kVR_MAX_WIDTH, ' ') self.logger.table_header += 'HBM_TVIOL'.rjust(kHBM_MAX_WIDTH, ' ') + self.logger.table_header += 'GFX_CLKVIOL'.rjust(kGFXC_MAX_WIDTH, ' ') self.logger.store_output(args.gpu, 'values', monitor_values) diff --git a/amdsmi_cli/amdsmi_logger.py b/amdsmi_cli/amdsmi_logger.py index 38633f46..2a6f0c7b 100644 --- a/amdsmi_cli/amdsmi_logger.py +++ b/amdsmi_cli/amdsmi_logger.py @@ -180,6 +180,8 @@ def _convert_json_to_tabular(self, json_object: Dict[str, any]): table_values += string_value.rjust(10) elif key == "hbm_tviol": table_values += string_value.rjust(11) + elif key == "gfx_clkviol": + table_values += string_value.rjust(13) elif key == "process_list": #Add an additional padding between the first instance of GPU and NAME table_values += ' ' diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 819cf0a4..addf071c 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -526,18 +526,22 @@ typedef struct { uint64_t acc_socket_thrm; //!< TVIOL; Current accumulated Socket thermal count; Max uint64 means unsupported uint64_t acc_vr_thrm; //!< Current accumulated voltage regulator count; Max uint64 means unsupported uint64_t acc_hbm_thrm; //!< Current accumulated High Bandwidth Memory (HBM) thermal count; Max uint64 means unsupported + uint64_t acc_gfx_clk_below_host_limit; //!< Current graphic clock below host limit count; Max uint64 means unsupported uint64_t per_prochot_thrm; //!< Processor hot violation % (greater than 0% is a violation); Max uint64 means unsupported uint64_t per_ppt_pwr; //!< PVIOL; Package Power Tracking (PPT) violation % (greater than 0% is a violation); Max uint64 means unsupported uint64_t per_socket_thrm; //!< TVIOL; Socket thermal violation % (greater than 0% is a violation); Max uint64 means unsupported uint64_t per_vr_thrm; //!< Voltage regulator violation % (greater than 0% is a violation); Max uint64 means unsupported uint64_t per_hbm_thrm; //!< High Bandwidth Memory (HBM) thermal violation % (greater than 0% is a violation); Max uint64 means unsupported + uint64_t per_gfx_clk_below_host_limit; //!< Graphics clock below host limit violation % (greater than 0% is a violation); Max uint64 means unsupported uint8_t active_prochot_thrm; //!< Processor hot violation; 1 = active 0 = not active; Max uint8 means unsupported uint8_t active_ppt_pwr; //!< Package Power Tracking (PPT) violation; 1 = active 0 = not active; Max uint8 means unsupported uint8_t active_socket_thrm; //!< Socket thermal violation; 1 = active 0 = not active; Max uint8 means unsupported uint8_t active_vr_thrm; //!< Voltage regulator violation; 1 = active 0 = not active; Max uint8 means unsupported uint8_t active_hbm_thrm; //!< High Bandwidth Memory (HBM) thermal violation; 1 = active 0 = not active; Max uint8 means unsupported - uint64_t reserved[30]; // Reserved for new violation info + uint8_t active_gfx_clk_below_host_limit; //!< Graphics clock below host limit violation; 1 = active 0 = not active; Max uint8 means unsupported + uint64_t reserved[3]; // Reserved for new violation info } amdsmi_violation_status_t; + typedef struct { amdsmi_range_t supported_freq_range; amdsmi_range_t current_freq_range; diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 1ae18573..a6e74f89 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -28,6 +28,7 @@ import sys import math from time import localtime, asctime, time +import json MAX_NUM_PROCESSES = 1024 @@ -1559,7 +1560,9 @@ def amdsmi_get_hsmp_metrics_table( "mtbl_ppt_residency_acc": mtbl.ppt_residency_acc, "mtbl_socket_thm_residency_acc": mtbl.socket_thm_residency_acc, "mtbl_vr_thm_residency_acc": mtbl.vr_thm_residency_acc, - "mtbl_hbm_thm_residency_acc": mtbl.hbm_thm_residency_acc + "mtbl_hbm_thm_residency_acc": mtbl.hbm_thm_residency_acc, + "mtbl_gfx_clk_below_host_residency_acc": mtbl.gfx_clk_below_host_residency_acc, + "mtbl_low_utilization_residency_acc": mtbl.low_utilization_residency_acc } def amdsmi_first_online_core_on_cpu_socket( @@ -2035,7 +2038,7 @@ def amdsmi_get_violation_status( processor_handle, ctypes.byref(violation_status)) ) - return { + dict_return = { "reference_timestamp": _validate_if_max_uint(violation_status.reference_timestamp, MaxUIntegerTypes.UINT64_T), "violation_timestamp": _validate_if_max_uint(violation_status.violation_timestamp, MaxUIntegerTypes.UINT64_T), "acc_counter": _validate_if_max_uint(violation_status.acc_counter, MaxUIntegerTypes.UINT64_T), @@ -2044,17 +2047,21 @@ def amdsmi_get_violation_status( "acc_socket_thrm": _validate_if_max_uint(violation_status.acc_socket_thrm, MaxUIntegerTypes.UINT64_T), #TVIOL "acc_vr_thrm": _validate_if_max_uint(violation_status.acc_vr_thrm, MaxUIntegerTypes.UINT64_T), "acc_hbm_thrm": _validate_if_max_uint(violation_status.acc_hbm_thrm, MaxUIntegerTypes.UINT64_T), + "acc_gfx_clk_below_host_limit": _validate_if_max_uint(violation_status.acc_gfx_clk_below_host_limit, MaxUIntegerTypes.UINT64_T), "per_prochot_thrm": _validate_if_max_uint(violation_status.per_prochot_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True), "per_ppt_pwr": _validate_if_max_uint(violation_status.per_ppt_pwr, MaxUIntegerTypes.UINT64_T, isActivity=True), #PVIOL "per_socket_thrm": _validate_if_max_uint(violation_status.per_socket_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True), #TVIOL "per_vr_thrm": _validate_if_max_uint(violation_status.per_vr_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True), "per_hbm_thrm": _validate_if_max_uint(violation_status.per_hbm_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True), + "per_gfx_clk_below_host_limit": _validate_if_max_uint(violation_status.per_gfx_clk_below_host_limit, MaxUIntegerTypes.UINT64_T, isActivity=True), "active_prochot_thrm": _validate_if_max_uint(violation_status.active_prochot_thrm, MaxUIntegerTypes.UINT8_T, isBool=True), "active_ppt_pwr": _validate_if_max_uint(violation_status.active_ppt_pwr, MaxUIntegerTypes.UINT8_T, isBool=True), #PVIOL "active_socket_thrm": _validate_if_max_uint(violation_status.active_socket_thrm, MaxUIntegerTypes.UINT8_T, isBool=True), #TVIOL "active_vr_thrm": _validate_if_max_uint(violation_status.active_vr_thrm, MaxUIntegerTypes.UINT8_T, isBool=True), - "active_hbm_thrm": _validate_if_max_uint(violation_status.active_hbm_thrm, MaxUIntegerTypes.UINT8_T, isBool=True) + "active_hbm_thrm": _validate_if_max_uint(violation_status.active_hbm_thrm, MaxUIntegerTypes.UINT8_T, isBool=True), + "active_gfx_clk_below_host_limit": _validate_if_max_uint(violation_status.active_gfx_clk_below_host_limit, MaxUIntegerTypes.UINT8_T, isBool=True), } + return dict_return def amdsmi_get_gpu_total_ecc_count( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 4c343771..9075461d 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -727,18 +727,21 @@ class struct_amdsmi_violation_status_t(Structure): ('acc_socket_thrm', ctypes.c_uint64), ('acc_vr_thrm', ctypes.c_uint64), ('acc_hbm_thrm', ctypes.c_uint64), + ('acc_gfx_clk_below_host_limit', ctypes.c_uint64), ('per_prochot_thrm', ctypes.c_uint64), ('per_ppt_pwr', ctypes.c_uint64), ('per_socket_thrm', ctypes.c_uint64), ('per_vr_thrm', ctypes.c_uint64), ('per_hbm_thrm', ctypes.c_uint64), + ('per_gfx_clk_below_host_limit', ctypes.c_uint64), ('active_prochot_thrm', ctypes.c_ubyte), ('active_ppt_pwr', ctypes.c_ubyte), ('active_socket_thrm', ctypes.c_ubyte), ('active_vr_thrm', ctypes.c_ubyte), ('active_hbm_thrm', ctypes.c_ubyte), - ('PADDING_0', ctypes.c_ubyte * 3), - ('reserved', ctypes.c_uint64 * 30), + ('active_gfx_clk_below_host_limit', ctypes.c_ubyte), + ('PADDING_0', ctypes.c_ubyte * 2), + ('reserved', ctypes.c_uint64 * 3), ] amdsmi_violation_status_t = struct_amdsmi_violation_status_t @@ -791,6 +794,19 @@ class struct_amdsmi_bdf_t(Structure): class struct_amdsmi_pcie_info_t(Structure): pass +class struct_pcie_static_(Structure): + pass + +struct_pcie_static_._pack_ = 1 # source:False +struct_pcie_static_._fields_ = [ + ('max_pcie_width', ctypes.c_uint16), + ('PADDING_0', ctypes.c_ubyte * 2), + ('max_pcie_speed', ctypes.c_uint32), + ('pcie_interface_version', ctypes.c_uint32), + ('slot_type', amdsmi_card_form_factor_t), + ('reserved', ctypes.c_uint64 * 10), +] + class struct_pcie_metric_(Structure): pass @@ -811,19 +827,6 @@ class struct_pcie_metric_(Structure): ('reserved', ctypes.c_uint64 * 12), ] -class struct_pcie_static_(Structure): - pass - -struct_pcie_static_._pack_ = 1 # source:False -struct_pcie_static_._fields_ = [ - ('max_pcie_width', ctypes.c_uint16), - ('PADDING_0', ctypes.c_ubyte * 2), - ('max_pcie_speed', ctypes.c_uint32), - ('pcie_interface_version', ctypes.c_uint32), - ('slot_type', amdsmi_card_form_factor_t), - ('reserved', ctypes.c_uint64 * 10), -] - struct_amdsmi_pcie_info_t._pack_ = 1 # source:False struct_amdsmi_pcie_info_t._fields_ = [ ('pcie_static', struct_pcie_static_), @@ -1120,16 +1123,6 @@ class struct_amdsmi_engine_usage_t(Structure): class struct_amdsmi_proc_info_t(Structure): pass -class struct_engine_usage_(Structure): - pass - -struct_engine_usage_._pack_ = 1 # source:False -struct_engine_usage_._fields_ = [ - ('gfx', ctypes.c_uint64), - ('enc', ctypes.c_uint64), - ('reserved', ctypes.c_uint32 * 12), -] - class struct_memory_usage_(Structure): pass @@ -1141,6 +1134,16 @@ class struct_memory_usage_(Structure): ('reserved', ctypes.c_uint32 * 10), ] +class struct_engine_usage_(Structure): + pass + +struct_engine_usage_._pack_ = 1 # source:False +struct_engine_usage_._fields_ = [ + ('gfx', ctypes.c_uint64), + ('enc', ctypes.c_uint64), + ('reserved', ctypes.c_uint32 * 12), +] + struct_amdsmi_proc_info_t._pack_ = 1 # source:False struct_amdsmi_proc_info_t._fields_ = [ ('name', ctypes.c_char * 256), diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 5c418b3e..2f06f531 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -629,18 +629,21 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha violation_status->acc_socket_thrm = std::numeric_limits::max(); violation_status->acc_vr_thrm = std::numeric_limits::max(); violation_status->acc_hbm_thrm = std::numeric_limits::max(); + violation_status->acc_gfx_clk_below_host_limit = std::numeric_limits::max(); violation_status->per_prochot_thrm = std::numeric_limits::max(); violation_status->per_ppt_pwr = std::numeric_limits::max(); violation_status->per_socket_thrm = std::numeric_limits::max(); violation_status->per_vr_thrm = std::numeric_limits::max(); violation_status->per_hbm_thrm = std::numeric_limits::max(); + violation_status->per_gfx_clk_below_host_limit = std::numeric_limits::max(); violation_status->active_prochot_thrm = std::numeric_limits::max(); violation_status->active_ppt_pwr = std::numeric_limits::max(); violation_status->active_socket_thrm = std::numeric_limits::max(); violation_status->active_vr_thrm = std::numeric_limits::max(); violation_status->active_hbm_thrm = std::numeric_limits::max(); + violation_status->active_gfx_clk_below_host_limit = std::numeric_limits::max(); const auto p1 = std::chrono::system_clock::now(); auto current_time = std::chrono::duration_cast( @@ -664,8 +667,18 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha return r; } + // default to 0xffffffff as not supported + uint32_t partitition_id = std::numeric_limits::max(); + auto tmp_partition_id = uint32_t(0); + amdsmi_status_t status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, &(tmp_partition_id)); + // Do not return early if this value fails + // continue to try getting all info + if (status == AMDSMI_STATUS_SUCCESS) { + partitition_id = tmp_partition_id; + } + amdsmi_gpu_metrics_t metric_info_a = {}; - amdsmi_status_t status = amdsmi_get_gpu_metrics_info( + status = amdsmi_get_gpu_metrics_info( processor_handle, &metric_info_a); if (status != AMDSMI_STATUS_SUCCESS) { std::ostringstream ss; @@ -680,7 +693,9 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha && metric_info_a.ppt_residency_acc == std::numeric_limits::max() && metric_info_a.socket_thm_residency_acc == std::numeric_limits::max() && metric_info_a.vr_thm_residency_acc == std::numeric_limits::max() - && metric_info_a.hbm_thm_residency_acc == std::numeric_limits::max()) { + && metric_info_a.hbm_thm_residency_acc == std::numeric_limits::max() + && (metric_info_a.xcp_stats->gfx_below_host_limit_acc[partitition_id] + == std::numeric_limits::max())) { ss << __PRETTY_FUNCTION__ << " | ASIC does not support throttle violations!, " << "returning AMDSMI_STATUS_NOT_SUPPORTED"; @@ -705,33 +720,38 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha violation_status->acc_socket_thrm = metric_info_b.socket_thm_residency_acc; violation_status->acc_vr_thrm = metric_info_b.vr_thm_residency_acc; violation_status->acc_hbm_thrm = metric_info_b.hbm_thm_residency_acc; + violation_status->acc_gfx_clk_below_host_limit + = metric_info_b.xcp_stats->gfx_below_host_limit_acc[partitition_id]; ss << __PRETTY_FUNCTION__ << " | " << "[gpu_metrics A] metric_info_a.accumulation_counter: " << std::dec - << metric_info_a.accumulation_counter + << metric_info_a.accumulation_counter << "\n" << "; metric_info_a.prochot_residency_acc: " << std::dec - << metric_info_a.prochot_residency_acc + << metric_info_a.prochot_residency_acc << "\n" << "; metric_info_a.ppt_residency_acc (pviol): " << std::dec - << metric_info_a.ppt_residency_acc + << metric_info_a.ppt_residency_acc << "\n" << "; metric_info_a.socket_thm_residency_acc (tviol): " << std::dec - << metric_info_a.socket_thm_residency_acc + << metric_info_a.socket_thm_residency_acc << "\n" << "; metric_info_a.vr_thm_residency_acc: " << std::dec - << metric_info_a.vr_thm_residency_acc + << metric_info_a.vr_thm_residency_acc << "\n" << "; metric_info_a.hbm_thm_residency_acc: " << std::dec << metric_info_a.hbm_thm_residency_acc << "\n" + << "; metric_info_b.xcp_stats->gfx_below_host_limit_acc[" << partitition_id << "]: " + << std::dec << metric_info_a.xcp_stats->gfx_below_host_limit_acc[partitition_id] << "\n" << " [gpu_metrics B] metric_info_b.accumulation_counter: " << std::dec - << metric_info_b.accumulation_counter + << metric_info_b.accumulation_counter << "\n" << "; metric_info_b.prochot_residency_acc: " << std::dec - << metric_info_b.prochot_residency_acc + << metric_info_b.prochot_residency_acc << "\n" << "; metric_info_b.ppt_residency_acc (pviol): " << std::dec - << metric_info_b.ppt_residency_acc + << metric_info_b.ppt_residency_acc << "\n" << "; metric_info_b.socket_thm_residency_acc (tviol): " << std::dec - << metric_info_b.socket_thm_residency_acc + << metric_info_b.socket_thm_residency_acc << "\n" << "; metric_info_b.vr_thm_residency_acc: " << std::dec - << metric_info_b.vr_thm_residency_acc + << metric_info_b.vr_thm_residency_acc << "\n" << "; metric_info_b.hbm_thm_residency_acc: " << std::dec - << metric_info_b.hbm_thm_residency_acc - << "\n"; + << metric_info_b.hbm_thm_residency_acc << "\n" + << "; metric_info_b.xcp_stats->gfx_below_host_limit_acc[" << partitition_id << "]: " + << std::dec << metric_info_b.xcp_stats->gfx_below_host_limit_acc[partitition_id] << "\n"; LOG_DEBUG(ss); if ( (metric_info_b.prochot_residency_acc != std::numeric_limits::max() @@ -842,6 +862,28 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha << violation_status->active_hbm_thrm << "\n"; LOG_DEBUG(ss); } + if ( (metric_info_b.xcp_stats->gfx_below_host_limit_acc[partitition_id] != std::numeric_limits::max() + || metric_info_a.xcp_stats->gfx_below_host_limit_acc[partitition_id] != std::numeric_limits::max()) + && (metric_info_b.xcp_stats->gfx_below_host_limit_acc[partitition_id] >= metric_info_a.xcp_stats->gfx_below_host_limit_acc[partitition_id]) + && ((metric_info_b.accumulation_counter - metric_info_a.accumulation_counter) > 0) ) { + violation_status->per_gfx_clk_below_host_limit = + (((metric_info_b.xcp_stats->gfx_below_host_limit_acc[partitition_id] - + metric_info_a.xcp_stats->gfx_below_host_limit_acc[partitition_id]) * 100) / + (metric_info_b.accumulation_counter - metric_info_a.accumulation_counter)); + + if (violation_status->per_gfx_clk_below_host_limit > 0) { + violation_status->active_gfx_clk_below_host_limit = 1; + violation_status->violation_timestamp = kFASTEST_POLL_TIME_MS; + } else { + violation_status->active_gfx_clk_below_host_limit = 0; + } + ss << __PRETTY_FUNCTION__ << " | " + << "ENTERED gfx_clk_below_host_residency_acc | per_gfx_clk_below_host_limit: " << std::dec + << violation_status->per_gfx_clk_below_host_limit + << "%; active_ppt_pwr = " << std::dec + << violation_status->active_gfx_clk_below_host_limit << "\n"; + LOG_DEBUG(ss); + } ss << __PRETTY_FUNCTION__ << " | " << "RETURNING AMDSMI_STATUS_SUCCESS | " @@ -859,6 +901,8 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha << violation_status->per_vr_thrm << "; violation_status->per_hbm_thrm (%): " << std::dec << violation_status->per_hbm_thrm + << "; violation_status->per_gfx_clk_below_host_limit (%): " << std::dec + << violation_status->per_gfx_clk_below_host_limit << "; violation_status->active_prochot_thrm (bool): " << std::dec << static_cast(violation_status->active_prochot_thrm) << "; violation_status->active_ppt_pwr (bool): " << std::dec @@ -869,6 +913,8 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha << static_cast(violation_status->active_vr_thrm) << "; violation_status->active_hbm_thrm (bool): " << std::dec << static_cast(violation_status->active_hbm_thrm) + << "; violation_status->active_gfx_clk_below_host_limit (bool): " << std::dec + << static_cast(violation_status->active_gfx_clk_below_host_limit) << "\n"; LOG_INFO(ss); diff --git a/tests/python_unittest/integration_test.py b/tests/python_unittest/integration_test.py index 265f4466..925a3bd0 100755 --- a/tests/python_unittest/integration_test.py +++ b/tests/python_unittest/integration_test.py @@ -893,6 +893,8 @@ def test_get_violation_status(self): violation_status['acc_vr_thrm'])) print(" Current HBM Thrm Accumulated (Count): {}".format( violation_status['acc_hbm_thrm'])) + print(" Current GFX CLK Below Host Limit Accumulated (Count): {}".format( + violation_status['acc_gfx_clk_below_host_limit'])) print(" Prochot Thrm Violation (%): {}".format( violation_status['per_prochot_thrm'])) @@ -904,6 +906,8 @@ def test_get_violation_status(self): violation_status['per_vr_thrm'])) print(" HBM Thrm Violation (%): {}".format( violation_status['per_hbm_thrm'])) + print(" GFX CLK Below Host Limit Violation (%): {}".format( + violation_status['per_gfx_clk_below_host_limit'])) print(" Prochot Thrm Violation (bool): {}".format( violation_status['active_prochot_thrm'])) @@ -915,6 +919,8 @@ def test_get_violation_status(self): violation_status['active_vr_thrm'])) print(" HBM Thrm Violation (bool): {}".format( violation_status['active_hbm_thrm'])) + print(" GFX CLK Below Host Limit Violation (bool): {}".format( + violation_status['active_gfx_clk_below_host_limit'])) print() self.tearDown()