diff --git a/config/hwloc.m4 b/config/hwloc.m4 index 4e9560f042..84117858c1 100644 --- a/config/hwloc.m4 +++ b/config/hwloc.m4 @@ -1304,8 +1304,9 @@ return rsmi_init(0); hwloc_rsmi_warning=no], [AC_MSG_RESULT([no]) hwloc_rsmi_warning=yes], - [AC_MSG_RESULT([don't know (cross-compiling)])])], - [hwloc_rsmi_happy=no]) + [AC_MSG_RESULT([don't know (cross-compiling)])]) + AC_CHECK_DECLS([rsmi_dev_partition_id_get],,[:],[[#include ]]) + ], [hwloc_rsmi_happy=no]) LDFLAGS="$LDFLAGS_save" LIBS="$LIBS_save" ], [hwloc_rsmi_happy=no]) diff --git a/hwloc/topology-rsmi.c b/hwloc/topology-rsmi.c index 12ff6527c1..05ff41b25a 100644 --- a/hwloc/topology-rsmi.c +++ b/hwloc/topology-rsmi.c @@ -122,6 +122,32 @@ static int get_device_pci_info(uint32_t dv_ind, uint64_t *bdfid) return 0; } +/* + * Get the partition ID of the GPU + * + * dv_ind (IN) The device index + * partid (OUT) partition ID of GPU devices + */ +static int get_device_partition_id(uint32_t dv_ind, uint32_t *partid) +{ +#if HAVE_DECL_RSMI_DEV_PARTITION_ID_GET + rsmi_status_t rsmi_rc = rsmi_dev_partition_id_get(dv_ind, partid); + + if (rsmi_rc != RSMI_STATUS_SUCCESS) { + if (HWLOC_SHOW_ALL_ERRORS()) { + const char *status_string; + rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); + fprintf(stderr, "hwloc/rsmi: GPU(%u): Failed to get partition ID: %s\n", (unsigned)dv_ind, status_string); + } + return -1; + } + return 0; +#else + errno = ENOSYS; + return -1; +#endif +} + /* * Get the PCI link speed of the GPU * @@ -365,6 +391,16 @@ hwloc_rsmi_discover(struct hwloc_backend *backend, struct hwloc_disc_status *dst device = ((bdfid & 0xff)>>3) & 0x1f; func = bdfid & 0x7; parent = hwloc_pci_find_parent_by_busid(topology, domain, bus, device, func); + if ((!parent || parent->type != HWLOC_OBJ_PCI_DEVICE) && func > 0) { + /* Partitioned MI devices may return the partition ID in a fake BDF func, + * hence we would fail to find a pcidev parent. + * Try with func=0 instead. + */ + uint32_t partid = 0; + get_device_partition_id(i, &partid); + if (func == partid) + parent = hwloc_pci_find_parent_by_busid(topology, domain, bus, device, 0); + } if (parent && parent->type == HWLOC_OBJ_PCI_DEVICE) get_device_pci_linkspeed(i, &parent->attr->pcidev.linkspeed); if (!parent) diff --git a/tests/hwloc/ports/Makefile.am b/tests/hwloc/ports/Makefile.am index 645cd1d4c4..cea80b4313 100644 --- a/tests/hwloc/ports/Makefile.am +++ b/tests/hwloc/ports/Makefile.am @@ -170,7 +170,8 @@ nodist_libhwloc_port_rsmi_la_SOURCES = topology-rsmi.c libhwloc_port_rsmi_la_SOURCES = \ include/rsmi/rocm_smi/rocm_smi.h libhwloc_port_rsmi_la_CPPFLAGS = $(common_CPPFLAGS) \ - -I$(HWLOC_top_srcdir)/tests/hwloc/ports/include/rsmi + -I$(HWLOC_top_srcdir)/tests/hwloc/ports/include/rsmi \ + -DHAVE_DECL_RSMI_DEV_PARTITION_ID_GET=1 nodist_libhwloc_port_levelzero_la_SOURCES = topology-levelzero.c libhwloc_port_levelzero_la_SOURCES = \ diff --git a/tests/hwloc/ports/include/rsmi/rocm_smi/rocm_smi.h b/tests/hwloc/ports/include/rsmi/rocm_smi/rocm_smi.h index c671926132..580b4273c5 100644 --- a/tests/hwloc/ports/include/rsmi/rocm_smi/rocm_smi.h +++ b/tests/hwloc/ports/include/rsmi/rocm_smi/rocm_smi.h @@ -1,5 +1,5 @@ /* - * Copyright © 2013-2021 Inria. All rights reserved. + * Copyright © 2013-2024 Inria. All rights reserved. * Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. * Written by Advanced Micro Devices, * See COPYING in top-level directory. @@ -79,6 +79,7 @@ rsmi_status_t rsmi_dev_name_get(uint32_t dv_ind, char *name, size_t len); rsmi_status_t rsmi_dev_serial_number_get(uint32_t dv_ind, char *serial_num, uint32_t len); rsmi_status_t rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *id); rsmi_status_t rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *bandwidth); +rsmi_status_t rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id); rsmi_status_t rsmi_dev_xgmi_hive_id_get(uint32_t dv_ind, uint64_t *hive_id); rsmi_status_t rsmi_topo_get_link_type(uint32_t dv_ind_src, uint32_t dv_ind_dst, uint64_t *hops, RSMI_IO_LINK_TYPE *type);