From 55f288d0df0c7d2593807412ede68ec867987d56 Mon Sep 17 00:00:00 2001 From: Adam Goldman Date: Mon, 3 Mar 2025 10:03:04 -0500 Subject: [PATCH] prov/psm3: update provider to sync with IEFS 12.0.0.0.36 Updates: - GPU HAL - Removed AVX requirement, now will only warn. - Improved PSM3_RDMA modes 2 & 3's performance. - Improved NIC selection algorithms Signed-off-by: Adam Goldman (cherry picked from commit f09b96d88fc8ff420ce01aad6d77a8b6a7ef8062) --- prov/psm3/COPYING | 1 + prov/psm3/Makefile.am | 5 +- prov/psm3/Makefile.include | 20 +- prov/psm3/VERSION | 2 +- prov/psm3/configure.ac | 31 +- prov/psm3/configure.m4 | 43 +- prov/psm3/debian/changelog | 2 +- prov/psm3/libpsm3-fi.spec.in | 2 +- prov/psm3/psm3/Makefile.include | 20 +- prov/psm3/psm3/gpu/psm_gpu_cuda.c | 2025 ++++++++++ prov/psm3/psm3/gpu/psm_gpu_hal.c | 422 ++ prov/psm3/psm3/gpu/psm_gpu_hal.h | 817 ++++ prov/psm3/psm3/gpu/psm_gpu_oneapi_ze.c | 3548 +++++++++++++++++ prov/psm3/psm3/hal_sockets/sockets_ep.c | 26 +- prov/psm3/psm3/hal_sockets/sockets_ep.h | 2 +- prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c | 24 +- prov/psm3/psm3/hal_sockets/sockets_hal.c | 68 +- prov/psm3/psm3/hal_sockets/sockets_hal.h | 4 +- .../psm3/hal_sockets/sockets_hal_inline_i.h | 14 +- prov/psm3/psm3/hal_sockets/sockets_recvhdrq.c | 2 +- prov/psm3/psm3/hal_sockets/sockets_spio.c | 44 +- prov/psm3/psm3/hal_verbs/verbs_ep.c | 297 +- prov/psm3/psm3/hal_verbs/verbs_ep.h | 14 +- prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c | 27 +- prov/psm3/psm3/hal_verbs/verbs_hal.c | 49 +- prov/psm3/psm3/hal_verbs/verbs_hal.h | 4 +- prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h | 77 +- prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c | 4 +- prov/psm3/psm3/hal_verbs/verbs_spio.c | 84 +- prov/psm3/psm3/include/linux-i386/sysdep.h | 28 - prov/psm3/psm3/include/utils_debug.h | 6 +- prov/psm3/psm3/include/utils_env.h | 5 + prov/psm3/psm3/include/utils_user.h | 19 +- prov/psm3/psm3/psm.c | 1069 +---- prov/psm3/psm3/psm2.h | 26 +- prov/psm3/psm3/psm2_hal.c | 6 +- prov/psm3/psm3/psm2_hal.h | 26 +- prov/psm3/psm3/psm2_hal_inline_t.h | 8 +- prov/psm3/psm3/psm2_hal_loopback.c | 12 +- prov/psm3/psm3/psm2_mq.h | 8 +- prov/psm3/psm3/psm_config.h | 30 +- prov/psm3/psm3/psm_context.c | 102 +- prov/psm3/psm3/psm_ep.c | 57 +- prov/psm3/psm3/psm_ep.h | 20 +- prov/psm3/psm3/psm_help.h | 16 +- prov/psm3/psm3/psm_mpool.c | 20 +- prov/psm3/psm3/psm_mpool.h | 2 +- prov/psm3/psm3/psm_mq.c | 78 +- prov/psm3/psm3/psm_mq_internal.h | 41 +- prov/psm3/psm3/psm_mq_recv.c | 69 +- prov/psm3/psm3/psm_nic_select.c | 885 ++-- prov/psm3/psm3/psm_nic_select.h | 42 +- prov/psm3/psm3/psm_oneapi_ze.c | 1040 ----- prov/psm3/psm3/psm_rndv_mod.c | 402 +- prov/psm3/psm3/psm_rndv_mod.h | 30 +- prov/psm3/psm3/psm_sysbuf.c | 84 +- prov/psm3/psm3/psm_sysbuf.h | 2 +- prov/psm3/psm3/psm_user.h | 1213 +----- prov/psm3/psm3/psm_utils.c | 117 +- prov/psm3/psm3/psm_utils.h | 32 +- prov/psm3/psm3/psm_verbs_mr.c | 175 +- prov/psm3/psm3/psm_verbs_mr.h | 16 +- prov/psm3/psm3/ptl.h | 32 +- .../psm3/ptl_am/am_cuda_memhandle_cache.c | 515 --- .../psm3/ptl_am/am_cuda_memhandle_cache.h | 91 - .../psm3/ptl_am/am_oneapi_memhandle_cache.c | 696 ---- .../psm3/ptl_am/am_oneapi_memhandle_cache.h | 97 - prov/psm3/psm3/ptl_am/am_reqrep_shmem.c | 558 +-- prov/psm3/psm3/ptl_am/psm_am_internal.h | 54 +- prov/psm3/psm3/ptl_am/ptl.c | 133 +- prov/psm3/psm3/ptl_ips/ips_config.h | 6 +- prov/psm3/psm3/ptl_ips/ips_expected_proto.h | 17 +- prov/psm3/psm3/ptl_ips/ips_proto.c | 216 +- prov/psm3/psm3/ptl_ips/ips_proto.h | 36 +- prov/psm3/psm3/ptl_ips/ips_proto_connect.h | 24 +- prov/psm3/psm3/ptl_ips/ips_proto_expected.c | 77 +- prov/psm3/psm3/ptl_ips/ips_proto_mq.c | 138 +- prov/psm3/psm3/ptl_ips/ips_proto_params.h | 7 +- prov/psm3/psm3/ptl_ips/ips_proto_recv.c | 3 + prov/psm3/psm3/ptl_ips/ips_scb.c | 4 +- prov/psm3/psm3/ptl_ips/ips_scb.h | 6 +- prov/psm3/psm3/ptl_ips/ptl.c | 17 +- prov/psm3/psm3/ptl_ips/ptl_rcvthread.c | 44 +- prov/psm3/psm3/ptl_self/ptl.c | 25 +- prov/psm3/psm3/utils/utils_debug.c | 2 +- prov/psm3/psm3/utils/utils_dsa.c | 4 +- prov/psm3/psm3/utils/utils_dwordcpy-x86_64.c | 4 +- prov/psm3/src/psm3_revision.c.in | 4 - prov/psm3/src/psmx3.h | 1 + prov/psm3/src/psmx3_atomic.c | 36 +- prov/psm3/src/psmx3_attr.c | 36 +- prov/psm3/src/psmx3_ep.c | 50 + prov/psm3/src/psmx3_init.c | 23 +- prov/psm3/src/psmx3_rma.c | 84 +- prov/psm3/src/psmx3_wait.c | 75 +- 95 files changed, 8734 insertions(+), 7745 deletions(-) create mode 120000 prov/psm3/COPYING create mode 100644 prov/psm3/psm3/gpu/psm_gpu_cuda.c create mode 100644 prov/psm3/psm3/gpu/psm_gpu_hal.c create mode 100644 prov/psm3/psm3/gpu/psm_gpu_hal.h create mode 100644 prov/psm3/psm3/gpu/psm_gpu_oneapi_ze.c mode change 100755 => 100644 prov/psm3/psm3/hal_sockets/sockets_ep.c mode change 100755 => 100644 prov/psm3/psm3/hal_sockets/sockets_hal.h mode change 100755 => 100644 prov/psm3/psm3/hal_sockets/sockets_recvhdrq.c delete mode 100644 prov/psm3/psm3/psm_oneapi_ze.c delete mode 100644 prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.c delete mode 100644 prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.h delete mode 100644 prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c delete mode 100644 prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.h diff --git a/prov/psm3/COPYING b/prov/psm3/COPYING new file mode 120000 index 00000000000..7d29222e4ca --- /dev/null +++ b/prov/psm3/COPYING @@ -0,0 +1 @@ +../../COPYING \ No newline at end of file diff --git a/prov/psm3/Makefile.am b/prov/psm3/Makefile.am index cec9bddede3..80def139e48 100644 --- a/prov/psm3/Makefile.am +++ b/prov/psm3/Makefile.am @@ -1,6 +1,6 @@ # # Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2017-2018 Intel Corporation, Inc. All right reserved. +# Copyright (c) 2017-2024 Intel Corporation, Inc. All right reserved. # Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All rights reserved. # (C) Copyright 2020 Hewlett Packard Enterprise Development LP # @@ -97,6 +97,7 @@ common_srcs = \ shared/var.c \ shared/abi_1_0.c + if MACOS common_srcs += shared/osx/osd.c common_srcs += shared/unix/osd.c @@ -230,7 +231,7 @@ src_libpsm3_fi_la_LDFLAGS += -lpsm2 endif !HAVE_PSM3_SRC if !EMBEDDED -src_libpsm3_fi_la_LDFLAGS += -version-info 24:0:23 +src_libpsm3_fi_la_LDFLAGS += -version-info 25:0:24 endif prov_install_man_pages = man/man7/fi_psm3.7 diff --git a/prov/psm3/Makefile.include b/prov/psm3/Makefile.include index 9a7ef74370a..257af00e361 100644 --- a/prov/psm3/Makefile.include +++ b/prov/psm3/Makefile.include @@ -52,14 +52,11 @@ noinst_LTLIBRARIES += \ prov/psm3/psm3/libptl_self.la \ prov/psm3/psm3/libhal_verbs.la \ prov/psm3/psm3/libhal_sockets.la \ + prov/psm3/psm3/libgpu.la \ prov/psm3/psm3/libpsm3i.la prov_psm3_psm3_libptl_am_la_SOURCES = \ prov/psm3/psm3/ptl_am/am_config.h \ - prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.c \ - prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.h \ - prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c \ - prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.h \ prov/psm3/psm3/ptl_am/am_reqrep.c \ prov/psm3/psm3/ptl_am/am_reqrep_shmem.c \ prov/psm3/psm3/ptl_am/cmarw.h \ @@ -191,6 +188,17 @@ prov_psm3_psm3_libhal_sockets_la_CPPFLAGS = \ prov_psm3_psm3_libhal_sockets_la_CFLAGS = \ $(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags) +prov_psm3_psm3_libgpu_la_SOURCES = \ + prov/psm3/psm3/gpu/psm_gpu_hal.c \ + prov/psm3/psm3/gpu/psm_gpu_hal.h \ + prov/psm3/psm3/gpu/psm_gpu_cuda.c \ + prov/psm3/psm3/gpu/psm_gpu_oneapi_ze.c +prov_psm3_psm3_libgpu_la_CPPFLAGS = \ + -I$(top_srcdir)/prov/psm3/psm3/gpu/ \ + $(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags) +prov_psm3_psm3_libgpu_la_CFLAGS = \ + $(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags) + prov_psm3_psm3_libpsm3i_la_SOURCES = \ prov/psm3/psm3/psm.c \ prov/psm3/psm3/psm_am.c \ @@ -218,7 +226,6 @@ prov_psm3_psm3_libpsm3i_la_SOURCES = \ prov/psm3/psm3/psm_netutils.h \ prov/psm3/psm3/psm_nic_select.c \ prov/psm3/psm3/psm_nic_select.h \ - prov/psm3/psm3/psm_oneapi_ze.c \ prov/psm3/psm3/psm_perf.c \ prov/psm3/psm3/psm_perf.h \ prov/psm3/psm3/psm_rndv_mod.c \ @@ -263,6 +270,7 @@ prov_psm3_psm3_libpsm3i_la_LIBADD = \ prov/psm3/psm3/libptl_ips.la \ prov/psm3/psm3/libptl_self.la \ prov/psm3/psm3/libhal_verbs.la \ + prov/psm3/psm3/libgpu.la \ prov/psm3/psm3/libhal_sockets.la prov_psm3_psm3_libpsm3i_la_DEPENDENCIES = \ @@ -271,6 +279,7 @@ prov_psm3_psm3_libpsm3i_la_DEPENDENCIES = \ prov/psm3/psm3/libptl_ips.la \ prov/psm3/psm3/libptl_self.la \ prov/psm3/psm3/libhal_verbs.la \ + prov/psm3/psm3/libgpu.la \ prov/psm3/psm3/libhal_sockets.la # Mirror EXTRA_DIST to end of file @@ -288,6 +297,7 @@ chksum_srcs += \ $(prov_psm3_psm3_libutils_la_SOURCES) \ $(prov_psm3_psm3_libhal_verbs_la_SOURCES) \ $(prov_psm3_psm3_libhal_sockets_la_SOURCES) \ + $(prov_psm3_psm3_libgpu_la_SOURCES) \ $(prov_psm3_psm3_libpsm3i_la_SOURCES) \ $(prov_psm3_extra_dist) diff --git a/prov/psm3/VERSION b/prov/psm3/VERSION index 8cb63b0114c..a38fee63f9d 100644 --- a/prov/psm3/VERSION +++ b/prov/psm3/VERSION @@ -1 +1 @@ -3_7_0_0 +4_0_0_0 diff --git a/prov/psm3/configure.ac b/prov/psm3/configure.ac index 53569e8e510..18a02468985 100644 --- a/prov/psm3/configure.ac +++ b/prov/psm3/configure.ac @@ -143,7 +143,7 @@ AS_IF([test "x$enable_psm3_rc" = "xcheck"], AS_IF([test "x$enable_psm3_rc" = "xyes"], [ AS_IF([test "x$enable_psm3_verbs" = "xyes"], - [CPPFLAGS="$CPPFLAGS -DUSE_RC"], + [CPPFLAGS="$CPPFLAGS -DUSE_RC -DUSE_RDMA_READ"], [AC_MSG_ERROR([User RC QPs requires Verbs HAL active])]) ]) AS_IF([test "x$enable_psm3_src" = "xyes"], @@ -690,12 +690,14 @@ AS_IF([test "$have_oneapi_ze" = "1"], LIBS="$LIBS $ze_LIBS" dnl - Check for zeMemPutIpcHandle after ZE added to LIBS/*FLAGS + save_LDFLAGS="$LDFLAGS" + LDFLAGS="$LDFLAGS -lze_loader" AC_MSG_CHECKING([for zeMemPutIpcHandle support in level-zero]) AC_LINK_IFELSE( [AC_LANG_PROGRAM([[ #include ]],[[ - ze_context_handle_t hContext; + ze_context_handle_t hContext = NULL; ze_ipc_mem_handle_t handle; (void)zeMemPutIpcHandle(hContext, handle); ]]) @@ -703,8 +705,10 @@ AS_IF([test "$have_oneapi_ze" = "1"], AC_MSG_RESULT(yes) have_oneapi_zeMemPutIpcHandle=1 CPPFLAGS="$CPPFLAGS -DPSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE" + LDFLAGS="$save_LDFLAGS" ],[ AC_MSG_RESULT(no) + LDFLAGS="$save_LDFLAGS" ]) ]) @@ -910,12 +914,9 @@ AS_IF([test ! -z "$CC" && ( test "x${CC%% *}" = "xicc" || test "x${CC%% *}" = "x [ dnl ICC/ICX CFLAGS="$CFLAGS -Werror -xATOM_SSE4.2 -DPSM_AVX512 -fpic -fPIC -D_GNU_SOURCE -DPACK_STRUCT_STL=packed," LDFLAGS="$LDFLAGS -Wc,-static-intel" - PSM3_MARCH="avx2" ], [ dnl GCC/other - CFLAGS="$CFLAGS -Werror -mavx2 -fpic -fPIC -funwind-tables -Wformat -Wformat-security" - PSM3_MARCH="avx2" + CFLAGS="$CFLAGS -Werror -msse4.2 -fpic -fPIC -funwind-tables -Wformat -Wformat-security" ]) -AC_DEFINE_UNQUOTED([PSM3_MARCH], ["$PSM3_MARCH"], [PSM3 built with instruction set]) AS_IF([test ! -z "$PSM_CPPFLAGS"], [CPPFLAGS="$CPPFLAGS $PSM_CPPFLAGS"], []) AS_IF([test ! -z "$PSM_CFLAGS"], [CFLAGS="$CFLAGS $PSM_CFLAGS"], []) @@ -936,12 +937,10 @@ AC_DEFINE([HAVE_XPMEM], 0, [Ignore HAVE_XPMEM]) dnl Provider-specific checks dnl FI_PROVIDER_INIT -AC_DEFINE([HAVE_BGQ], 0, [Ignore HAVE_BGQ]) -AC_DEFINE([HAVE_BGQ_DL], 0, [Ignore HAVE_BGQ_DL]) +AC_DEFINE([HAVE_CXI], 0, [Ignore HAVE_CXI]) +AC_DEFINE([HAVE_CXI_DL], 0, [Ignore HAVE_CXI_DL]) AC_DEFINE([HAVE_EFA], 0, [Ignore HAVE_EFA]) AC_DEFINE([HAVE_EFA_DL], 0, [Ignore HAVE_EFA_DL]) -AC_DEFINE([HAVE_GNI], 0, [Ignore HAVE_GNI]) -AC_DEFINE([HAVE_GNI_DL], 0, [Ignore HAVE_GNI_DL]) AC_DEFINE([HAVE_MRAIL], 0, [Ignore HAVE_MRAIL]) AC_DEFINE([HAVE_MRAIL_DL], 0, [Ignore HAVE_MRAIL_DL]) AC_DEFINE([HAVE_NET], 0, [Ignore HAVE_NET]) @@ -954,8 +953,6 @@ AC_DEFINE([HAVE_PSM2_DL], 0, [Ignore HAVE_PSM2_DL]) dnl FI_PROVIDER_SETUP([psm3]) AC_DEFINE([HAVE_OPX], 0, [Ignore HAVE_OPX]) AC_DEFINE([HAVE_OPX_DL], 0, [Ignore HAVE_OPX_DL]) -AC_DEFINE([HAVE_RSTREAM], 0, [Ignore HAVE_RSTREAM]) -AC_DEFINE([HAVE_RSTREAM_DL], 0, [Ignore HAVE_RSTREAM_DL]) AC_DEFINE([HAVE_RXD], 0, [Ignore HAVE_RXD]) AC_DEFINE([HAVE_RXD_DL], 0, [Ignore HAVE_RXD_DL]) AC_DEFINE([HAVE_RXM], 0, [Ignore HAVE_RXM]) @@ -974,8 +971,6 @@ AC_DEFINE([HAVE_UCX], 0, [Ignore HAVE_UCX]) AC_DEFINE([HAVE_UCX_DL], 0, [Ignore HAVE_UCX_DL]) AC_DEFINE([HAVE_UDP], 0, [Ignore HAVE_UDP]) AC_DEFINE([HAVE_UDP_DL], 0, [Ignore HAVE_UDP_DL]) -AC_DEFINE([HAVE_USNIC], 0, [Ignore HAVE_USNIC]) -AC_DEFINE([HAVE_USNIC_DL], 0, [Ignore HAVE_USNIC_DL]) AC_DEFINE([HAVE_VERBS], 0, [Ignore HAVE_VERBS]) AC_DEFINE([HAVE_VERBS_DL], 0, [Ignore HAVE_VERBS_DL]) dnl FI_PROVIDER_FINI @@ -991,8 +986,12 @@ AM_COND_IF([HAVE_PSM3_SRC], AS_IF([test -z "${PSM3_IEFS_VERSION}"], [PSM3_IEFS_VERSION="${PACKAGE_VERSION}$(whoami)"]) PSM3_IEFS_VERSION=$(echo "${PSM3_IEFS_VERSION}" | tr '.' '_') PSM3_GIT_HASH="$(git rev-parse HEAD)" - RPM_RELEASE=$(echo "${PSM3_IEFS_VERSION}" | cut -d'_' -f5) - RELEASE_VER=$(echo "${PSM3_IEFS_VERSION}" | cut -d'_' -f1-4 | sed 's/_/./g') + RPM_RELEASE=$(echo "${PSM3_IEFS_VERSION}" | tr -s '@<:@A-Z@:>@' '_' | cut -d'_' -f5) + RELEASE_VER=$(echo "${PSM3_IEFS_VERSION}" | tr -s '@<:@A-Z@:>@' '_' | cut -d'_' -f1-4 | sed 's/_/./g') + char=$(echo "${PSM3_IEFS_VERSION}" | tr -dc '@<:@A-Z@:>@' | tr '@<:@A-Z@:>@' '@<:@a-z@:>@') + AS_IF([test -n "$char"], [ + RPM_RELEASE="0${char}${RPM_RELEASE}" + ]) AS_IF([test x"${RELEASE_VER}" = x"${PACKAGE_VERSION}"], [], [ AC_MSG_NOTICE([Release Tag does not match VERSION file]) AC_MSG_NOTICE([${RELEASE_VER} != ${PACKAGE_VERSION}]) diff --git a/prov/psm3/configure.m4 b/prov/psm3/configure.m4 index 5c8c083f7dc..1fd157f7b58 100644 --- a/prov/psm3/configure.m4 +++ b/prov/psm3/configure.m4 @@ -20,7 +20,6 @@ AC_DEFUN([FI_PSM3_CONFIGURE],[ PSM3_HAL_INST="" PSM3_HAL_CNT=0 - PSM3_MARCH="" psm3_happy=1 AS_IF([test x"$enable_psm3" != x"no"], @@ -57,7 +56,7 @@ AC_DEFUN([FI_PSM3_CONFIGURE],[ AS_IF([test "x$enable_psm3_rc" = "xyes"], [ AS_IF([test "x$enable_psm3_verbs" = "xyes"], - [psm3_CPPFLAGS="$psm3_CPPFLAGS -DUSE_RC"], + [psm3_CPPFLAGS="$psm3_CPPFLAGS -DUSE_RC -DUSE_RDMA_READ"], [AC_MSG_ERROR([User RC QPs requires Verbs HAL active])]) ]) @@ -121,39 +120,16 @@ AC_DEFUN([FI_PSM3_CONFIGURE],[ ],[ AC_MSG_RESULT([yes]) PSM3_ARCH_CFLAGS="-msse4.2" - PSM3_MARCH="sse4.2" ],[ psm3_happy=0 AC_MSG_RESULT([no]) - AC_MSG_NOTICE([psm3 requires minimum of avx instruction set to build]) + AC_MSG_NOTICE([psm3 requires minimum of sse4.2 instruction set to build]) ]) CFLAGS=$save_CFLAGS - AC_MSG_CHECKING([for -mavx support]) + AC_MSG_CHECKING([for -mavx2 support (recommended)]) save_CFLAGS=$CFLAGS - CFLAGS="$PSM3_STRIP_OPTFLAGS -mavx -O0" - AC_LINK_IFELSE( - [AC_LANG_PROGRAM( - [[#include ]], - [[unsigned long long _a[4] = {1ULL,2ULL,3ULL,4ULL}; - __m256i vA = _mm256_loadu_si256((__m256i *)_a); - __m256i vB; - _mm256_store_si256(&vB, vA); - return 0;]]) - ],[ - AC_MSG_RESULT([yes]) - PSM3_ARCH_CFLAGS="-mavx" - PSM3_MARCH="avx" - ],[ - psm3_happy=0 - AC_MSG_RESULT([no]) - AC_MSG_NOTICE([psm3 requires minimum of avx instruction set to build]) - ]) - CFLAGS=$save_CFLAGS - - AC_MSG_CHECKING([for -mavx2 support]) - save_CFLAGS=$CFLAGS - CFLAGS="$PSM3_STRIP_OPTFLAGS -mavx2 -O0" + CFLAGS="$PSM3_STRIP_OPTFLAGS -O0" AC_LINK_IFELSE( [AC_LANG_PROGRAM( [[#include ]], @@ -164,10 +140,9 @@ AC_DEFUN([FI_PSM3_CONFIGURE],[ return 0;]]) ],[ AC_MSG_RESULT([yes]) - PSM3_ARCH_CFLAGS="-mavx2" - PSM3_MARCH="avx2" ],[ AC_MSG_RESULT([no]) + AC_MSG_NOTICE([psm3 recommends minimum of avx2 instruction set for best performance]) ]) CFLAGS=$save_CFLAGS @@ -227,20 +202,24 @@ AC_DEFUN([FI_PSM3_CONFIGURE],[ AS_IF([test "$have_oneapi_ze" = "1"], [ + save_LDFLAGS="$LDFLAGS" + LDFLAGS="$LDFLAGS -lze_loader" AC_MSG_CHECKING([for zeMemPutIpcHandle support in level-zero]) AC_LINK_IFELSE( [AC_LANG_PROGRAM([[ #include ]],[[ - ze_context_handle_t hContext; + ze_context_handle_t hContext = NULL; ze_ipc_mem_handle_t handle; (void)zeMemPutIpcHandle(hContext, handle); ]]) ],[ AC_MSG_RESULT(yes) psm3_CPPFLAGS="$psm3_CPPFLAGS -DPSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE" + LDFLAGS="$save_LDFLAGS" ],[ AC_MSG_RESULT(no) + LDFLAGS="$save_LDFLAGS" ]) ]) @@ -413,8 +392,6 @@ AC_DEFUN([FI_PSM3_CONFIGURE],[ AC_SUBST(psm3_LIBS) AC_SUBST(PSM3_HAL_CNT) AC_SUBST(PSM3_HAL_INST) - AC_DEFINE_UNQUOTED([PSM3_MARCH], ["$PSM3_MARCH"], [PSM3 built with instruction set]) - AC_SUBST(PSM3_MARCH) PSM3_IEFS_VERSION=m4_normalize(m4_esyscmd([cat prov/psm3/VERSION])) AC_SUBST(PSM3_IEFS_VERSION) diff --git a/prov/psm3/debian/changelog b/prov/psm3/debian/changelog index 52852ac0f5e..399de39bf55 100644 --- a/prov/psm3/debian/changelog +++ b/prov/psm3/debian/changelog @@ -1,4 +1,4 @@ -libpsm3-fi (11.7.0.0-110) unstable; urgency=medium +libpsm3-fi (12.0.0.0-36) unstable; urgency=medium * Initial release diff --git a/prov/psm3/libpsm3-fi.spec.in b/prov/psm3/libpsm3-fi.spec.in index b24d4c13a63..282a84e2b34 100644 --- a/prov/psm3/libpsm3-fi.spec.in +++ b/prov/psm3/libpsm3-fi.spec.in @@ -62,7 +62,7 @@ rm -rf %{buildroot} %files %defattr(-,root,root,-) %{_libdir}/libfabric/%{name}* -%doc README +%doc README COPYING %exclude %{_libdir}/libfabric/*.a %exclude %{_libdir}/libfabric/*.la %exclude %{_libdir}/pkgconfig diff --git a/prov/psm3/psm3/Makefile.include b/prov/psm3/psm3/Makefile.include index fd253089532..a8c87fd261a 100644 --- a/prov/psm3/psm3/Makefile.include +++ b/prov/psm3/psm3/Makefile.include @@ -17,14 +17,11 @@ noinst_LTLIBRARIES += \ psm3/libptl_self.la \ psm3/libhal_verbs.la \ psm3/libhal_sockets.la \ + psm3/libgpu.la \ psm3/libpsm3i.la psm3_libptl_am_la_SOURCES = \ psm3/ptl_am/am_config.h \ - psm3/ptl_am/am_cuda_memhandle_cache.c \ - psm3/ptl_am/am_cuda_memhandle_cache.h \ - psm3/ptl_am/am_oneapi_memhandle_cache.c \ - psm3/ptl_am/am_oneapi_memhandle_cache.h \ psm3/ptl_am/am_reqrep.c \ psm3/ptl_am/am_reqrep_shmem.c \ psm3/ptl_am/cmarw.h \ @@ -156,6 +153,17 @@ psm3_libhal_sockets_la_CPPFLAGS = \ psm3_libhal_sockets_la_CFLAGS = \ $(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags) +psm3_libgpu_la_SOURCES = \ + psm3/gpu/psm_gpu_hal.c \ + psm3/gpu/psm_gpu_hal.h \ + psm3/gpu/psm_gpu_cuda.c \ + psm3/gpu/psm_gpu_oneapi_ze.c +psm3_libgpu_la_CPPFLAGS = \ + -I$(top_srcdir)/psm3/gpu/ \ + $(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags) +psm3_libgpu_la_CFLAGS = \ + $(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags) + psm3_libpsm3i_la_SOURCES = \ psm3/psm.c \ psm3/psm_am.c \ @@ -183,7 +191,6 @@ psm3_libpsm3i_la_SOURCES = \ psm3/psm_netutils.h \ psm3/psm_nic_select.c \ psm3/psm_nic_select.h \ - psm3/psm_oneapi_ze.c \ psm3/psm_perf.c \ psm3/psm_perf.h \ psm3/psm_rndv_mod.c \ @@ -228,6 +235,7 @@ psm3_libpsm3i_la_LIBADD = \ psm3/libptl_ips.la \ psm3/libptl_self.la \ psm3/libhal_verbs.la \ + psm3/libgpu.la \ psm3/libhal_sockets.la psm3_libpsm3i_la_DEPENDENCIES = \ @@ -236,6 +244,7 @@ psm3_libpsm3i_la_DEPENDENCIES = \ psm3/libptl_ips.la \ psm3/libptl_self.la \ psm3/libhal_verbs.la \ + psm3/libgpu.la \ psm3/libhal_sockets.la _psm3_extra_dist = \ @@ -252,5 +261,6 @@ chksum_srcs += \ $(psm3_libutils_la_SOURCES) \ $(psm3_libhal_verbs_la_SOURCES) \ $(psm3_libhal_sockets_la_SOURCES) \ + $(psm3_libgpu_la_SOURCES) \ $(psm3_libpsm3i_la_SOURCES) \ $(_psm3_extra_dist) diff --git a/prov/psm3/psm3/gpu/psm_gpu_cuda.c b/prov/psm3/psm3/gpu/psm_gpu_cuda.c new file mode 100644 index 00000000000..7b122134680 --- /dev/null +++ b/prov/psm3/psm3/gpu/psm_gpu_cuda.c @@ -0,0 +1,2025 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2024 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2024 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2024 Intel Corporation. All rights reserved. */ + +#include +#include +#include /* cpu_set */ +#include /* isalpha */ +#include +#include + +#include "psm_user.h" + +#ifdef PSM_CUDA +#include +#include +#include +#include + +#if CUDA_VERSION < 7000 +#error Please update CUDA driver, required minimum version is 7.0 +#endif + +#include "psm2_hal.h" +#include "psm_mq_internal.h" +#include "ptl_am/psm_am_internal.h" +#include "ptl_ips/ips_proto.h" +#include "ptl_ips/ips_expected_proto.h" + +// cuCtxSetFlags(CU_CTX_SYNC_MEMOPS) was introduced in CUDA 12.1.0 +#define PSM3_CUDA_HAVE_CTX_SYNC_MEMOPS (CUDA_VERSION >= 12010) + +// if defined, do not use cuMemHostRegister for malloced pipeline +// copy bounce buffers +// otherwise, use cuMemHostRegister when malloc buffer +//#define PSM3_NO_CUDA_REGISTER + +// default value for PSM3_GPU_THRESH_RNDV +#define PSM3_CUDA_GPU_THRESH_RNDV 8000 +// default value for PSM3_GPU_RNDV_NIC_WINDOW when using Cuda GPU +#define PSM3_CUDA_RNDV_NIC_WINDOW_DEFAULT "2097152" +// default value for PSM3_GPUDIRECT_RDMA_SEND_LIMIT +#define PSM3_CUDA_GPUDIRECT_RDMA_SEND_LIMIT_DEFAULT UINT_MAX +// default value for PSM3_GPUDIRECT_RDMA_RECV_LIMIT +#define PSM3_CUDA_GPUDIRECT_RDMA_RECV_LIMIT_DEFAULT UINT_MAX +// default value for PSM3_MQ_RNDV_SHM_GPU_THRESH +// Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem +#define PSM3_CUDA_MQ_RNDV_SHM_GPU_THRESH 63 + +/* CUDA Driver Library */ +static void *psm3_cuda_lib; +static int psm3_cuda_lib_version; +/* CUDA Runtime (psm3_cudart) Library */ +static void *psm3_cudart_lib; +static int psm3_cuda_runtime_ver; + +/* This is a global cuda context + * stored to provide hints during a cuda failure + * due to a null cuda context. + */ +CUcontext psm3_cu_ctxt; + +#ifdef PSM_HAVE_RNDV_MOD +static int psm3_cuda_gpu_pin_check; // PSM3_GPU_PIN_CHECK +static uint64_t *psm3_cuda_gpu_bars; +static int psm3_cuda_num_gpu_bars = 0; +static uint64_t psm3_cuda_min_gpu_bar_size; + +static uint64_t psm3_cuda_get_nvidia_bar_addr(int domain, int bus, int slot); +#endif + +typedef enum { + PSM3_CPE_REJECT = 0, + PSM3_CPE_IGNORE = 1, + PSM3_CPE_OBEY = 2, +} psm3_cuda_permitted_enforcement_t; + +static psm3_cuda_permitted_enforcement_t psm3_cuda_permitted_enforcement = PSM3_CPE_IGNORE; + +typedef enum { + PSM3_CUDA_SYNC_CTX = 0, + PSM3_CUDA_SYNC_PTR = 1, + PSM3_CUDA_SYNC_PTR_RELAXED = 2, + PSM3_CUDA_SYNC_NONE = 3, +} psm3_cuda_sync_mode_t; + +static psm3_cuda_sync_mode_t psm3_cuda_sync_mode = PSM3_CUDA_SYNC_CTX; + +/* function pointers from dlopen access to cuda shared library */ +#define PSM3_CUDA_SYM_FP(name) PSM3_CONCAT(psm3_cuda_, name) +static CUresult (*PSM3_CUDA_SYM_FP(cuInit))(unsigned int Flags ); +static CUresult (*PSM3_CUDA_SYM_FP(cuCtxDetach))(CUcontext c); +static CUresult (*PSM3_CUDA_SYM_FP(cuCtxGetCurrent))(CUcontext *c); +static CUresult (*PSM3_CUDA_SYM_FP(cuCtxSetCurrent))(CUcontext c); +#if PSM3_CUDA_HAVE_CTX_SYNC_MEMOPS +static CUresult (*PSM3_CUDA_SYM_FP(cuCtxSetFlags))(unsigned int flags); +#endif +static CUresult (*PSM3_CUDA_SYM_FP(cuPointerGetAttribute))(void *data, CUpointer_attribute pa, CUdeviceptr p); +static CUresult (*PSM3_CUDA_SYM_FP(cuPointerGetAttributes))(unsigned int count, CUpointer_attribute *attrs, void **data, CUdeviceptr ptr); +static CUresult (*PSM3_CUDA_SYM_FP(cuPointerSetAttribute))(void *data, CUpointer_attribute pa, CUdeviceptr p); +static CUresult (*PSM3_CUDA_SYM_FP(cuDeviceCanAccessPeer))(int *canAccessPeer, CUdevice dev, CUdevice peerDev); +static CUresult (*PSM3_CUDA_SYM_FP(cuDeviceGet))(CUdevice* device, int ordinal); +static CUresult (*PSM3_CUDA_SYM_FP(cuDeviceGetAttribute))(int* pi, CUdevice_attribute attrib, CUdevice dev); +static CUresult (*PSM3_CUDA_SYM_FP(cuDriverGetVersion))(int* driverVersion); +static CUresult (*PSM3_CUDA_SYM_FP(cuDeviceGetCount))(int* count); +static CUresult (*PSM3_CUDA_SYM_FP(cuStreamCreate))(CUstream* phStream, unsigned int Flags); +static CUresult (*PSM3_CUDA_SYM_FP(cuStreamDestroy))(CUstream phStream); +static CUresult (*PSM3_CUDA_SYM_FP(cuStreamSynchronize))(CUstream phStream); +static CUresult (*PSM3_CUDA_SYM_FP(cuEventCreate))(CUevent* phEvent, unsigned int Flags); +static CUresult (*PSM3_CUDA_SYM_FP(cuEventDestroy))(CUevent hEvent); +static CUresult (*PSM3_CUDA_SYM_FP(cuEventQuery))(CUevent hEvent); +static CUresult (*PSM3_CUDA_SYM_FP(cuEventRecord))(CUevent hEvent, CUstream hStream); +static CUresult (*PSM3_CUDA_SYM_FP(cuEventSynchronize))(CUevent hEvent); +static CUresult (*PSM3_CUDA_SYM_FP(cuMemRetainAllocationHandle))(CUmemGenericAllocationHandle *h, CUdeviceptr p); +static CUresult (*PSM3_CUDA_SYM_FP(cuMemRelease))(CUmemGenericAllocationHandle h); +static CUresult (*PSM3_CUDA_SYM_FP(cuMemHostAlloc))(void** pp, size_t bytesize, unsigned int Flags); +static CUresult (*PSM3_CUDA_SYM_FP(cuMemFreeHost))(void* p); +static CUresult (*PSM3_CUDA_SYM_FP(cuMemHostRegister))(void* p, size_t bytesize, unsigned int Flags); +static CUresult (*PSM3_CUDA_SYM_FP(cuMemHostUnregister))(void* p); +static CUresult (*PSM3_CUDA_SYM_FP(cuMemcpy))(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); +static CUresult (*PSM3_CUDA_SYM_FP(cuMemcpyDtoD))(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); +static CUresult (*PSM3_CUDA_SYM_FP(cuMemcpyDtoH))(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); +static CUresult (*PSM3_CUDA_SYM_FP(cuMemcpyHtoD))(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount); +static CUresult (*PSM3_CUDA_SYM_FP(cuMemcpyDtoHAsync))(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); +static CUresult (*PSM3_CUDA_SYM_FP(cuMemcpyHtoDAsync))(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream); +static CUresult (*PSM3_CUDA_SYM_FP(cuIpcGetMemHandle))(CUipcMemHandle* pHandle, CUdeviceptr dptr); +static CUresult (*PSM3_CUDA_SYM_FP(cuIpcOpenMemHandle))(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags); +static CUresult (*PSM3_CUDA_SYM_FP(cuIpcCloseMemHandle))(CUdeviceptr dptr); +static CUresult (*PSM3_CUDA_SYM_FP(cuMemGetAddressRange))(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr); +static CUresult (*PSM3_CUDA_SYM_FP(cuDevicePrimaryCtxGetState))(CUdevice dev, unsigned int* flags, int* active); +static CUresult (*PSM3_CUDA_SYM_FP(cuDevicePrimaryCtxRetain))(CUcontext* pctx, CUdevice dev); +static CUresult (*PSM3_CUDA_SYM_FP(cuCtxGetDevice))(CUdevice* device); +static CUresult (*PSM3_CUDA_SYM_FP(cuDevicePrimaryCtxRelease))(CUdevice device); +static CUresult (*PSM3_CUDA_SYM_FP(cuGetErrorString))(CUresult error, const char **pStr); +static cudaError_t (*PSM3_CUDA_SYM_FP(cudaRuntimeGetVersion))(int* runtimeVersion); + +/* statistics counting each cuda call PSM3 makes */ +#define PSM3_CUDA_SYM_COUNT(name) PSM3_CONCAT(psm3_cuda_count_, name) +static uint64_t PSM3_CUDA_SYM_COUNT(cuInit); +static uint64_t PSM3_CUDA_SYM_COUNT(cuCtxDetach); +static uint64_t PSM3_CUDA_SYM_COUNT(cuCtxGetCurrent); +static uint64_t PSM3_CUDA_SYM_COUNT(cuCtxSetCurrent); +#if PSM3_CUDA_HAVE_CTX_SYNC_MEMOPS +static uint64_t PSM3_CUDA_SYM_COUNT(cuCtxSetFlags); +#endif +static uint64_t PSM3_CUDA_SYM_COUNT(cuPointerGetAttribute); +static uint64_t PSM3_CUDA_SYM_COUNT(cuPointerGetAttributes); +static uint64_t PSM3_CUDA_SYM_COUNT(cuPointerSetAttribute); +static uint64_t PSM3_CUDA_SYM_COUNT(cuDeviceCanAccessPeer); +static uint64_t PSM3_CUDA_SYM_COUNT(cuDeviceGet); +static uint64_t PSM3_CUDA_SYM_COUNT(cuDeviceGetAttribute); +static uint64_t PSM3_CUDA_SYM_COUNT(cuDriverGetVersion); +static uint64_t PSM3_CUDA_SYM_COUNT(cuDeviceGetCount); +static uint64_t PSM3_CUDA_SYM_COUNT(cuStreamCreate); +static uint64_t PSM3_CUDA_SYM_COUNT(cuStreamDestroy); +static uint64_t PSM3_CUDA_SYM_COUNT(cuStreamSynchronize); +static uint64_t PSM3_CUDA_SYM_COUNT(cuEventCreate); +static uint64_t PSM3_CUDA_SYM_COUNT(cuEventDestroy); +static uint64_t PSM3_CUDA_SYM_COUNT(cuEventQuery); +static uint64_t PSM3_CUDA_SYM_COUNT(cuEventRecord); +static uint64_t PSM3_CUDA_SYM_COUNT(cuEventSynchronize); +static uint64_t PSM3_CUDA_SYM_COUNT(cuMemRetainAllocationHandle); +static uint64_t PSM3_CUDA_SYM_COUNT(cuMemRelease); +static uint64_t PSM3_CUDA_SYM_COUNT(cuMemHostAlloc); +static uint64_t PSM3_CUDA_SYM_COUNT(cuMemFreeHost); +static uint64_t PSM3_CUDA_SYM_COUNT(cuMemHostRegister); +static uint64_t PSM3_CUDA_SYM_COUNT(cuMemHostUnregister); +static uint64_t PSM3_CUDA_SYM_COUNT(cuMemcpy); +static uint64_t PSM3_CUDA_SYM_COUNT(cuMemcpyDtoD); +static uint64_t PSM3_CUDA_SYM_COUNT(cuMemcpyDtoH); +static uint64_t PSM3_CUDA_SYM_COUNT(cuMemcpyHtoD); +static uint64_t PSM3_CUDA_SYM_COUNT(cuMemcpyDtoHAsync); +static uint64_t PSM3_CUDA_SYM_COUNT(cuMemcpyHtoDAsync); +static uint64_t PSM3_CUDA_SYM_COUNT(cuIpcGetMemHandle); +static uint64_t PSM3_CUDA_SYM_COUNT(cuIpcOpenMemHandle); +static uint64_t PSM3_CUDA_SYM_COUNT(cuIpcCloseMemHandle); +static uint64_t PSM3_CUDA_SYM_COUNT(cuMemGetAddressRange); +static uint64_t PSM3_CUDA_SYM_COUNT(cuDevicePrimaryCtxGetState); +static uint64_t PSM3_CUDA_SYM_COUNT(cuDevicePrimaryCtxRetain); +static uint64_t PSM3_CUDA_SYM_COUNT(cuCtxGetDevice); +static uint64_t PSM3_CUDA_SYM_COUNT(cuDevicePrimaryCtxRelease); +static uint64_t PSM3_CUDA_SYM_COUNT(cuGetErrorString); +static uint64_t PSM3_CUDA_SYM_COUNT(cudaRuntimeGetVersion); + +/* Set the context-level SYNC_MEMOPS flag (as opposed to the pointer + * attribute) + */ +static void psm3_cuda_ctx_set_sync_memops(void) +{ +#if PSM3_CUDA_HAVE_CTX_SYNC_MEMOPS + if (psm3_cuda_sync_mode != PSM3_CUDA_SYNC_CTX) + return; + CUresult err = PSM3_CUDA_SYM_FP(cuCtxSetFlags)(CU_CTX_SYNC_MEMOPS); + if_pf (err != CUDA_SUCCESS) + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Failed to set CUDA context flag: SYNC_MEMOPS\n"); +#endif +} + +static int psm3_cuda_check_set_cuda_ctxt(void) +{ + CUresult err; + CUcontext tmpctxt = {0}; + + if (unlikely(!PSM3_CUDA_SYM_FP(cuCtxGetCurrent) || !PSM3_CUDA_SYM_FP(cuCtxSetCurrent))) + return 0; + + err = PSM3_CUDA_SYM_FP(cuCtxGetCurrent)(&tmpctxt); + if (likely(!err)) { + if (unlikely(!tmpctxt && psm3_cu_ctxt)) { + err = PSM3_CUDA_SYM_FP(cuCtxSetCurrent)(psm3_cu_ctxt); + if (likely(!err)) + psm3_cuda_ctx_set_sync_memops(); + return !!err; + } else if (unlikely(tmpctxt && !psm3_cu_ctxt)) { + psm3_cu_ctxt = tmpctxt; + psm3_cuda_ctx_set_sync_memops(); + } + } + return 0; +} + +/* Make sure have a real GPU job. Set psm3_cu_ctxt if available */ +PSMI_ALWAYS_INLINE( +int psm3_cuda_check_have_cuda_ctxt(void)) +{ + if (! psm3_cu_ctxt) { + if (unlikely(psm3_cuda_check_set_cuda_ctxt())) { + psm3_handle_error(PSMI_EP_NORETURN, + PSM2_INTERNAL_ERR, "Failed to set/synchronize" + " CUDA context.\n"); + } + } + return (psm3_cu_ctxt != NULL); +} + +/** + * execute the specified function and return the result without error handling + */ +#define PSM3_CUDA_EXEC_ASSUME_CONTEXT(func, args...) \ + ({ \ + PSM3_CONCAT(psm3_cuda_count_, func)++; \ + (CUresult)PSM3_CONCAT(psm3_cuda_, func)(args); \ + }) + +#define PSM3_CUDA_EXEC(func, args...) \ + ({ \ + if (unlikely(psm3_cuda_check_set_cuda_ctxt())) { \ + psm3_handle_error( \ + PSMI_EP_NORETURN, \ + PSM2_INTERNAL_ERR, \ + "Failed to set/synchronize CUDA context.\n"); \ + } \ + PSM3_CUDA_EXEC_ASSUME_CONTEXT(func, args); \ + }) + +/** + * apply boilerplate non-fatal error handling to the indicated error + */ +#define PSM3_CUDA_ERROR(func, cudaerr, log_level) \ + do { \ + const char *pStr = NULL; \ + PSM3_CUDA_SYM_COUNT(cuGetErrorString)++; \ + PSM3_CUDA_SYM_FP(cuGetErrorString)(cudaerr, &pStr); \ + _HFI_##log_level( \ + "CUDA failure: %s() (at %s:%d) returned %d: %s\n", \ + #func, __FILE__, __LINE__, cudaerr, \ + pStr ? pStr : "Unknown"); \ + } while (0) + +/** + * check for errors, do necessary boilerplate, then fail hard + */ +#define PSM3_CUDA_CHECK(func, cudaerr) \ + do { \ + if (cudaerr != CUDA_SUCCESS) { \ + PSM3_CUDA_ERROR(func, cudaerr, ERROR); \ + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Error returned from CUDA function %s.\n", #func); \ + } \ + } while (0) + +/** + * execute the CUDA function and handle any errors with failure + */ +#define PSM3_CUDA_CALL(func, args...) \ + do { \ + CUresult cudaerr = PSM3_CUDA_EXEC(func, args); \ + PSM3_CUDA_CHECK(func, cudaerr); \ + } while (0) + +/** + * Similar to PSM3_CUDA_CALL() except does not error out + * if func(args) returns CUDA_SUCCESS or except_err + * + * Invoker must provide 'CUresult cudaerr' in invoked scope + * so invoker can inspect whether cudaerr == CUDA_SUCCESS or + * cudaerr == except_err after expanded code is executed. + * + * As except_err is an allowed value, message is printed at + * DBG level. + */ +#define PSM3_CUDA_CALL_EXCEPT(except_err, func, args...) \ + ({ \ + CUresult cudaerr; \ + do { \ + cudaerr = PSM3_CUDA_EXEC(func, args); \ + if (cudaerr == except_err) { \ + PSM3_CUDA_ERROR(func, cudaerr, ERROR); \ + break; \ + } \ + PSM3_CUDA_CHECK(func, cudaerr); \ + } while (0); \ + cudaerr; \ + }) + +#define PSM3_CUDA_CHECK_EVENT(event, cudaerr) do { \ + PSM3_CUDA_SYM_COUNT(cuEventQuery)++; \ + cudaerr = PSM3_CUDA_SYM_FP(cuEventQuery)(event); \ + if ((cudaerr != CUDA_SUCCESS) && (cudaerr != CUDA_ERROR_NOT_READY)) { \ + const char *pStr = NULL; \ + PSM3_CUDA_SYM_COUNT(cuGetErrorString)++; \ + PSM3_CUDA_SYM_FP(cuGetErrorString)(cudaerr, &pStr); \ + _HFI_ERROR( \ + "CUDA failure: %s() (at %s:%d) returned %d: %s\n", \ + "cuEventQuery", __FILE__, __LINE__, cudaerr, \ + pStr?pStr:"Unknown"); \ + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Error returned from CUDA function cuEventQuery.\n");\ + } \ + } while (0) + +// resolve a cuda shared library symbol +#define PSM3_CUDA_DLSYM(psm3_cuda_lib,func) do { \ + PSM3_CONCAT(psm3_cuda_, func) = dlsym(psm3_cuda_lib, STRINGIFY(func)); \ + if (!PSM3_CONCAT(psm3_cuda_, func)) { \ + psm3_handle_error(PSMI_EP_NORETURN, \ + PSM2_INTERNAL_ERR, \ + " Unable to resolve %s symbol" \ + " in CUDA libraries.\n",STRINGIFY(func)); \ + } \ +} while (0) + +static int psm3_cuda_lib_load() +{ + psm2_error_t err = PSM2_OK; + char *dlerr; + + PSM2_LOG_MSG("entering"); + _HFI_DBG("Loading CUDA library.\n"); + + psm3_cuda_lib = dlopen("libcuda.so.1", RTLD_LAZY); + if (!psm3_cuda_lib) { + dlerr = dlerror(); + _HFI_ERROR("Unable to open libcuda.so.1. Error %s\n", + dlerr ? dlerr : "no dlerror()"); + goto fail; + } + + PSM3_CUDA_SYM_FP(cuDriverGetVersion) = dlsym(psm3_cuda_lib, "cuDriverGetVersion"); + + if (!PSM3_CUDA_SYM_FP(cuDriverGetVersion)) { + _HFI_ERROR + ("Unable to resolve symbols in CUDA libraries.\n"); + goto fail; + } + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuGetErrorString);// for PSM3_CUDA_CALL + + PSM3_CUDA_CALL(cuDriverGetVersion, &psm3_cuda_lib_version); + if (psm3_cuda_lib_version < 7000) { + _HFI_ERROR("Please update CUDA driver, required minimum version is 7.0\n"); + goto fail; + } + + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuInit); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuCtxGetCurrent); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuCtxDetach); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuCtxSetCurrent); +#if PSM3_CUDA_HAVE_CTX_SYNC_MEMOPS + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuCtxSetFlags); +#endif + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuPointerGetAttribute); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuPointerGetAttributes); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuPointerSetAttribute); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuDeviceCanAccessPeer); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuDeviceGetAttribute); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuDeviceGet); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuDeviceGetCount); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuStreamCreate); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuStreamDestroy); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuStreamSynchronize); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuEventCreate); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuEventDestroy); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuEventQuery); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuEventRecord); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuEventSynchronize); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemRetainAllocationHandle); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemRelease); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemHostAlloc); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemFreeHost); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemHostRegister); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemHostUnregister); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemcpy); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemcpyDtoD); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemcpyDtoH); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemcpyHtoD); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemcpyDtoHAsync); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemcpyHtoDAsync); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuIpcGetMemHandle); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuIpcOpenMemHandle); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuIpcCloseMemHandle); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemGetAddressRange); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuDevicePrimaryCtxGetState); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuDevicePrimaryCtxRetain); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuDevicePrimaryCtxRelease); + PSM3_CUDA_DLSYM(psm3_cuda_lib, cuCtxGetDevice); + + /* CUDA Runtime */ + psm3_cudart_lib = dlopen("libcudart.so", RTLD_LAZY); + if (!psm3_cudart_lib) { + dlerr = dlerror(); + _HFI_ERROR("Unable to open libcudart.so. Error %s\n", + dlerr ? dlerr : "no dlerror()"); + goto fail; + } + PSM3_CUDA_DLSYM(psm3_cudart_lib, cudaRuntimeGetVersion); + + PSM2_LOG_MSG("leaving"); + return err; +fail: + if (psm3_cuda_lib) + dlclose(psm3_cuda_lib); + if (psm3_cudart_lib) + dlclose(psm3_cudart_lib); + err = psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to load CUDA library.\n"); + return err; +} + +static void psm3_cuda_stats_register() +{ +#define PSM3_CUDA_COUNT_DECLU64(func) \ + PSMI_STATS_DECLU64(#func, NULL, &PSM3_CONCAT(psm3_cuda_count_, func)) + + struct psmi_stats_entry entries[] = { + PSM3_CUDA_COUNT_DECLU64(cuInit), + PSM3_CUDA_COUNT_DECLU64(cuCtxDetach), + PSM3_CUDA_COUNT_DECLU64(cuCtxGetCurrent), + PSM3_CUDA_COUNT_DECLU64(cuCtxSetCurrent), +#if PSM3_CUDA_HAVE_CTX_SYNC_MEMOPS + PSM3_CUDA_COUNT_DECLU64(cuCtxSetFlags), +#endif + PSM3_CUDA_COUNT_DECLU64(cuPointerGetAttribute), + PSM3_CUDA_COUNT_DECLU64(cuPointerGetAttributes), + PSM3_CUDA_COUNT_DECLU64(cuPointerSetAttribute), + PSM3_CUDA_COUNT_DECLU64(cuDeviceCanAccessPeer), + PSM3_CUDA_COUNT_DECLU64(cuDeviceGet), + PSM3_CUDA_COUNT_DECLU64(cuDeviceGetAttribute), + PSM3_CUDA_COUNT_DECLU64(cuDriverGetVersion), + PSM3_CUDA_COUNT_DECLU64(cuDeviceGetCount), + PSM3_CUDA_COUNT_DECLU64(cuStreamCreate), + PSM3_CUDA_COUNT_DECLU64(cuStreamDestroy), + PSM3_CUDA_COUNT_DECLU64(cuStreamSynchronize), + PSM3_CUDA_COUNT_DECLU64(cuEventCreate), + PSM3_CUDA_COUNT_DECLU64(cuEventDestroy), + PSM3_CUDA_COUNT_DECLU64(cuEventQuery), + PSM3_CUDA_COUNT_DECLU64(cuEventRecord), + PSM3_CUDA_COUNT_DECLU64(cuEventSynchronize), + PSM3_CUDA_COUNT_DECLU64(cuMemRetainAllocationHandle), + PSM3_CUDA_COUNT_DECLU64(cuMemRelease), + PSM3_CUDA_COUNT_DECLU64(cuMemHostAlloc), + PSM3_CUDA_COUNT_DECLU64(cuMemFreeHost), + PSM3_CUDA_COUNT_DECLU64(cuMemHostRegister), + PSM3_CUDA_COUNT_DECLU64(cuMemHostUnregister), + PSM3_CUDA_COUNT_DECLU64(cuMemcpy), + PSM3_CUDA_COUNT_DECLU64(cuMemcpyDtoD), + PSM3_CUDA_COUNT_DECLU64(cuMemcpyDtoH), + PSM3_CUDA_COUNT_DECLU64(cuMemcpyHtoD), + PSM3_CUDA_COUNT_DECLU64(cuMemcpyDtoHAsync), + PSM3_CUDA_COUNT_DECLU64(cuMemcpyHtoDAsync), + PSM3_CUDA_COUNT_DECLU64(cuIpcGetMemHandle), + PSM3_CUDA_COUNT_DECLU64(cuIpcOpenMemHandle), + PSM3_CUDA_COUNT_DECLU64(cuIpcCloseMemHandle), + PSM3_CUDA_COUNT_DECLU64(cuMemGetAddressRange), + PSM3_CUDA_COUNT_DECLU64(cuDevicePrimaryCtxGetState), + PSM3_CUDA_COUNT_DECLU64(cuDevicePrimaryCtxRetain), + PSM3_CUDA_COUNT_DECLU64(cuCtxGetDevice), + PSM3_CUDA_COUNT_DECLU64(cuDevicePrimaryCtxRelease), + PSM3_CUDA_COUNT_DECLU64(cuGetErrorString), + PSM3_CUDA_COUNT_DECLU64(cudaRuntimeGetVersion), + }; +#undef PSM3_CUDA_COUNT_DECLU64 + + psm3_stats_register_type("PSM_Cuda_call_statistics", + "Count of CUDA calls per API entry point for the whole process.\n" + "When using an NVIDIA GPU, PSM3 may call lower level CUDA " + "APIs to access or transfer application buffers in GPU memory.", + PSMI_STATSTYPE_GPU, + entries, PSMI_HOWMANY(entries), NULL, + &PSM3_CUDA_SYM_COUNT(cuInit), NULL); /* context must != NULL */ +} + +#ifdef PSM_HAVE_RNDV_MOD +static void psm3_cuda_get_bars(void) +{ + int num_devices, dev; + union psmi_envvar_val env; + + psm3_getenv("PSM3_GPU_PIN_CHECK", + "Enable sanity check of physical addresses mapped into GPU BAR space (Enabled by default)", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)1, &env); + psm3_cuda_gpu_pin_check = env.e_int; + + PSM3_CUDA_CALL(cuDeviceGetCount, &num_devices); + psm3_cuda_gpu_bars = psmi_calloc(PSMI_EP_NONE, UNDEFINED, num_devices, sizeof(psm3_cuda_gpu_bars[0])); + if (! psm3_cuda_gpu_bars) + return; // psmi_calloc will have exited for Out of Memory + + if (psm3_cuda_gpu_pin_check) + psm3_cuda_num_gpu_bars = num_devices; + + for (dev = 0; dev < num_devices; dev++) { + CUdevice device; + int domain, bus, slot; + + PSM3_CUDA_CALL(cuDeviceGet, &device, dev); + PSM3_CUDA_CALL(cuDeviceGetAttribute, + &domain, + CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, + device); + PSM3_CUDA_CALL(cuDeviceGetAttribute, + &bus, + CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, + device); + PSM3_CUDA_CALL(cuDeviceGetAttribute, + &slot, + CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, + device); + psm3_cuda_gpu_bars[dev] = psm3_cuda_get_nvidia_bar_addr(domain, bus, slot); + } +} +#endif /* PSM_HAVE_RNDV_MOD */ + +static void psm3_cuda_init_env_cpe(void) +{ + union psmi_envvar_val val; + + int ret = psm3_getenv_range("PSM3_CUDA_PERMITTED_ENFORCEMENT", + "Enforcement policy for the CUDA_PERMITTED endpoint flag\n", + " 0: REJECT attempts to modify as an error\n" + " 1: IGNORE attempts to modify, feigning success (default)\n" + " 2: OBEY by restricting CUDA usage", + PSMI_ENVVAR_LEVEL_HIDDEN, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)PSM3_CPE_IGNORE, + (union psmi_envvar_val)PSM3_CPE_REJECT, + (union psmi_envvar_val)PSM3_CPE_OBEY, + NULL, + NULL, + &val); + + if (!ret) + psm3_cuda_permitted_enforcement = (psm3_cuda_permitted_enforcement_t)val.e_uint; +} + +static void psm3_cuda_init_env_sync(void) +{ + union psmi_envvar_val val; + + int ret = psm3_getenv_range("PSM3_CUDA_SYNC", + "Policy for setting synchroniation attributes.\n", + " 0 CTX: attempt to set context-level SYNC_MEMOPS on CUDA 12.1 or better\n" + " otherwise, set pointer-level SYNC_MEMOPS\n" + " 1 PTR: always set pointer-level SYNC_MEMOPS\n" + " 2 PTR_RELAXED: always set pointer-level SYNC_MEMOPS,\n" + " but ignore 801 (not supported, expected for VMM allocs)\n" + " 3 NONE: never set SYNC_MEMOPS\n", + PSMI_ENVVAR_LEVEL_HIDDEN, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)PSM3_CUDA_SYNC_CTX, + (union psmi_envvar_val)PSM3_CUDA_SYNC_CTX, + (union psmi_envvar_val)PSM3_CUDA_SYNC_NONE, + NULL, + NULL, + &val); + + if (!ret) + psm3_cuda_sync_mode = (psm3_cuda_sync_mode_t)val.e_uint; +} + +static psm2_error_t psm3_cuda_initialize(void) +{ + psm2_error_t err = PSM2_OK; + + PSM2_LOG_MSG("entering"); + _HFI_DBG("Enabling CUDA support.\n"); + + psm3_cuda_stats_register(); + + err = psm3_cuda_lib_load(); + if (err != PSM2_OK) + goto fail; + + PSM3_CUDA_CALL(cuInit, 0); + + PSM3_CUDA_CALL(cudaRuntimeGetVersion, &psm3_cuda_runtime_ver); + +#ifdef PSM_HAVE_RNDV_MOD + psm3_cuda_get_bars(); +#endif + if (! psm3_gpu_thresh_rndv) // sockets HAL could set new default + psm3_gpu_thresh_rndv = PSM3_CUDA_GPU_THRESH_RNDV; + psm3_gpu_rndv_nic_window_default = PSM3_CUDA_RNDV_NIC_WINDOW_DEFAULT; + psm3_gpu_gpudirect_rdma_send_limit_default = PSM3_CUDA_GPUDIRECT_RDMA_SEND_LIMIT_DEFAULT; + psm3_gpu_gpudirect_rdma_recv_limit_default = PSM3_CUDA_GPUDIRECT_RDMA_RECV_LIMIT_DEFAULT; + psm3_gpu_mq_rndv_shm_gpu_thresh_default = PSM3_CUDA_MQ_RNDV_SHM_GPU_THRESH; + + psm3_cuda_init_env_cpe(); + psm3_cuda_init_env_sync(); + + PSM2_LOG_MSG("leaving"); + return err; +fail: + err = psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to initialize PSM3 CUDA support.\n"); + return err; +} + +static void psm3_cuda_finalize(void) +{ + psm3_stats_deregister_type(PSMI_STATSTYPE_GPU, &PSM3_CUDA_SYM_COUNT(cuInit)); +} + +static void psm3_cuda_ep_open(void) +{ + // nothing to do +} + +static void psm3_cuda_ep_close(void) +{ + // nothing to do +} + +static void psm3_cuda_identify(char *accel_vers, size_t size) +{ + char cudart_ver[64] = "unknown"; + if (psm3_cuda_runtime_ver) + snprintf(cudart_ver, sizeof(cudart_ver), "%d.%d", + psm3_cuda_runtime_ver / 1000, (psm3_cuda_runtime_ver % 1000) / 10); + snprintf(accel_vers, size, "%s %s CUDA Runtime %s built against interface %d.%d\n", + psm3_get_mylabel(), psm3_ident_tag, + cudart_ver, CUDA_VERSION / 1000, (CUDA_VERSION % 1000) / 10); +} + +static int psm3_cuda_p2p_supported() +{ + static int p2p_supported = -1; // -1 indicates "unset" + if (likely(p2p_supported > -1)) + return p2p_supported; + + p2p_supported = 0; + + /* Check which devices the current device has p2p access to. */ + CUdevice current_device; + CUcontext current_context; + int num_devices, dev_idx; + PSM3_CUDA_CALL(cuDeviceGetCount, &num_devices); + + if (num_devices > 1) { + PSM3_CUDA_CALL(cuCtxGetCurrent, ¤t_context); + if (current_context == NULL) { + _HFI_INFO("Unable to find active CUDA context, assuming P2P not supported\n"); + return 0; + } + PSM3_CUDA_CALL(cuCtxGetDevice, ¤t_device); + } + + for (dev_idx = 0; dev_idx < num_devices; dev_idx++) { + CUdevice device; + PSM3_CUDA_CALL(cuDeviceGet, &device, dev_idx); + + if (num_devices > 1 && device != current_device) { + int canAccessPeer = 0; + PSM3_CUDA_CALL(cuDeviceCanAccessPeer, &canAccessPeer, + current_device, device); + + if (canAccessPeer != 1) + _HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev_idx); + else + p2p_supported |= (1 << dev_idx); + } else { + /* Always support p2p on the same GPU */ + psm3_my_gpu_device = dev_idx; + p2p_supported |= (1 << dev_idx); + } + } + + _HFI_DBG("returning (0x%x), device 0x%x (%d)\n", p2p_supported, (1 << psm3_my_gpu_device), psm3_my_gpu_device); + return p2p_supported; +} + +static int psm3_cuda_gpudirect_supported() +{ + static int device_support_gpudirect = -1; // -1 indicates unset + + if (likely(device_support_gpudirect > -1)) return device_support_gpudirect; + + int num_devices, dev; + + /* Check if all devices support GPU Direct RDMA based on version. */ + PSM3_CUDA_CALL(cuDeviceGetCount, &num_devices); + + device_support_gpudirect = 1; + + for (dev = 0; dev < num_devices; dev++) { + CUdevice device; + PSM3_CUDA_CALL(cuDeviceGet, &device, dev); + + int major; + PSM3_CUDA_CALL(cuDeviceGetAttribute, &major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device); + if (major < 3) { + device_support_gpudirect = 0; + _HFI_INFO("CUDA device %d does not support GPUDirect RDMA (Non-fatal error)\n", dev); + } + } + + return device_support_gpudirect; +} + +static void psm3_cuda_using_rv_for_mrs(void) +{ + // nothing to do +} + +static void psm3_cuda_verify_GPU_capabilities(void) +{ + static int device_support_unified_addr = -1; // -1 indicates "unchecked" + // we confirm the GPU supports unified addressing since this + // allows a GPU address alone to be sufficient to identify the GPU device + if (likely(device_support_unified_addr > -1)) return; + + int num_devices, dev; + + /* Check if all devices support Unified Virtual Addressing. */ + PSM3_CUDA_CALL(cuDeviceGetCount, &num_devices); + + device_support_unified_addr = 1; + + for (dev = 0; dev < num_devices; dev++) { + CUdevice device; + PSM3_CUDA_CALL(cuDeviceGet, &device, dev); + int unifiedAddressing; + PSM3_CUDA_CALL(cuDeviceGetAttribute, + &unifiedAddressing, + CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, + device); + + if (unifiedAddressing !=1) { + psm3_handle_error(PSMI_EP_NORETURN, PSM2_EP_DEVICE_FAILURE, + "CUDA device %d does not support Unified Virtual Addressing.\n", + dev); + } + } + + return; +} + +static void psm3_cuda_get_pci_addr(uint32_t *domain_p, uint32_t *bus_p, + uint32_t *dev_p, uint32_t *func_p) +{ + int domain, bus, dev; + int num_devices; + CUdevice device; + + PSM3_CUDA_CALL(cuDeviceGetCount, &num_devices); + _HFI_DBG("%d Cuda GPUs found\n", num_devices); + if (! num_devices) + return; + + if (num_devices == 1) { + PSM3_CUDA_CALL(cuDeviceGet, &device, 0); + } else { + // all GPUs will be visible to process, see if app chose one first + CUcontext ctxt = {0}; + if (! PSM3_CUDA_SYM_FP(cuCtxGetCurrent) || PSM3_CUDA_SYM_FP(cuCtxGetCurrent)(&ctxt) || ! ctxt) { + _HFI_DBG("Unable to get Cuda ctxt\n"); + //PSM3_CUDA_CALL(cuDeviceGet, &device, 0); + return; + } else { + PSM3_CUDA_CALL(cuCtxGetDevice, &device); + } + } + _HFI_DBG("Using Cuda GPU %d\n", device); + PSM3_CUDA_CALL(cuDeviceGetAttribute, + &domain, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, device); + PSM3_CUDA_CALL(cuDeviceGetAttribute, + &bus, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, device); + PSM3_CUDA_CALL(cuDeviceGetAttribute, + &dev, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, device); + *domain_p = domain; + *bus_p = bus; + *dev_p = dev; + *func_p = 0; +} + +#ifdef PSM_HAVE_RNDV_MOD +// The second BAR address is where the GPU will map GPUDirect memory. +// The beginning of this BAR is reserved for non-GPUDirect uses. +// However, it has been observed that in some multi-process +// pinning failures, HED-2035, the nvidia_p2p_get_pages can foul up +// it's IOMMU after which the next successful pin will incorrectly +// return the 1st physical address of the BAR for the pinned pages. +// In this case it will report this same physical address for other GPU virtual +// addresses and cause RDMA to use the wrong memory. +// As a workaround, we gather the Region 1 BAR address start for each +// GPU and if we see this address returned as the phys_addr of a mmapped +// GPUDirect Copy or the iova of a GPU MR we fail the job before it can +// corrupt any more application data. +static uint64_t psm3_cuda_get_nvidia_bar_addr(int domain, int bus, int slot) +{ + char sysfs[100]; + int ret; + FILE *f; + unsigned long long start_addr, end_addr, bar_size; + + ret = snprintf(sysfs, sizeof(sysfs), + "/sys/class/pci_bus/%04x:%02x/device/%04x:%02x:%02x.0/resource", + domain, bus, domain, bus, slot); + psmi_assert_always(ret < sizeof(sysfs)); + f = fopen(sysfs, "r"); + if (! f) { + if (psm3_cuda_gpu_pin_check) { + _HFI_ERROR("Unable to open %s for GPU BAR Address: %s\n", + sysfs, strerror(errno)); + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Unable to get GPU BAR address\n"); + } + return 0; + } + // for each BAR region, start, end and flags are listed in hex + // nVidia uses the 2nd BAR region (aka Region #1) to map peer to peer + // accesses into it's potentially larger GPU local memory space + ret = fscanf(f, "%*x %*x %*x %llx %llx", &start_addr, &end_addr); + if (ret != 2) { + if (psm3_cuda_gpu_pin_check) { + _HFI_ERROR("Unable to get GPU BAR Address from %s: %s\n", + sysfs, strerror(errno)); + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Unable to get GPU BAR address\n"); + } + fclose(f); + return 0; + } + fclose(f); + + bar_size = (end_addr - start_addr) + 1; + _HFI_DBG("GPU BAR Addr from %s is 0x%llx - 0x%llx (size 0x%llx)\n", sysfs, start_addr, end_addr, bar_size); + if (! psm3_cuda_min_gpu_bar_size || bar_size < psm3_cuda_min_gpu_bar_size) + psm3_cuda_min_gpu_bar_size = bar_size; + return start_addr; +} + +static uint64_t psm3_cuda_min_bar_size(void) +{ + // for ONEAPI can return 0 for now, implement later + return psm3_cuda_min_gpu_bar_size; +} + +static psm2_error_t psm3_cuda_check_phys_addr(uint64_t phys_addr) +{ + int i; + for (i=0; i < psm3_cuda_num_gpu_bars; i++) { + if (phys_addr == psm3_cuda_gpu_bars[i]) { + _HFI_ERROR("Incorrect Physical Address (0x%"PRIx64") returned by nVidia driver. PSM3 exiting to avoid data corruption. Job may be rerun with PSM3_GPUDIRECT=0 to avoid this issue.\n", + phys_addr); + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Incorrect Physical Address returned by nVidia driver\n"); + psmi_assert_always(0); + return PSM2_INTERNAL_ERR; + } + } + return PSM2_OK; +} + +static void psm3_cuda_roundup_gdrcopy(unsigned long buf, size_t size, + uintptr_t *pageaddr_p, uint64_t *pagelen_p) +{ + *pageaddr_p = buf & GPU_PAGE_MASK; + *pagelen_p = (uint64_t) (PSMI_GPU_PAGESIZE + + ((buf + size - 1) & GPU_PAGE_MASK) - *pageaddr_p); +} + +#ifdef PSM_HAVE_REG_MR +static void psm3_cuda_roundup_rv_reg_mr(struct psm2_ep *ep, + void **addr_p, uint64_t *length_p, int access) +{ + uint64_t addr_in = (uint64_t)*addr_p; + + *addr_p = (void *)ROUNDDOWN64P2(addr_in, PSMI_GPU_PAGESIZE); + *length_p = ROUNDUP64P2(addr_in + *length_p, PSMI_GPU_PAGESIZE) - (uint64_t)*addr_p; +} + +// add Cuda specific information to the mparams in prep for the +// RV_IOCTL_REG_MEM ioctl to rv +// For Cuda, no additional information is needed +static int psm3_cuda_init_rv_reg_mr_params( + void *addr, uint64_t length, int access, + struct rv_mem_params *mparams, + union psm3_verbs_mr_gpu_specific *gpu_specific, + union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad) +{ + // nothing to do + return 0; +} +#endif // PSM_HAVE_REG_MR + +// add Cuda specific information to the params in prep for the +// RV_IOCTL_PIN_MMAP ioctl to rv +// For Cuda, no additional information is needed +static int psm3_cuda_init_rv_pin_mmap_params( + void *addr, uint64_t length, int access, + struct rv_gpu_mem_params *params, + union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad) +{ + // nothing to do + return 0; +} + +// cleanup Cuda specific scratchpad from +// psm3_cuda_init_rv_reg_mr_params or +// psm3_cuda_init_rv_pin_mmap_params +// called on success or error path, makes sure not to polute errno +// as it can reflect the earlier error for the error path in caller. +static void psm3_cuda_rv_reg_mmap_cleanup( + void *addr, uint64_t length, int access, + union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad) +{ + // nothing to do +} +#endif /* PSM_HAVE_RNDV_MOD */ + +#ifdef PSM_HAVE_REG_MR +// compare GPU specific fields in verbs MR cache entry +static int psm3_cuda_cmp_mr(const union psm3_verbs_mr_gpu_specific *a, + const union psm3_verbs_mr_gpu_specific *b) +{ + // nothing to do + return 0; +} + +// initialize GPU specific fields in verbs MR cache entry +static void psm3_cuda_init_mr(void *addr, uint64_t length, int access, + union psm3_verbs_mr_gpu_specific *gpu_specific) +{ + // nothing to do +} +#endif /* PSM_HAVE_REG_MR */ + +static void psm3_cuda_fetch_ctxt(void) +{ + PSM3_CUDA_CALL(cuCtxGetCurrent, &psm3_cu_ctxt); +} + +// ensure psm3_cu_ctxt reflects our most recent psm3_cu_ctxt +static void psm3_cuda_refresh_ctxt(void) +{ + if (psm3_cu_ctxt) + PSM3_CUDA_CALL(cuCtxSetCurrent, psm3_cu_ctxt); +} + +static void psm3_cuda_register_hostmem(void *buf, uint32_t size) +{ +#ifndef PSM3_NO_CUDA_REGISTER + // By registering memory with Cuda, we make + // cuMemcpy run faster for copies + if (psm3_cuda_check_have_cuda_ctxt()) { + PSM3_CUDA_CALL(cuMemHostRegister, + buf, size, CU_MEMHOSTALLOC_PORTABLE); + } +#endif +} + +static void psm3_cuda_unregister_hostmem(void *buf) +{ +#ifndef PSM3_NO_CUDA_REGISTER + if (psm3_cu_ctxt) { + /* ignore NOT_REGISTERED in case cuda initialized late */ + /* ignore other errors as context could be destroyed before this */ + CUresult cudaerr = PSM3_CUDA_EXEC_ASSUME_CONTEXT(cuMemHostUnregister, buf); + if (cudaerr) + PSM3_CUDA_ERROR(cuMemHostUnregister, cudaerr, DBG); + } +#endif +} + +static int psm3_cuda_is_gpu_mem(const void *ptr) +{ + CUresult cudaerr; + CUpointer_attribute attrs[] = { + CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + CU_POINTER_ATTRIBUTE_IS_MANAGED, + }; + CUmemorytype mt = 0; + uint64_t managed = 0; + void *resp[] = { &mt, &managed }; + + static_assert(PSMI_HOWMANY(attrs) == PSMI_HOWMANY(resp), + "attribute count must equal response count"); + + cudaerr = PSM3_CUDA_SYM_FP(cuPointerGetAttributes)( + PSMI_HOWMANY(attrs), attrs, resp, (CUdeviceptr)ptr); + PSM3_CUDA_SYM_COUNT(cuPointerGetAttributes) += 1; + return cudaerr == CUDA_SUCCESS && mt == CU_MEMORYTYPE_DEVICE && !managed; +} + +static void psm3_cuda_prepare_HtoD_memcpys(struct ips_protoexp *protoexp) +{ + protoexp->gpu_specific.cudastream_recv = NULL; +} + +static void psm3_cuda_prepare_DtoH_memcpys(struct ips_proto *proto) +{ + proto->gpu_specific.cudastream_send = NULL; +} + +static void psm3_cuda_shutdown_HtoD_memcpys(struct ips_protoexp *protoexp) +{ + if (protoexp->gpu_specific.cudastream_recv != NULL) { + PSM3_CUDA_CALL(cuStreamDestroy, protoexp->gpu_specific.cudastream_recv); + } +} + +static void psm3_cuda_shutdown_DtoH_memcpys(struct ips_proto *proto) +{ + if (proto->gpu_specific.cudastream_send) { + PSM3_CUDA_CALL(cuStreamDestroy, proto->gpu_specific.cudastream_send); + } +} + +static void psm3_cuda_memcpy_HtoD_start(struct ips_protoexp *protoexp, + struct ips_gpu_hostbuf *ghb, uint32_t len) +{ + if (protoexp->gpu_specific.cudastream_recv == NULL) { + PSM3_CUDA_CALL(cuStreamCreate, &protoexp->gpu_specific.cudastream_recv, + CU_STREAM_NON_BLOCKING); + } + PSM3_CUDA_CALL(cuMemcpyHtoDAsync, (CUdeviceptr)ghb->gpu_buf, ghb->host_buf, + len, protoexp->gpu_specific.cudastream_recv); + if (ghb->gpu_specific.cuda_copy_status == NULL) { + PSM3_CUDA_CALL(cuEventCreate, &ghb->gpu_specific.cuda_copy_status, CU_EVENT_DEFAULT); + } + PSM3_CUDA_CALL(cuEventRecord, ghb->gpu_specific.cuda_copy_status, protoexp->gpu_specific.cudastream_recv); +} + +static void psm3_cuda_memcpy_DtoH_start(struct ips_proto *proto, + struct ips_gpu_hostbuf *ghb, uint32_t len) +{ + if (proto->gpu_specific.cudastream_send == NULL) { + PSM3_CUDA_CALL(cuStreamCreate, &proto->gpu_specific.cudastream_send, + CU_STREAM_NON_BLOCKING); + } + if (ghb->gpu_specific.cuda_copy_status == NULL) { + PSM3_CUDA_CALL(cuEventCreate, &ghb->gpu_specific.cuda_copy_status, CU_EVENT_DEFAULT); + } + PSM3_CUDA_CALL(cuMemcpyDtoHAsync, ghb->host_buf, (CUdeviceptr)ghb->gpu_buf, + len, proto->gpu_specific.cudastream_send); + PSM3_CUDA_CALL(cuEventRecord, ghb->gpu_specific.cuda_copy_status, proto->gpu_specific.cudastream_send); +} + +static int psm3_cuda_memcpy_done(struct ips_gpu_hostbuf *ghb) +{ + CUresult status; + PSM3_CUDA_CHECK_EVENT(ghb->gpu_specific.cuda_copy_status, status); + return (status == CUDA_SUCCESS); +} + +static void psm3_cuda_hostbuf_lazy_init(struct ips_gpu_hostbuf *ghb) +{ + ghb->gpu_specific.cuda_copy_status = NULL; +} + +static void psm3_cuda_hostbuf_reset(struct ips_gpu_hostbuf *ghb) +{ + // nothing to do +} + +static void psm3_cuda_hostbuf_destroy(struct ips_gpu_hostbuf *ghb) +{ + if (ghb->gpu_specific.cuda_copy_status != NULL) { + PSM3_CUDA_CALL(cuEventDestroy, ghb->gpu_specific.cuda_copy_status); + } + if (ghb->host_buf != NULL) { + PSM3_CUDA_CALL(cuMemFreeHost, ghb->host_buf); + } +} + +static void psm3_cuda_memcpy_DtoD(void *dstptr, const void *srcptr, uint32_t len) +{ + PSM3_CUDA_CALL(cuMemcpyDtoD, (CUdeviceptr)dstptr, (CUdeviceptr)srcptr, len); +} + +static void psm3_cuda_memcpy_HtoD(void *dstptr, const void *srcptr, uint32_t len) +{ + PSM3_CUDA_CALL(cuMemcpyHtoD, (CUdeviceptr)dstptr, srcptr, len); +} + +static void psm3_cuda_memcpy_DtoH(void *dstptr, const void *srcptr, uint32_t len) +{ + PSM3_CUDA_CALL(cuMemcpyDtoH, dstptr, (CUdeviceptr)srcptr, len); +} + +static void psm3_cuda_memcpy(void *dstptr, const void *srcptr, uint32_t len) +{ + PSM3_CUDA_CALL(cuMemcpy, (CUdeviceptr)dstptr, (CUdeviceptr)srcptr, len); +} + +static void psm3_cuda_synchronize_memcpy(void) +{ + PSM3_CUDA_CALL(cuStreamSynchronize, 0); +} + +/* + * CUDA documentation dictates the use of SYNC_MEMOPS attribute when the buffer + * pointer received into PSM has been allocated by the application and is the + * target of GPUDirect DMA operations. + * + * Normally, CUDA is permitted to implicitly execute synchronous memory + * operations as asynchronous operations, relying on commands arriving via CUDA + * for proper sequencing. GDR, however, bypasses CUDA, enabling races, e.g. + * cuMemcpy sequenced before a GDR operation. + * + * SYNC_MEMOPS avoids this optimization. + * + * Note that allocations via the "VMM" API, i.e. cuMemCreate, do not support the + * SYNC_MEMOPS pointer attribute, and will return 801 (not supported). If we're + * using the newer context-level sync flag available in CUDA 12.1+ to avoid this + * issue, we will not set the pointer-level sync flag here. + */ +static void psm3_cuda_mark_buf_synchronous(const void *buf) +{ + bool check_for_not_supported = false; + + switch (psm3_cuda_sync_mode) { + case PSM3_CUDA_SYNC_CTX: +#if PSM3_CUDA_HAVE_CTX_SYNC_MEMOPS + // sync set at the context-level; nothing to do here + return; +#else + // otherwise, intentional fall through to PTR behavior +#endif + case PSM3_CUDA_SYNC_PTR: + // pointer level sync, handling all errors + break; + case PSM3_CUDA_SYNC_PTR_RELAXED: + // pointer level sync, ignoring not supported + check_for_not_supported = true; + break; + case PSM3_CUDA_SYNC_NONE: + return; + } + + CUresult cudaerr; + int true_flag = 1; + + cudaerr = PSM3_CUDA_EXEC(cuPointerSetAttribute, + &true_flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)buf); + + if_pf (check_for_not_supported && cudaerr == CUDA_ERROR_NOT_SUPPORTED) { +#ifdef PSM_DEBUG + // query the handle just to be sure it is in fact a VMM alloc + CUmemGenericAllocationHandle h; + PSM3_CUDA_CALL(cuMemRetainAllocationHandle, &h, (CUdeviceptr)buf); + PSM3_CUDA_CALL(cuMemRelease, h); +#endif + return; + } + + PSM3_CUDA_CHECK(cuPointerSetAttribute, cudaerr); + return; +} + +static void psm3_cuda_host_alloc(void **ret_ptr, uint32_t size) +{ + PSM3_CUDA_CALL(cuMemHostAlloc, (void **)ret_ptr, size, + CU_MEMHOSTALLOC_PORTABLE); +} + +static void psm3_cuda_host_free(void *ptr) +{ + PSM3_CUDA_CALL(cuMemFreeHost, (void *)ptr); +} + +static int psm3_cuda_gpu_addr_send_mr(struct psm2_mq_req *mqreq) +{ + return mqreq->is_buf_gpu_mem && ! mqreq->gpu_hostbuf_used; +} + +static int psm3_cuda_gpu_addr_recv_mr(struct ips_tid_recv_desc *tidrecvc, + int gpu_hostbuf_used) +{ + return tidrecvc->is_ptr_gpu_backed; +} + +//*************************************************************************** +//cuda support for PSM3_DEVICES "shm", via an IPC handle cache and Cuda IPC +//In platforms with NVLINK between GPUs, Cuda IPC will use NVLINK. + +#define CUDA_MEMHANDLE_CACHE_SIZE 64 + +/* + * rbtree cruft + */ +struct _cl_map_item; + +typedef struct +{ + unsigned long start; /* start virtual address */ + CUipcMemHandle cuda_ipc_handle; /* cuda ipc mem handle */ + CUdeviceptr cuda_ipc_dev_ptr;/* Cuda device pointer */ + psm2_epid_t epid; + struct _cl_map_item* i_prev; /* idle queue previous */ + struct _cl_map_item* i_next; /* idle queue next */ +}__attribute__ ((aligned (128))) psm3_rbtree_cuda_memhandle_cache_mapitem_pl_t; + +typedef struct { + uint32_t nelems; /* number of elements in the cache */ +} psm3_rbtree_cuda_memhandle_cache_map_pl_t; + +/* + * Custom comparator + */ +typedef psm3_rbtree_cuda_memhandle_cache_mapitem_pl_t psm3_cuda_cache_item; + +static int psm3_cuda_cache_key_cmp(const psm3_cuda_cache_item *a, const psm3_cuda_cache_item *b) +{ + // we use epid as part of cache key so multi-ep and multi-process jobs + // can have a better cache hit rate. In some cases we may end up with + // cache entries for the same buffer with different epid's all within the + // same multi-ep rank, but this does no harm other than to waste some + // cache space. By including epid in key_cmp we have a chance to have + // separate cache entries for the same sbuf address in different + // sender's GPU virtual address space. + switch (psm3_epid_cmp_internal(a->epid, b->epid)) { + case -1: return -1; + case 1: return 1; + default: + break; + } + + // The sender has used cuMemGetAddressRange to normalize the address + // so we can simply compare the start address of the allocation. + // Note cuIpcOpenMemHandle only needs the start address as well, so we + // ignore length + if (a->start < b->start) + return -1; + if (b->start < a->start) + return 1; + + return 0; +} + + +/* + * Necessary rbtree cruft + */ +#define RBTREE_MI_PL psm3_rbtree_cuda_memhandle_cache_mapitem_pl_t +#define RBTREE_MAP_PL psm3_rbtree_cuda_memhandle_cache_map_pl_t +#define RBTREE_CMP(a,b) psm3_cuda_cache_key_cmp((a), (b)) +#define RBTREE_ASSERT psmi_assert +#define RBTREE_MAP_COUNT(PAYLOAD_PTR) ((PAYLOAD_PTR)->nelems) +#define RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR + +#include "psm3_rbtree.h" +#include "psm3_rbtree.c" + +/* + * Convenience rbtree cruft + */ +#define NELEMS(cache) ((cache)->map.payload.nelems) + +#define IHEAD(cache) ((cache)->map.root) +#define LAST(cache) (IHEAD(cache)->payload.i_prev) +#define FIRST(cache) (IHEAD(cache)->payload.i_next) +#define INEXT(x) ((x)->payload.i_next) +#define IPREV(x) ((x)->payload.i_prev) + +/* + * Actual module data + */ +struct psm3_cuda_memhandle_cache { + cl_qmap_t map; + mpool_t mpool; + uint32_t size; + psm2_mq_stats_t *stats; +}; +typedef struct psm3_cuda_memhandle_cache *psm3_cuda_memhandle_cache_t; + +static void psm3_print_cuda_memhandle_cache_stats(psm2_mq_stats_t *stats) +{ + _HFI_DBG("limit=%lu,maxelems=%lu,hit=%lu,miss=%lu,evict=%lu,remove=%lu,clear=%lu\n", + stats->gpu_ipc_cache_limit, stats->gpu_ipc_cache_max_nelems, + stats->gpu_ipc_cache_hit, stats->gpu_ipc_cache_miss, + stats->gpu_ipc_cache_evict, stats->gpu_ipc_cache_remove, + stats->gpu_ipc_cache_clear); +} + +/* + * This is the callback function when mempool are resized or destroyed. + * Upon calling cache fini mpool is detroyed which in turn calls this callback + * which helps in closing all memhandles. + */ +static void +psm3_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj) +{ + cl_map_item_t* memcache_item = (cl_map_item_t*)obj; + if (!is_alloc) { + if(memcache_item->payload.start) + PSM3_CUDA_CALL(cuIpcCloseMemHandle, + memcache_item->payload.cuda_ipc_dev_ptr); + } +} + +/* + * Creating mempool for cuda memhandle cache nodes. + */ +static psm2_error_t +psm3_cuda_memhandle_mpool_alloc(psm3_cuda_memhandle_cache_t cache, + uint32_t memcache_size) +{ + psm2_error_t err; + if (memcache_size < 1) + return PSM2_PARAM_ERR; + + cache->size = memcache_size; + /* Creating a memory pool of size PSM3_CUDA_MEMCACHE_SIZE + * which includes the Root and NIL items + */ + cache->mpool = psm3_mpool_create_for_gpu(sizeof(cl_map_item_t), + cache->size, + cache->size, 0, + UNDEFINED, NULL, NULL, + psm3_cuda_memhandle_cache_alloc_func, + NULL); + if (cache->mpool == NULL) { + err = psm3_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, + "Couldn't allocate CUDA host receive buffer pool"); + return err; + } + return PSM2_OK; +} + +/* + * allocate and initialize memhandle cache + * including rbtree. + */ +static psm2_error_t psm3_cuda_memhandle_cache_alloc( + psm3_cuda_memhandle_cache_t *cachep, uint32_t memcache_size, + psm2_mq_stats_t *stats) +{ + cl_map_item_t *root = NULL, *nil_item = NULL; + + *cachep = (psm3_cuda_memhandle_cache_t)psmi_calloc( + NULL, UNDEFINED, 1, sizeof(**cachep)); + if (! *cachep) + return PSM2_NO_MEMORY; + + psm2_error_t err = psm3_cuda_memhandle_mpool_alloc(*cachep, memcache_size); + if (err != PSM2_OK) + goto fail; + + root = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t)); + if (root == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + nil_item = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t)); + if (nil_item == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + + nil_item->payload.start = 0; + nil_item->payload.epid = psm3_epid_zeroed_internal(); + ips_cl_qmap_init(&(*cachep)->map,root,nil_item); + NELEMS(*cachep) = 0; + + (*cachep)->stats = stats; + + stats->gpu_ipc_cache_limit = memcache_size; + stats->gpu_ipc_cache_nelems = 0; + stats->gpu_ipc_cache_max_nelems = 0; + stats->gpu_ipc_cache_hit = 0; + stats->gpu_ipc_cache_miss = 0; + stats->gpu_ipc_cache_evict = 0; + stats->gpu_ipc_cache_remove = 0; + stats->gpu_ipc_cache_clear = 0; + + return PSM2_OK; + +fail: + if (nil_item) + psmi_free(nil_item); + if (root) + psmi_free(root); + if ((*cachep)->mpool) + psm3_mpool_destroy((*cachep)->mpool); + psmi_free(*cachep); + return err; +} + +static void psm3_cuda_memhandle_cache_free(psm3_cuda_memhandle_cache_t cache) +{ + psm3_print_cuda_memhandle_cache_stats(cache->stats); + + if (cache->map.nil_item) + psmi_free(cache->map.nil_item); + if (cache->map.root) + psmi_free(cache->map.root); + if (cache->mpool) + psm3_mpool_destroy(cache->mpool); + psmi_free(cache); +} + +/* + * Insert at the head of Idleq. + */ +static void +psm3_cuda_idleq_insert(psm3_cuda_memhandle_cache_t cache, cl_map_item_t* memcache_item) +{ + if (FIRST(cache) == NULL) { + FIRST(cache) = memcache_item; + LAST(cache) = memcache_item; + return; + } + INEXT(FIRST(cache)) = memcache_item; + IPREV(memcache_item) = FIRST(cache); + FIRST(cache) = memcache_item; + INEXT(FIRST(cache)) = NULL; + return; +} + +/* + * Remove least recent used element. + */ +static void +psm3_cuda_idleq_remove_last(psm3_cuda_memhandle_cache_t cache, cl_map_item_t* memcache_item) +{ + if (!INEXT(memcache_item)) { + LAST(cache) = NULL; + FIRST(cache) = NULL; + } else { + LAST(cache) = INEXT(memcache_item); + IPREV(LAST(cache)) = NULL; + } + // Null-out now-removed memcache_item's next and prev pointers out of + // an abundance of caution + INEXT(memcache_item) = IPREV(memcache_item) = NULL; +} + +static void +psm3_cuda_idleq_remove(psm3_cuda_memhandle_cache_t cache, cl_map_item_t* memcache_item) +{ + if (LAST(cache) == memcache_item) { + psm3_cuda_idleq_remove_last(cache, memcache_item); + } else if (FIRST(cache) == memcache_item) { + FIRST(cache) = IPREV(memcache_item); + INEXT(FIRST(cache)) = NULL; + } else { + INEXT(IPREV(memcache_item)) = INEXT(memcache_item); + IPREV(INEXT(memcache_item)) = IPREV(memcache_item); + } + // Null-out now-removed memcache_item's next and prev pointers out of + // an abundance of caution + INEXT(memcache_item) = IPREV(memcache_item) = NULL; +} + +static void +psm3_cuda_idleq_reorder(psm3_cuda_memhandle_cache_t cache, cl_map_item_t* memcache_item) +{ + if (FIRST(cache) == memcache_item && LAST(cache) == memcache_item ) { + return; + } + psm3_cuda_idleq_remove(cache, memcache_item); + psm3_cuda_idleq_insert(cache, memcache_item); + return; +} + +/* + * After a successful cache hit, item is validated by doing a + * memcmp on the handle stored and the handle we receive from the + * sender. If the validation fails the item is removed from the idleq, + * the rbtree, is put back into the mpool and cuIpcCloseMemHandle function + * is called. + * Cuda ipcMemHandles for distinct allocations are unique, even if the + * allocation was at the same address. So this check catches stale cache + * entries. + */ +static psm2_error_t +psm3_cuda_memhandle_cache_validate(psm3_cuda_memhandle_cache_t cache, + cl_map_item_t* memcache_item, + uintptr_t sbuf, CUipcMemHandle* handle, + psm2_epid_t epid) +{ + psmi_assert(!psm3_epid_cmp_internal(epid, memcache_item->payload.epid)); + psmi_assert(sbuf == memcache_item->payload.start); + if (0 == memcmp(handle, &memcache_item->payload.cuda_ipc_handle, + sizeof(CUipcMemHandle))) { + return PSM2_OK; + } + _HFI_DBG("cache collision: new entry start=%lu\n", sbuf); + + cache->stats->gpu_ipc_cache_remove++; + ips_cl_qmap_remove_item(&cache->map, memcache_item); + cache->stats->gpu_ipc_cache_nelems--; + PSM3_CUDA_CALL(cuIpcCloseMemHandle, + memcache_item->payload.cuda_ipc_dev_ptr); + psm3_cuda_idleq_remove(cache, memcache_item); + memset(memcache_item, 0, sizeof(*memcache_item)); + psm3_mpool_put(memcache_item); + return PSM2_OK_NO_PROGRESS; +} + +/* + * Current eviction policy: Least Recently Used. + */ +static void +psm3_cuda_memhandle_cache_evict(psm3_cuda_memhandle_cache_t cache) +{ + cache->stats->gpu_ipc_cache_evict++; + cl_map_item_t *p_item = LAST(cache); + _HFI_VDBG("Removing (epid=%s,start=%lu,dev_ptr=0x%llX,it=%p) from cuda_memhandle_cachemap.\n", + psm3_epid_fmt_internal(p_item->payload.epid, 0), p_item->payload.start, + p_item->payload.cuda_ipc_dev_ptr, p_item); + ips_cl_qmap_remove_item(&cache->map, p_item); + cache->stats->gpu_ipc_cache_nelems--; + PSM3_CUDA_CALL(cuIpcCloseMemHandle, p_item->payload.cuda_ipc_dev_ptr); + psm3_cuda_idleq_remove_last(cache, p_item); + memset(p_item, 0, sizeof(*p_item)); + psm3_mpool_put(p_item); +} + +static psm2_error_t +psm3_cuda_memhandle_cache_register(psm3_cuda_memhandle_cache_t cache, + uintptr_t sbuf, CUipcMemHandle* handle, + psm2_epid_t epid, + CUdeviceptr cuda_ipc_dev_ptr) +{ + if (NELEMS(cache) == cache->size) + psm3_cuda_memhandle_cache_evict(cache); + + cl_map_item_t* memcache_item = psm3_mpool_get(cache->mpool); + /* memcache_item cannot be NULL as we evict + * before the call to mpool_get. Check has + * been fixed to help with klockwork analysis. + */ + if (memcache_item == NULL) + return PSM2_NO_MEMORY; + memcache_item->payload.start = sbuf; + memcache_item->payload.cuda_ipc_handle = *handle; + memcache_item->payload.cuda_ipc_dev_ptr = cuda_ipc_dev_ptr; + memcache_item->payload.epid = epid; + ips_cl_qmap_insert_item(&cache->map, memcache_item); + cache->stats->gpu_ipc_cache_nelems++; + if (cache->stats->gpu_ipc_cache_nelems > cache->stats->gpu_ipc_cache_max_nelems) + cache->stats->gpu_ipc_cache_max_nelems = cache->stats->gpu_ipc_cache_nelems; + psm3_cuda_idleq_insert(cache, memcache_item); + return PSM2_OK; +} + +static void psm3_cuda_memhandle_cache_clear(psm3_cuda_memhandle_cache_t cache) +{ + _HFI_DBG("Closing all handles, clearing cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS(cache)); + while (NELEMS(cache)) { + psm3_cuda_memhandle_cache_evict(cache); + } + cache->stats->gpu_ipc_cache_clear++; + _HFI_DBG("Closed all handles, cleared cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS(cache)); +} + +/* + * The key used to search the cache is the senders buf address pointer and + * epid. The sender will have used cuMemGetAddressRange + * to find the start of the memory containing the buffer (supplied as sbuf). + * Upon match, we must validate the entry we find and may need to replace it. + */ +static CUdeviceptr +psm3_cuda_memhandle_acquire(psm3_cuda_memhandle_cache_t cache, + uintptr_t sbuf, CUipcMemHandle* handle, + psm2_epid_t epid) +{ + _HFI_VDBG("sbuf=%lu,handle=%p,epid=%s\n", + sbuf, handle, psm3_epid_fmt_internal(epid, 0)); + + CUdeviceptr cuda_ipc_dev_ptr; + if(! cache) { + PSM3_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, + *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); + return cuda_ipc_dev_ptr; + } + + psm3_cuda_cache_item key = { + .start = (unsigned long) sbuf, + .epid = epid + }; + + /* + * preconditions: + * 1) buffer [start,epid) may or may not be in cachemap already + * 2) there are no duplicate entries in cachemap + * postconditions: + * 1) buffer is in cachemap with same handle, epid + * 2) there are no duplicate entries in cachemap + * + * The key used to search the cache is the senders buf address pointer + * and epid. + * Upon a succesful hit in the cache, additional validation is required + * as the handle could be stale. + */ + cl_map_item_t *p_item = ips_cl_qmap_searchv(&cache->map, &key); + if (p_item->payload.start) { + // confirm the entry for sbuf matches the handle and is not stale + if (psm3_cuda_memhandle_cache_validate(cache, p_item, sbuf, handle, epid) == PSM2_OK) { + cache->stats->gpu_ipc_cache_hit++; + psm3_cuda_idleq_reorder(cache, p_item); + return p_item->payload.cuda_ipc_dev_ptr; + } + + // buffer found was stale psm3_cuda_memhandle_cache_validate() + // closed and removed existing entry. + // Should find no more duplicates +#ifdef PSM_DEBUG + p_item = ips_cl_qmap_searchv(&cache->map, &key); + psmi_assert(! p_item->payload.start); +#endif + } + cache->stats->gpu_ipc_cache_miss++; + + CUresult cudaerr = PSM3_CUDA_CALL_EXCEPT( + CUDA_ERROR_ALREADY_MAPPED, + cuIpcOpenMemHandle, + &cuda_ipc_dev_ptr, + *handle, + CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); + + if (cudaerr == CUDA_ERROR_ALREADY_MAPPED) { + // remote memory already mapped. Close all handles, clear cache, + // and try again + psm3_cuda_memhandle_cache_clear(cache); + PSM3_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, *handle, + CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); + } + + psm3_cuda_memhandle_cache_register(cache, sbuf, handle, + epid, cuda_ipc_dev_ptr); + return cuda_ipc_dev_ptr; +} + +static void +psm3_cuda_memhandle_release(psm3_cuda_memhandle_cache_t cache, + CUdeviceptr cuda_ipc_dev_ptr) +{ + if(! cache) + PSM3_CUDA_CALL(cuIpcCloseMemHandle, cuda_ipc_dev_ptr); + return; +} +// end of CUDA IPC MemHandle Cache +//*************************************************************************** + + +// RTS and CTS processing functions for PSM3_DEVICES "shm" to pass +// Cuda IPC handles and permit use of NVLINK for intra-node transfers +static psm2_error_t psm3_cuda_shm_init(struct ptl_am *ptl, + psm2_mq_stats_t *stats) +{ + // TBD - should we have generic names for these env variables + // PSM3_GPU_MEMCACHE_ENABLED, PSM3_GPU_MEMCACHE_SIZE? + union psmi_envvar_val env_memcache_enabled; + + psm3_getenv("PSM3_CUDA_MEMCACHE_ENABLED", + "PSM cuda ipc memhandle cache enabled (default is enabled)", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)1, &env_memcache_enabled); + if (env_memcache_enabled.e_uint) { + union psmi_envvar_val env_memcache_size; + + psm3_getenv("PSM3_CUDA_MEMCACHE_SIZE", + "Size of the cuda ipc memhandle cache ", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)CUDA_MEMHANDLE_CACHE_SIZE, + &env_memcache_size); + return psm3_cuda_memhandle_cache_alloc( + (psm3_cuda_memhandle_cache_t*)&ptl->memhandle_cache, + env_memcache_size.e_uint, stats); + } + return PSM2_OK; +} + +static void psm3_cuda_shm_finalize(struct ptl_am *ptl) +{ + if (ptl->memhandle_cache) + psm3_cuda_memhandle_cache_free((psm3_cuda_memhandle_cache_t)ptl->memhandle_cache); + ptl->memhandle_cache = NULL; + return; +} + +static psm2_error_t psm3_cuda_shm_epaddr_add(struct ptl_am *ptl, + struct am_epaddr *am_epaddr) +{ + // nothing to do + return PSM2_OK; +} + +static void psm3_cuda_shm_epaddr_free(struct am_epaddr *am_epaddr) +{ + // nothing to do +} + +static int psm3_cuda_shm_dev_fds_needed() +{ + // don't need to exchange dev_fds + return 0; +} + +static void psm3_cuda_shm_dev_fds_send(struct ptl_am *ptl, struct am_epaddr *am_epaddr) +{ + // nothing to do +} + +static psm2_error_t psm3_cuda_shm_dev_fds_connreq_poll(struct ptl_am *ptl, struct am_ptl_connection_req *req) +{ + // nothing to do + return PSM2_OK; +} + +static psm2_error_t psm3_cuda_shm_dev_fds_check_exchanged(struct ptl_am *ptl, struct am_ptl_connection_req *req, int index) +{ + // nothing to do + return PSM2_OK; +} + +static psm2_error_t psm3_cuda_shm_dev_fds_poll(struct ptl_am *ptl, psm2_error_t res) +{ + // nothing to do + return res; +} + +// On Sender, place the IPC handle in the RTS +// We put offset in the basic "args" parameters and the actual +// IPC handle as payload due to it's size +// Callers expect payload_size >0 when using GPU IPC and key off non-zero +// payload size in RTS to identify a GPU IPC RTS +// Save in the req the needed information about IPC resources allocated here +// so psm3_cuda_process_cts and release them. +static psm2_error_t psm3_cuda_shm_build_rts(struct ptl_am *ptl, + psm2_mq_req_t req, int *narg_p, + psm2_amarg_t *args, void **payload_p, size_t *payload_size_p, + union am_gpu_rts_payload *info) +{ + CUdeviceptr buf_base_ptr; + void *buf = req->req_data.buf; + PSM3_CUDA_CALL(cuMemGetAddressRange, &buf_base_ptr, NULL, (CUdeviceptr)buf); + + /* Offset in GPU buffer from which we copy data, we have to + * send it separetly because this offset is lost + * when cuIpcGetMemHandle is called */ + req->gpu_specific.cuda_ipc_offset = (uint32_t)((uintptr_t)buf - (uintptr_t)buf_base_ptr); + args[2].u32w0 = (uint32_t)req->gpu_specific.cuda_ipc_offset; + + PSM3_CUDA_CALL(cuIpcGetMemHandle, &req->gpu_specific.cuda_ipc_handle, (CUdeviceptr) buf); + *narg_p = 5; + *payload_p = (void*)&req->gpu_specific.cuda_ipc_handle; + *payload_size_p = sizeof(CUipcMemHandle); + req->gpu_specific.cuda_ipc_handle_attached = 1; + return PSM2_OK; +} + +// On receiver, pull IPC information out of the RTS which our peer build using +// psm3_cuda_shm_build_rts. Information is saved to the req for subsequent +// processing after tag matching via psm3_cuda_shm_rtsmatch +static void psm3_cuda_shm_process_rts(psm2_mq_req_t req, void *buf, size_t len, + int narg, psm2_amarg_t *args) +{ + req->gpu_specific.cuda_ipc_handle = *((CUipcMemHandle*)buf); + psmi_assert(len == sizeof(CUipcMemHandle)); + req->gpu_specific.cuda_ipc_handle_attached = 1; + req->gpu_specific.cuda_ipc_offset = args[2].u32w0; +} + +// On receiver, use GPU IPC to copy data from the sender to this process +// This is called on the receiver after psm3_cuda_process_rts has parsed the +// incoming RTS and tag matching has matched the RTS with a receive buffer and +// populated the req with information about the matched receiver buffer +static int psm3_cuda_shm_rtsmatch(struct ptl_am *ptl, psm2_mq_req_t req) +{ + if (req->gpu_specific.cuda_ipc_handle_attached) { + CUdeviceptr cuda_ipc_dev_ptr = psm3_cuda_memhandle_acquire( + ptl->memhandle_cache, + req->rts_sbuf - req->gpu_specific.cuda_ipc_offset, + (CUipcMemHandle*)&req->gpu_specific.cuda_ipc_handle, + req->rts_peer->epid); + cuda_ipc_dev_ptr = cuda_ipc_dev_ptr + req->gpu_specific.cuda_ipc_offset; + /* cuMemcpy into the receive side buffer + * based on its location */ + if (req->is_buf_gpu_mem) { + /*PSM3_GPU_MEMCPY_DTOD*/ + psm3_cuda_memcpy_DtoD(req->req_data.buf, (void*)cuda_ipc_dev_ptr, + req->req_data.recv_msglen); + //PSM3_GPU_SYNCHRONIZE_MEMCPY(); + psm3_cuda_synchronize_memcpy(); + } else { + /*PSM3_GPU_MEMCPY_DTOH*/ + psm3_cuda_memcpy_DtoH(req->req_data.buf, (void*)cuda_ipc_dev_ptr, + req->req_data.recv_msglen); + } + psm3_cuda_memhandle_release(ptl->memhandle_cache, + cuda_ipc_dev_ptr - req->gpu_specific.cuda_ipc_offset); + req->gpu_specific.cuda_ipc_handle_attached = 0; + return 1; + } + return 0; +} + +// On sender, we have now received the CTS corresponding to an RTS +// we may have built in psm3_cuda_build_rts. All we need to do here is release +// the resources we allocated in psm3_cuda_build_rts. We saved the necessary +// information tracking those resources in the send req. +// Returns: +// 0 - the req was not for a GPU IO +// 1 - the req was for a GPU IO and we have released the resources +static int psm3_cuda_shm_process_cts(psm2_mq_req_t req) +{ + if (req->gpu_specific.cuda_ipc_handle_attached) { + // no need to release any Cuda resources + req->gpu_specific.cuda_ipc_handle_attached = 0; + return 1; + } + return 0; +} +// end of RTS and CTS processing functions for PSM3_DEVICES "shm" +//*************************************************************************** + +static psm2_error_t psm3_cuda_get_cuda_permitted(struct psm2_ep *ep, bool *enable) +{ + switch (psm3_cuda_permitted_enforcement) { + case PSM3_CPE_REJECT: + _HFI_DBG("GET(CUDA_PERMITTED) rejected\n"); + return PSM2_PARAM_ERR; + case PSM3_CPE_IGNORE: + case PSM3_CPE_OBEY: + *enable = ep->gpu_specific.cuda_permitted; + return PSM2_OK; + } + + _HFI_ERROR("PSM3_CUDA_PERMITTED_ENFORCEMENT invalid: %u\n", + psm3_cuda_permitted_enforcement); + return PSM2_PARAM_ERR; +} + +static psm2_error_t psm3_cuda_set_cuda_permitted(struct psm2_ep *ep, bool enable) +{ + switch (psm3_cuda_permitted_enforcement) { + case PSM3_CPE_REJECT: + _HFI_DBG("SET(CUDA_PERMITTED) rejected\n"); + return PSM2_PARAM_ERR; + case PSM3_CPE_IGNORE: + case PSM3_CPE_OBEY: + ep->gpu_specific.cuda_permitted = enable; + return PSM2_OK; + } + + _HFI_ERROR("PSM3_CUDA_PERMITTED_ENFORCEMENT invalid: %u\n", + psm3_cuda_permitted_enforcement); + return PSM2_PARAM_ERR; +} + +static bool psm3_cuda_is_memcpy_permitted(struct psm2_ep *ep) +{ + switch (psm3_cuda_permitted_enforcement) { + case PSM3_CPE_REJECT: + // REJECT behaves as though the CUDA_PERMITTED option doesn't exist, + // so behave as per legacy and allow memcpy + return true; + case PSM3_CPE_IGNORE: + // IGNORE behaves as though CUDA_PERMITTED is always true + return true; + case PSM3_CPE_OBEY: + // OBEY requires we honor the config set by the user + return ep->gpu_specific.cuda_permitted; + } + + _HFI_ERROR("PSM3_CUDA_PERMITTED_ENFORCEMENT invalid: %u\n", + psm3_cuda_permitted_enforcement); + return true; +} + +struct psm3_gpu_hal psm3_cuda_hal = { + .type = "cuda", +#ifdef PSM_HAVE_RNDV_MOD + .rv_major_rev_fail = 0, + .rv_minor_rev_fail = 0, + .rv_capability_expected = RV_CAP_NVIDIA_GPU, + .hal_cap_expected = PSM_HAL_CAP_NVIDIA_GPU, +#endif + .ghfp_initialize = psm3_cuda_initialize, + .ghfp_finalize = psm3_cuda_finalize, + .ghfp_ep_open = psm3_cuda_ep_open, + .ghfp_ep_close = psm3_cuda_ep_close, + .ghfp_identify = psm3_cuda_identify, + .ghfp_verify_GPU_capabilities = psm3_cuda_verify_GPU_capabilities, + .ghfp_p2p_supported = psm3_cuda_p2p_supported, + .ghfp_gpudirect_supported = psm3_cuda_gpudirect_supported, + .ghfp_using_rv_for_mrs = psm3_cuda_using_rv_for_mrs, + .ghfp_get_pci_addr = psm3_cuda_get_pci_addr, +#ifdef PSM_HAVE_RNDV_MOD + .ghfp_min_bar_size = psm3_cuda_min_bar_size, + .ghfp_check_phys_addr = psm3_cuda_check_phys_addr, + .ghfp_roundup_gdrcopy = psm3_cuda_roundup_gdrcopy, +#ifdef PSM_HAVE_REG_MR + .ghfp_roundup_rv_reg_mr = psm3_cuda_roundup_rv_reg_mr, + .ghfp_init_rv_reg_mr_params = psm3_cuda_init_rv_reg_mr_params, +#endif + .ghfp_init_rv_pin_mmap_params = psm3_cuda_init_rv_pin_mmap_params, + .ghfp_rv_reg_mmap_cleanup = psm3_cuda_rv_reg_mmap_cleanup, +#endif /* PSM_HAVE_RNDV_MOD */ +#ifdef PSM_HAVE_REG_MR + .ghfp_cmp_mr = psm3_cuda_cmp_mr, + .ghfp_init_mr = psm3_cuda_init_mr, +#endif + .ghfp_fetch_ctxt = psm3_cuda_fetch_ctxt, + .ghfp_refresh_ctxt = psm3_cuda_refresh_ctxt, + .ghfp_register_hostmem = psm3_cuda_register_hostmem, + .ghfp_unregister_hostmem = psm3_cuda_unregister_hostmem, + .ghfp_is_gpu_mem = psm3_cuda_is_gpu_mem, + .ghfp_prepare_HtoD_memcpys = psm3_cuda_prepare_HtoD_memcpys, + .ghfp_prepare_DtoH_memcpys = psm3_cuda_prepare_DtoH_memcpys, + .ghfp_shutdown_HtoD_memcpys = psm3_cuda_shutdown_HtoD_memcpys, + .ghfp_shutdown_DtoH_memcpys = psm3_cuda_shutdown_DtoH_memcpys, + .ghfp_memcpy_HtoD_start = psm3_cuda_memcpy_HtoD_start, + .ghfp_memcpy_DtoH_start = psm3_cuda_memcpy_DtoH_start, + .ghfp_memcpy_done = psm3_cuda_memcpy_done, + .ghfp_hostbuf_lazy_init = psm3_cuda_hostbuf_lazy_init, + .ghfp_hostbuf_reset = psm3_cuda_hostbuf_reset, + .ghfp_hostbuf_destroy = psm3_cuda_hostbuf_destroy, + .ghfp_memcpy_DtoD = psm3_cuda_memcpy_DtoD, + .ghfp_memcpy_HtoD = psm3_cuda_memcpy_HtoD, + .ghfp_memcpy_DtoH = psm3_cuda_memcpy_DtoH, + .ghfp_memcpy = psm3_cuda_memcpy, + .ghfp_synchronize_memcpy = psm3_cuda_synchronize_memcpy, + .ghfp_mark_buf_synchronous = psm3_cuda_mark_buf_synchronous, + .ghfp_host_alloc = psm3_cuda_host_alloc, + .ghfp_host_free = psm3_cuda_host_free, + .ghfp_gpu_addr_send_mr = psm3_cuda_gpu_addr_send_mr, + .ghfp_gpu_addr_recv_mr = psm3_cuda_gpu_addr_recv_mr, + // functions for PSM3_DEVICES "shm" RTS/CTS processing to enable + // use of GPU specific scale-up transfers within the given server + .ghfp_shm_init = psm3_cuda_shm_init, + .ghfp_shm_finalize = psm3_cuda_shm_finalize, + .ghfp_shm_epaddr_add = psm3_cuda_shm_epaddr_add, + .ghfp_shm_epaddr_free = psm3_cuda_shm_epaddr_free, + .ghfp_shm_dev_fds_needed = psm3_cuda_shm_dev_fds_needed, + .ghfp_shm_dev_fds_send = psm3_cuda_shm_dev_fds_send, + .ghfp_shm_dev_fds_connreq_poll = psm3_cuda_shm_dev_fds_connreq_poll, + .ghfp_shm_dev_fds_check_exchanged = psm3_cuda_shm_dev_fds_check_exchanged, + .ghfp_shm_dev_fds_poll = psm3_cuda_shm_dev_fds_poll, + .ghfp_shm_build_rts = psm3_cuda_shm_build_rts, + .ghfp_shm_process_rts = psm3_cuda_shm_process_rts, + .ghfp_shm_rtsmatch = psm3_cuda_shm_rtsmatch, + .ghfp_shm_process_cts = psm3_cuda_shm_process_cts, + .ghfp_get_cuda_permitted = psm3_cuda_get_cuda_permitted, + .ghfp_set_cuda_permitted = psm3_cuda_set_cuda_permitted, + .ghfp_is_memcpy_permitted = psm3_cuda_is_memcpy_permitted, +}; + +#endif /* PSM_CUDA */ diff --git a/prov/psm3/psm3/gpu/psm_gpu_hal.c b/prov/psm3/psm3/gpu/psm_gpu_hal.c new file mode 100644 index 00000000000..e2c24b90cd7 --- /dev/null +++ b/prov/psm3/psm3/gpu/psm_gpu_hal.c @@ -0,0 +1,422 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2024 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2024 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2024 Intel Corporation. All rights reserved. */ + +#include +#include +#include /* cpu_set */ +#include /* isalpha */ +#include + +#include "psm_user.h" + +#ifdef PSM_HAVE_GPU + +#ifdef PSM_HAVE_RNDV_MOD +#ifndef RV_CAP_GPU_DIRECT +#error "Inconsistent build. RV_CAP_GPU_DIRECT must be defined for GPU builds. Must use GPU enabled rv headers" +#endif +#include "psm2_hal.h" +#endif /* PSM_HAVE_RNDV_MOD */ + +int psm3_my_gpu_device; // up to 10 bits identifying GPU within server + +int psm3_gpu_is_gdr_copy_enabled; +uint32_t psm3_gpu_gdr_copy_limit_send; +uint32_t psm3_gpu_gdr_copy_limit_recv; +int psm3_gpu_is_gpudirect_enabled = 0; +int psm3_gpu_is_driver_gpudirect_enabled = 0; +uint32_t psm3_gpu_gpudirect_send_limit; + +/* All GPU transfers beyond this threshold use + * RNDV protocol. It is mostly a send side knob. + */ +uint32_t psm3_gpu_thresh_rndv; + +uint32_t psm3_gpu_gpudirect_rdma_send_limit; +uint32_t psm3_gpu_gpudirect_rdma_send_limit_default; + +uint32_t psm3_gpu_gpudirect_rdma_recv_limit; +uint32_t psm3_gpu_gpudirect_rdma_recv_limit_default; + +int psm3_gpu_is_driver_gpudirect_enabled; + +// default value for PSM3_GPU_RNDV_NIC_WINDOW +const char *psm3_gpu_rndv_nic_window_default = NULL; + +// default value for PSM3_MQ_RNDV_SHM_GPU_THRESH +// Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem +unsigned psm3_gpu_mq_rndv_shm_gpu_thresh_default; + +uint64_t psm3_gpu_cache_evict; // in bytes + +#ifdef PSM_HAVE_RNDV_MOD +void psm3_gpu_rv_cap_string(char *buf, size_t size, uint64_t capability) +{ + int offset = 0; + buf[0] = '\0'; + offset += snprintf(buf+offset, size-offset, (capability & RV_CAP_NVIDIA_GPU)?" cuda":""); + if (size > offset) { + offset += snprintf(buf+offset, size-offset, (capability & RV_CAP_INTEL_GPU)?" oneapi-ze":""); + } +} + +// Based on the RV capability supported, add to the ptl_ips HAL capability. +// Should only be called within an ptl_ips HAL once it has decided it will +// open rv. +void psm3_gpu_rv_set_hal_cap(uint64_t capability) +{ + if (capability & RV_CAP_NVIDIA_GPU & PSM3_GPU_RV_CAPABILITY_EXPECTED) + psmi_hal_add_cap(PSM_HAL_CAP_NVIDIA_GPU); + if (capability & RV_CAP_INTEL_GPU & PSM3_GPU_RV_CAPABILITY_EXPECTED) + psmi_hal_add_cap(PSM_HAL_CAP_INTEL_GPU); + +} + +static void psm3_gpu_roundup_gdrcopy(unsigned long buf, size_t size, + uintptr_t *pageaddr_p, uint64_t *pagelen_p) +{ + *pageaddr_p = (uintptr_t)buf; + *pagelen_p = (uint64_t)size; +} +#endif /* PSM_HAVE_RNDV_MOD */ + +uint32_t psm3_gpu_query_feature_mask(void) +{ + uint32_t res =0; +#ifdef PSM_CUDA + res |= PSM2_INFO_QUERY_FEATURE_CUDA; +#endif +#ifdef PSM_ONEAPI + res |= PSM2_INFO_QUERY_FEATURE_ONEAPI; +#endif + return res; +} + +// noop function for everything in HAL when no GPU selected +static psm2_error_t psm3_gpu_noop(void) +{ + return PSM2_OK; +} + +static int psm3_gpu_true(void) +{ + return 1; +} + +static int psm3_gpu_zero(void) +{ + return 0; +} + +#ifdef PSM_HAVE_RNDV_MOD +static uint64_t psm3_gpu_zero64(void) +{ + return 0; +} +#endif + +struct psm3_gpu_hal psm3_gpu_noop_hal = { + .type = "none", +#ifdef PSM_HAVE_RNDV_MOD + .rv_major_rev_fail = 0, + .rv_minor_rev_fail = 0, + .rv_capability_expected = 0, + .hal_cap_expected = 0, +#endif + .ghfp_initialize = (psm2_error_t (*)(void))psm3_gpu_noop, + .ghfp_finalize = (void (*)(void))psm3_gpu_noop, + .ghfp_ep_open = (void (*)(void))psm3_gpu_noop, + .ghfp_ep_close = (void (*)(void))psm3_gpu_noop, + .ghfp_identify = (void (*)(char *accel_vers, size_t size))psm3_gpu_noop, + .ghfp_verify_GPU_capabilities = (void (*)(void))psm3_gpu_noop, + .ghfp_p2p_supported = (int (*)(void))psm3_gpu_zero, + .ghfp_gpudirect_supported = (int (*)(void))psm3_gpu_zero, + .ghfp_using_rv_for_mrs = (void (*)(void))psm3_gpu_noop, + .ghfp_get_pci_addr = (void (*)(uint32_t *domain_p, uint32_t *bus_p, + uint32_t *dev_p, uint32_t *func_p))psm3_gpu_noop, +#ifdef PSM_HAVE_RNDV_MOD + .ghfp_min_bar_size = (uint64_t (*)(void))psm3_gpu_zero64, + .ghfp_check_phys_addr = (psm2_error_t (*)(uint64_t phys_addr))psm3_gpu_noop, + .ghfp_roundup_gdrcopy = (void (*)(unsigned long buf, size_t size, + uintptr_t *pageaddr_p, uint64_t *pagelen_p))psm3_gpu_roundup_gdrcopy, +#ifdef PSM_HAVE_REG_MR + .ghfp_roundup_rv_reg_mr = (void (*)(struct psm2_ep *ep, + void **addr_, uint64_t *length_p, int access))psm3_gpu_noop, + .ghfp_init_rv_reg_mr_params = (int (*)(void *addr, uint64_t length, int access, + struct rv_mem_params *mparams, + union psm3_verbs_mr_gpu_specific *gpu_specific, + union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad))psm3_gpu_zero, +#endif + .ghfp_init_rv_pin_mmap_params = (int (*)(void *addr, uint64_t length, int access, + struct rv_gpu_mem_params *params, + union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad))psm3_gpu_zero, + .ghfp_rv_reg_mmap_cleanup = (void (*)(void *addr, uint64_t length, int access, + union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad))psm3_gpu_noop, +#endif /* PSM_HAVE_RNDV_MOD */ +#ifdef PSM_HAVE_REG_MR + .ghfp_cmp_mr = (int (*)(const union psm3_verbs_mr_gpu_specific *a, + const union psm3_verbs_mr_gpu_specific *b))psm3_gpu_zero, + + .ghfp_init_mr = (void (*)(void *addr, uint64_t length, int access, + union psm3_verbs_mr_gpu_specific *gpu_specific))psm3_gpu_noop, +#endif + .ghfp_fetch_ctxt = (void (*)(void))psm3_gpu_noop, + .ghfp_refresh_ctxt = (void (*)(void))psm3_gpu_noop, + .ghfp_register_hostmem = (void (*)(void *buf, uint32_t size))psm3_gpu_noop, + .ghfp_unregister_hostmem = (void (*)(void *buf))psm3_gpu_noop, + .ghfp_is_gpu_mem = (int (*)(const void *ptr))psm3_gpu_zero, + .ghfp_prepare_HtoD_memcpys = (void (*)(struct ips_protoexp *protoexp))psm3_gpu_noop, + .ghfp_prepare_DtoH_memcpys = (void (*)(struct ips_proto *proto))psm3_gpu_noop, + .ghfp_shutdown_HtoD_memcpys = (void (*)(struct ips_protoexp *protoexp))psm3_gpu_noop, + .ghfp_shutdown_DtoH_memcpys = (void (*)(struct ips_proto *proto))psm3_gpu_noop, + .ghfp_memcpy_HtoD_start = (void (*)(struct ips_protoexp *protoexp, + struct ips_gpu_hostbuf *ghb, uint32_t len))psm3_gpu_noop, + .ghfp_memcpy_DtoH_start = (void (*)(struct ips_proto *proto, + struct ips_gpu_hostbuf *ghb, uint32_t len))psm3_gpu_noop, + .ghfp_memcpy_done = (int (*)(struct ips_gpu_hostbuf *ghb))psm3_gpu_true, + .ghfp_hostbuf_lazy_init = (void (*)(struct ips_gpu_hostbuf *ghb))psm3_gpu_noop, + .ghfp_hostbuf_reset = (void (*)(struct ips_gpu_hostbuf *ghb))psm3_gpu_noop, + .ghfp_hostbuf_destroy = (void (*)(struct ips_gpu_hostbuf *ghb))psm3_gpu_noop, + .ghfp_memcpy_DtoD = (void (*)(void *dstptr, const void *srcptr, uint32_t len))psm3_gpu_noop, + .ghfp_memcpy_HtoD = (void (*)(void *dstptr, const void *srcptr, uint32_t len))psm3_gpu_noop, + .ghfp_memcpy_DtoH = (void (*)(void *dstptr, const void *srcptr, uint32_t len))psm3_gpu_noop, + .ghfp_memcpy = (void (*)(void *dstptr, const void *srcptr, uint32_t len))psm3_gpu_noop, + .ghfp_synchronize_memcpy = (void (*)(void))psm3_gpu_noop, + .ghfp_mark_buf_synchronous = (void (*)(const void *buf))psm3_gpu_noop, + .ghfp_host_alloc = (void (*)(void **ret_ptr, uint32_t size))psm3_gpu_noop, + .ghfp_host_free = (void (*)(void *ptr))psm3_gpu_noop, + .ghfp_gpu_addr_send_mr = (int (*)(struct psm2_mq_req *mqreq))psm3_gpu_noop, + .ghfp_gpu_addr_recv_mr = (int (*)(struct ips_tid_recv_desc *tidrecvc, + int gpu_hostbuf_used))psm3_gpu_noop, + // functions for PSM3_DEVICES "shm" RTS/CTS processing to enable + // use of GPU specific scale-up transfers within the given server + .ghfp_shm_init = (psm2_error_t (*)(struct ptl_am *ptl, + psm2_mq_stats_t *stats))psm3_gpu_noop, + .ghfp_shm_finalize = (void (*)(struct ptl_am *ptl))psm3_gpu_noop, + .ghfp_shm_epaddr_add = (psm2_error_t (*)(struct ptl_am *ptl, + struct am_epaddr *am_epaddr))psm3_gpu_noop, + .ghfp_shm_epaddr_free = (void (*)(struct am_epaddr *am_epaddr))psm3_gpu_noop, + .ghfp_shm_dev_fds_needed = (int (*)(void))psm3_gpu_zero, + .ghfp_shm_dev_fds_send = (void (*)(struct ptl_am *ptl, + struct am_epaddr *am_epaddr))psm3_gpu_noop, + .ghfp_shm_dev_fds_connreq_poll = (psm2_error_t (*)(struct ptl_am *ptl, + struct am_ptl_connection_req *req))psm3_gpu_noop, + .ghfp_shm_dev_fds_check_exchanged = (psm2_error_t (*)(struct ptl_am *ptl, + struct am_ptl_connection_req *req, int index))psm3_gpu_noop, + .ghfp_shm_dev_fds_poll = (psm2_error_t (*)(struct ptl_am *ptl, psm2_error_t res))psm3_gpu_noop, + .ghfp_shm_build_rts = (psm2_error_t (*)(struct ptl_am *ptl, + psm2_mq_req_t req, int *narg_p, + psm2_amarg_t *args, void **payload_p, size_t *payload_size_p, + union am_gpu_rts_payload *info_p))psm3_gpu_noop, + .ghfp_shm_process_rts = (void (*)(psm2_mq_req_t req, void *buf, size_t len, + int narg, psm2_amarg_t *args))psm3_gpu_noop, + .ghfp_shm_rtsmatch = (int (*)(struct ptl_am *ptl, psm2_mq_req_t req))psm3_gpu_zero, + .ghfp_shm_process_cts = (int (*)(psm2_mq_req_t sreq))psm3_gpu_zero, + .ghfp_get_cuda_permitted = (psm2_error_t (*)(struct psm2_ep *ep, bool *enable))psm3_gpu_zero, + .ghfp_set_cuda_permitted = (psm2_error_t (*)(struct psm2_ep *ep, bool enable))psm3_gpu_zero, + .ghfp_is_memcpy_permitted = (bool (*)(struct psm2_ep *ep))psm3_gpu_zero, +}; + +struct psm3_gpu_hal *psm3_gpu_hal = &psm3_gpu_noop_hal; + +// parse additional options and threshholds for GPU data movement +static void psm3_gpu_env_init(void) +{ + int ret; + + union psmi_envvar_val env_enable_gdr_copy; + psm3_getenv("PSM3_GDRCOPY", + "Enable (set envvar to 1) for gdr copy support in PSM (Enabled by default)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)1, &env_enable_gdr_copy); + psm3_gpu_is_gdr_copy_enabled = env_enable_gdr_copy.e_int; + + union psmi_envvar_val env_gpu_thresh_rndv; + ret = psm3_getenv_range("PSM3_GPU_THRESH_RNDV", + "RNDV protocol is used for GPU send message sizes greater than the threshold", + NULL, PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)psm3_gpu_thresh_rndv, + (union psmi_envvar_val)0, (union psmi_envvar_val)UINT32_MAX, + NULL, NULL, &env_gpu_thresh_rndv); + if (ret > 0) { // used default + /* + * For backward compatibility, check if the old variable name is set. + * Priority order: New name > old name > default value. + */ + psm3_getenv("PSM3_CUDA_THRESH_RNDV", + "[Deprecated, use PSM3_GPU_THRESH_RNDV]" + " RNDV protocol is used for GPU send message sizes greater than the threshold", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)psm3_gpu_thresh_rndv, + &env_gpu_thresh_rndv); + } + + psm3_gpu_thresh_rndv = env_gpu_thresh_rndv.e_uint; + + + union psmi_envvar_val env_gdr_copy_limit_send; + psm3_getenv("PSM3_GDRCOPY_LIMIT_SEND", + "GDR Copy is turned off on the send side" + " for message sizes greater than the limit" + " or larger than 1 MTU\n", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)GDR_COPY_LIMIT_SEND, &env_gdr_copy_limit_send); + psm3_gpu_gdr_copy_limit_send = env_gdr_copy_limit_send.e_int; + + if (psm3_gpu_gdr_copy_limit_send < 8 || psm3_gpu_gdr_copy_limit_send > psm3_gpu_thresh_rndv) + psm3_gpu_gdr_copy_limit_send = max(GDR_COPY_LIMIT_SEND, psm3_gpu_thresh_rndv); + + union psmi_envvar_val env_gdr_copy_limit_recv; + psm3_getenv("PSM3_GDRCOPY_LIMIT_RECV", + "GDR Copy is turned off on the recv side" + " for message sizes greater than the limit\n", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)GDR_COPY_LIMIT_RECV, &env_gdr_copy_limit_recv); + psm3_gpu_gdr_copy_limit_recv = env_gdr_copy_limit_recv.e_int; + + if (psm3_gpu_gdr_copy_limit_recv < 8) + psm3_gpu_gdr_copy_limit_recv = GDR_COPY_LIMIT_RECV; + + if (!psm3_gpu_is_gdr_copy_enabled) + psm3_gpu_gdr_copy_limit_send = psm3_gpu_gdr_copy_limit_recv = 0; +} + +psm2_error_t psm3_gpu_initialize(void) +{ +// TBD - what if customer exports CUDA and ONEAPI in a build with both? +// TBD - how to interpret GPU_DIRECT when build has both enabled? +// maybe we need to have a HAL function to check if any devices available +#ifdef PSM_CUDA + union psmi_envvar_val env_enable_cuda; + + psm3_getenv("PSM3_CUDA", + "Enable (set envvar to 1) for cuda support in PSM (Disabled by default)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)0, &env_enable_cuda); + // order important, always parse gpudirect + if (psmi_parse_gpudirect() || env_enable_cuda.e_int) { + psm2_error_t err; + // establish HAL for Cuda + psm3_gpu_hal = &psm3_cuda_hal; + err = psm3_cuda_hal.ghfp_initialize(); + if (err != PSM2_OK) + return err; + psm3_gpu_env_init(); + } +#else /* PSM_CUDA */ + /* PSM3_CUDA is not allowed for this build, so we check it's + * presence but don't want to use psm3_getenv since we don't + * want it to appear in PSM3_VERBOSE_ENV help text + */ + int enable_cuda = 0; + if (psm3_parse_str_int(psm3_env_get("PSM3_CUDA"), &enable_cuda, + INT_MIN, INT_MAX) == -2 + || enable_cuda) { + _HFI_INFO("WARNING: PSM built without CUDA enabled, PSM3_CUDA unavailable\n"); + } +#endif /* PSM_CUDA */ +#ifdef PSM_ONEAPI + union psmi_envvar_val env_enable_oneapi; + psm3_getenv("PSM3_ONEAPI_ZE", + "Enable (set envvar to 1) for OneAPI Level Zero (ZE) support in PSM (Disabled by default)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)0, &env_enable_oneapi); + if (psmi_parse_gpudirect() || env_enable_oneapi.e_int) { + psm2_error_t err; + // establish HAL for Cuda + psm3_gpu_hal = &psm3_oneapi_ze_hal; + err = psm3_oneapi_ze_hal.ghfp_initialize(); + if (err != PSM2_OK) + return err; + psm3_gpu_env_init(); + } +#else /* PSM_ONEAPI */ + /* PSM3_ONEAPI_ZE is not allowed for this build, so we check it's + * presence but don't want to use psm3_getenv since we don't + * want it to appear in PSM3_VERBOSE_ENV help text + */ + int enable_oneapi = 0; + if (psm3_parse_str_int(psm3_env_get("PSM3_ONEAPI_ZE"), &enable_oneapi, + INT_MIN, INT_MAX) == -2 + || enable_oneapi) { + _HFI_INFO("WARNING: PSM built without ONEAPI_ZE enabled, PSM3_ONEAPI_ZE unavailable\n"); + } +#endif /* PSM_ONEAPI */ + return PSM2_OK; +} + +#else /* PSM_HAVE_GPU */ + +psm2_error_t psm3_gpu_initialize(void) +{ + /* PSM3_GPUDIRECT is not allowed for this build, so we check it's + * presence but don't want to use psm3_getenv since we don't + * want it to appear in PSM3_VERBOSE_ENV help text + * Note we check here, rather than in ips_proto_init, because + * PSM3_GPUDIERECT can enable GPU for ptl_am (shm) as well as ips, + * so if a user attempted a non-GPU build single node run with + * PSM3_GPUDIRECT=1 and expected GPU handling in shm, they would not + * get the behavior they expected + */ + unsigned int gpudirect = 0; + if (psm3_parse_str_uint(psm3_env_get("PSM3_GPUDIRECT"), &gpudirect, + 0, UINT_MAX) == -2 + || gpudirect) { + _HFI_INFO("WARNING: PSM built with neither ONEAPI_ZE nor CUDA enabled, PSM3_GPUDIRECT unavailable\n"); + } + return PSM2_OK; // just a warning, non-fatal +} + +#endif /* PSM_HAVE_GPU */ diff --git a/prov/psm3/psm3/gpu/psm_gpu_hal.h b/prov/psm3/psm3/gpu/psm_gpu_hal.h new file mode 100644 index 00000000000..dccf99032d7 --- /dev/null +++ b/prov/psm3/psm3/gpu/psm_gpu_hal.h @@ -0,0 +1,817 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2024 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2024 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2024 Intel Corporation. All rights reserved. */ + +#ifndef _PSMI_IN_USER_H +#error psm_gpu_hal.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSMI_GPU_HAL_H +#define _PSMI_GPU_HAL_H + +#ifdef PSM_HAVE_GPU + +#ifdef PSM_HAVE_RNDV_MOD +#include + +#if defined(PSM_ONEAPI) +#ifndef RV_IOCTL_CAPABILITY +// TBD we could have configure test this and disable PSM3_HAVE_RNDV_MOD +// or perhaps even disable/fail oneapi in configure +#error "PSM_ONEAPI requires rv_user_ioctls.h 1.3 (w/GPU 1.2) or later" +#endif +#endif + +/* we test *_GPU_DIRECT since those defines + * control the rv module ioctl header file interface + * This establishes the build time RV GPUs which could be supported. + */ +#if defined(NVIDIA_GPU_DIRECT) || defined(INTEL_GPU_DIRECT) + +#ifndef RV_CAP_GPU_DIRECT +#error "Inconsistent build. RV_CAP_GPU_DIRECT must be defined for GPU builds. Must use GPU enabled rv headers" +#endif + +#ifdef INTEL_GPU_DIRECT +#define PSM3_RV_GPU_TYPES_INTEL " oneapi-ze" +#else +#define PSM3_RV_GPU_TYPES_INTEL +#endif +#ifdef NVIDIA_GPU_DIRECT +#define PSM3_RV_GPU_TYPES_NVIDIA " cuda" +#else +#define PSM3_RV_GPU_TYPES_NVIDIA +#endif + +#define PSM3_RV_GPU_TYPES PSM3_RV_GPU_TYPES_INTEL PSM3_RV_GPU_TYPES_NVIDIA + +#define PSM3_GPU_FMT_RV_GPU_VER " gpu v%u.%u" PSM3_RV_GPU_TYPES +#define PSM3_GPU_OUT_RV_GPU_VER \ + , psm3_rv_get_gpu_user_major_bldtime_version() \ + , psm3_rv_get_gpu_user_minor_bldtime_version() +#else +#define PSM3_GPU_FMT_RV_GPU_VER +#define PSM3_GPU_OUT_RV_GPU_VER +#endif + +#endif /* PSM_HAVE_RNDV_MOD */ + + +#ifdef PSM_ONEAPI +#include +#include + +#define MAX_ZE_DEVICES 8 +#define PSM3_GPU_TYPES_ONEAPI " oneapi-ze" +#else +#define PSM3_GPU_TYPES_ONEAPI +#endif + +#ifdef PSM_CUDA +#include +#include +//#include +#define PSM3_GPU_TYPES_CUDA " cuda" +#else +#define PSM3_GPU_TYPES_CUDA +#endif + +// build time PSM3 GPU types included +#define PSM3_GPU_TYPES PSM3_GPU_TYPES_ONEAPI PSM3_GPU_TYPES_CUDA + +// GPU specific fields within psm2_ep_t +union psm2_ep_gpu_specific { +#ifdef PSM_CUDA + struct { + bool cuda_permitted; + }; +#endif /* PSM_CUDA */ +}; + +// GPU specific fields within psm2_mq_req for use during PSM3 shm IPC +union psm2_mq_req_gpu_specific { +#ifdef PSM_ONEAPI + struct { + union { + ze_ipc_mem_handle_t ze_ipc_handle; // for sender req + uint32_t ze_handle; // receiver req pidfd or gem_handle + }; + uint8_t ze_handle_attached; + uint8_t ze_alloc_type; + uint32_t ze_ipc_offset; +#ifndef PSM_HAVE_PIDFD + uint32_t ze_device_index; +#endif + uint64_t ze_alloc_id; + }; +#endif /* PSM_ONEAPI */ +#ifdef PSM_CUDA + struct { + CUipcMemHandle cuda_ipc_handle; + uint8_t cuda_ipc_handle_attached; + uint32_t cuda_ipc_offset; + }; +#endif /* PSM_CUDA */ +}; + +// GPU specific fields within ips_gpu_hostbuf.gpu_specific +// for use during PSM3 GPU Direct copy pipeline +union gpu_hostbuf_gpu_specific { +#ifdef PSM_ONEAPI + struct { + ze_event_pool_handle_t ze_event_pool; + ze_command_list_handle_t ze_command_lists[MAX_ZE_DEVICES]; + ze_event_handle_t ze_copy_status; + int ze_cur_dev_inx; + }; +#endif /* PSM_ONEAPI */ +#ifdef PSM_CUDA + struct { + CUevent cuda_copy_status; + }; +#endif /* PSM_CUDA */ +}; + +// GPU specific fields within ptl_am.gpu_specific +// for use during PSM3 shm IPC +union ptl_am_gpu_specific { +#ifdef PSM_ONEAPI + struct { +#ifndef PSM_HAVE_PIDFD + char *ze_listen_sockname; // /dev/shm filename for ze_ipc_socket + int ze_ipc_socket; // AF_UNIX listener sock to recv GPU Dev FDs + int ze_need_dev_fds_poll; // are there outstanding dev_fds to be polled +#endif + }; +#endif /* PSM_ONEAPI */ +#ifdef PSM_CUDA + struct { + // nothing needed + }; +#endif /* PSM_CUDA */ +}; + +// GPU specific fields within am_epaddr.gpu_specific +// for use during PSM3 shm IPC +union am_epaddr_gpu_specific { +#ifdef PSM_ONEAPI + struct { +#ifdef PSM_HAVE_PIDFD + int ze_pidfd; + int ze_pad; // align to 64 bits +#else + int ze_num_peer_fds; + int ze_peer_fds[MAX_ZE_DEVICES]; + int ze_sock_connected_state; + /* ze_sock_connected_state state definitions */ +#define ZE_SOCK_NOT_CONNECTED 0 +#define ZE_SOCK_DEV_FDS_SENT 1 +#define ZE_SOCK_DEV_FDS_SENT_AND_RECD 2 + int ze_sock; + int ze_pad; // align to 64 bits +#endif + }; +#endif /* PSM_ONEAPI */ +#ifdef PSM_CUDA + struct { + // nothing needed + }; +#endif /* PSM_CUDA */ +}; + +// GPU specific fields for use as RTS payload +// during PSM3 shm IPC +union am_gpu_rts_payload { +#ifdef PSM_ONEAPI + struct am_oneapi_ze_rts_payload { + uint32_t ze_handle; /* GEM handle or file descriptor */ + uint8_t ze_alloc_type; /* allocation type */ + } ze; +#endif /* PSM_ONEAPI */ +#ifdef PSM_CUDA + struct { + // nothing needed + }; +#endif /* PSM_CUDA */ +}; + +// GPU specific fields within ips_protoexp.gpu_specific +// for use during PSM3 rendezvous RDMA +union ips_protoexp_gpu_specific { +#ifdef PSM_ONEAPI + struct { + /* Will not be usd if psm3_oneapi_immed_async_copy */ + ze_command_queue_handle_t ze_cq_recvs[MAX_ZE_DEVICES]; + }; +#endif /* PSM_ONEAPI */ +#ifdef PSM_CUDA + struct { + CUstream cudastream_recv; + }; +#endif /* PSM_CUDA */ +}; + +// GPU specific fields within ips_proto.gpu_specific +// for use during PSM3 rendezvous RDMA +union ips_proto_gpu_specific { +#ifdef PSM_ONEAPI + struct { + /* Will not be usd if psm3_oneapi_immed_async_copy */ + ze_command_queue_handle_t ze_cq_sends[MAX_ZE_DEVICES]; + }; +#endif /* PSM_ONEAPI */ +#ifdef PSM_CUDA + struct { + CUstream cudastream_send; + }; +#endif /* PSM_CUDA */ +}; + +#ifdef PSM_HAVE_REG_MR +// GPU specific fields within psm3_verbs_mr +union psm3_verbs_mr_gpu_specific { +#ifdef PSM_ONEAPI + struct { + uint64_t ze_alloc_id; + uint64_t ze_base_addr; + }; +#define PSM3_GPU_MRC_FMT " id %"PRIu64" base 0x%"PRIx64 +#define PSM3_GPU_OUT_MRC(gpu_specific) ,(gpu_specific)->ze_alloc_id, (gpu_specific)->ze_base_addr +#endif /* PSM_ONEAPI */ +#ifdef PSM_CUDA + struct { + // nothing needed + }; +#define PSM3_GPU_MRC_FMT "" +#define PSM3_GPU_OUT_MRC(gpu_specific) +#endif /* PSM_CUDA */ +}; +#endif /* PSM_HAVE_REG_MR */ + +#ifdef PSM_HAVE_RNDV_MOD +// scratch pad to save information needed in PSM3_GPU_RV_REG_MMAP_CLEANUP +// This holds transient information which is allocated during +// PSM3_GPU_INIT_RV_REG_MR_PARAMS and PSM3_GPU_INIT_RV_PIN_AND MMAP_PARAMS +// and then released via PSM3_GPU_RV_REG_MMAP_CLEANUP immediately +// after successful or failed RV registration or mmap +union psm3_gpu_rv_reg_mmap_mem_scratchpad { +#ifdef PSM_ONEAPI + struct { + ze_ipc_mem_handle_t ze_ipc_handle; + uint64_t ze_handle_fd; + }; +#endif /* PSM_ONEAPI */ +#ifdef PSM_CUDA + struct { + // nothing needed + }; +#endif /* PSM_CUDA */ +}; +#endif /* PSM_HAVE_RNDV_MOD */ + +struct psm2_ep; +struct ips_proto; +struct ips_protoexp; +struct ips_gpu_hostbuf; +struct ips_tid_recv_desc; +struct psm2_mq_req; +struct ptl_am;; +struct am_epaddr;; +struct am_ptl_connection_req; + + +extern int psm3_my_gpu_device; // up to 10 bits identifying GPU within server + +extern int psm3_gpu_is_gdr_copy_enabled; +/* This limit dictates when the sender turns off + * GDR Copy and uses SDMA. The limit needs to be less than equal + * GPU RNDV threshold (psm3_gpu_thresh_rndv) + * set to 0 if GDR Copy disabled + */ +extern uint32_t psm3_gpu_gdr_copy_limit_send; +/* This limit dictates when the reciever turns off + * GDR Copy. The limit needs to be less than equal + * GPU RNDV threshold (psm3_gpu_thresh_rndv) + * set to 0 if GDR Copy disabled + */ +extern uint32_t psm3_gpu_gdr_copy_limit_recv; +extern int psm3_gpu_is_gpudirect_enabled; // only for use during parsing of other params +extern int psm3_gpu_is_driver_gpudirect_enabled; // only for use during parsing of other params + +/* All GPU transfers beyond this threshold use + * RNDV protocol. It is mostly a send side knob. + */ +extern uint32_t psm3_gpu_thresh_rndv; + +extern uint32_t psm3_gpu_gpudirect_rdma_send_limit; +extern uint32_t psm3_gpu_gpudirect_rdma_send_limit_default; + +extern uint32_t psm3_gpu_gpudirect_rdma_recv_limit; +extern uint32_t psm3_gpu_gpudirect_rdma_recv_limit_default; + +// default value for PSM3_GPU_RNDV_NIC_WINDOW +extern const char *psm3_gpu_rndv_nic_window_default; + +// default value for PSM3_MQ_RNDV_SHM_GPU_THRESH +// Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem) +extern unsigned psm3_gpu_mq_rndv_shm_gpu_thresh_default; + +extern uint64_t psm3_gpu_cache_evict; + +extern struct psm3_gpu_hal { + const char *type; +#ifdef PSM_HAVE_RNDV_MOD + uint16_t rv_major_rev_fail; + uint16_t rv_minor_rev_fail; + uint64_t rv_capability_expected; + uint32_t hal_cap_expected; +#endif + psm2_error_t (*ghfp_initialize)(void); + void (*ghfp_finalize)(void); + void (*ghfp_ep_open)(void); + void (*ghfp_ep_close)(void); + void (*ghfp_identify)(char *accel_vers, size_t size); + void (*ghfp_verify_GPU_capabilities)(void); + int (*ghfp_p2p_supported)(void); + int (*ghfp_gpudirect_supported)(void); + void (*ghfp_using_rv_for_mrs)(void); + void (*ghfp_get_pci_addr)(uint32_t *domain, uint32_t *bus, + uint32_t *dev, uint32_t *func); +#ifdef PSM_HAVE_RNDV_MOD + uint64_t (*ghfp_min_bar_size)(void); + psm2_error_t (*ghfp_check_phys_addr)(uint64_t phys_addr); + void (*ghfp_roundup_gdrcopy)(unsigned long buf, size_t size, + uintptr_t *pageaddr_p, uint64_t *pagelen_p); +#ifdef PSM_HAVE_REG_MR + void (*ghfp_roundup_rv_reg_mr)(struct psm2_ep *ep, + void **addr_p, uint64_t *length_p, int access); + int (*ghfp_init_rv_reg_mr_params)(void *addr, uint64_t length, int access, + struct rv_mem_params *mparams, + union psm3_verbs_mr_gpu_specific *gpu_specific, + union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad); +#endif + int (*ghfp_init_rv_pin_mmap_params)(void *addr, uint64_t length, int access, + struct rv_gpu_mem_params *params, + union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad); + void (*ghfp_rv_reg_mmap_cleanup)(void *addr, uint64_t length, int access, + union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad); +#endif /* PSM_HAVE_RNDV_MOD */ +#ifdef PSM_HAVE_REG_MR + int (*ghfp_cmp_mr)(const union psm3_verbs_mr_gpu_specific *a, + const union psm3_verbs_mr_gpu_specific *b); + void (*ghfp_init_mr)(void *addr, uint64_t length, int access, + union psm3_verbs_mr_gpu_specific *gpu_specific); +#endif + void (*ghfp_fetch_ctxt)(void); + void (*ghfp_refresh_ctxt)(void); + void (*ghfp_register_hostmem)(void *buf, uint32_t size); + void (*ghfp_unregister_hostmem)(void *buf); + int (*ghfp_is_gpu_mem)(const void *ptr); + void (*ghfp_prepare_HtoD_memcpys)(struct ips_protoexp *protoexp); + void (*ghfp_prepare_DtoH_memcpys)(struct ips_proto *proto); + void (*ghfp_shutdown_HtoD_memcpys)(struct ips_protoexp *protoexp); + void (*ghfp_shutdown_DtoH_memcpys)(struct ips_proto *proto); + void (*ghfp_memcpy_HtoD_start)(struct ips_protoexp *protoexp, + struct ips_gpu_hostbuf *ghb, uint32_t len); + void (*ghfp_memcpy_DtoH_start)(struct ips_proto *proto, + struct ips_gpu_hostbuf *ghb, uint32_t len); + int (*ghfp_memcpy_done)(struct ips_gpu_hostbuf *ghb); + void (*ghfp_hostbuf_lazy_init)(struct ips_gpu_hostbuf *ghb); + void (*ghfp_hostbuf_reset)(struct ips_gpu_hostbuf *ghb); + void (*ghfp_hostbuf_destroy)(struct ips_gpu_hostbuf *ghb); + void (*ghfp_memcpy_DtoD)(void *dstptr, const void *srcptr, uint32_t len); + void (*ghfp_memcpy_HtoD)(void *dstptr, const void *srcptr, uint32_t len); + void (*ghfp_memcpy_DtoH)(void *dstptr, const void *srcptr, uint32_t len); + void (*ghfp_memcpy)(void *dstptr, const void *srcptr, uint32_t len); + void (*ghfp_synchronize_memcpy)(void); + void (*ghfp_mark_buf_synchronous)(const void *buf); +// TBD should it be unsigned size instead? + void (*ghfp_host_alloc)(void **ret_ptr, uint32_t size); + void (*ghfp_host_free)(void *ptr); + // should the send buffer be treated as GPU memory + int (*ghfp_gpu_addr_send_mr)(struct psm2_mq_req *mqreq); + // should the recv buffer be treated as GPU memory + int (*ghfp_gpu_addr_recv_mr)(struct ips_tid_recv_desc *tidrecvc, + int gpu_hostbuf_used); + // functions for PSM3_DEVICES "shm" RTS/CTS processing to enable + // use of GPU specific scale-up transfers within the given server + psm2_error_t (*ghfp_shm_init)(struct ptl_am *ptl, + psm2_mq_stats_t *stats); + void (*ghfp_shm_finalize)(struct ptl_am *ptl); + psm2_error_t (*ghfp_shm_epaddr_add)(struct ptl_am *ptl, + struct am_epaddr *am_epaddr); + void (*ghfp_shm_epaddr_free)(struct am_epaddr *am_epaddr); + int (*ghfp_shm_dev_fds_needed)(void); + void (*ghfp_shm_dev_fds_send)(struct ptl_am *ptl, + struct am_epaddr *am_epaddr); + psm2_error_t (*ghfp_shm_dev_fds_connreq_poll)(struct ptl_am *ptl, + struct am_ptl_connection_req *req); + psm2_error_t (*ghfp_shm_dev_fds_check_exchanged)(struct ptl_am *ptl, + struct am_ptl_connection_req *req, int index); + psm2_error_t (*ghfp_shm_dev_fds_poll)(struct ptl_am *ptl, psm2_error_t res); + psm2_error_t (*ghfp_shm_build_rts)(struct ptl_am *ptl, + psm2_mq_req_t req, int *narg_p, + psm2_amarg_t *args, void **payload_p, size_t *payload_size_p, + union am_gpu_rts_payload *info_p); + void (*ghfp_shm_process_rts)(psm2_mq_req_t req, void *buf, size_t len, + int narg, psm2_amarg_t *args); + int (*ghfp_shm_rtsmatch)(struct ptl_am *ptl, psm2_mq_req_t req); + int (*ghfp_shm_process_cts)(psm2_mq_req_t sreq); + psm2_error_t (*ghfp_get_cuda_permitted)(struct psm2_ep *ep, bool *enable); + psm2_error_t (*ghfp_set_cuda_permitted)(struct psm2_ep *ep, bool enable); + bool (*ghfp_is_memcpy_permitted)(struct psm2_ep *ep); +} *psm3_gpu_hal; + +extern struct psm3_gpu_hal psm3_gpu_noop_hal; + +#ifdef PSM_CUDA +extern struct psm3_gpu_hal psm3_cuda_hal; +#endif + +#ifdef PSM_ONEAPI +extern struct psm3_gpu_hal psm3_oneapi_ze_hal; +#endif + +#ifdef PSM_HAVE_RNDV_MOD +extern void psm3_gpu_rv_cap_string(char *buf, size_t size, uint64_t capability); +extern void psm3_gpu_rv_set_hal_cap(uint64_t capability); +#endif + +extern uint32_t psm3_gpu_query_feature_mask(void); +extern psm2_error_t psm3_gpu_initialize(void); + +#define PSM3_GPU_TYPE (psm3_gpu_hal->type) + +#define PSM3_GPU_IS_ENABLED (psm3_gpu_hal != &psm3_gpu_noop_hal) + +#define PSM3_GPU_IS_GDR_COPY_ENABLED (psm3_gpu_is_gdr_copy_enabled) +#define PSM3_GPU_IS_DRIVER_GPUDIRECT_ENABLED (psm3_gpu_is_driver_gpudirect_enabled) + +// Only valid if called for a GPU buffer +#define PSMI_USE_GDR_COPY_RECV(len) \ + ((len) >=1 && (len) <= psm3_gpu_gdr_copy_limit_recv) + +#ifdef PSM_HAVE_RNDV_MOD +// RV GPU API version <= this unacceptable +#define PSM3_GPU_RV_MAJOR_REV_FAIL \ + (psm3_gpu_hal->rv_major_rev_fail) +#define PSM3_GPU_RV_MINOR_REV_FAIL \ + (psm3_gpu_hal->rv_minor_rev_fail) + +// capability bit corresponding to the GPU type which was selected by +// PSM3_GPU_INITIALIZE +#define PSM3_GPU_RV_CAPABILITY_EXPECTED \ + (psm3_gpu_hal->rv_capability_expected) +// ptl_ips HAL capability bit corresponding to the GPU type which was selected +// by PSM3_GPU_INITIALIZE +#define PSM3_GPU_HAL_CAP_EXPECTED \ + (psm3_gpu_hal->hal_cap_expected) + +// not a HAL function table call, +// return a string representing the GPU(s) +// supported by the given RV reported runtime capability mask +#define PSM3_GPU_RV_CAP_STRING(buf, size, capability) \ + psm3_gpu_rv_cap_string(buf, size, capability) + +// not a HAL function table call, +// Based on the RV capability supported, add to the ptl_ips HAL capability. +// Should only be called within an ptl_ips HAL once it has decided it will +// open rv. +#define PSM3_GPU_RV_SET_HAL_CAP(capability) \ + psm3_gpu_rv_set_hal_cap(capability) +#endif /* PSM_HAVE_RNDV_MOD */ + +// not a HAL function table call, +// indicates features available in the build of PSM3 +#define PSM3_GPU_QUERY_FEATURE_MASK() \ + psm3_gpu_query_feature_mask() + +// Initialization is unique, we will check which HALs are available +// and selected and setup psm3_gpu_hal and then initialize the +// selected HAL +#define PSM3_GPU_INITIALIZE() psm3_gpu_initialize() + +// These are all front ends to the GPU HAL function table +#define PSM3_GPU_FINALIZE() \ + (psm3_gpu_hal->ghfp_finalize)() +#define PSM3_GPU_EP_OPEN() \ + (psm3_gpu_hal->ghfp_ep_open)() +#define PSM3_GPU_EP_CLOSE() \ + (psm3_gpu_hal->ghfp_ep_close)() + +#define PSM3_GPU_IDENTIFY(accel_vers, size) \ + (psm3_gpu_hal->ghfp_identify)(accel_vers, size) +#define PSM3_GPU_VERIFY_CAPABILITIES() \ + (psm3_gpu_hal->ghfp_verify_GPU_capabilities)() +#define PSM3_GPU_P2P_SUPPORTED() \ + (psm3_gpu_hal->ghfp_p2p_supported)() +#define PSM3_GPU_GPUDIRECT_SUPPORTED() \ + (psm3_gpu_hal->ghfp_gpudirect_supported)() +#define PSM3_GPU_USING_RV_FOR_MRS() \ + (psm3_gpu_hal->ghfp_using_rv_for_mrs)() +#define PSM3_GPU_GET_PCI_ADDR(domain_p, bus_p, dev_p, func_p) \ + (psm3_gpu_hal->ghfp_get_pci_addr)(domain_p, bus_p, dev_p, func_p) +#ifdef PSM_HAVE_RNDV_MOD +#define PSM3_GPU_MIN_BAR_SIZE(void) \ + (psm3_gpu_hal->ghfp_min_bar_size)() +#define PSM3_GPU_CHECK_PHYS_ADDR(phys_addr) \ + (psm3_gpu_hal->ghfp_check_phys_addr)(phys_addr) +#define PSM3_GPU_ROUNDUP_GDRCOPY(buf, size, pageaddr_p, pagelen_p) \ + (psm3_gpu_hal->ghfp_roundup_gdrcopy)(buf, size, pageaddr_p, pagelen_p) +#ifdef PSM_HAVE_REG_MR +#define PSM3_GPU_ROUNDUP_RV_REG_MR(ep, addr_p, length_p, access) \ + (psm3_gpu_hal->ghfp_roundup_rv_reg_mr)(ep, addr_p, length_p, access) +#endif +#define PSM3_GPU_INIT_RV_REG_MR_PARAMS(addr, length, access, mparams, \ + gpu_specific, scratchpad) \ + (psm3_gpu_hal->ghfp_init_rv_reg_mr_params)(addr, length, access, \ + mparams, gpu_specific, scratchpad) +#define PSM3_GPU_INIT_RV_PIN_MMAP_PARAMS(addr, length, access, params, \ + scratchpad) \ + (psm3_gpu_hal->ghfp_init_rv_pin_mmap_params)(addr, length, access, \ + params, scratchpad) +#define PSM3_GPU_RV_REG_MMAP_CLEANUP(addr, length, access, scratchpad)\ + (psm3_gpu_hal->ghfp_rv_reg_mmap_cleanup)(addr, length, access, \ + scratchpad) +#endif /* PSM_HAVE_RNDV_MOD */ +#ifdef PSM_HAVE_REG_MR +#define PSM3_GPU_CMP_MR(a, b) \ + (psm3_gpu_hal->ghfp_cmp_mr)(a, b) +#define PSM3_GPU_INIT_MR(addr, length, access, gpu_specific) \ + (psm3_gpu_hal->ghfp_init_mr)(addr, length, access, gpu_specific) +#endif /* PSM_HAVE_RNDV_MOD */ +// if GPU HAL needs it, fetch current context of process and save internal to +// GPU HAL for use in later calls. Used by rcvthread at thread start +// to ensure GPU APIs have a context if needed +#define PSM3_GPU_FETCH_CTXT(void) \ + (psm3_gpu_hal->ghfp_fetch_ctxt)(void) +// if GPU HAL needs it, refresh current context of process based on copy +// internal to HAL. Used by rcvthread at thread interrupt callback +// to ensure GPU APIs have a context if needed +#define PSM3_GPU_REFRESH_CTXT(void) \ + (psm3_gpu_hal->ghfp_refresh_ctxt)(void) +// These calls permit the GPU specific code to preregister host memory +// which was malloc()'ed. This can speed up GPU memcpy for some GPUs +#define PSM3_GPU_REGISTER_HOSTMEM(buf, size) \ + (psm3_gpu_hal->ghfp_register_hostmem)(buf, size) +#define PSM3_GPU_UNREGISTER_HOSTMEM(buf) \ + (psm3_gpu_hal->ghfp_unregister_hostmem)(buf) +// TBD - this is called alot, but seems we need to use function ptr +// instead of macro, TBD if will affect latency, cost is probably +// in function called, not in actual call/ret overhead +#define PSM3_IS_GPU_MEM(ptr) \ + (psm3_gpu_hal->ghfp_is_gpu_mem)(ptr) +#define PSM3_IS_BUFFER_GPU_MEM(buf, len) ((len) && PSM3_IS_GPU_MEM(buf)) +#define PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp) \ + (psm3_gpu_hal->ghfp_prepare_HtoD_memcpys)(protoexp) +#define PSM3_GPU_PREPARE_DTOH_MEMCPYS(proto) \ + (psm3_gpu_hal->ghfp_prepare_DtoH_memcpys)(proto) +#define PSM3_GPU_SHUTDOWN_HTOD_MEMCPYS(protoexp) \ + (psm3_gpu_hal->ghfp_shutdown_HtoD_memcpys)(protoexp) +#define PSM3_GPU_SHUTDOWN_DTOH_MEMCPYS(proto) \ + (psm3_gpu_hal->ghfp_shutdown_DtoH_memcpys)(proto) +#define PSM3_GPU_MEMCPY_HTOD_START(proto, ghb, len) \ + (psm3_gpu_hal->ghfp_memcpy_HtoD_start)(proto, ghb, len) +#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len) \ + (psm3_gpu_hal->ghfp_memcpy_DtoH_start)(proto, ghb, len) +#define PSM3_GPU_MEMCPY_DONE(ghb) \ + (psm3_gpu_hal->ghfp_memcpy_done)(ghb) +#define PSM3_GPU_HOSTBUF_LAZY_INIT(ghb) \ + (psm3_gpu_hal->ghfp_hostbuf_lazy_init)(ghb) +#define PSM3_GPU_HOSTBUF_RESET(ghb) \ + (psm3_gpu_hal->ghfp_hostbuf_reset)(ghb) +#define PSM3_GPU_HOSTBUF_DESTROY(ghb) \ + (psm3_gpu_hal->ghfp_hostbuf_destroy)(ghb) +#define PSM3_GPU_MEMCPY_DTOD(dstptr, srcptr, len) \ + (psm3_gpu_hal->ghfp_memcpy_DtoD)(dstptr, srcptr, len) +#define PSM3_GPU_MEMCPY_HTOD(dstptr, srcptr, len) \ + (psm3_gpu_hal->ghfp_memcpy_HtoD)(dstptr, srcptr, len) +#define PSM3_GPU_MEMCPY_DTOH(dstptr, srcptr, len) \ + (psm3_gpu_hal->ghfp_memcpy_DtoH)(dstptr, srcptr, len) +#define PSM3_GPU_MEMCPY(dstptr, srcptr, len) \ + (psm3_gpu_hal->ghfp_memcpy)(dstptr, srcptr, len) +#define PSM3_GPU_SYNCHRONIZE_MEMCPY() \ + (psm3_gpu_hal->ghfp_synchronize_memcpy)() +#define PSM3_GPU_MARK_BUF_SYNCHRONOUS(buf) \ + (psm3_gpu_hal->ghfp_mark_buf_synchronous)(buf) +#define PSM3_GPU_HOST_ALLOC(ret_ptr, size) \ + (psm3_gpu_hal->ghfp_host_alloc)(ret_ptr, size) +#define PSM3_GPU_HOST_FREE(ptr) \ + (psm3_gpu_hal->ghfp_host_free)(ptr) +#define PSM3_GPU_ADDR_SEND_MR(mqreq) \ + (psm3_gpu_hal->ghfp_gpu_addr_send_mr)(mqreq) +#define PSM3_GPU_ADDR_RECV_MR(tidrecvc, gpu_hostbuf_used) \ + (psm3_gpu_hal->ghfp_gpu_addr_recv_mr)(tidrecvc, gpu_hostbuf_used) +// functions for PSM3_DEVICES "shm" RTS/CTS processing to enable +// use of GPU specific scale-up transfers within the given server +#define PSM3_GPU_SHM_INIT(ptl, stats) \ + (psm3_gpu_hal->ghfp_shm_init)(ptl, stats) +#define PSM3_GPU_SHM_FINALIZE(ptl) \ + (psm3_gpu_hal->ghfp_shm_finalize)(ptl) +#define PSM3_GPU_SHM_EPADDR_ADD(ptl, am_epaddr) \ + (psm3_gpu_hal->ghfp_shm_epaddr_add)(ptl, am_epaddr) +#define PSM3_GPU_SHM_EPADDR_FREE(am_epaddr) \ + (psm3_gpu_hal->ghfp_shm_epaddr_free)(am_epaddr) +#define PSM3_GPU_SHM_DEV_FDS_NEEDED() \ + (psm3_gpu_hal->ghfp_shm_dev_fds_needed)() +#define PSM3_GPU_SHM_DEV_FDS_SEND(ptl, am_epaddr) \ + (psm3_gpu_hal->ghfp_shm_dev_fds_send)(ptl, am_epaddr) +#define PSM3_GPU_SHM_DEV_FDS_CONNREQ_POLL(ptl, req) \ + (psm3_gpu_hal->ghfp_shm_dev_fds_connreq_poll)(ptl, req) +#define PSM3_GPU_SHM_DEV_FDS_CHECK_EXCHANGED(ptl, req, index) \ + (psm3_gpu_hal->ghfp_shm_dev_fds_check_exchanged)(ptl, req, index) +#define PSM3_GPU_SHM_DEV_FDS_POLL(ptl, res) \ + (psm3_gpu_hal->ghfp_shm_dev_fds_poll)(ptl, res) +#define PSM3_GPU_SHM_BUILD_RTS(ptl, req, narg_p, args, payload_p, payload_size_p, info_p) \ + (psm3_gpu_hal->ghfp_shm_build_rts)(ptl, req, narg_p, args, payload_p, \ + payload_size_p, info_p) +#define PSM3_GPU_SHM_PROCESS_RTS(req, buf, len, narg, args) \ + (psm3_gpu_hal->ghfp_shm_process_rts)(req, buf, len, narg, args) +#define PSM3_GPU_SHM_RTSMATCH(ptl, req) \ + (psm3_gpu_hal->ghfp_shm_rtsmatch)(ptl, req) +#define PSM3_GPU_SHM_PROCESS_CTS(sreq) \ + (psm3_gpu_hal->ghfp_shm_process_cts)(sreq) +#define PSM3_GPU_GET_CUDA_PERMITTED(ep, enable) \ + (psm3_gpu_hal->ghfp_get_cuda_permitted)(ep, enable) +#define PSM3_GPU_SET_CUDA_PERMITTED(ep, enable) \ + (psm3_gpu_hal->ghfp_set_cuda_permitted)(ep, enable) +#define PSM3_GPU_IS_MEMCPY_PERMITTED(ep) \ + (psm3_gpu_hal->ghfp_is_memcpy_permitted)(ep) + +#else /* PSM_HAVE_GPU */ +// GPU omitted from build + +#define PSM3_GPU_FMT_RV_GPU_VER +#define PSM3_GPU_OUT_RV_GPU_VER + +#define PSM3_GPU_TYPES + +#define PSM3_GPU_IS_ENABLED (0) + +#define PSM3_GPU_IS_GDR_COPY_ENABLED (0) +#define PSM3_GPU_IS_DRIVER_GPUDIRECT_ENABLED (0) + +#ifdef PSM_HAVE_RNDV_MOD +#define PSM3_GPU_RV_MAJOR_REV_FAIL (0) +#define PSM3_GPU_RV_MINOR_REV_FAIL (0) + +#define PSM3_GPU_RV_CAPABILITY_EXPECTED (0) +#define PSM3_GPU_HAL_CAP_EXPECTED (0) + +// we output " gpu unknown" since this being called means RV supports a GPU +// but PSM3 build does not +#define PSM3_GPU_RV_CAP_STRING(buf, size, capability) \ + (void)snprintf(buf, size, " gpu unknown"); + +#define PSM3_GPU_RV_SET_HAL_CAP(capability) do { } while (0) +#endif /* PSM_HAVE_RNDV_MOD */ + +// this is unique, indicates features available in the build of PSM3 +#define PSM3_GPU_QUERY_FEATURE_MASK() (0) + +#define PSM3_GPU_TYPE "none" + +// Initialization is unique, we will check for GPU related parameters +// and warn the user +#define PSM3_GPU_INITIALIZE() (PSM2_OK) + +// These are all front ends to the GPU HAL function table +// GPU omitted from build, so all HAL functions are no-ops +// this avoids need for callers to check if GPU enabled and reduces clutter + +#define PSM3_GPU_FINALIZE() do { } while (0) +#define PSM3_GPU_EP_OPEN() do { } while (0) +#define PSM3_GPU_EP_CLOSE() do { } while (0) + +#define PSM3_GPU_IDENTIFY(accel_vers, size) \ + do { accel_vers[0] = '\0'; } while (0) +#define PSM3_GPU_VERIFY_CAPABILITIES() do { } while (0) +#define PSM3_GPU_P2P_SUPPORTED() (0) +#define PSM3_GPU_GPUDIRECT_SUPPORTED() (0) +#define PSM3_GPU_USING_RV_FOR_MRS() do { } while (0) +#define PSM3_GPU_IS_DRIVER_GPUDIRECT_DISABLED() (1) +#define PSM3_GPU_GET_PCI_ADDR(domain_p, bus_p, dev_p, func_p) \ + do { *domain_p = 0; *bus_p = 0; *dev_p = 0; *func_p = 0; ) while (0) +#ifdef PSM_HAVE_RNDV_MOD +#define PSM3_GPU_MIN_BAR_SIZE(void) (0) +#define PSM3_GPU_CHECK_PHYS_ADDR(phys_addr) (PSM2_OK) +#define PSM3_GPU_ROUNDUP_GDRCOPY(buf, size, pageaddr_p, pagelen_p) \ + do { *pageaddr_p = (uintptr_t)buf; *pagelen_p = (uint64_t)size } while (0) +#ifdef PSM_HAVE_REG_MR +#define PSM3_GPU_ROUNDUP_RV_REG_MR(ep, addr_p, length_p, access) do { } while (0) +#endif +#define PSM3_GPU_INIT_RV_REG_MR_PARAMS(addr, length, access, mparams, \ + gpu_specific, scratchpad) (0) +#define PSM3_GPU_INIT_RV_PIN_MMAP_PARAMS(addr, length, access, params, \ + scratchpad) (0) +#define PSM3_GPU_RV_REG_MMAP_CLEANUP(addr, length, access, scratchpad)\ + do { } while (0) +#endif /* PSM_HAVE_RNDV_MOD */ +#ifdef PSM_HAVE_REG_MR +#define PSM3_GPU_CMP_MR(a, b) (0) +#define PSM3_GPU_INIT_MR(addr, length, access, gpu_specific) \ + do { } while (0) +#endif +#define PSM3_GPU_FETCH_CTXT() do { } while (0) +#define PSM3_GPU_REFRESH_CTXT() do { } while (0) +#define PSM3_GPU_REGISTER_HOSTMEM(buf, size) do { } while (0) +#define PSM3_GPU_UNREGISTER_HOSTMEM(buf) do { } while (0) +#define PSM3_IS_GPU_MEM(ptr) (0) +#define PSM3_IS_BUFFER_GPU_MEM(buf, len) (0) +// maybe some of these should be psmi_assert instead since should not +// be called if didn't have a GPU +#define PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp) do { } while (0) +#define PSM3_GPU_PREPARE_DTOH_MEMCPYS(proto) do { } while (0) +#define PSM3_GPU_SHUTDOWN_HTOD_MEMCPYS(protoexp) do { } while (0) +#define PSM3_GPU_SHUTDOWN_DTOH_MEMCPYS(proto) do { } while (0) +#define PSM3_GPU_MEMCPY_HTOD_START(proto, ghb, len) do { } while (0) +#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len) do { } while (0) +#define PSM3_GPU_MEMCPY_DONE(ghb) (1) +#define PSM3_GPU_HOSTBUF_LAZY_INIT(ghb) do { } while (0) +#define PSM3_GPU_HOSTBUF_RESET(ghb) do { } while (0) +#define PSM3_GPU_HOSTBUF_DESTROY(ghb) do { } while (0) +#define PSM3_GPU_MEMCPY_DTOD(dstptr, srcptr, len) do { } while (0) +#define PSM3_GPU_MEMCPY_HTOD(dstptr, srcptr, len) do { } while (0) +#define PSM3_GPU_MEMCPY_DTOH(dstptr, srcptr, len) do { } while (0) +#define PSM3_GPU_MEMCPY(dstptr, srcptr, len) do { } while (0) +#define PSM3_GPU_SYNCHRONIZE_MEMCPY() do { } while (0) +#define PSM3_GPU_MARK_BUF_SYNCHRONOUS(buf) do { } while (0) +#define PSM3_GPU_HOST_ALLOC(ret_ptr, size) do { *(ret_ptr) = NULL; } while (0) +#define PSM3_GPU_HOST_FREE(ptr) do { } while (0) +#define PSM3_GPU_ADDR_SEND_MR(mqreq) (0) +#define PSM3_GPU_ADDR_RECV_MR(tidrecvc, gpu_hostbuf_used) (0) +// functions for PSM3_DEVICES "shm" RTS/CTS processing to enable +// use of GPU specific scale-up transfers within the given server +#define PSM3_GPU_SHM_INIT(ptl, stats) (PSM2_OK) +#define PSM3_GPU_SHM_FINALIZE(ptl) (PSM2_OK) +#define PSM3_GPU_SHM_EPADDR_ADD(ptl, amadddr) (PSM2_OK) +#define PSM3_GPU_SHM_EPADDR_FREE(amadddr) do { } while (0) +#define PSM3_GPU_SHM_DEV_FDS_NEEDED() (0) +#define PSM3_GPU_SHM_DEV_FDS_SEND(ptl, am_epaddr) do { } while (0) +#define PSM3_GPU_SHM_DEV_FDS_CONNREQ_POLL(ptl, req) (PSM2_OK) +#define PSM3_GPU_SHM_DEV_FDS_CHECK_EXCHANGED(ptl, req, index) (PSM2_OK) +#define PSM3_GPU_SHM_DEV_FDS_POLL(ptl, res) (res) +#define PSM3_GPU_SHM_BUILD_RTS(ptl, req, narg_p, args, payload_p, payload_size_p, info_p) \ + (PSM2_OK) +#define PSM3_GPU_SHM_PROCESS_RTS(req, buf, len, narg, args) \ + do { } while(0) +#define PSM3_GPU_SHM_RTSMATCH(ptl, req) (0) +#define PSM3_GPU_SHM_PROCESS_CTS(sreq) (0) +#define PSM3_GPU_GET_CUDA_PERMITTED(ep, enable) ({ *(enable) = true; PSM2_OK; }) +#define PSM3_GPU_SET_CUDA_PERMITTED(ep, enable) (PSM2_OK) +#define PSM3_GPU_IS_MEMCPY_PERMITTED(ep) (false) + +#endif /* PSM_HAVE_GPU */ + +#endif /* _PSMI_GPU_HAL_H */ diff --git a/prov/psm3/psm3/gpu/psm_gpu_oneapi_ze.c b/prov/psm3/psm3/gpu/psm_gpu_oneapi_ze.c new file mode 100644 index 00000000000..98e20d86cd8 --- /dev/null +++ b/prov/psm3/psm3/gpu/psm_gpu_oneapi_ze.c @@ -0,0 +1,3548 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2024 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2024 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2024 Intel Corporation. All rights reserved. */ + +#include +#include +#include /* cpu_set */ +#include /* isalpha */ +#include + +#include "psm_user.h" + +#ifdef PSM_ONEAPI +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "psm2_hal.h" +#include "psm_mq_internal.h" +#include "ptl_am/psm_am_internal.h" +#include "ptl_ips/ips_proto.h" +#include "ptl_ips/ips_expected_proto.h" +#include "psmi_wrappers.h" +#include +#ifdef HAVE_DRM +#include +#include +#endif +#ifdef HAVE_LIBDRM +#include +#include +#endif +#ifdef PSM_HAVE_PIDFD +#include +#endif + +// if defined, use malloc for pipeline copy bounce buffers +// otherwise, use zeMemAllocHost +//#define PSM3_USE_ONEAPI_MALLOC + +// if defined, do not use zexDriverImportExternalPointer for malloced pipeline +// copy bounce buffers +// otherwise, use zexDriverImportExternalPointer when malloc buffer +//#define PSM3_NO_ONEAPI_IMPORT + +// default value for PSM3_GPU_THRESH_RNDV +#define PSM3_ONEAPI_ZE_GPU_THRESH_RNDV 8000 +// default value for PSM3_GPU_RNDV_NIC_WINDOW when using OneApi Level Zero GPU +#define PSM3_ONEAPI_ZE_RNDV_NIC_WINDOW_DEFAULT "131072:524287,262144:1048575,524288" +// default value for PSM3_GPUDIRECT_RDMA_SEND_LIMIT +#define PSM3_ONEAPI_ZE_GPUDIRECT_RDMA_SEND_LIMIT_DEFAULT UINT_MAX +// default value for PSM3_GPUDIRECT_RDMA_RECV_LIMIT +#define PSM3_ONEAPI_ZE_GPUDIRECT_RDMA_RECV_LIMIT_DEFAULT 1 +// default value for PSM3_MQ_RNDV_SHM_GPU_THRESH +// Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem +#define PSM3_ONEAPI_ZE_MQ_RNDV_SHM_GPU_THRESH 127 + +struct psm3_oneapi_ze_dev_ctxt { + ze_device_handle_t dev; + int dev_index; /* Index in psm3_oneapi_ze_devices[] */ + uint32_t ordinal; /* CmdQGrp ordinal for the 1st copy_only engine */ + uint32_t index; /* Cmdqueue index within the CmdQGrp */ + uint32_t num_queues; /* Number of queues in the CmdQGrp */ + // for most sync copies + ze_command_queue_handle_t cq; // NULL if psm3_oneapi_ze_immed_sync_copy + ze_command_list_handle_t cl; + // fields below are only used for large DTOD sync copy so can do 2 + // parallel async copies then wait for both + ze_event_handle_t copy_status0; + ze_event_handle_t copy_status1; + ze_command_list_handle_t async_cl0; + ze_command_list_handle_t async_cl1; + ze_command_queue_handle_t async_cq0;// NULL if psm3_oneapi_ze_immed_sync_copy + ze_command_queue_handle_t async_cq1;// NULL if psm3_oneapi_ze_immed_sync_copy + ze_event_pool_handle_t event_pool; +}; + +static ze_driver_handle_t psm3_oneapi_ze_driver = NULL; +static struct psm3_oneapi_ze_dev_ctxt psm3_oneapi_ze_devices[MAX_ZE_DEVICES]; +static int psm3_num_oneapi_ze_devices = 0; +static struct psm3_oneapi_ze_dev_ctxt *psm3_oneapi_ze_cur_dev = NULL; + +/* ZE Loader(zel) And Runtime(ze) Library */ +static void *psm3_oneapi_ze_lib; +static ze_api_version_t psm3_oneapi_ze_api_version = 0; +static zel_version_t psm3_oneapi_ze_lib_version = { }; + +/* This is a global oneapi_ze context + */ +static ze_context_handle_t psm3_oneapi_ze_context = NULL; + +#ifndef PSM_HAVE_PIDFD +static int psm3_ze_dev_fds[MAX_ZE_DEVICES]; +static int psm3_num_ze_dev_fds; +#endif +static int psm3_oneapi_ze_immed_sync_copy; +static int psm3_oneapi_ze_immed_async_copy; +static unsigned psm3_oneapi_parallel_dtod_copy_thresh; + +#ifdef PSM_HAVE_RNDV_MOD +// PSM3_RV_GPU_IGNORE_ALLOC_ID allows internal testing of GPU caching in RV +// =0 -> default, alloc_id used to identify new buffers which have same +// virt addr as an existing cache entry. In which case a cache miss +// and invalidation of the old cache entry occurs. +// =1 -> an alloc_id of 0 is always used. This has been demonstrated to +// cause false cache hits which can lead to landing data in safe but +// incorrect pages. Useful only for development experiments and tests. +// =2 -> for cache miss performance testing. This will use a different alloc_id +// per IO which will force cache invalidation on every IO. So no +// MR/mmap cache hits will occur, but all the normal MR handling will +// occur just as if there was a miss when running in normal mode +static int psm3_oneapi_ze_ignore_alloc_id; // PSM3_RV_GPU_IGNORE_ALLOC_ID +static uint64_t psm3_oneapi_ze_fake_alloc_id; // for when PSM3_RV_GPU_IGNORE_ALLOC_ID==2 +#endif + +/* function pointers from dlopen access to oneapi_Ze shared library */ +#define PSM3_ZE_SYM_FP(name) PSM3_CONCAT(psm3_oneapi_ze_, name) +static ze_result_t (*PSM3_ZE_SYM_FP(zeInit))(ze_init_flags_t flags); +static ze_result_t (*PSM3_ZE_SYM_FP(zeDriverGet))(uint32_t *pCount, ze_driver_handle_t *phDrivers); +static ze_result_t (*PSM3_ZE_SYM_FP(zeDeviceGet))(ze_driver_handle_t hDriver, uint32_t *pCount, ze_device_handle_t *phDevices); +static ze_result_t (*PSM3_ZE_SYM_FP(zeDevicePciGetPropertiesExt))(ze_device_handle_t hDevice, ze_pci_ext_properties_t *pPciProperties); +#ifndef PSM3_NO_ONEAPI_IMPORT +static ze_result_t (*PSM3_ZE_SYM_FP(zeDriverGetExtensionFunctionAddress))(ze_driver_handle_t hDriver, const char *name, void **ppFunctionAddress); +static ze_result_t (*PSM3_ZE_SYM_FP(zexDriverImportExternalPointer))(ze_driver_handle_t hDriver, void *ptr, size_t size); +static ze_result_t (*PSM3_ZE_SYM_FP(zexDriverReleaseImportedPointer))(ze_driver_handle_t hDriver, void *ptr); +#endif +static ze_result_t (*PSM3_ZE_SYM_FP(zeContextCreate))(ze_driver_handle_t hDriver, const ze_context_desc_t *desc, ze_context_handle_t *phContext); +static ze_result_t (*PSM3_ZE_SYM_FP(zeContextDestroy))(ze_context_handle_t hContext); +static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandQueueCreate))(ze_context_handle_t hContext, ze_device_handle_t hDevice,const ze_command_queue_desc_t *desc, ze_command_queue_handle_t *phCommandQueue); +static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandQueueDestroy))(ze_command_queue_handle_t hCommandQueue); +static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandQueueExecuteCommandLists))(ze_command_queue_handle_t hCommandQueue, uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, ze_fence_handle_t hFence); +static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandQueueSynchronize))(ze_command_queue_handle_t hCommandQueue, uint64_t timeout); +static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandListCreate))(ze_context_handle_t hContext, ze_device_handle_t hDevice, const ze_command_list_desc_t *desc, ze_command_list_handle_t *phCommandList); +static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandListDestroy))(ze_command_list_handle_t hCommandList); +static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandListClose))(ze_command_list_handle_t hCommandList); +static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandListReset))(ze_command_list_handle_t hCommandList); +static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandListCreateImmediate))(ze_context_handle_t hContext, ze_device_handle_t hDevice, const ze_command_queue_desc_t *desc, ze_command_list_handle_t *phCommandList); +static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandListAppendMemoryCopy))(ze_command_list_handle_t hCommandList, void *dstptr, const void *srcptr, size_t size, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents); +static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandListAppendSignalEvent))(ze_command_list_handle_t hCommandList, ze_event_handle_t hEvent); +static ze_result_t (*PSM3_ZE_SYM_FP(zeDeviceCanAccessPeer))(ze_device_handle_t hDevice, ze_device_handle_t hPeerDevice, ze_bool_t *value); +static ze_result_t (*PSM3_ZE_SYM_FP(zeDeviceGetCommandQueueGroupProperties))(ze_device_handle_t hDevice, uint32_t *pCount, ze_command_queue_group_properties_t *pCommandQueueGroupProperties); +static ze_result_t (*PSM3_ZE_SYM_FP(zeMemAllocHost))(ze_context_handle_t hContext, const ze_host_mem_alloc_desc_t *host_desc, size_t size, size_t alignment, void **pptr); +static ze_result_t (*PSM3_ZE_SYM_FP(zeMemAllocDevice))(ze_context_handle_t hContext, const ze_device_mem_alloc_desc_t *device_desc, size_t size, size_t alignment, ze_device_handle_t hDevice, void **pptr); +static ze_result_t (*PSM3_ZE_SYM_FP(zeMemFree))(ze_context_handle_t hContext, void *ptr); +static ze_result_t (*PSM3_ZE_SYM_FP(zeMemGetIpcHandle))(ze_context_handle_t hContext, const void *ptr, ze_ipc_mem_handle_t *pIpcHandle); +#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE +static ze_result_t (*PSM3_ZE_SYM_FP(zeMemPutIpcHandle))(ze_context_handle_t hContext, ze_ipc_mem_handle_t handle); +#endif +static ze_result_t (*PSM3_ZE_SYM_FP(zeMemOpenIpcHandle))(ze_context_handle_t hContext,ze_device_handle_t hDevice, ze_ipc_mem_handle_t handle, ze_ipc_memory_flags_t flags, void **pptr); +static ze_result_t (*PSM3_ZE_SYM_FP(zeMemCloseIpcHandle))(ze_context_handle_t hContext, const void *ptr); +static ze_result_t (*PSM3_ZE_SYM_FP(zeMemGetAddressRange))(ze_context_handle_t hContext, const void *ptr, void **pBase, size_t *pSize); +static ze_result_t (*PSM3_ZE_SYM_FP(zeMemGetAllocProperties))(ze_context_handle_t hContext, const void *ptr, ze_memory_allocation_properties_t *pMemAllocProperties, ze_device_handle_t *phDevice); +static ze_result_t (*PSM3_ZE_SYM_FP(zeEventPoolCreate))(ze_context_handle_t hContext, const ze_event_pool_desc_t *desc, uint32_t numDevices, ze_device_handle_t *phDevices, ze_event_pool_handle_t *phEventPool); +static ze_result_t (*PSM3_ZE_SYM_FP(zeEventPoolDestroy))(ze_event_pool_handle_t hEventPool); +static ze_result_t (*PSM3_ZE_SYM_FP(zeEventCreate))(ze_event_pool_handle_t hEventPool, const ze_event_desc_t *desc, ze_event_handle_t *phEvent); +static ze_result_t (*PSM3_ZE_SYM_FP(zeEventDestroy))(ze_event_handle_t hEvent); +static ze_result_t (*PSM3_ZE_SYM_FP(zeEventQueryStatus))(ze_event_handle_t hEvent); +static ze_result_t (*PSM3_ZE_SYM_FP(zeEventHostSynchronize))(ze_event_handle_t hEvent, uint64_t timeout); +static ze_result_t (*PSM3_ZE_SYM_FP(zeEventHostReset))(ze_event_handle_t hEvent); +static ze_result_t (*PSM3_ZE_SYM_FP(zelLoaderGetVersions))(size_t *num_elems, zel_component_version_t *versions); + +/* statistics counting each oneapi_ze call PSM3 makes */ +#define PSM3_ZE_SYM_COUNT(name) PSM3_CONCAT(psm3_oneapi_ze_count_, name) +static uint64_t PSM3_ZE_SYM_COUNT(zeInit); +static uint64_t PSM3_ZE_SYM_COUNT(zeDriverGet); +static uint64_t PSM3_ZE_SYM_COUNT(zeDeviceGet); +static uint64_t PSM3_ZE_SYM_COUNT(zeDevicePciGetPropertiesExt); +#ifndef PSM3_NO_ONEAPI_IMPORT +static uint64_t PSM3_ZE_SYM_COUNT(zeDriverGetExtensionFunctionAddress); +static uint64_t PSM3_ZE_SYM_COUNT(zexDriverImportExternalPointer); +static uint64_t PSM3_ZE_SYM_COUNT(zexDriverReleaseImportedPointer); +#endif +static uint64_t PSM3_ZE_SYM_COUNT(zeContextCreate); +static uint64_t PSM3_ZE_SYM_COUNT(zeContextDestroy); +static uint64_t PSM3_ZE_SYM_COUNT(zeCommandQueueCreate); +static uint64_t PSM3_ZE_SYM_COUNT(zeCommandQueueDestroy); +static uint64_t PSM3_ZE_SYM_COUNT(zeCommandQueueExecuteCommandLists); +static uint64_t PSM3_ZE_SYM_COUNT(zeCommandQueueSynchronize); +static uint64_t PSM3_ZE_SYM_COUNT(zeCommandListCreate); +static uint64_t PSM3_ZE_SYM_COUNT(zeCommandListDestroy); +static uint64_t PSM3_ZE_SYM_COUNT(zeCommandListClose); +static uint64_t PSM3_ZE_SYM_COUNT(zeCommandListReset); +static uint64_t PSM3_ZE_SYM_COUNT(zeCommandListCreateImmediate); +static uint64_t PSM3_ZE_SYM_COUNT(zeCommandListAppendMemoryCopy); +static uint64_t PSM3_ZE_SYM_COUNT(zeCommandListAppendSignalEvent); +static uint64_t PSM3_ZE_SYM_COUNT(zeDeviceCanAccessPeer); +static uint64_t PSM3_ZE_SYM_COUNT(zeDeviceGetCommandQueueGroupProperties); +static uint64_t PSM3_ZE_SYM_COUNT(zeMemAllocHost); +static uint64_t PSM3_ZE_SYM_COUNT(zeMemAllocDevice); +static uint64_t PSM3_ZE_SYM_COUNT(zeMemFree); +static uint64_t PSM3_ZE_SYM_COUNT(zeMemGetIpcHandle); +#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE +static uint64_t PSM3_ZE_SYM_COUNT(zeMemPutIpcHandle); +#endif +static uint64_t PSM3_ZE_SYM_COUNT(zeMemOpenIpcHandle); +static uint64_t PSM3_ZE_SYM_COUNT(zeMemCloseIpcHandle); +static uint64_t PSM3_ZE_SYM_COUNT(zeMemGetAddressRange); +static uint64_t PSM3_ZE_SYM_COUNT(zeMemGetAllocProperties); +static uint64_t PSM3_ZE_SYM_COUNT(zeEventPoolCreate); +static uint64_t PSM3_ZE_SYM_COUNT(zeEventPoolDestroy); +static uint64_t PSM3_ZE_SYM_COUNT(zeEventCreate); +static uint64_t PSM3_ZE_SYM_COUNT(zeEventDestroy); +static uint64_t PSM3_ZE_SYM_COUNT(zeEventQueryStatus); +static uint64_t PSM3_ZE_SYM_COUNT(zeEventHostSynchronize); +static uint64_t PSM3_ZE_SYM_COUNT(zeEventHostReset); +static uint64_t PSM3_ZE_SYM_COUNT(zelLoaderGetVersions); + +static const char* psm3_oneapi_ze_result_to_string(const ze_result_t result) { +#define ZE_RESULT_CASE(RES) case ZE_RESULT_##RES: return STRINGIFY(RES) + + switch (result) { + ZE_RESULT_CASE(SUCCESS); + ZE_RESULT_CASE(NOT_READY); + ZE_RESULT_CASE(ERROR_UNINITIALIZED); + ZE_RESULT_CASE(ERROR_DEVICE_LOST); + ZE_RESULT_CASE(ERROR_INVALID_ARGUMENT); + ZE_RESULT_CASE(ERROR_OUT_OF_HOST_MEMORY); + ZE_RESULT_CASE(ERROR_OUT_OF_DEVICE_MEMORY); + ZE_RESULT_CASE(ERROR_MODULE_BUILD_FAILURE); + ZE_RESULT_CASE(ERROR_INSUFFICIENT_PERMISSIONS); + ZE_RESULT_CASE(ERROR_NOT_AVAILABLE); + ZE_RESULT_CASE(ERROR_UNSUPPORTED_VERSION); + ZE_RESULT_CASE(ERROR_UNSUPPORTED_FEATURE); + ZE_RESULT_CASE(ERROR_INVALID_NULL_HANDLE); + ZE_RESULT_CASE(ERROR_HANDLE_OBJECT_IN_USE); + ZE_RESULT_CASE(ERROR_INVALID_NULL_POINTER); + ZE_RESULT_CASE(ERROR_INVALID_SIZE); + ZE_RESULT_CASE(ERROR_UNSUPPORTED_SIZE); + ZE_RESULT_CASE(ERROR_UNSUPPORTED_ALIGNMENT); + ZE_RESULT_CASE(ERROR_INVALID_SYNCHRONIZATION_OBJECT); + ZE_RESULT_CASE(ERROR_INVALID_ENUMERATION); + ZE_RESULT_CASE(ERROR_UNSUPPORTED_ENUMERATION); + ZE_RESULT_CASE(ERROR_UNSUPPORTED_IMAGE_FORMAT); + ZE_RESULT_CASE(ERROR_INVALID_NATIVE_BINARY); + ZE_RESULT_CASE(ERROR_INVALID_GLOBAL_NAME); + ZE_RESULT_CASE(ERROR_INVALID_KERNEL_NAME); + ZE_RESULT_CASE(ERROR_INVALID_FUNCTION_NAME); + ZE_RESULT_CASE(ERROR_INVALID_GROUP_SIZE_DIMENSION); + ZE_RESULT_CASE(ERROR_INVALID_GLOBAL_WIDTH_DIMENSION); + ZE_RESULT_CASE(ERROR_INVALID_KERNEL_ARGUMENT_INDEX); + ZE_RESULT_CASE(ERROR_INVALID_KERNEL_ARGUMENT_SIZE); + ZE_RESULT_CASE(ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE); + ZE_RESULT_CASE(ERROR_INVALID_COMMAND_LIST_TYPE); + ZE_RESULT_CASE(ERROR_OVERLAPPING_REGIONS); + ZE_RESULT_CASE(ERROR_UNKNOWN); + default: + return "Unknown error"; + } + +#undef ZE_RESULT_CASE +} + +#define PSM3_ONEAPI_ZE_CALL(func, args...) do { \ + ze_result_t result; \ + PSM3_CONCAT(psm3_oneapi_ze_count_, func)++; \ + result = PSM3_CONCAT(psm3_oneapi_ze_, func)(args); \ + if(result != ZE_RESULT_SUCCESS) { \ + _HFI_ERROR( "OneAPI Level Zero failure: %s() (at %s:%d)" \ + " returned 0x%x: %s\n", \ + #func, __FILE__, __LINE__, result, \ + psm3_oneapi_ze_result_to_string(result)); \ + psm3_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Error returned from OneAPI Level Zero function %s.\n", #func); \ + } \ +} while (0) + +// resolve a OneAPI Level Zero shared library symbol +#define PSM3_ONEAPI_ZE_DLSYM(lib_ptr, func) do { \ + PSM3_CONCAT(psm3_oneapi_ze_, func) = dlsym(lib_ptr, STRINGIFY(func)); \ + if (!PSM3_CONCAT(psm3_oneapi_ze_, func)) { \ + psm3_handle_error(PSMI_EP_NORETURN, \ + PSM2_INTERNAL_ERR, \ + "Unable to resolve %s symbol " \ + "in OneAPI Level Zero library.\n", STRINGIFY(func)); \ + } \ +} while (0) + +static int psm3_oneapi_ze_lib_load() +{ + psm2_error_t err = PSM2_OK; + char *dlerr; + + PSM2_LOG_MSG("entering"); + _HFI_VDBG("Loading OneAPI Level Zero library.\n"); + + psm3_oneapi_ze_lib = dlopen("libze_loader.so.1", RTLD_LAZY); + if (!psm3_oneapi_ze_lib) { + dlerr = dlerror(); + _HFI_ERROR( + "Unable to open libze_loader.so.1. Error %s\n", + dlerr ? dlerr : "no dlerror()"); + goto fail; + } + + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeInit); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeDriverGet); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeDeviceGet); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeDevicePciGetPropertiesExt); +#ifndef PSM3_NO_ONEAPI_IMPORT + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeDriverGetExtensionFunctionAddress); +#endif + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeContextCreate); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeContextDestroy); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandQueueCreate); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandQueueDestroy); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandQueueExecuteCommandLists); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandQueueSynchronize); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandListCreate); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandListDestroy); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandListClose); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandListReset); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandListCreateImmediate); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandListAppendMemoryCopy); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandListAppendSignalEvent); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeDeviceCanAccessPeer); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeDeviceGetCommandQueueGroupProperties); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeMemAllocHost); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeMemAllocDevice); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeMemFree); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeMemGetIpcHandle); +#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeMemPutIpcHandle); +#endif + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeMemOpenIpcHandle); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeMemCloseIpcHandle); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeMemGetAddressRange); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeMemGetAllocProperties); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeEventPoolCreate); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeEventPoolDestroy); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeEventCreate); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeEventDestroy); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeEventQueryStatus); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeEventHostSynchronize); + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeEventHostReset); + + /* ze loader API */ + PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zelLoaderGetVersions); + + PSM2_LOG_MSG("leaving"); + return err; +fail: + if (psm3_oneapi_ze_lib) + dlclose(psm3_oneapi_ze_lib); + err = psm3_handle_error(PSMI_EP_NORETURN, + PSM2_INTERNAL_ERR, + "Unable to load OneAPI Level Zero library.\n"); + return err; +} + +static void psm3_oneapi_ze_stats_register() +{ +#define PSM3_ONEAPI_ZE_COUNT_DECLU64(func) \ + PSMI_STATS_DECLU64(#func, NULL, &PSM3_CONCAT(psm3_oneapi_ze_count_, func)) + + struct psmi_stats_entry ze_entries[] = { + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeInit), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeDriverGet), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeDeviceGet), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeDevicePciGetPropertiesExt), +#ifndef PSM3_NO_ONEAPI_IMPORT + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeDriverGetExtensionFunctionAddress), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zexDriverImportExternalPointer), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zexDriverReleaseImportedPointer), +#endif + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeContextCreate), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeContextDestroy), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandQueueCreate), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandQueueDestroy), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandQueueExecuteCommandLists), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandQueueSynchronize), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandListCreate), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandListDestroy), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandListClose), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandListReset), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandListCreateImmediate), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandListAppendMemoryCopy), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandListAppendSignalEvent), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeDeviceCanAccessPeer), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeDeviceGetCommandQueueGroupProperties), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeMemAllocHost), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeMemAllocDevice), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeMemFree), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeMemGetIpcHandle), +#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeMemPutIpcHandle), +#endif + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeMemOpenIpcHandle), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeMemCloseIpcHandle), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeMemGetAddressRange), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeMemGetAllocProperties), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeEventPoolCreate), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeEventPoolDestroy), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeEventCreate), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeEventDestroy), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeEventQueryStatus), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeEventHostSynchronize), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zeEventHostReset), + PSM3_ONEAPI_ZE_COUNT_DECLU64(zelLoaderGetVersions) + }; +#undef PSM3_ONEAPI_ZE_COUNT_DECLU64 + + psm3_stats_register_type("PSM_OneAPI_ZE_call_statistics", + "Count of OneAPI Level Zero calls per API entry point for the whole process.\n" + "When using an Intel(r) GPU, PSM3 may call Level Zero " + "APIs to access or transfer application buffers in GPU memory.", + PSMI_STATSTYPE_GPU, + ze_entries, PSMI_HOWMANY(ze_entries), NULL, + &psm3_oneapi_ze_count_zeInit, NULL); /* context must != NULL */ +} + +static void psm3_oneapi_ze_find_copy_only_engine(ze_device_handle_t dev, + struct psm3_oneapi_ze_dev_ctxt *ctxt) +{ + uint32_t count = 0; + ze_command_queue_group_properties_t *props = NULL; + int i; + int done = 0; + + /* Set the default */ + ctxt->ordinal = 0; + ctxt->index = 0; + ctxt->num_queues = 1; + PSM3_ONEAPI_ZE_CALL(zeDeviceGetCommandQueueGroupProperties, dev, + &count, NULL); + props = psmi_calloc(PSMI_EP_NONE, UNDEFINED, count, sizeof(*props)); + if (!props) { + _HFI_ERROR("Failed to allocate mem for CmdQ Grp\n"); + return; + } + PSM3_ONEAPI_ZE_CALL(zeDeviceGetCommandQueueGroupProperties, dev, + &count, props); + + // pick the last command queue group which supports copy but not compute. + // For PVC this will be the xeLink copy engine which will also + // have numQueues >1 (TBD - perhaps only select if it has numQueues>1). + // This ordinal is then supplied to create Command Queues and Command Lists. + for (i = count - 1; i >= 0; i--) { + _HFI_DBG("GPU Queue Group %d: copy=%d Compute=%d num_queues=%d\n", i, + (props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) != 0, + (props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) != 0, + (int)props[i].numQueues); + if (! done && (props[i].flags & + ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) && + !(props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)) { + ctxt->ordinal = i; + ctxt->num_queues = props[i].numQueues; + done = 1; + if (_HFI_DBG_ON) { + _HFI_DBG_ALWAYS("Selected GPU copy engine %d\n", i); + } else { + break; + } + } + } + psmi_free(props); +} + +// for pipelined async GPU memcpy +// *p_cq is left as NULL when psm3_oneapi_ze_immed_async_copy enabled +static void psm3_oneapi_ze_async_cmd_create(struct psm3_oneapi_ze_dev_ctxt *ctxt, + ze_command_queue_handle_t *p_cq, ze_command_list_handle_t *p_cl) +{ + psmi_assert(! *p_cl); + if (psm3_oneapi_ze_immed_async_copy) { + ze_command_queue_desc_t cq_desc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + .flags = 0, + .mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, + .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL + }; + cq_desc.ordinal = ctxt->ordinal; + cq_desc.index = ctxt->index++; + ctxt->index %= ctxt->num_queues; + PSM3_ONEAPI_ZE_CALL(zeCommandListCreateImmediate, + psm3_oneapi_ze_context, ctxt->dev, &cq_desc, p_cl); + } else { + if (! *p_cq) { + ze_command_queue_desc_t cq_desc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + .flags = 0, + .mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, + .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL + }; + cq_desc.ordinal = ctxt->ordinal; + cq_desc.index = ctxt->index++; + ctxt->index %= ctxt->num_queues; + PSM3_ONEAPI_ZE_CALL(zeCommandQueueCreate, + psm3_oneapi_ze_context, ctxt->dev, &cq_desc, p_cq); + } + ze_command_list_desc_t cl_desc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, + .flags = 0 + }; + cl_desc.commandQueueGroupOrdinal = ctxt->ordinal; + PSM3_ONEAPI_ZE_CALL(zeCommandListCreate, + psm3_oneapi_ze_context, ctxt->dev, &cl_desc, p_cl); + } +} + +// create command queue for use in psm3_oneapi_ze_memcpy for sync memcpy +static void psm3_oneapi_ze_cmd_create(ze_device_handle_t dev, struct psm3_oneapi_ze_dev_ctxt *ctxt) +{ + ze_command_queue_desc_t ze_cq_desc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + .flags = 0, + //.mode set below + .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL, + }; + + psm3_oneapi_ze_find_copy_only_engine(dev, ctxt); + ze_cq_desc.ordinal = ctxt->ordinal; + ze_cq_desc.index = ctxt->index; + + if (psm3_oneapi_ze_immed_sync_copy) { + ze_cq_desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS; + PSM3_ONEAPI_ZE_CALL(zeCommandListCreateImmediate, psm3_oneapi_ze_context, + dev, &ze_cq_desc, &ctxt->cl); + } else { + ze_command_list_desc_t ze_cl_desc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, + .flags = 0 + }; + ze_cq_desc.mode = ZE_COMMAND_QUEUE_MODE_DEFAULT; + + PSM3_ONEAPI_ZE_CALL(zeCommandQueueCreate, psm3_oneapi_ze_context, + dev, &ze_cq_desc, &ctxt->cq); + + ze_cl_desc.commandQueueGroupOrdinal = ctxt->ordinal; + PSM3_ONEAPI_ZE_CALL(zeCommandListCreate, psm3_oneapi_ze_context, + dev, &ze_cl_desc, &ctxt->cl); + } + ctxt->dev = dev; + + if (psm3_oneapi_parallel_dtod_copy_thresh < UINT_MAX) { + // create resources for dual copy mechanism + ze_event_pool_desc_t pool_desc = { + .stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, + .flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE, + .count = 2 + }; + ze_event_desc_t event_desc = { + .stype = ZE_STRUCTURE_TYPE_EVENT_DESC, + .signal = ZE_EVENT_SCOPE_FLAG_HOST, + .wait = ZE_EVENT_SCOPE_FLAG_HOST, + }; + PSM3_ONEAPI_ZE_CALL(zeEventPoolCreate, + psm3_oneapi_ze_context, &pool_desc, 0, NULL, &ctxt->event_pool); + + event_desc.index = 0; + PSM3_ONEAPI_ZE_CALL(zeEventCreate, ctxt->event_pool, &event_desc, + &ctxt->copy_status0); + + event_desc.index = 1; + PSM3_ONEAPI_ZE_CALL(zeEventCreate, ctxt->event_pool, &event_desc, + &ctxt->copy_status1); + + psm3_oneapi_ze_async_cmd_create(ctxt, &ctxt->async_cq0, + &ctxt->async_cl0); + psm3_oneapi_ze_async_cmd_create(ctxt, &ctxt->async_cq1, + &ctxt->async_cl1); + } +} + +static void psm3_oneapi_ze_cmd_create_all(void) +{ + int i; + struct psm3_oneapi_ze_dev_ctxt *ctxt; + ze_context_desc_t ctxtDesc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, NULL, 0 }; + + if (!psm3_oneapi_ze_context) + PSM3_ONEAPI_ZE_CALL(zeContextCreate, psm3_oneapi_ze_driver, &ctxtDesc, + &psm3_oneapi_ze_context); + + for (i = 0; i < psm3_num_oneapi_ze_devices; i++) { + ctxt = &psm3_oneapi_ze_devices[i]; + + if (!ctxt->cl) { + psm3_oneapi_ze_cmd_create(ctxt->dev, ctxt); + _HFI_DBG("Initialized cmd queues for ze_device[%d] %p\n", + i, ctxt->dev); + } + } + if (psm3_num_oneapi_ze_devices > 0) + psm3_oneapi_ze_cur_dev = &psm3_oneapi_ze_devices[0]; +} + +static void psm3_oneapi_ze_cmd_destroy_all(void) +{ + int i; + struct psm3_oneapi_ze_dev_ctxt *ctxt; + + for (i = 0; i < psm3_num_oneapi_ze_devices; i++) { + ctxt = &psm3_oneapi_ze_devices[i]; + + if (ctxt->async_cl1 != NULL) { + PSM3_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->async_cl1); + ctxt->async_cl1 = NULL; + } + if (ctxt->async_cq1 != NULL) { + PSM3_ONEAPI_ZE_CALL(zeCommandQueueDestroy, ctxt->async_cq1); + ctxt->async_cq1 = NULL; + } + if (ctxt->async_cl0 != NULL) { + PSM3_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->async_cl0); + ctxt->async_cl0 = NULL; + } + if (ctxt->async_cq0 != NULL) { + PSM3_ONEAPI_ZE_CALL(zeCommandQueueDestroy, ctxt->async_cq0); + ctxt->async_cq0 = NULL; + } + if (ctxt->copy_status1 != NULL) { + PSM3_ONEAPI_ZE_CALL(zeEventDestroy, ctxt->copy_status1); + ctxt->copy_status1 = NULL; + } + if (ctxt->copy_status0 != NULL) { + PSM3_ONEAPI_ZE_CALL(zeEventDestroy, ctxt->copy_status0); + ctxt->copy_status0 = NULL; + } + if (ctxt->event_pool != NULL) { + PSM3_ONEAPI_ZE_CALL(zeEventPoolDestroy, ctxt->event_pool); + ctxt->event_pool = NULL; + } + if (ctxt->cl) { + PSM3_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->cl); + ctxt->cl = NULL; + } + if (ctxt->cq) { + PSM3_ONEAPI_ZE_CALL(zeCommandQueueDestroy, ctxt->cq); + ctxt->cq = NULL; + } + } + psm3_oneapi_ze_cur_dev = NULL; + + /* Also destroy psm3_oneapi_ze_context */ + if (psm3_oneapi_ze_context) { + PSM3_ONEAPI_ZE_CALL(zeContextDestroy, psm3_oneapi_ze_context); + psm3_oneapi_ze_context = NULL; + } +} + +/* + * get OneAPI alloc_id for a GPU address + * + * The address should be part of a buffer allocated from an OneAPI + * library call (zeMemAllocDevice() or zeMemAllocHost()). + * The alloc_id changes on each OneAPI allocation call. PSM3/rv uses the + * alloc_id to determine if a cache hit is a potentially stale entry which + * should be invalidated. + */ +static uint64_t psm3_oneapi_ze_get_alloc_id(void *addr, uint8_t *type) +{ + ze_memory_allocation_properties_t mem_props = { + .stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES + }; + ze_device_handle_t device; + + PSM3_ONEAPI_ZE_CALL(zeMemGetAllocProperties, psm3_oneapi_ze_context, + addr, &mem_props, &device); + if (type) + *type = (uint8_t)mem_props.type; + /* + * id is unique across all allocates on all devices within a given + * process + */ + return mem_props.id; +} + +//*************************************************************************** +//OneAPI Level Zero support for IPC handles +//IPC Handles are used both for PSM3 shm intranode copies via xeLink +//as well as for dma_buf use during MR creation for GPU Direct DMA and RDMA +#ifndef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE +static int psm3_oneapi_ze_ipc_handle_cached(const void *buf, + ze_ipc_mem_handle_t ipc_handle) +{ + static int first = 1; + static int cached = 0; + ze_ipc_mem_handle_t tmp_ipc_handle; + int tmp_fd; + + /* Only detect the first time */ + if (!first) + return cached; + + PSM3_ONEAPI_ZE_CALL(zeMemGetIpcHandle, psm3_oneapi_ze_context, + buf, &tmp_ipc_handle); + tmp_fd = *(uint32_t *)tmp_ipc_handle.data; + if (tmp_fd == *(uint32_t *)ipc_handle.data) + cached = 1; + else + close(tmp_fd); + + first = 0; + _HFI_VDBG("fd %u tmp_fd %d cached %d\n", *(uint32_t *)ipc_handle.data, + tmp_fd, cached); + + return cached; +} +#endif + +#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE +#define ONEAPI_PUTQUEUE_SIZE -1 + +// queue for delayed Put to get better GetIpcHandle performance +// while having an upper bound on number of active Ipc Handles +// sized based on PSM3_ONEAPI_PUTQUEUE_SIZE +struct { + psmi_lock_t lock; + struct oneapi_handle_array { + uint8_t valid; + ze_ipc_mem_handle_t ze_ipc_handle; + } *array; + unsigned index; // where to add next entry and remove oldest + int size; // number of slots in queue, -1 disables put +} psm3_oneapi_ze_putqueue; +#endif /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */ + +static psm2_error_t psm3_oneapi_ze_putqueue_alloc(void) +{ +#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE + union psmi_envvar_val env; + psm3_getenv("PSM3_ONEAPI_PUTQUEUE_SIZE", + "How many Ipc Handle Puts to queue for shm send and nic Direct GPU Access [-1 disables Put, 0 disables queue]", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)ONEAPI_PUTQUEUE_SIZE, &env); + _HFI_DBG("OneApi PutQueue Size=%d\n", env.e_int); + psm3_oneapi_ze_putqueue.size = env.e_int; + if (env.e_int > 0) { + psm3_oneapi_ze_putqueue.array = (struct oneapi_handle_array *)psmi_calloc( + PSMI_EP_NONE, UNDEFINED, env.e_int, + sizeof(*psm3_oneapi_ze_putqueue.array)); + if (! psm3_oneapi_ze_putqueue.array) + return PSM2_NO_MEMORY; + psm3_oneapi_ze_putqueue.index = 0; + psmi_init_lock(&psm3_oneapi_ze_putqueue.lock); + } +#endif /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */ + return PSM2_OK; +} + +#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE +static void psm3_oneapi_ze_get_dmabuf_fd(const void *buf, uint64_t *handle_fd) +{ + ze_memory_allocation_properties_t mem_props = {}; + ze_device_handle_t device_ptr; + ze_external_memory_export_fd_t export_fd = {}; + + export_fd.stype = ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_EXPORT_FD; + export_fd.flags = ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF; + + mem_props.stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES; + mem_props.pNext = &export_fd; + + PSM3_ONEAPI_ZE_CALL(zeMemGetAllocProperties, psm3_oneapi_ze_context, + buf, &mem_props, &device_ptr); + *handle_fd = export_fd.fd; +} +#endif + +#ifdef PSM_HAVE_RNDV_MOD +static void psm3_oneapi_ze_get_ipc_handle(const void *buf, ze_ipc_mem_handle_t *ipc_handle, uint64_t *handle_fd) +{ + PSM3_ONEAPI_ZE_CALL(zeMemGetIpcHandle, psm3_oneapi_ze_context, + (const void *)buf, ipc_handle); +#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE + psm3_oneapi_ze_get_dmabuf_fd(buf, handle_fd); +#else + *handle_fd = *(uint32_t *)ipc_handle->data; +#endif +} +#endif /* PSM_HAVE_RNDV_MOD */ + +static void psm3_oneapi_ze_put_ipc_handle(const void *buf, ze_ipc_mem_handle_t ipc_handle) +{ +#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE + if (! psm3_oneapi_ze_putqueue.array) { // queue disabled + if (psm3_oneapi_ze_putqueue.size >= 0) // negative size disables Put + PSM3_ONEAPI_ZE_CALL(zeMemPutIpcHandle, psm3_oneapi_ze_context, ipc_handle); + return; + } + PSMI_LOCK(psm3_oneapi_ze_putqueue.lock); + if (psm3_oneapi_ze_putqueue.array[psm3_oneapi_ze_putqueue.index].valid) { + // Put the oldest one to make room for new entry + ze_ipc_mem_handle_t tmp_ipc_handle = + psm3_oneapi_ze_putqueue.array[psm3_oneapi_ze_putqueue.index].ze_ipc_handle; + PSM3_ONEAPI_ZE_CALL(zeMemPutIpcHandle, psm3_oneapi_ze_context, tmp_ipc_handle); + } + // queue the new one + psm3_oneapi_ze_putqueue.array[psm3_oneapi_ze_putqueue.index].valid = 1; + psm3_oneapi_ze_putqueue.array[psm3_oneapi_ze_putqueue.index++].ze_ipc_handle = ipc_handle; + psm3_oneapi_ze_putqueue.index %= psm3_oneapi_ze_putqueue.size; + PSMI_UNLOCK(psm3_oneapi_ze_putqueue.lock); +#else /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */ + // for older Agama with handle "cache" but no reference counting + // no way to put handle without affecting all IOs using that buffer + // on ATS w/o Agama handle cache, no benefit to holding onto fd so close + if (!psm3_oneapi_ze_ipc_handle_cached(buf, ipc_handle)) + close(*(uint32_t *)ipc_handle.data); +#endif /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */ +} + +static void psm3_oneapi_ze_putqueue_free(void) +{ +#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE +#if 0 // we are shutting down, so don't worry about Putting the queued handles + int i; + + // no need for lock, destroying object, no more callers + for (i=0; i < psm3_oneapi_ze_putqueue.size; i++) { + if (psm3_oneapi_ze_putqueue.array[i].valid) { + ze_ipc_mem_handle_t ipc_handle = psm3_oneapi_ze_putqueue.array[i].ze_ipc_handle; + PSM3_ONEAPI_ZE_CALL(zeMemPutIpcHandle, psm3_oneapi_ze_context, ipc_handle); + } + } +#endif /* 0 */ + if (psm3_oneapi_ze_putqueue.array) { + psmi_free(psm3_oneapi_ze_putqueue.array); + psm3_oneapi_ze_putqueue.array = NULL; + psmi_destroy_lock(&psm3_oneapi_ze_putqueue.lock); + } +#endif /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */ +} +//*************************************************************************** + +static psm2_error_t psm3_oneapi_ze_initialize(void) +{ + psm2_error_t err = PSM2_OK; + uint32_t ze_driver_count = 1; + uint32_t ze_device_count = 0; + ze_device_handle_t devices[MAX_ZE_DEVICES]; + zel_component_version_t *zel_comps = NULL; + size_t num_zel_comps; + int i; + union psmi_envvar_val env; + + PSM2_LOG_MSG("entering"); + _HFI_DBG("Init Level Zero library.\n"); + + psm3_oneapi_ze_stats_register(); + err = psm3_oneapi_ze_lib_load(); + if (err != PSM2_OK) + goto fail; + + psm3_getenv("PSM3_ONEAPI_IMMED_SYNC_COPY", + "Use Immediate CommandList for synchronous copy to/from GPU]", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)1, &env); + psm3_oneapi_ze_immed_sync_copy = env.e_int; + + psm3_getenv("PSM3_ONEAPI_IMMED_ASYNC_COPY", + "Use Immediate CommandList for asynchronous pipeline copy to/from GPU]", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)1, &env); + psm3_oneapi_ze_immed_async_copy = env.e_int; + + psm3_getenv("PSM3_ONEAPI_PARALLEL_DTOD_COPY_THRESH", + "Use parallel CommandLists for GPU to GPU copy larger than threshold", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)(256*1024-1), &env); + // no benefit below 128K-1, plus the copy is spilt at a 64K boundary + psm3_oneapi_parallel_dtod_copy_thresh = max(128*1024-1, env.e_uint); + + + PSM3_ONEAPI_ZE_CALL(zeInit, ZE_INIT_FLAG_GPU_ONLY); + + /* Need to query count before alloc array */ + PSM3_ONEAPI_ZE_CALL(zelLoaderGetVersions, &num_zel_comps, NULL); + if (num_zel_comps > 0) { + zel_comps = (zel_component_version_t *)psmi_calloc( + PSMI_EP_NONE, UNDEFINED, sizeof(zel_component_version_t), + num_zel_comps); + PSM3_ONEAPI_ZE_CALL(zelLoaderGetVersions, &num_zel_comps, zel_comps); + + /* Loop looking for "loader" name */ + for (i = 0; i < num_zel_comps; i++) { + if (!strncmp(zel_comps[i].component_name, "loader", sizeof("loader"))){ + psm3_oneapi_ze_lib_version = zel_comps[i].component_lib_version; + psm3_oneapi_ze_api_version = zel_comps[i].spec_version; + break; + } + } + psmi_free(zel_comps); + if (i == num_zel_comps) { + _HFI_DBG("WARNING: 'loader' not found among the %zd components reported" + " by zelLoaderGetVersions, unable to report Level-Zero version", + num_zel_comps); + } + } else { + _HFI_DBG("WARNING: no components reported by zelLoaderGetVersions," + " unable to report Level-Zero version"); + } + + PSM3_ONEAPI_ZE_CALL(zeDriverGet, &ze_driver_count, &psm3_oneapi_ze_driver); +#ifndef PSM3_NO_ONEAPI_IMPORT + PSM3_ONEAPI_ZE_CALL(zeDriverGetExtensionFunctionAddress, psm3_oneapi_ze_driver, "zexDriverImportExternalPointer", (void **)&psm3_oneapi_ze_zexDriverImportExternalPointer); + PSM3_ONEAPI_ZE_CALL(zeDriverGetExtensionFunctionAddress, psm3_oneapi_ze_driver, "zexDriverReleaseImportedPointer", (void **)&psm3_oneapi_ze_zexDriverReleaseImportedPointer); +#endif + + PSM3_ONEAPI_ZE_CALL(zeDeviceGet, psm3_oneapi_ze_driver, &ze_device_count, NULL); + if (ze_device_count > MAX_ZE_DEVICES) + ze_device_count = MAX_ZE_DEVICES; + PSM3_ONEAPI_ZE_CALL(zeDeviceGet, psm3_oneapi_ze_driver, &ze_device_count, devices); + + ze_context_desc_t ctxtDesc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, NULL, 0 }; + PSM3_ONEAPI_ZE_CALL(zeContextCreate, psm3_oneapi_ze_driver, &ctxtDesc, &psm3_oneapi_ze_context); + _HFI_DBG("ze_driver %p %u devices first device %p ze_context %p\n", + psm3_oneapi_ze_driver, ze_device_count, devices[0], psm3_oneapi_ze_context); + + for (i = 0; i < ze_device_count; i++) { + psm3_oneapi_ze_devices[i].dev_index = i; + psm3_oneapi_ze_cmd_create(devices[i], &psm3_oneapi_ze_devices[i]); + _HFI_DBG("Initialized cmd queues for ze_device[%d] %p\n", + i, psm3_oneapi_ze_devices[i].dev); + } + + psm3_num_oneapi_ze_devices = ze_device_count; + if (psm3_num_oneapi_ze_devices > 0) + psm3_oneapi_ze_cur_dev = &psm3_oneapi_ze_devices[0]; + + err = psm3_oneapi_ze_putqueue_alloc(); + if (err != PSM2_OK) + goto fail; + +#ifndef PSM_HAVE_PIDFD + psm3_num_ze_dev_fds = 0; +#endif + +#ifdef PSM_HAVE_RNDV_MOD + // these env only needed when rv being used, since hidden, always parse + { + union psmi_envvar_val env; + + psm3_getenv("PSM3_RV_GPU_IGNORE_ALLOC_ID", + "Disable use of alloc_id to identify GPU MRs to invalidate in RV GPU cache. 1=ignore, 2=use fake id to get 100% miss", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)0, &env); + psm3_oneapi_ze_ignore_alloc_id = env.e_int; + } +#endif /* PSM_HAVE_RNDV_MOD */ + + if (! psm3_gpu_thresh_rndv) // sockets HAL could set new default + psm3_gpu_thresh_rndv = PSM3_ONEAPI_ZE_GPU_THRESH_RNDV; + psm3_gpu_rndv_nic_window_default = PSM3_ONEAPI_ZE_RNDV_NIC_WINDOW_DEFAULT; + psm3_gpu_gpudirect_rdma_send_limit_default = PSM3_ONEAPI_ZE_GPUDIRECT_RDMA_SEND_LIMIT_DEFAULT; + psm3_gpu_gpudirect_rdma_recv_limit_default = PSM3_ONEAPI_ZE_GPUDIRECT_RDMA_RECV_LIMIT_DEFAULT; + psm3_gpu_mq_rndv_shm_gpu_thresh_default = PSM3_ONEAPI_ZE_MQ_RNDV_SHM_GPU_THRESH; + + PSM2_LOG_MSG("leaving"); + return err; +fail: + err = psm3_handle_error(PSMI_EP_NORETURN, + PSM2_INTERNAL_ERR, + "Unable to initialize PSM3 OneAPI Level Zero support.\n"); + return err; +} + +static void psm3_oneapi_ze_finalize(void) +{ + psm3_stats_deregister_type(PSMI_STATSTYPE_GPU, &psm3_oneapi_ze_count_zeInit); + /* + * Trying to destroy command list, queue, and context will result in + * segfaults here. + */ + /*psm3_oneapi_ze_putqueue_free(); + psm3_oneapi_ze_cmd_destroy(); + if (psm3_oneapi_ze_context) { + PSM3_ONEAPI_ZE_CALL(zeContextDestroy, psm3_oneapi_ze_context); + psm3_oneapi_ze_context = NULL; + } */ +} + +// called on every EP open +static void psm3_oneapi_ze_ep_open(void) +{ + /* Make sure ze_context and command queue/list are available. + * They could be destroyed when final EP is closed + * If another endpoint is opened after that, the code here can + * recreate the context, command queue and list. + */ + if (!psm3_oneapi_ze_cur_dev) + psm3_oneapi_ze_cmd_create_all(); +} + +// called on final EP close +static void psm3_oneapi_ze_ep_close(void) +{ + /* + * It would be ideal to destroy the global command list, queue, and + * context in psm3_finalize() (via psm3_oneapi_ze_finalize). + * Unfortunately, it will cause segfaults in Level-zero library. + */ + psm3_oneapi_ze_putqueue_free(); + psm3_oneapi_ze_cmd_destroy_all(); +} + +static void psm3_oneapi_ze_identify(char *accel_vers, size_t size) +{ + char ze_api_ver[64] = "unknown"; + char ze_loader_ver[64] = "unknown"; + + if (psm3_oneapi_ze_api_version) + snprintf(ze_api_ver, sizeof(ze_api_ver), "%d.%d", + ZE_MAJOR_VERSION(psm3_oneapi_ze_api_version), ZE_MINOR_VERSION(psm3_oneapi_ze_api_version)); + if (psm3_oneapi_ze_lib_version.major || psm3_oneapi_ze_lib_version.minor || psm3_oneapi_ze_lib_version.patch) + snprintf(ze_loader_ver, sizeof(ze_loader_ver), "v%d.%d.%d", + psm3_oneapi_ze_lib_version.major, psm3_oneapi_ze_lib_version.minor, psm3_oneapi_ze_lib_version.patch); + snprintf(accel_vers, size, "%s %s Level-Zero Runtime %s (%s) built against interface %d.%d\n", + psm3_get_mylabel(), psm3_ident_tag, + ze_api_ver, ze_loader_ver, + ZE_MAJOR_VERSION(ZE_API_VERSION_CURRENT), ZE_MINOR_VERSION(ZE_API_VERSION_CURRENT)); +} + +static void psm3_oneapi_ze_verify_GPU_capabilities(void) +{ + // nothing to do +} + +static int psm3_oneapi_ze_p2p_supported() +{ + static int p2p_supported = -1; // -1 indicates "unset" + uint32_t num_devices = 0; + uint32_t dev; + ze_device_handle_t devices[MAX_ZE_DEVICES]; + + if (likely(p2p_supported > -1)) return p2p_supported; + + p2p_supported = 0; + + PSM3_ONEAPI_ZE_CALL(zeDeviceGet, psm3_oneapi_ze_driver, &num_devices, NULL); + if (num_devices > MAX_ZE_DEVICES) + num_devices = MAX_ZE_DEVICES; + PSM3_ONEAPI_ZE_CALL(zeDeviceGet, psm3_oneapi_ze_driver, &num_devices, devices); + + for (dev = 0; dev < num_devices; dev++) { + ze_device_handle_t device; + device = devices[dev]; + + if (num_devices > 1 && device != psm3_oneapi_ze_cur_dev->dev) { + ze_bool_t canAccessPeer = 0; + + PSM3_ONEAPI_ZE_CALL(zeDeviceCanAccessPeer, psm3_oneapi_ze_cur_dev->dev, + device, &canAccessPeer); + if (canAccessPeer != 1) + _HFI_DBG("ONEAPI device %d does not support P2P from current device (Non-fatal error)\n", dev); + else + p2p_supported |= (1 << dev); + } else { + /* Always support p2p on the same GPU */ + psm3_my_gpu_device = dev; + p2p_supported |= (1 << dev); + } + } + + return p2p_supported; +} + +static int psm3_oneapi_ze_gpudirect_supported(void) +{ + /* Is there any OneAPI Level Zero device property that can indicate this? */ + return 1; +} + + +static void psm3_oneapi_ze_get_pci_addr(uint32_t *domain_p, uint32_t *bus_p, + uint32_t *dev_p, uint32_t *func_p) +{ + ze_pci_ext_properties_t PciProperties; + + _HFI_DBG("%d Level Zero GPUs found\n", psm3_num_oneapi_ze_devices); + if (! psm3_num_oneapi_ze_devices) + return; + + // caling middleware will have limited GPUs visible to process + PSM3_ONEAPI_ZE_CALL(zeDevicePciGetPropertiesExt, + psm3_oneapi_ze_devices[0].dev, &PciProperties); + *domain_p = PciProperties.address.domain; + *bus_p = PciProperties.address.bus; + *dev_p = PciProperties.address.device; + *func_p = PciProperties.address.function; +} + +#ifdef PSM_HAVE_RNDV_MOD +static uint64_t psm3_oneapi_ze_min_bar_size(void) +{ + // implement later + return 0; +} + +static psm2_error_t psm3_oneapi_ze_check_phys_addr(uint64_t phys_addr) +{ + return PSM2_OK; +} + +static void psm3_oneapi_ze_roundup_gdrcopy(unsigned long buf, size_t size, + uintptr_t *pageaddr_p, uint64_t *pagelen_p) +{ + PSM3_ONEAPI_ZE_CALL(zeMemGetAddressRange, psm3_oneapi_ze_context, + (const void *)buf, (void **)pageaddr_p, pagelen_p); +} + +#ifdef PSM_HAVE_REG_MR +static void psm3_oneapi_ze_roundup_rv_reg_mr(struct psm2_ep *ep, + void **addr_p, uint64_t *length_p, int access) +{ +#define MAX_USER_MR_SIZE (32 * 1024) + void *base; + size_t len; + uint64_t page_offset; + + PSM3_ONEAPI_ZE_CALL(zeMemGetAddressRange, psm3_oneapi_ze_context, + (const void *)*addr_p, &base, &len); + /* + * Need to register MR with base address and total length. + * However, for Mellanox cards, the max buffer size for a + * user MR registered through the rv module is 32k bytes. + * Otherwise, it will fail with IB_WC_MW_BIND_ERR. For fast + * registration MR through RV (kernel MR and GPU MR), there + * is also a upper limit (max_fast_reg_page_list_len) imposed + * by the underlying RDMA device (eg 256MB for mlx5). + */ + if (strncasecmp(ep->dev_name, "mlx5_0", 3) == 0 && + !(access & IBV_ACCESS_KERNEL)) { + if (len > MAX_USER_MR_SIZE) { + /* + * Register the first 32k if the buffer stays in the + * range. Otherwise, align the buffer to page boundary. + */ + if (((char *)*addr_p + *length_p) <= + ((char *)base + MAX_USER_MR_SIZE)) { + *addr_p = base; + *length_p = MAX_USER_MR_SIZE; + } else { + page_offset = ((uint64_t)*addr_p) & + GPU_PAGE_OFFSET_MASK; + *addr_p = (void *) + ROUNDDOWN64P2((uint64_t)*addr_p, + PSMI_GPU_PAGESIZE); + *length_p = *length_p + page_offset; + } + } else { + /* Register the entire buffer */ + *addr_p = base; + *length_p = len; + } + } else { + uint64_t start, end; + uint64_t mr_len; + uint64_t offset; + uint64_t limit = ep->verbs_ep.max_fmr_size; + + /* Buffer end + 1 */ + end = (uint64_t)base + len; + /* Offset of the requested buffer chunk */ + offset = (uint64_t)*addr_p - (uint64_t)base; + /* + * Start address of next MR. + * The idea is to avoid fragment the entire buffer as few times + * as possible to avoid overlapped MRs and increae cache hit + * rate. Therefore, we can't just start from page boundary of + * the requested buffer address: + * start = ROUNDDOWN64P2((uint64_t)*addr_p, PSMI_GPU_PAGESIZE); + */ + start = (uint64_t)base + (offset / limit) * limit; + mr_len = end - start; + if (mr_len > limit) + mr_len = limit; + /* + * If the chunk does not cross the (start + mr_len) boundary, + * register the max chunk size or the remainder of the entire + * buffer. Otherwise, align the buffer to page size and just + * register the requested chunk size plus the offset. + */ + if (((uint64_t)*addr_p + *length_p) <= (start + mr_len)) { + *addr_p = (void *)start; + *length_p = mr_len; + } else { + page_offset = ((uint64_t)*addr_p) & + GPU_PAGE_OFFSET_MASK; + *addr_p = (void *)ROUNDDOWN64P2((uint64_t)*addr_p, + PSMI_GPU_PAGESIZE); + *length_p = *length_p + page_offset; + } + } +} + +// add OneAPI Level Zero specific information to the mparams in prep for the +// RV_IOCTL_REG_MEM ioctl to rv +// for reg_mr the gpu_specific->ze_alloc_id is obtained in caller and +// retained in the psm2_verbs_mr_t for future cache hit checks +static int psm3_oneapi_ze_init_rv_reg_mr_params( + void *addr, uint64_t length, int access, + struct rv_mem_params *mparams, + union psm3_verbs_mr_gpu_specific *gpu_specific, + union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad) +{ + // need to save off ipc_handle here for cleanup below + psm3_oneapi_ze_get_ipc_handle(addr, &scratchpad->ze_ipc_handle, + &scratchpad->ze_handle_fd); + mparams->in.ipc_handle = (uint32_t)scratchpad->ze_handle_fd; + if (!mparams->in.ipc_handle) { + _HFI_ERROR("zeMemGetIpcHandle for %p returned empty handle: 0x%02x%02x%02x%02x %02x%02x%02x%02x\n", + addr, scratchpad->ze_ipc_handle.data[0], + scratchpad->ze_ipc_handle.data[1], + scratchpad->ze_ipc_handle.data[2], + scratchpad->ze_ipc_handle.data[3], + scratchpad->ze_ipc_handle.data[4], + scratchpad->ze_ipc_handle.data[5], + scratchpad->ze_ipc_handle.data[6], + scratchpad->ze_ipc_handle.data[7]); + // tends to mean out of fd's + return ENOSPC; + } + mparams->in.alloc_id = psm3_oneapi_ze_ignore_alloc_id? + (psm3_oneapi_ze_ignore_alloc_id==1? + 0:psm3_oneapi_ze_fake_alloc_id++) + :gpu_specific->ze_alloc_id; + mparams->in.base_addr = gpu_specific->ze_base_addr; + return 0; +} +#endif /* PSM_HAVE_REG_MR */ + +// add OneAPI Level Zero specific information to the params in prep for the +// RV_IOCTL_PIN_MMAP ioctl to rv +// for pin_mmap the alloc_id is obtained here and there is no caching in the +// caller. +static int psm3_oneapi_ze_init_rv_pin_mmap_params( + void *addr, uint64_t length, int access, + struct rv_gpu_mem_params *params, + union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad) +{ + uint64_t alloc_id; + + psm3_oneapi_ze_get_ipc_handle(addr, &scratchpad->ze_ipc_handle, &scratchpad->ze_handle_fd); + params->in.ipc_handle = (uint32_t)scratchpad->ze_handle_fd; + if (!params->in.ipc_handle) { + _HFI_ERROR("No ipc_handle: 0x%02x%02x%02x%02x %02x%02x%02x%02x\n", + scratchpad->ze_ipc_handle.data[0], + scratchpad->ze_ipc_handle.data[1], + scratchpad->ze_ipc_handle.data[2], + scratchpad->ze_ipc_handle.data[3], + scratchpad->ze_ipc_handle.data[4], + scratchpad->ze_ipc_handle.data[5], + scratchpad->ze_ipc_handle.data[6], + scratchpad->ze_ipc_handle.data[7]); + return EFAULT; + } + alloc_id = psm3_oneapi_ze_get_alloc_id(addr, NULL); + // id is unique across all allocs on all devices in a process + params->in.alloc_id = psm3_oneapi_ze_ignore_alloc_id? + (psm3_oneapi_ze_ignore_alloc_id==1? + 0:psm3_oneapi_ze_fake_alloc_id++) + :alloc_id; + _HFI_VDBG("addr 0x%"PRIx64" length %"PRIu64" id %"PRIu64" access 0x%x\n", + (uint64_t)addr, length, alloc_id, access); + return 0; +} + +// cleanup OneAPI Level Zero specific scratchpad from +// psm3_oneapi_ze_init_rv_reg_mr_params or +// psm3_oneapi_ze_init_rv_pin_mmap_params +// called on success or error path, makes sure not to polute errno +// as it can reflect the earlier error for the error path in caller. +static void psm3_oneapi_ze_rv_reg_mmap_cleanup( + void *addr, uint64_t length, int access, + union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad) +{ + if (scratchpad->ze_handle_fd) { + int save_errno = errno; + psm3_oneapi_ze_put_ipc_handle((const void *)addr, scratchpad->ze_ipc_handle); + // no need to clear scratchpad + errno = save_errno; + } +} +#endif /* PSM_HAVE_RNDV_MOD */ + +#ifdef PSM_HAVE_REG_MR +// compare GPU specific fields in verbs MR cache entry +static int psm3_oneapi_ze_cmp_mr(const union psm3_verbs_mr_gpu_specific *a, + const union psm3_verbs_mr_gpu_specific *b) +{ + if (a->ze_alloc_id < b->ze_alloc_id) + return -1; + else if (a->ze_alloc_id > b->ze_alloc_id) + return 1; + else + return 0; +} + +// initialize GPU specific fields in verbs MR cache entry +static void psm3_oneapi_ze_init_mr(void *addr, uint64_t length, int access, + union psm3_verbs_mr_gpu_specific *gpu_specific) +{ + void *base = NULL; + size_t len; + + if (access & IBV_ACCESS_IS_GPU_ADDR) + PSM3_ONEAPI_ZE_CALL(zeMemGetAddressRange, + psm3_oneapi_ze_context, (const void *)addr, + &base, &len); + gpu_specific->ze_base_addr = (uint64_t)base; + gpu_specific->ze_alloc_id = (access & IBV_ACCESS_IS_GPU_ADDR)? + psm3_oneapi_ze_get_alloc_id(addr, NULL) : 0; +} +#endif /* PSM_HAVE_REG_MR */ + +static void psm3_oneapi_ze_fetch_ctxt(void) +{ + // nothing to do +} + +// ensure psm3_cu_ctxt reflects our most recent psm3_cu_ctxt +static void psm3_oneapi_ze_refresh_ctxt(void) +{ + // nothing to do +} + +static void psm3_oneapi_ze_register_hostmem(void *buf, uint32_t size) +{ +#ifndef PSM3_NO_ONEAPI_IMPORT + PSM3_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, psm3_oneapi_ze_driver, + buf, size); +#endif +} + +static void psm3_oneapi_ze_unregister_hostmem(void *buf) +{ +#ifndef PSM3_NO_ONEAPI_IMPORT + ze_result_t result; + //PSM3_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, psm3_oneapi_ze_driver, + // buf); + psm3_oneapi_ze_count_zexDriverReleaseImportedPointer++; + result = psm3_oneapi_ze_zexDriverReleaseImportedPointer(psm3_oneapi_ze_driver, + buf); + if (result != ZE_RESULT_SUCCESS) { + _HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psm3_oneapi_ze_result_to_string(result)); + } +#endif +} + +/* + * Two usages: + * (1) ctxt == NULL: check if the buffer is allocated from Level-zero. + * In this case, change psm3_oneapi_ze_cur_dev if device has changed. + * (2) ctxt != NULL: try to get the device context. + * In this case, don't change psm3_oneapi_ze_cur_dev. + */ +PSMI_ALWAYS_INLINE( +int +psm3_is_oneapi_ze_mem(const void *ptr, struct psm3_oneapi_ze_dev_ctxt **ctxt)) +{ + ze_memory_allocation_properties_t mem_props = { + ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES + }; + ze_device_handle_t dev; + ze_result_t result; + int ret = 0; + + psm3_oneapi_ze_count_zeMemGetAllocProperties++; + result = psm3_oneapi_ze_zeMemGetAllocProperties(psm3_oneapi_ze_context, ptr, &mem_props, + &dev); + if (result == ZE_RESULT_SUCCESS && + (mem_props.type != ZE_MEMORY_TYPE_UNKNOWN)) { + ret = 1; + _HFI_VDBG("ptr %p type %d dev %p oneapi_ze_cur_dev %p\n", + ptr, mem_props.type, dev, psm3_oneapi_ze_cur_dev->dev); + /* + * Check if the gpu device has changed. + * If we are trying to get the device context (!ctxt), + * don't change psm3_oneapi_ze_cur_dev. + * If the buffer is allocated through zeMemAllocHost, + * there will be no device associated with it (dev == NULL). + * In this case, use the current device context. + */ + if (!dev) { + if (ctxt) + *ctxt = psm3_oneapi_ze_cur_dev; + return ret; + } + if (ctxt || (!ctxt && dev != psm3_oneapi_ze_cur_dev->dev)) { + int i; + + for (i = 0; i < psm3_num_oneapi_ze_devices; i++) { + if (psm3_oneapi_ze_devices[i].dev == dev) { + if (ctxt) + *ctxt = &psm3_oneapi_ze_devices[i]; + else + psm3_oneapi_ze_cur_dev = &psm3_oneapi_ze_devices[i]; + break; + } + } + _HFI_VDBG("check ze_device[%d-%d] for dev %p: no match\n", 0, psm3_num_oneapi_ze_devices-1, dev); + } + } + + return ret; +} + +PSMI_ALWAYS_INLINE( +struct psm3_oneapi_ze_dev_ctxt * +psm3_oneapi_ze_dev_ctxt_get(const void *ptr)) +{ + struct psm3_oneapi_ze_dev_ctxt *ctxt = NULL; + + psm3_is_oneapi_ze_mem(ptr, &ctxt); + + return ctxt; +} + +static int psm3_oneapi_ze_is_gpu_mem(const void *ptr) +{ + return psm3_is_oneapi_ze_mem(ptr, NULL); +} + +static void psm3_oneapi_ze_prepare_HtoD_memcpys(struct ips_protoexp *protoexp) +{ + int i; + + for (i = 0; i < MAX_ZE_DEVICES; i++) + protoexp->gpu_specific.ze_cq_recvs[i] = NULL; +} + +static void psm3_oneapi_ze_prepare_DtoH_memcpys(struct ips_proto *proto) +{ + int i; + + for (i = 0; i < MAX_ZE_DEVICES; i++) + proto->gpu_specific.ze_cq_sends[i] = NULL; +} + +static void psm3_oneapi_ze_shutdown_HtoD_memcpys(struct ips_protoexp *protoexp) +{ + int i; + + for (i = 0; i < MAX_ZE_DEVICES; i++) { + if (protoexp->gpu_specific.ze_cq_recvs[i]) { + PSM3_ONEAPI_ZE_CALL(zeCommandQueueDestroy, protoexp->gpu_specific.ze_cq_recvs[i]); + protoexp->gpu_specific.ze_cq_recvs[i] = NULL; + } + } +} + +static void psm3_oneapi_ze_shutdown_DtoH_memcpys(struct ips_proto *proto) +{ + int i; + + for (i = 0; i < MAX_ZE_DEVICES; i++) { + if (proto->gpu_specific.ze_cq_sends[i]) { + PSM3_ONEAPI_ZE_CALL(zeCommandQueueDestroy, proto->gpu_specific.ze_cq_sends[i]); + proto->gpu_specific.ze_cq_sends[i] = NULL; + } + } +} + +static void psm3_oneapi_ze_memcpy_HtoD_start(struct ips_protoexp *protoexp, + struct ips_gpu_hostbuf *ghb, uint32_t len) +{ + ze_event_pool_desc_t pool_desc = { + .stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, + .flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE, + .count = 1 + }; + ze_event_desc_t event_desc = { + .stype = ZE_STRUCTURE_TYPE_EVENT_DESC, + .signal = ZE_EVENT_SCOPE_FLAG_HOST, + .wait = ZE_EVENT_SCOPE_FLAG_HOST, + .index = 0 + }; + struct psm3_oneapi_ze_dev_ctxt *ctxt; + int inx; + + ctxt = psm3_oneapi_ze_dev_ctxt_get(ghb->gpu_buf); + if (!ctxt) { + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "%s HTOD: unknown GPU device for addr %p\n", + __FUNCTION__, ghb->gpu_buf); + return; /* NOT REACHED */ + } + if (ghb->gpu_specific.ze_event_pool == NULL) { + PSM3_ONEAPI_ZE_CALL(zeEventPoolCreate, + psm3_oneapi_ze_context, &pool_desc, 0, NULL, &ghb->gpu_specific.ze_event_pool); + } + if (ghb->gpu_specific.ze_copy_status == NULL) { + PSM3_ONEAPI_ZE_CALL(zeEventCreate, + ghb->gpu_specific.ze_event_pool, &event_desc, &ghb->gpu_specific.ze_copy_status); + } + inx = ctxt->dev_index; + if (! ghb->gpu_specific.ze_command_lists[inx]) { + psm3_oneapi_ze_async_cmd_create(ctxt, + &protoexp->gpu_specific.ze_cq_recvs[inx], &ghb->gpu_specific.ze_command_lists[inx]); + } + ghb->gpu_specific.ze_cur_dev_inx = inx; + PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ghb->gpu_specific.ze_command_lists[inx], + ghb->gpu_buf, ghb->host_buf, len, ghb->gpu_specific.ze_copy_status, 0, NULL); + if (! psm3_oneapi_ze_immed_async_copy) { + PSM3_ONEAPI_ZE_CALL(zeCommandListClose, ghb->gpu_specific.ze_command_lists[inx]); + PSM3_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, + protoexp->gpu_specific.ze_cq_recvs[inx], 1, &ghb->gpu_specific.ze_command_lists[inx], NULL); + } +} + +static void psm3_oneapi_ze_memcpy_DtoH_start(struct ips_proto *proto, + struct ips_gpu_hostbuf *ghb, uint32_t len) +{ + ze_event_pool_desc_t pool_desc = { + .stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, + .flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE, + .count = 1 + }; + ze_event_desc_t event_desc = { + .stype = ZE_STRUCTURE_TYPE_EVENT_DESC, + .signal = ZE_EVENT_SCOPE_FLAG_HOST, + .wait = ZE_EVENT_SCOPE_FLAG_HOST, + .index = 0 + }; + struct psm3_oneapi_ze_dev_ctxt *ctxt; + int inx; + + ctxt = psm3_oneapi_ze_dev_ctxt_get(ghb->gpu_buf); + if (!ctxt) { + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "%s DTOH: unknown GPU device for addr %p\n", + __FUNCTION__, ghb->gpu_buf); + return; /* NOT REACHED */ + } + if (ghb->gpu_specific.ze_event_pool == NULL) { + PSM3_ONEAPI_ZE_CALL(zeEventPoolCreate, + psm3_oneapi_ze_context, &pool_desc, 0, NULL, &ghb->gpu_specific.ze_event_pool); + } + if (ghb->gpu_specific.ze_copy_status == NULL) { + PSM3_ONEAPI_ZE_CALL(zeEventCreate, ghb->gpu_specific.ze_event_pool, &event_desc, + &ghb->gpu_specific.ze_copy_status); + } + inx = ctxt->dev_index; + if (! ghb->gpu_specific.ze_command_lists[inx]) { + psm3_oneapi_ze_async_cmd_create(ctxt, &proto->gpu_specific.ze_cq_sends[inx], + &ghb->gpu_specific.ze_command_lists[inx]); + } + ghb->gpu_specific.ze_cur_dev_inx = inx; + PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ghb->gpu_specific.ze_command_lists[inx], + ghb->host_buf, ghb->gpu_buf, len, ghb->gpu_specific.ze_copy_status, 0, NULL); + if (! psm3_oneapi_ze_immed_async_copy) { + PSM3_ONEAPI_ZE_CALL(zeCommandListClose, ghb->gpu_specific.ze_command_lists[inx]); + PSM3_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, + proto->gpu_specific.ze_cq_sends[inx], 1, &ghb->gpu_specific.ze_command_lists[inx], NULL); + } +} + +static int psm3_oneapi_ze_memcpy_done(struct ips_gpu_hostbuf *ghb) +{ + ze_result_t result; + psm3_oneapi_ze_count_zeEventQueryStatus++; + + result = psm3_oneapi_ze_zeEventQueryStatus(ghb->gpu_specific.ze_copy_status); + if (result == ZE_RESULT_SUCCESS) { + return 1; + } else if (result == ZE_RESULT_NOT_READY) { + return 0; + } else { + _HFI_ERROR("OneAPI Level Zero failure: %s() (at %s:%d) returned 0x%x: %s\n", + "zeEventQueryStatus", __FILE__, __LINE__, result, + psm3_oneapi_ze_result_to_string(result)); + psm3_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Error returned from OneAPI Level Zero function %s.\n", + "zeEventQueryStatus"); + } + return 0; +} + +// when allocating bounce buffers either malloc w/Import or +// zeMemAllocHost can be used. zeMemAllocHost tends to perform +// better in the subsequent GPU copy's AppendMemoryCopy. However +// zeMemAllocHost results in a GPU-like address which requires dmabuf +// so we can't use zeMemAllocHost for DMA to/from the bounce buffer +// unless rv is available to handle GPU addresses (eg. PSM3_GPUDIRECT=1) + +static void *psm3_oneapi_ze_host_alloc_malloc(unsigned size) +{ + void *ret_ptr = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size); +#ifndef PSM3_NO_ONEAPI_IMPORT + PSM3_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, psm3_oneapi_ze_driver, ret_ptr, size); +#endif + return ret_ptr; +} + +static void psm3_oneapi_ze_host_free_malloc(void *ptr) +{ +#ifndef PSM3_NO_ONEAPI_IMPORT + PSM3_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, psm3_oneapi_ze_driver, ptr); +#endif + psmi_free(ptr); +} + +#ifndef PSM3_USE_ONEAPI_MALLOC +static void *psm3_oneapi_ze_host_alloc_zemem(unsigned size) +{ + void *ret_ptr; + ze_host_mem_alloc_desc_t host_desc = { + .stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, + .flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW + }; + PSM3_ONEAPI_ZE_CALL(zeMemAllocHost, psm3_oneapi_ze_context, + &host_desc, size, 8, &ret_ptr); + return ret_ptr; +} + +static void psm3_oneapi_ze_host_free_zemem(void *ptr) +{ + PSM3_ONEAPI_ZE_CALL(zeMemFree, psm3_oneapi_ze_context, ptr); +} + +static void *(*psm3_oneapi_ze_host_alloc_ptr)(unsigned size) = psm3_oneapi_ze_host_alloc_malloc; +static void (*psm3_oneapi_ze_host_free_ptr)(void *ptr) = psm3_oneapi_ze_host_free_malloc; +static int psm3_oneapi_ze_using_zemem_alloc = 0; +#endif /* PSM3_USE_ONEAPI_MALLOC */ + +// this is only called if GPU Direct is enabled in rv such that +// GDR Copy and/or RDMA MRs can provide GPU-like addresses to rv +static void psm3_oneapi_ze_using_rv_for_mrs(void) +{ +#ifndef PSM3_USE_ONEAPI_MALLOC + psm3_oneapi_ze_host_alloc_ptr = psm3_oneapi_ze_host_alloc_zemem; + psm3_oneapi_ze_host_free_ptr = psm3_oneapi_ze_host_free_zemem; + psm3_oneapi_ze_using_zemem_alloc = 1; +#endif +} + +static void psm3_oneapi_ze_host_alloc(void **ret_ptr, uint32_t size) +{ +#ifdef PSM3_USE_ONEAPI_MALLOC + *ret_ptr = psm3_oneapi_ze_host_alloc_malloc(size); +#else + *ret_ptr = (*psm3_oneapi_ze_host_alloc_ptr)(size); +#endif +} + +static void psm3_oneapi_ze_host_free(void *ptr) +{ +#ifdef PSM3_USE_ONEAPI_MALLOC + psm3_oneapi_ze_host_free_malloc(ptr); +#else + (*psm3_oneapi_ze_host_free_ptr)(ptr); +#endif +} + +static void psm3_oneapi_ze_hostbuf_lazy_init(struct ips_gpu_hostbuf *ghb) +{ + int i; + + ghb->gpu_specific.ze_event_pool = NULL; + ghb->gpu_specific.ze_copy_status = NULL; + for (i = 0; i < MAX_ZE_DEVICES; i++) + ghb->gpu_specific.ze_command_lists[i] = NULL; +} + +static void psm3_oneapi_ze_hostbuf_reset(struct ips_gpu_hostbuf *ghb) +{ + if (! psm3_oneapi_ze_immed_async_copy) { + PSM3_ONEAPI_ZE_CALL(zeCommandListReset, + ghb->gpu_specific.ze_command_lists[ghb->gpu_specific.ze_cur_dev_inx]); + } + PSM3_ONEAPI_ZE_CALL(zeEventHostReset, ghb->gpu_specific.ze_copy_status); +} + +static void psm3_oneapi_ze_hostbuf_destroy(struct ips_gpu_hostbuf *ghb) +{ + int i; + + if (ghb->gpu_specific.ze_copy_status != NULL) { + PSM3_ONEAPI_ZE_CALL(zeEventDestroy, ghb->gpu_specific.ze_copy_status); + } + if (ghb->host_buf != NULL) { + psm3_oneapi_ze_host_free(ghb->host_buf); + } + if (ghb->gpu_specific.ze_event_pool != NULL) { + PSM3_ONEAPI_ZE_CALL(zeEventPoolDestroy, ghb->gpu_specific.ze_event_pool); + } + for (i = 0; i < MAX_ZE_DEVICES; i++) { + if (ghb->gpu_specific.ze_command_lists[i]) { + PSM3_ONEAPI_ZE_CALL( zeCommandListDestroy, ghb->gpu_specific.ze_command_lists[i]); + ghb->gpu_specific.ze_command_lists[i] = NULL; + } + } +} + +// synchronous GPU memcpy +static void psm3_oneapi_ze_memcpy_internal(void *dstptr, const void *srcptr, size_t size) +{ + struct psm3_oneapi_ze_dev_ctxt *ctxt; + + psmi_assert(size > 0); + ctxt = psm3_oneapi_ze_dev_ctxt_get(dstptr); + if (!ctxt) { + ctxt = psm3_oneapi_ze_dev_ctxt_get(srcptr); + if (!ctxt) { + _HFI_ERROR("dst %p src %p not GPU buf for copying\n", + dstptr, srcptr); + return; + } + } + if (psm3_oneapi_ze_immed_sync_copy) { + PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl, + dstptr, srcptr, size, NULL, 0, NULL); + } else { + PSM3_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->cl); + PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl, + dstptr, srcptr, size, NULL, 0, NULL); + PSM3_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->cl); + PSM3_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->cq, + 1, &ctxt->cl, NULL); + PSM3_ONEAPI_ZE_CALL(zeCommandQueueSynchronize, ctxt->cq, UINT32_MAX); + } +} + +// synchronous GPU memcpy DTOD (xeLink) +static void psm3_oneapi_ze_memcpy_DTOD(void *dstptr, const void *srcptr, size_t size) +{ + struct psm3_oneapi_ze_dev_ctxt *ctxt; + + psmi_assert(size > 0); + ctxt = psm3_oneapi_ze_dev_ctxt_get(dstptr); + if (!ctxt) { + _HFI_ERROR("dst %p src %p not GPU buf for copying\n", + dstptr, srcptr); + return; + } + if (size <= psm3_oneapi_parallel_dtod_copy_thresh) { + if (psm3_oneapi_ze_immed_sync_copy) { + PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl, + dstptr, srcptr, size, NULL, 0, NULL); + } else { + PSM3_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->cl); + PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl, + dstptr, srcptr, size, NULL, 0, NULL); + PSM3_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->cl); + PSM3_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->cq, + 1, &ctxt->cl, NULL); + PSM3_ONEAPI_ZE_CALL(zeCommandQueueSynchronize, ctxt->cq, UINT32_MAX); + } + } else { + // for large DTOD copies, start 2 parallel commands + // then wait for both + size_t size0 = ROUNDUP64P2(size/2, 64*1024); + size_t size1 = size - size0; + + if (psm3_oneapi_ze_immed_sync_copy) { + PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl0, + dstptr, srcptr, size0, ctxt->copy_status0, 0, NULL); + + PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl1, + (void*)((uintptr_t)dstptr+size0), + (void*)((uintptr_t)srcptr+size0), size1, ctxt->copy_status1, + 0, NULL); + } else { + PSM3_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->async_cl0); + PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl0, + dstptr, srcptr, size0, ctxt->copy_status0, 0, NULL); + PSM3_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->async_cl0); + PSM3_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->async_cq0, + 1, &ctxt->async_cl0, NULL); + + PSM3_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->async_cl1); + PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl1, + (void*)((uintptr_t)dstptr+size0), + (void*)((uintptr_t)srcptr+size0), size1, ctxt->copy_status1, + 0, NULL); + PSM3_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->async_cl1); + PSM3_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->async_cq1, + 1, &ctxt->async_cl1, NULL); + } + // 2nd copy may be slightly smaller so waity for it first so + // can potentially hide its Reset latency while 1st copy completes + PSM3_ONEAPI_ZE_CALL(zeEventHostSynchronize, ctxt->copy_status1, UINT32_MAX); + PSM3_ONEAPI_ZE_CALL(zeEventHostReset, ctxt->copy_status1); + + PSM3_ONEAPI_ZE_CALL(zeEventHostSynchronize, ctxt->copy_status0, UINT32_MAX); + PSM3_ONEAPI_ZE_CALL(zeEventHostReset, ctxt->copy_status0); + } +} + +static void psm3_oneapi_ze_memcpy_DtoD(void *dstptr, const void *srcptr, uint32_t len) +{ + psm3_oneapi_ze_memcpy_DTOD(dstptr, srcptr, len); +} + +static void psm3_oneapi_ze_memcpy_HtoD(void *dstptr, const void *srcptr, uint32_t len) +{ + psm3_oneapi_ze_memcpy_internal(dstptr, srcptr, len); +} + +static void psm3_oneapi_ze_memcpy_DtoH(void *dstptr, const void *srcptr, uint32_t len) +{ + psm3_oneapi_ze_memcpy_internal(dstptr, srcptr, len); +} + +static void psm3_oneapi_ze_memcpy(void *dstptr, const void *srcptr, uint32_t len) +{ + psm3_oneapi_ze_memcpy_internal(dstptr, srcptr, len); +} + +static void psm3_oneapi_ze_synchronize_memcpy(void) +{ + /* Not needed for OneAPI Level Zero */ +} + +static void psm3_oneapi_ze_mark_buf_synchronous(const void *buf) +{ + /* not needed for OneAPI ZE */ +} + +static int psm3_oneapi_ze_gpu_addr_send_mr(struct psm2_mq_req *mqreq) +{ +#ifdef PSM3_USE_ONEAPI_MALLOC + // HOST_ALLOC memory treated as CPU memory for Verbs MRs + return (mqreq->is_buf_gpu_mem && ! mqreq->gpu_hostbuf_used); +#else + // HOST_ALLOC memory treated as GPU memory for Verbs MRs + /// Note: gpu_hostbuf_used" only set if is_buf_gpu_mem + return mqreq->is_buf_gpu_mem && + (! mqreq->gpu_hostbuf_used || psm3_oneapi_ze_using_zemem_alloc ); +#endif +} + +static int psm3_oneapi_ze_gpu_addr_recv_mr(struct ips_tid_recv_desc *tidrecvc, + int gpu_hostbuf_used) +{ +#ifdef PSM3_USE_ONEAPI_MALLOC + // HOST_ALLOC memory treated as CPU memory for Verbs MRs + return tidrecvc->is_ptr_gpu_backed; +#else + // HOST_ALLOC memory treated as GPU memory for Verbs MRs + /// Note: gpu_hostbuf_used" only set if is_buf_gpu_mem + return tidrecvc->is_ptr_gpu_backed + || (gpu_hostbuf_used && psm3_oneapi_ze_using_zemem_alloc); +#endif +} + +//*************************************************************************** +//OneAPI Level Zero support for PSM3_DEVICES "shm", via an IPC handle cache and +//OneAPI Level Zero IPC +//In platforms with xeLink between GPUs, OneAPI Level Zero IPC will use xeLink. + +#define ONEAPI_MEMHANDLE_CACHE_SIZE 64 + +#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) +/* + * rbtree cruft + */ +struct _cl_map_item; + +struct psm3_oneapi_ze_memhandle_cache; +typedef struct psm3_oneapi_ze_memhandle_cache *psm3_oneapi_ze_memhandle_cache_t; + +typedef struct +{ + unsigned long start; /* start(base) virtual address + in peer process */ + uint32_t ze_handle; /* Sender's GEM handle or fd */ + uint64_t alloc_id; /* ze alloc_id */ + void *buf_ptr; /* buffer pointer in this + process */ + psm2_epid_t epid; + struct _cl_map_item* i_prev; /* idle queue previous */ + struct _cl_map_item* i_next; /* idle queue next */ + psm3_oneapi_ze_memhandle_cache_t cache; /* only for gem_handle close */ +}__attribute__ ((aligned (128))) psm3_rbtree_oneapi_ze_memhandle_cache_mapitem_pl_t; + +typedef struct { + uint32_t nelems; /* number of elements in the cache */ +} psm3_rbtree_oneapi_ze_memhandle_cache_map_pl_t; + +static psm2_error_t psm3_oneapi_ze_memhandle_mpool_alloc( + psm3_oneapi_ze_memhandle_cache_t cache, uint32_t memcache_size); +static void psm3_oneapi_ze_memhandle_delete(void *buf_ptr); + +/* + * Custom comparator + */ +typedef psm3_rbtree_oneapi_ze_memhandle_cache_mapitem_pl_t psm3_oneapi_ze_cache_item; + +static int psm3_oneapi_ze_cache_key_cmp(const psm3_oneapi_ze_cache_item *a, + const psm3_oneapi_ze_cache_item *b) +{ + // we use epid as part of cache key so multi-ep and multi-process jobs + // can have a better cache hit rate. In some cases we may end up with + // cache entries for the same buffer with different epid's all within the + // same multi-ep rank, but this does no harm other than to waste some + // cache space. By including epid in key_cmp we have a chance to have + // separate cache entries for the same sbuf address in different + // sender's GPU virtual address space. + switch (psm3_epid_cmp_internal(a->epid, b->epid)) { + case -1: return -1; + case 1: return 1; + default: + break; + } + + // The sender has used zeMemGetAddressRange to normalize the address + // so we can simply compare the start address of the allocation. + // Note zeMemOpenIpcHandle only needs the start address as well, so we + // ignore length + if (a->start < b->start) + return -1; + if (b->start < a->start) + return 1; + + return 0; +} + + +/* + * Necessary rbtree cruft + */ +#define RBTREE_MI_PL psm3_rbtree_oneapi_ze_memhandle_cache_mapitem_pl_t +#define RBTREE_MAP_PL psm3_rbtree_oneapi_ze_memhandle_cache_map_pl_t +#define RBTREE_CMP(a,b) psm3_oneapi_ze_cache_key_cmp((a), (b)) +#define RBTREE_ASSERT psmi_assert +#define RBTREE_MAP_COUNT(PAYLOAD_PTR) ((PAYLOAD_PTR)->nelems) +#define RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR + +#include "psm3_rbtree.h" +#include "psm3_rbtree.c" + +/* + * Convenience rbtree cruft + */ +#define NELEMS(cache) ((cache)->map.payload.nelems) + +#define IHEAD(cache) ((cache)->map.root) +#define LAST(cache) (IHEAD(cache)->payload.i_prev) +#define FIRST(cache) (IHEAD(cache)->payload.i_next) +#define INEXT(x) ((x)->payload.i_next) +#define IPREV(x) ((x)->payload.i_prev) + +/* + * Actual module data + */ +struct psm3_oneapi_ze_memhandle_cache { + cl_qmap_t map; + mpool_t mpool; + uint32_t size; + psm2_mq_stats_t *stats; +}; + +static void psm3_print_oneapi_ze_memhandle_cache_stats(psm2_mq_stats_t *stats) +{ + _HFI_DBG("limit=%lu,maxelems=%lu,hit=%lu,miss=%lu,evict=%lu,remove=%lu,clear=%lu\n", + stats->gpu_ipc_cache_limit, stats->gpu_ipc_cache_max_nelems, + stats->gpu_ipc_cache_hit, stats->gpu_ipc_cache_miss, + stats->gpu_ipc_cache_evict, stats->gpu_ipc_cache_remove, + stats->gpu_ipc_cache_clear); +} + +/* + * This is the callback function when mempool are resized or destroyed. + * Upon calling cache free mpool is destroyed which in turn calls this callback + * which helps in closing all memhandles. + * TBD - only called for !is_alloc when destroying so could avoid keeping + * cache pointer in memcache_item. But when GEM_CLOSE is not needed + * memhandle_delete won't need destroyng flag and can remove cache pointer then + */ +static void +psm3_oneapi_ze_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj) +{ + cl_map_item_t* memcache_item = (cl_map_item_t*)obj; + if (!is_alloc) { + if(memcache_item->payload.start) + psm3_oneapi_ze_memhandle_delete(memcache_item->payload.buf_ptr); + } +} + +/* + * Creating mempool for ze memhandle cache nodes. + */ +static psm2_error_t +psm3_oneapi_ze_memhandle_mpool_alloc(psm3_oneapi_ze_memhandle_cache_t cache, + uint32_t memcache_size) +{ + psm2_error_t err; + if (memcache_size < 1) + return PSM2_PARAM_ERR; + + cache->size = memcache_size; + /* Creating a memory pool of size PSM3_ONEAPI_MEMCACHE_SIZE + * which includes the Root and NIL items + */ + cache->mpool = psm3_mpool_create_for_gpu(sizeof(cl_map_item_t), + cache->size, + cache->size, 0, + UNDEFINED, NULL, NULL, + psm3_oneapi_ze_memhandle_cache_alloc_func, + NULL); + if (cache->mpool == NULL) { + err = psm3_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, + "Couldn't allocate ONEAPI host receive buffer pool"); + return err; + } + return PSM2_OK; +} + +/* + * allocate and initialize memhandle cache + * including rbtree. + */ +static psm2_error_t psm3_oneapi_ze_memhandle_cache_alloc(psm3_oneapi_ze_memhandle_cache_t *cachep, + uint32_t memcache_size, + psm2_mq_stats_t *stats) +{ + cl_map_item_t *root = NULL, *nil_item = NULL; + + *cachep = (psm3_oneapi_ze_memhandle_cache_t)psmi_calloc( + NULL, UNDEFINED, 1, sizeof(**cachep)); + if (! *cachep) + return PSM2_NO_MEMORY; + + psm2_error_t err = psm3_oneapi_ze_memhandle_mpool_alloc( + *cachep, memcache_size); + if (err != PSM2_OK) + return err; + + root = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t)); + if (root == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + nil_item = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t)); + if (nil_item == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + + nil_item->payload.start = 0; + nil_item->payload.epid = psm3_epid_zeroed_internal(); + ips_cl_qmap_init(&(*cachep)->map,root,nil_item); + NELEMS(*cachep) = 0; + + (*cachep)->stats = stats; + + stats->gpu_ipc_cache_limit = memcache_size; + stats->gpu_ipc_cache_nelems = 0; + stats->gpu_ipc_cache_max_nelems = 0; + stats->gpu_ipc_cache_hit = 0; + stats->gpu_ipc_cache_miss = 0; + stats->gpu_ipc_cache_evict = 0; + stats->gpu_ipc_cache_remove = 0; + stats->gpu_ipc_cache_clear = 0; + + return PSM2_OK; + +fail: + if (nil_item) + psmi_free(nil_item); + if (root) + psmi_free(root); + if ((*cachep)->mpool) + psm3_mpool_destroy((*cachep)->mpool); + psmi_free(*cachep); + return err; +} + +static void psm3_oneapi_ze_memhandle_cache_free(psm3_oneapi_ze_memhandle_cache_t cache) +{ + psm3_print_oneapi_ze_memhandle_cache_stats(cache->stats); + + if (cache->map.nil_item) + psmi_free(cache->map.nil_item); + if (cache->map.root) + psmi_free(cache->map.root); + if (cache->mpool) + psm3_mpool_destroy(cache->mpool); + psmi_free(cache); +} + +/* + * Insert at the head of Idleq. + */ +static void +psm3_oneapi_ze_idleq_insert(psm3_oneapi_ze_memhandle_cache_t cache, cl_map_item_t* memcache_item) +{ + if (FIRST(cache) == NULL) { + FIRST(cache) = memcache_item; + LAST(cache) = memcache_item; + return; + } + INEXT(FIRST(cache)) = memcache_item; + IPREV(memcache_item) = FIRST(cache); + FIRST(cache) = memcache_item; + INEXT(FIRST(cache)) = NULL; + return; +} + +/* + * Remove least recent used element. + */ +static void +psm3_oneapi_ze_idleq_remove_last(psm3_oneapi_ze_memhandle_cache_t cache, cl_map_item_t* memcache_item) +{ + if (!INEXT(memcache_item)) { + LAST(cache) = NULL; + FIRST(cache) = NULL; + } else { + LAST(cache) = INEXT(memcache_item); + IPREV(LAST(cache)) = NULL; + } + // Null-out now-removed memcache_item's next and prev pointers out of + // an abundance of caution + INEXT(memcache_item) = IPREV(memcache_item) = NULL; +} + +static void +psm3_oneapi_ze_idleq_remove(psm3_oneapi_ze_memhandle_cache_t cache, cl_map_item_t* memcache_item) +{ + if (LAST(cache) == memcache_item) { + psm3_oneapi_ze_idleq_remove_last(cache, memcache_item); + } else if (FIRST(cache) == memcache_item) { + FIRST(cache) = IPREV(memcache_item); + INEXT(FIRST(cache)) = NULL; + } else { + INEXT(IPREV(memcache_item)) = INEXT(memcache_item); + IPREV(INEXT(memcache_item)) = IPREV(memcache_item); + } + // Null-out now-removed memcache_item's next and prev pointers out of + // an abundance of caution + INEXT(memcache_item) = IPREV(memcache_item) = NULL; +} + +static void +psm3_oneapi_ze_idleq_reorder(psm3_oneapi_ze_memhandle_cache_t cache, cl_map_item_t* memcache_item) +{ + if (FIRST(cache) == memcache_item && LAST(cache) == memcache_item ) { + return; + } + psm3_oneapi_ze_idleq_remove(cache, memcache_item); + psm3_oneapi_ze_idleq_insert(cache, memcache_item); + return; +} + +/* + * After a successful cache hit, item is validated by doing a + * memcmp on the handle stored and the handle we receive from the + * sender. If the validation fails the item is removed from the idleq, + * the rbtree, is put back into the mpool and ZeMemCloseIpcHandle function + * is called. + * Level Zero's alloc_id will be unique per allocation, even if the allocation + * was at the same address. In some cases, but not always, the ipc_handle + * will also be different. So we validate both, although just checking alloc_id + * would be sufficient. + */ + +static psm2_error_t +psm3_oneapi_ze_memhandle_cache_validate(psm3_oneapi_ze_memhandle_cache_t cache, + cl_map_item_t* memcache_item, + uintptr_t sbuf, uint32_t handle, + psm2_epid_t epid, uint64_t alloc_id) +{ + psmi_assert(!psm3_epid_cmp_internal(epid, memcache_item->payload.epid)); + psmi_assert(sbuf == memcache_item->payload.start); + if (handle == memcache_item->payload.ze_handle && + alloc_id == memcache_item->payload.alloc_id) { + return PSM2_OK; + } + _HFI_DBG("cache remove stale entry: new start=%lu,handle=%u,alloc_id=%lu\n", + sbuf, handle, alloc_id); + + cache->stats->gpu_ipc_cache_remove++; + ips_cl_qmap_remove_item(&cache->map, memcache_item); + cache->stats->gpu_ipc_cache_nelems--; + psm3_oneapi_ze_memhandle_delete(memcache_item->payload.buf_ptr); + psm3_oneapi_ze_idleq_remove(cache, memcache_item); + memset(memcache_item, 0, sizeof(*memcache_item)); + psm3_mpool_put(memcache_item); + return PSM2_OK_NO_PROGRESS; +} + +/* + * Current eviction policy: Least Recently Used. + */ +static void +psm3_oneapi_ze_memhandle_cache_evict(psm3_oneapi_ze_memhandle_cache_t cache) +{ + cache->stats->gpu_ipc_cache_evict++; + cl_map_item_t *p_item = LAST(cache); + _HFI_VDBG("Removing (epid=%s,start=%lu,dev_ptr=%p,it=%p) from ze_memhandle_cachemap.\n", + psm3_epid_fmt_internal(p_item->payload.epid, 0), p_item->payload.start, + p_item->payload.buf_ptr, p_item); + ips_cl_qmap_remove_item(&cache->map, p_item); + cache->stats->gpu_ipc_cache_nelems--; + psm3_oneapi_ze_memhandle_delete(p_item->payload.buf_ptr); + psm3_oneapi_ze_idleq_remove_last(cache, p_item); + memset(p_item, 0, sizeof(*p_item)); + psm3_mpool_put(p_item); +} + +static psm2_error_t +psm3_oneapi_ze_memhandle_cache_register(psm3_oneapi_ze_memhandle_cache_t cache, + uintptr_t sbuf, uint32_t handle, + psm2_epid_t epid, + void *buf_ptr, uint64_t alloc_id) +{ + if (NELEMS(cache) == cache->size) + psm3_oneapi_ze_memhandle_cache_evict(cache); + + cl_map_item_t* memcache_item = psm3_mpool_get(cache->mpool); + /* memcache_item cannot be NULL as we evict + * before the call to mpool_get. Check has + * been fixed to help with klockwork analysis. + */ + if (memcache_item == NULL) + return PSM2_NO_MEMORY; + memcache_item->payload.start = sbuf; + memcache_item->payload.ze_handle = handle; + memcache_item->payload.buf_ptr = buf_ptr; + memcache_item->payload.alloc_id = alloc_id; + memcache_item->payload.epid = epid; + memcache_item->payload.cache = cache; + ips_cl_qmap_insert_item(&cache->map, memcache_item); + cache->stats->gpu_ipc_cache_nelems++; + if (cache->stats->gpu_ipc_cache_nelems > cache->stats->gpu_ipc_cache_max_nelems) + cache->stats->gpu_ipc_cache_max_nelems = cache->stats->gpu_ipc_cache_nelems; + psm3_oneapi_ze_idleq_insert(cache, memcache_item); + _HFI_VDBG("registered: handle %u sbuf 0x%lx ptr %p alloc_id %lu\n", + handle, sbuf, buf_ptr, alloc_id); + return PSM2_OK; +} + +#ifndef PSM_HAVE_PIDFD +static inline psm2_error_t psm3_oneapi_ze_prepare_fds_for_ipc_import( + uint32_t gem_handle, int device_index, int *ipc_fd, + psm2_epaddr_t epaddr) +{ + am_epaddr_t *am_epaddr = (am_epaddr_t*)epaddr; + int fd; + struct drm_prime_handle open_fd = {0, 0, -1}; + + if (device_index >= psm3_num_oneapi_ze_devices || device_index < 0) { + _HFI_ERROR("psm3_oneapi_ze_memhandle_acquire received invalid device_index from peer: %d\n", + device_index); + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "device_index " + "invalid - received from peer: %d", + device_index); + return PSM2_INTERNAL_ERR; + } + fd = am_epaddr->gpu_specific.ze_peer_fds[device_index]; + psm3_oneapi_ze_cur_dev = &psm3_oneapi_ze_devices[device_index]; + open_fd.flags = DRM_CLOEXEC | DRM_RDWR; + open_fd.handle = gem_handle; + if (ioctl(fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &open_fd) < 0) { + _HFI_ERROR("ioctl failed for DRM_IOCTL_PRIME_HANDLE_TO_FD: %s\n", strerror(errno)); + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "ioctl " + "failed for DRM_IOCTL_PRIME_HANDLE_TO_FD errno=%d", + errno); + return PSM2_INTERNAL_ERR; + } + *ipc_fd = open_fd.fd; + + return PSM2_OK; +} +#else +static inline psm2_error_t psm3_oneapi_ze_prepare_fds_for_ipc_import( + uint32_t handle, int device_index, int *ipc_fd, + psm2_epaddr_t epaddr) +{ + int fd; + am_epaddr_t *am_epaddr = (am_epaddr_t *)epaddr; + + fd = syscall(__NR_pidfd_getfd, am_epaddr->gpu_specific.ze_pidfd, handle, 0); + if (fd < 0) { + _HFI_ERROR("pidfd_getfd failed %d: %s\n", fd, strerror(errno)); + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "pidfd_getfd failed errno=%d (%s)", + errno, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + *ipc_fd = fd; + + return PSM2_OK; +} +#endif /* PSM_HAVE_PIDFD */ +#endif /* defined(HAVE_DRM) || defined(HAVE_LIBDRM) */ + +#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) +static void *psm3_oneapi_ze_import_ipc_buf(uint32_t fd, uint8_t alloc_type) +{ + ze_external_memory_import_fd_t import_desc = {}; + void *ze_ipc_buf = NULL; + + import_desc.stype = ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_FD; + import_desc.flags = ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF; + import_desc.fd = fd; + + switch(alloc_type) { + case ZE_MEMORY_TYPE_HOST: + { + ze_host_mem_alloc_desc_t host_desc = {}; + + host_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC; + host_desc.pNext = &import_desc; + /* size & alignment are not used since this is an import.*/ + PSM3_ONEAPI_ZE_CALL(zeMemAllocHost, psm3_oneapi_ze_context, &host_desc, + 0, 0, &ze_ipc_buf); + } + break; + case ZE_MEMORY_TYPE_DEVICE: + { + ze_device_mem_alloc_desc_t dev_desc = {}; + + dev_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; + dev_desc.pNext = &import_desc; + /* size & alignment are not used since this is an import. */ + PSM3_ONEAPI_ZE_CALL(zeMemAllocDevice, psm3_oneapi_ze_context, &dev_desc, + 0, 0, psm3_oneapi_ze_cur_dev->dev, &ze_ipc_buf); + } + break; + default: + _HFI_ERROR("Invalid alloc_type %u for fd %u\n", + alloc_type, fd); + return NULL; + } + + return ze_ipc_buf; +} +#endif /* defined(HAVE_DRM) || defined(HAVE_LIBDRM) */ + +/* + * The key used to search the cache is the senders buf address pointer and + * epid. The sender will have used zeMemGetAddressRange + * to find the start of the memory containing the buffer (supplied as sbuf) + * Upon match, we must validate the entry we find and may need to replace it. + */ +static void * +psm3_oneapi_ze_memhandle_acquire(psm3_oneapi_ze_memhandle_cache_t cache, + uintptr_t sbuf, uint32_t handle, + psm2_epaddr_t epaddr, int device_index, + uint64_t alloc_id, uint8_t alloc_type) +{ + void *buf_ptr = NULL; + psm2_epid_t epid = epaddr->epid; +#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) + int ipc_fd = -1; +#endif + _HFI_VDBG("sbuf=%lu,handle=%u,epid=%s\n", + sbuf, handle, psm3_epid_fmt_internal(epid, 0)); +#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) + + if (!cache) { + if (psm3_oneapi_ze_prepare_fds_for_ipc_import(handle, device_index, &ipc_fd, + epaddr) == PSM2_OK) { + buf_ptr = psm3_oneapi_ze_import_ipc_buf(ipc_fd, alloc_type); + if (ipc_fd >= 0) { + if (close(ipc_fd) < 0) { + _HFI_ERROR("close failed for ipc_fd: %s\n", strerror(errno)); + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "close " + "failed for ipc_fd %d errno=%d", + ipc_fd, errno); + return NULL; + } + } + } + return buf_ptr; + } + + psm3_oneapi_ze_cache_item key = { + .start = (unsigned long) sbuf, + .epid = epid + }; + + /* + * preconditions: + * 1) buffer [start,epid) may or may not be in cache->map already + * 2) there are no duplicate entries in cache->map + * postconditions: + * 1) buffer is in cache->map with same handle, epid, alloc_id + * 2) there are no duplicate entries in cache->map + * + * The key used to search the cache is the senders buf address pointer + * and epid. + * Upon a succesful hit in the cache, additional validation is required + * as the handle or alloc_id could be stale. + */ + cl_map_item_t *p_item = ips_cl_qmap_searchv(&cache->map, &key); + if (p_item->payload.start) { + // confirm the entry for sbuf matches the handle and is not stale + if (psm3_oneapi_ze_memhandle_cache_validate(cache, p_item, sbuf, handle, + epid, alloc_id) == PSM2_OK) { + cache->stats->gpu_ipc_cache_hit++; + psm3_oneapi_ze_idleq_reorder(cache, p_item); + return p_item->payload.buf_ptr; + } + + // buffer found was stale am_oneapi_memhandle_cache_validate() + // closed and removed existing entry. + // Should find no more duplicates +#ifdef PSM_DEBUG + p_item = ips_cl_qmap_searchv(&cache->map, &key); + psmi_assert(! p_item->payload.start); +#endif + } + cache->stats->gpu_ipc_cache_miss++; + + if (psm3_oneapi_ze_prepare_fds_for_ipc_import(handle, device_index, &ipc_fd, + epaddr) == PSM2_OK) { + buf_ptr = psm3_oneapi_ze_import_ipc_buf(ipc_fd, alloc_type); + if (ipc_fd >= 0) { + if (close(ipc_fd) < 0) { + _HFI_ERROR("close failed for ipc_fd: %s\n", strerror(errno)); + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "close " + "failed for ipc_fd %d errno=%d", + ipc_fd, errno); + return NULL; + } + } + if (!buf_ptr) + return NULL; + } else { + return NULL; + } + + psm3_oneapi_ze_memhandle_cache_register(cache, sbuf, handle, epid, buf_ptr, + alloc_id); + return buf_ptr; +#else // if no drm, set up to return NULL as oneapi ipc handles don't work without drm +// TBD - caller will assert when we return NULL, so should it be a build error +// if DRM not available? What works without DRM? + buf_ptr = NULL; + return buf_ptr; +#endif // defined(HAVE_DRM) || defined(HAVE_LIBDRM) + +} + +#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) +static void psm3_oneapi_ze_memhandle_delete(void *buf_ptr) +{ + /* Release the reference to the buffer */ + PSM3_ONEAPI_ZE_CALL(zeMemFree, psm3_oneapi_ze_context, buf_ptr); + +#ifndef PSM_HAVE_PIDFD + /* + * If pidfd is not used, we need to call GEM_CLOSE ioctl to remove the + * GEM handle from the handle cache of the peer device file's + * private file data in the kernel to avoid handle leak. However, we + * will have a potential risk condition that will fail a later request: + * (1) 3 requests with buf1, buf2, and buf1 are sent from sender side. + * Requests 1 and 3 uses the same buffer and therefore have the + * same gem_handle1. + * (2) buf1 is received and put into cache; + * (3) buf2 is received and buf1 is evicted from cache due to some + * condition (small cache size). As a result, gem_handle1 is closed + * through GEM_CLOSE ioctl. buf2 is put into cache. + * (4) Request 3 (with buf1) is received and HANDLE_TO_FD ioctl will + * fail because the gem_handle has been removed from peer device + * file's handle cache. + * For this reason, we prefer to leak the GEM handle over calling + * GEM_CLOSE. + */ +#endif +} +#endif /* HAVE_DRM or HAVE_LIBDRM */ + +static void +psm3_oneapi_ze_memhandle_release(psm3_oneapi_ze_memhandle_cache_t cache, + void *buf_ptr) +{ +#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) + if (!cache) + psm3_oneapi_ze_memhandle_delete(buf_ptr); +#endif // defined(HAVE_DRM) || defined(HAVE_LIBDRM) + return; +} + +// end of oneAPI Level Zero IPC MemHandle Cache +//*************************************************************************** + +// IPC Handle management for OneAPI Level Zero + +#ifndef PSM_HAVE_PIDFD +/* + * psm3_onapi_ze_init_fds - initialize the file descriptors (ze_dev_fds) + * + * Open the file descriptors for our GPUs (psm3_ze_dev_fds[]) + * + * The file descriptors are used in intra-node communication to pass to peers + * via socket with sendmsg/recvmsg SCM_RIGHTS message type. + * + */ + +static psm2_error_t psm3_onapi_ze_init_fds(void) +{ + const char *dev_dir = "/dev/dri/by-path/"; + const char *suffix = "-render"; + DIR *dir; + struct dirent *ent = NULL; + char dev_name[NAME_MAX]; + int i = 0, ret; + + if (psm3_num_ze_dev_fds) + return PSM2_OK; + + dir = opendir(dev_dir); + if (dir == NULL) + return PSM2_INTERNAL_ERR; + + while ((ent = readdir(dir)) != NULL) { + if (ent->d_name[0] == '.' || + strstr(ent->d_name, suffix) == NULL) + continue; + + memset(dev_name, 0, sizeof(dev_name)); + ret = snprintf(dev_name, NAME_MAX, "%s%s", dev_dir, ent->d_name); + if (ret < 0 || ret >= NAME_MAX) { + _HFI_INFO("GPU dev name too long: %s%s\n", dev_dir, ent->d_name); + goto err; + } + + psm3_ze_dev_fds[i] = open(dev_name, O_RDWR); + if (psm3_ze_dev_fds[i] == -1) { + _HFI_INFO("Failed to open %s GPU dev FD: %s\n", dev_name, + strerror(errno)); + goto err; + } + _HFI_DBG("Opened %s GPU dev FD: %d\n", dev_name, + psm3_ze_dev_fds[i]); + i++; + psm3_num_ze_dev_fds++; + } + (void) closedir(dir); + _HFI_DBG("Opened %d GPU dev FDs\n", psm3_num_ze_dev_fds); + return PSM2_OK; + +err: + (void) closedir(dir); + return PSM2_INTERNAL_ERR; +} + +/* + * psm3_oneapi_ze_get_dev_fds - fetch device file descriptors + * + * Returns a pointer to ze_dev_fds while putting the number + * of fds into the in/out nfds parameter + * + */ + +static int *psm3_oneapi_ze_get_dev_fds(int *nfds) +{ + *nfds = psm3_num_ze_dev_fds; + return psm3_ze_dev_fds; +} + +/* + * psm3_oneapi_ze_sendmsg_fds - send device file descriptors over socket w/ sendmsg + * + * Prepares message of type SCM_RIGHTS, copies file descriptors as payload, + * and sends over socket via sendmsg while creating appropriate fd numbers + * for dest (effectively a dup(2) of our file descriptor) + * + * returns -errno on error or number of bytes sent (>0) on success + */ + +static int psm3_oneapi_ze_sendmsg_fds(int sock, int *fds, int nfds, psm2_epid_t epid) +{ + struct msghdr msg; + struct cmsghdr *cmsg; + struct iovec iov; + int64_t peer_id = *(int64_t *)&epid; + char *ctrl_buf; + size_t ctrl_size; + int ret; + + ctrl_size = sizeof(*fds) * nfds; + ctrl_buf = (char *)psmi_calloc(NULL, UNDEFINED, 1, CMSG_SPACE(ctrl_size)); + if (!ctrl_buf) + return -ENOMEM; + + iov.iov_base = &peer_id; + iov.iov_len = sizeof(peer_id); + + memset(&msg, 0, sizeof(msg)); + msg.msg_control = ctrl_buf; + msg.msg_controllen = CMSG_SPACE(ctrl_size); + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(ctrl_size); + memcpy(CMSG_DATA(cmsg), fds, ctrl_size); + + ret = sendmsg(sock, &msg, 0); + if (ret < 0) + ret = -errno; + else if (! ret) + ret = -EAGAIN; + + psmi_free(ctrl_buf); + return ret; +} + +/* + * psm3_oneapi_ze_recvmsg_fds - receive device file descriptors from socket w/ recvmsg + * + * Prepares message buffer of type SCM_RIGHTS, receives message from socket + * via recvmsg, and copies device file descriptors to in/out parameter. + * The received file descriptors are usable in our process and need to + * be closed when done being used + * + * returns -errno on error or number of bytes received (>0) on success + */ + +static int psm3_oneapi_ze_recvmsg_fd(int sock, int *fds, int nfds, psm2_epid_t epid) +{ + struct msghdr msg; + struct cmsghdr *cmsg; + struct iovec iov; + int64_t peer_id = *(int64_t *)&epid; + char *ctrl_buf; + size_t ctrl_size; + int ret; + + ctrl_size = sizeof(*fds) * nfds; + ctrl_buf = (char *)psmi_calloc(NULL, UNDEFINED, 1, CMSG_SPACE(ctrl_size)); + if (!ctrl_buf) + return -ENOMEM; + + iov.iov_base = &peer_id; + iov.iov_len = sizeof(peer_id); + + memset(&msg, 0, sizeof(msg)); + msg.msg_control = ctrl_buf; + msg.msg_controllen = CMSG_SPACE(ctrl_size); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + ret = recvmsg(sock, &msg, 0); + if (ret < 0) { + ret = -errno; + } else if (ret != sizeof(peer_id)) { + _HFI_CONNDBG("recvmsg from: %s returns %d expect %u\n", + psm3_epid_fmt_addr(epid, 0), ret, + (unsigned)sizeof(peer_id) ); + ret = -EAGAIN; + goto out; + } + + psmi_assert(!(msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))); + cmsg = CMSG_FIRSTHDR(&msg); + psmi_assert(cmsg && cmsg->cmsg_len == CMSG_LEN(ctrl_size) && + cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS && CMSG_DATA(cmsg)); + memcpy(fds, CMSG_DATA(cmsg), ctrl_size); +out: + psmi_free(ctrl_buf); + return ret; +} + +/* + * psm3_onapi_ze_init_ipc_socket - initialize ipc socket in ep + * + * Set up the AF_UNIX ipc socket in the ep for listen mode. Name it + * using our epid, and bind it. + * + */ + +static psm2_error_t psm3_onapi_ze_init_ipc_socket(struct ptl_am *ptl) +{ + psm2_error_t err = PSM2_OK; + int ret; + struct sockaddr_un sockaddr = {0}; + socklen_t len = sizeof(sockaddr); + + if ((ptl->gpu_specific.ze_ipc_socket = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { + _HFI_ERROR("error creating GPU dev FDs AF_UNIX sock: %s\n", + strerror(errno)); + err = PSM2_INTERNAL_ERR; + goto fail; + } + + sockaddr.sun_family = AF_UNIX; + snprintf(sockaddr.sun_path, 108, "/dev/shm/psm3_shm.ze_sock2.%ld.%s", + (long int) getuid(), psm3_epid_fmt_internal(ptl->epid, 0)); + ptl->gpu_specific.ze_listen_sockname = psmi_strdup(NULL, sockaddr.sun_path); + if (ptl->gpu_specific.ze_listen_sockname == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + + if ((ret = bind(ptl->gpu_specific.ze_ipc_socket, (struct sockaddr *) &sockaddr, len)) < 0) { + _HFI_ERROR("error binding GPU dev FDs AF_UNIX sock to %s: %s\n", + sockaddr.sun_path, strerror(errno)); + err = PSM2_INTERNAL_ERR; + goto fail; + } + + if ((ret = listen(ptl->gpu_specific.ze_ipc_socket, 256)) < 0) { + _HFI_ERROR("error listening on GPU dev FDs AF_UNIX sock %s: %s\n", + sockaddr.sun_path, strerror(errno)); + err = PSM2_INTERNAL_ERR; + goto fail; + } + return PSM2_OK; + +fail: + if (ptl->gpu_specific.ze_ipc_socket >= 0) + close(ptl->gpu_specific.ze_ipc_socket); + ptl->gpu_specific.ze_ipc_socket = -1; + if (ptl->gpu_specific.ze_listen_sockname) + psmi_free(ptl->gpu_specific.ze_listen_sockname); + ptl->gpu_specific.ze_listen_sockname = NULL; + return err; +} + +/* + * psm3_oneapi_ze_receive_dev_fds - receive the dev fds on the listen socket + * + * Set up the listen socket to be polled for POLLIN. When the event is + * received, accept for the new socket and then read the peer epid, + * and locate the epaddr for it. Then receive the dev fds to be stored + * in the am_epaddr. + * + * returns: + * PSM2_OK - GPU dev FDs received from a peer + * PSM2_OK_NO_PROGRESS - nothing received + * other - error + */ + +static psm2_error_t psm3_oneapi_ze_receive_dev_fds(struct ptl_am *ptl) +{ + psm2_error_t err = PSM2_OK; + struct pollfd fdset; + int newsock = -1; + + fdset.fd = ptl->gpu_specific.ze_ipc_socket; + fdset.events = POLLIN; + + if (poll(&fdset, 1, 0) <= 0) + return PSM2_OK_NO_PROGRESS; + + { + struct sockaddr_un sockaddr = {0}; + socklen_t len = sizeof(sockaddr); + int nfds = psm3_num_ze_dev_fds; + int nread; + psm2_epid_t epid; + psm2_epaddr_t epaddr; + am_epaddr_t *am_epaddr; + + newsock = accept(ptl->gpu_specific.ze_ipc_socket, (struct sockaddr *)&sockaddr, &len); + if (newsock < 0) { + _HFI_ERROR("GPU dev FDs AF_UNIX accept failed: %s\n", + strerror(errno)); + err = PSM2_INTERNAL_ERR; + goto fail; + } else { + int ret; + // technically we could get less than we asked for and need to + // call recv again in future but our transfers are small enough + // we should get it all + if ((nread = recv(newsock, &epid, sizeof(epid), 0)) < 0) { + _HFI_ERROR("GPU dev FDs AF_UNIX recv failed: %s\n", + strerror(errno)); + err = PSM2_INTERNAL_ERR; + goto fail; + } + if (nread != sizeof(epid)) { + _HFI_ERROR("GPU dev FDs AF_UNIX recv incomplete: %d\n", nread); + err = PSM2_INTERNAL_ERR; + goto fail; + } + // we only poll for recv FDs after processing a am_shm connect + // so the epid should always be known + if ((epaddr = psm3_epid_lookup(ptl->ep, epid)) == NULL) { + _HFI_ERROR("Peer Unknown, unable to receive GPU dev FDs from: %s\n", + psm3_epid_fmt_addr(epid, 0)); + err = PSM2_INTERNAL_ERR; + goto fail; + } + am_epaddr = (am_epaddr_t *)epaddr; + am_epaddr->gpu_specific.ze_num_peer_fds = nfds; + ret = psm3_oneapi_ze_recvmsg_fd(newsock, am_epaddr->gpu_specific.ze_peer_fds, nfds, ptl->epid); + if (ret <= 0) { + _HFI_ERROR("Unable to recvmsg %d GPU dev FDs from: %s: %s\n", + nfds, psm3_epid_fmt_addr(epid, 0), + strerror(-ret)); + err = PSM2_INTERNAL_ERR; + goto fail; + } + _HFI_CONNDBG("%d GPU dev FDs Received from: %s\n", + nfds, psm3_epid_fmt_addr(epid, 0)); + } + } + +fail: + if (newsock >= 0) + close(newsock); + return err; +} + +/* + * psm3_oneapi_ze_send_dev_fds - do next step to send the dev fds to the peer's + * listen socket + * + * Check the connected state and proceed accordingly: + * - ZE_SOCK_NOT_CONNECTED + * We have not done anything yet, so connect and send our epid, + * followed by the dev fds. Set state to ZE_SOCK_DEV_FDS_SENT + * - ZE_SOCK_DEV_FDS_SENT + * The dev fds have been sent. Issue ioctl to see if the output + * queue has been emptied indicating that the peer has read the data. + * If so, set state to ZE_SOCK_DEV_FDS_SENT_AND_RECD. + * - ZE_SOCK_DEV_FDS_SENT_AND_RECD + * We are done, just return. + * + * returns: + * PSM2_OK - next step completed + * PSM2_OK_NO_PROGRESS - nothing to do + * other - error + */ + +static psm2_error_t psm3_oneapi_ze_send_dev_fds(struct ptl_am *ptl, struct am_epaddr *am_epaddr) +{ + switch (am_epaddr->gpu_specific.ze_sock_connected_state) { + case ZE_SOCK_DEV_FDS_SENT_AND_RECD: + return PSM2_OK_NO_PROGRESS; + break; + + case ZE_SOCK_DEV_FDS_SENT: + { + int pending; + + psmi_assert(am_epaddr->gpu_specific.ze_sock >= 0); + if_pf (ioctl(am_epaddr->gpu_specific.ze_sock, SIOCOUTQ, &pending) != 0) { + return psm3_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "error sending dev FDs: %s\n", strerror(errno)); + } + if (pending == 0) { + am_epaddr->gpu_specific.ze_sock_connected_state = ZE_SOCK_DEV_FDS_SENT_AND_RECD; + _HFI_CONNDBG("GPU dev FDs Send Completed to: %s\n", + psm3_epid_fmt_addr(am_epaddr->epaddr.epid, 0)); + close(am_epaddr->gpu_specific.ze_sock); + am_epaddr->gpu_specific.ze_sock = -1; + return PSM2_OK; + } + // be paranoid just in case 1st call to send_dev_fds for given + // epaddr gets here + if (! ptl->gpu_specific.ze_need_dev_fds_poll) + _HFI_CONNDBG("restart GPU dev FDs poll\n"); + ptl->gpu_specific.ze_need_dev_fds_poll = 1; + return PSM2_OK_NO_PROGRESS; + break; + } + + case ZE_SOCK_NOT_CONNECTED: + { + struct sockaddr_un sockaddr = {0}; + socklen_t len = sizeof(sockaddr); + psm2_epid_t peer_epid = am_epaddr->epaddr.epid; + int *fds, nfds; + + if (!ptl->gpu_specific.ze_need_dev_fds_poll) + _HFI_CONNDBG("restart GPU dev FDs poll\n"); + ptl->gpu_specific.ze_need_dev_fds_poll = 1; + + fds = psm3_oneapi_ze_get_dev_fds(&nfds); + + if ((am_epaddr->gpu_specific.ze_sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { + _HFI_ERROR("error creating GPU dev FDs AF_UNIX sock: %s\n", + strerror(errno)); + goto fail; + } + + sockaddr.sun_family = AF_UNIX; + snprintf(sockaddr.sun_path, 108, "/dev/shm/psm3_shm.ze_sock2.%ld.%s", + (long int) getuid(), psm3_epid_fmt_internal(peer_epid, 0)); + + if (connect(am_epaddr->gpu_specific.ze_sock, (struct sockaddr *) &sockaddr, len) < 0) { + _HFI_ERROR("GPU dev FDs connect to %s (via %s) failed: %s\n", + psm3_epid_fmt_addr(am_epaddr->epaddr.epid, 0), + sockaddr.sun_path, strerror(errno)); + goto fail; + } else { + int ret; + ret = send(am_epaddr->gpu_specific.ze_sock, &ptl->epid, sizeof(ptl->epid), 0); + if (ret < 0) { + _HFI_ERROR("GPU dev FDs send to %s (via %s) failed: %s\n", + psm3_epid_fmt_addr(am_epaddr->epaddr.epid, 0), + sockaddr.sun_path, strerror(errno)); + goto fail; + } + + ret = psm3_oneapi_ze_sendmsg_fds(am_epaddr->gpu_specific.ze_sock, fds, nfds, peer_epid); + if (ret <= 0) { + /* ret is -errno */ + _HFI_ERROR("GPU dev FDs sendmsg to %s (via %s) failed: %s\n", + psm3_epid_fmt_addr(am_epaddr->epaddr.epid, 0), + sockaddr.sun_path, strerror(-ret)); + goto fail; + } + am_epaddr->gpu_specific.ze_sock_connected_state = ZE_SOCK_DEV_FDS_SENT; + _HFI_CONNDBG("%d GPU dev FDs Posted Send to: %s (via %s)\n", + nfds, psm3_epid_fmt_addr(am_epaddr->epaddr.epid, 0), + sockaddr.sun_path); + return PSM2_OK; + } + /* NOTREACHED */ + break; + } + + default: + return PSM2_INTERNAL_ERR; + break; + } + /* NOTREACHED */ + return PSM2_INTERNAL_ERR; + +fail: + if (am_epaddr->gpu_specific.ze_sock >= 0) + close(am_epaddr->gpu_specific.ze_sock); + am_epaddr->gpu_specific.ze_sock = -1; + return PSM2_INTERNAL_ERR; +} + +// simple test if dev_fds bi-dir exchange completed for given epaddr +// 1 = yes, 0 = no +static int psm3_oneapi_ze_dev_fds_exchanged(struct am_epaddr *am_epaddr) +{ + return (am_epaddr->gpu_specific.ze_sock_connected_state == ZE_SOCK_DEV_FDS_SENT_AND_RECD + && am_epaddr->gpu_specific.ze_num_peer_fds) ; +} + +/* + * psm3_oneapi_ze_check_dev_fds_exchanged - check that dev fds have been bi-dir exchanged + * with given peer. Poll to try and move forward as needed. + * + * connect state ZE_SOCK_DEV_FDS_SENT_AND_RECD indicates peer has received + * our send of dev_fds + * + * num_peer_fds indicates if we received peer's fds. + * + * if both are satisfied, exchange is complete, return PSM2_OK + * + *Returns: + * PSM2_OK - both are done + * PSM2_OK_NO_PROGRESS - more work needed + * other - error + */ +static psm2_error_t psm3_oneapi_ze_check_dev_fds_exchanged(struct ptl_am *ptl, struct am_epaddr *am_epaddr) +{ + psm2_error_t err; + psm2_error_t ret; + + psmi_assert(am_epaddr); + psmi_assert(! psm3_epid_zero_internal(am_epaddr->epaddr.epid)); + + if (psm3_oneapi_ze_dev_fds_exchanged(am_epaddr)) + return PSM2_OK; + + if (am_epaddr->cstate_outgoing != AMSH_CSTATE_OUTGOING_ESTABLISHED + && am_epaddr->cstate_incoming != AMSH_CSTATE_INCOMING_ESTABLISHED) + return PSM2_OK_NO_PROGRESS; + + // try to move forward 1 step + err = psm3_oneapi_ze_send_dev_fds(ptl, am_epaddr); + if (am_epaddr->gpu_specific.ze_sock_connected_state == ZE_SOCK_DEV_FDS_SENT_AND_RECD) + err = PSM2_OK; + else /* err will be NO_PROGRESS or worse */ + err = psm3_error_cmp(err, PSM2_OK_NO_PROGRESS); + + // only poll recv if we need to + ret = PSM2_OK_NO_PROGRESS; // keep KW happy + if (am_epaddr->gpu_specific.ze_num_peer_fds == 0) + ret = psm3_oneapi_ze_receive_dev_fds(ptl); + if (am_epaddr->gpu_specific.ze_num_peer_fds) + ret = PSM2_OK; + + /* worst err, NO_PROGRESS is worse than PSM2_OK */ + return psm3_error_cmp(ret, err); +} + +// check if all successful epid/epaddr in req have exchanged GPU dev FDs +// when called it assumes all the good epid have completed so it does not +// check failed epid and just treats them as done for this phase +// return: +// PSM2_OK - all that can be done are done +// PSM2_OK_NO_PROGRESS - more to be done +static psm2_error_t +psm3_oneapi_ze_shm_ep_connreq_poll_dev_fds(struct ptl_am *ptl, struct am_ptl_connection_req *req) +{ + int num_left = 0; + int i; + + for (i = 0; i < req->numep; i++) { + if (req->epid_mask[i] == AMSH_CMASK_NONE) + continue; + if (req->epid_mask[i] != AMSH_CMASK_DONE || req->errors[i]) + continue; + psmi_assert(req->epaddr[i]); + psmi_assert(! psm3_epid_zero_internal(req->epaddr[i]->epid)); + if (PSM2_OK != psm3_oneapi_ze_check_dev_fds_exchanged(ptl, (struct am_epaddr *)(req->epaddr[i]))) + num_left++; + } + if (num_left == 0) + return PSM2_OK; + else + return PSM2_OK_NO_PROGRESS; // not done everyone yet +} + +/* + * psm3_oneapi_ze_poll_dev_fds_exchanged - poll to make forward progress on + * GPU dev FDs exchange + * + * Loop through the epaddrs in am_ep and check_dev_fds_exchanged + * + * Returns: + * PSM2_OK - we found some work to do and made progress + * PSM2_OK_NO_PROGRESS - didn't find anything to do + * other - error + */ + +static psm2_error_t psm3_oneapi_ze_poll_dev_fds_exchange(struct ptl_am *ptl) +{ + psm2_error_t err = PSM2_OK_NO_PROGRESS; + psm2_error_t ret; + int i; + int num_left = 0; + + err = psm3_oneapi_ze_receive_dev_fds(ptl); + + for (i = 0; i <= ptl->max_ep_idx; i++) { + am_epaddr_t *am_epaddr = (am_epaddr_t *)ptl->am_ep[i].epaddr; + + if (!am_epaddr || psm3_epid_zero_internal(ptl->am_ep[i].epid)) + continue; + + if (psm3_oneapi_ze_dev_fds_exchanged(am_epaddr)) + continue; + + num_left++; // causes one extra poll if complete now below, but no harm + + // don't try if uni-dir REQ/REP is incomplete + if (am_epaddr->cstate_outgoing != AMSH_CSTATE_OUTGOING_ESTABLISHED + && am_epaddr->cstate_incoming != AMSH_CSTATE_INCOMING_ESTABLISHED) + continue; + + // try to move forward 1 step + ret = psm3_oneapi_ze_send_dev_fds(ptl, am_epaddr); + if (ret > PSM2_OK_NO_PROGRESS) + err = psm3_error_cmp(ret, err); + else if (ret == PSM2_OK && err == PSM2_OK_NO_PROGRESS) + err = ret; + } + if (num_left == 0 && ptl->gpu_specific.ze_need_dev_fds_poll) + _HFI_CONNDBG("stop GPU dev FDs poll\n"); + ptl->gpu_specific.ze_need_dev_fds_poll = (num_left != 0); + + return err; +} + +static void psm3_oneapi_ze_sock_detach(struct ptl_am *ptl) +{ + if (ptl->gpu_specific.ze_ipc_socket >= 0) + close(ptl->gpu_specific.ze_ipc_socket); + ptl->gpu_specific.ze_ipc_socket = -1; + if (ptl->gpu_specific.ze_listen_sockname) { + unlink(ptl->gpu_specific.ze_listen_sockname); + psmi_free(ptl->gpu_specific.ze_listen_sockname); + } + ptl->gpu_specific.ze_listen_sockname = NULL; +} +#endif /* not PSM_HAVE_PIDFD */ + +static psm2_error_t psm3_oneapi_ze_shm_init(struct ptl_am *ptl, + psm2_mq_stats_t *stats) +{ +#ifndef PSM_HAVE_PIDFD + psm2_error_t err; + + ptl->gpu_specific.ze_ipc_socket = -1; + if ((err = psm3_onapi_ze_init_ipc_socket(ptl)) != PSM2_OK) + return err; + if ((err = psm3_onapi_ze_init_fds()) != PSM2_OK) + return err; +#endif + +#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) + // TBD - should we have generic names for these env variables + // PSM3_GPU_MEMCACHE_ENABLED, PSM3_GPU_MEMCACHE_SIZE? + union psmi_envvar_val env_memcache_enabled; + psm3_getenv("PSM3_ONEAPI_MEMCACHE_ENABLED", + "PSM oneapi ipc memhandle cache enabled (default is enabled)", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)1, &env_memcache_enabled); + if (env_memcache_enabled.e_uint) { + union psmi_envvar_val env_memcache_size; + psm3_getenv("PSM3_ONEAPI_MEMCACHE_SIZE", + "Size of the oneapi ipc memhandle cache ", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)ONEAPI_MEMHANDLE_CACHE_SIZE, + &env_memcache_size); + return psm3_oneapi_ze_memhandle_cache_alloc( + (psm3_oneapi_ze_memhandle_cache_t *)&ptl->memhandle_cache, + env_memcache_size.e_uint, stats); +#endif + } + return PSM2_OK; +} + +static void psm3_oneapi_ze_shm_finalize(struct ptl_am *ptl) +{ +#ifndef PSM_HAVE_PIDFD + psm3_oneapi_ze_sock_detach(ptl); +#endif + if (ptl->memhandle_cache) + psm3_oneapi_ze_memhandle_cache_free(ptl->memhandle_cache); + ptl->memhandle_cache = NULL; + return; +} + +static psm2_error_t psm3_oneapi_ze_shm_epaddr_add(struct ptl_am *ptl, struct am_epaddr *am_epaddr) +{ +#ifdef PSM_HAVE_PIDFD + am_epaddr->gpu_specific.ze_pidfd = syscall(SYS_pidfd_open, ptl->am_ep[am_epaddr->shmidx].pid, 0); + if (am_epaddr->gpu_specific.ze_pidfd < 0) { + _HFI_ERROR("pidfd_open failed: pid %u, ret %d (%s)\n", + ptl->am_ep[am_epaddr->shmidx].pid, + am_epaddr->gpu_specific.ze_pidfd, + strerror(errno)); + return PSM2_NO_MEMORY; + } +#else + am_epaddr->gpu_specific.ze_num_peer_fds = 0; + { + int i; + for (i=0; i < MAX_ZE_DEVICES; i++) + am_epaddr->gpu_specific.ze_peer_fds[i] = -1; + } + am_epaddr->gpu_specific.ze_sock_connected_state = ZE_SOCK_NOT_CONNECTED; + am_epaddr->gpu_specific.ze_sock = -1; +#endif + return PSM2_OK; +} + +static void psm3_oneapi_ze_shm_epaddr_free(struct am_epaddr *am_epaddr) +{ +#ifdef PSM_HAVE_PIDFD + if (am_epaddr->gpu_specific.ze_pidfd >= 0) + close(am_epaddr->gpu_specific.ze_pidfd); +#else + { + int i; + for (i=0; i < MAX_ZE_DEVICES; i++) + if (am_epaddr->gpu_specific.ze_peer_fds[i] >= 0) + close(am_epaddr->gpu_specific.ze_peer_fds[i]); + } + if (am_epaddr->gpu_specific.ze_sock >= 0) + close(am_epaddr->gpu_specific.ze_sock); +#endif +} + +static int psm3_oneapi_ze_shm_dev_fds_needed() +{ +#ifndef PSM_HAVE_PIDFD + return 1; +#else + return 0; +#endif +} + +static void psm3_oneapi_ze_shm_dev_fds_send(struct ptl_am *ptl, struct am_epaddr *am_epaddr) +{ +#ifndef PSM_HAVE_PIDFD + psm3_oneapi_ze_send_dev_fds(ptl, am_epaddr); +#endif +} + +static psm2_error_t psm3_oneapi_ze_shm_dev_fds_connreq_poll(struct ptl_am *ptl, struct am_ptl_connection_req *req) +{ +#ifndef PSM_HAVE_PIDFD + return psm3_oneapi_ze_shm_ep_connreq_poll_dev_fds(ptl, req); +#else + return PSM2_OK; +#endif +} + +static psm2_error_t psm3_oneapi_ze_shm_dev_fds_check_exchanged(struct ptl_am *ptl, struct am_ptl_connection_req *req, int index) +{ +#ifndef PSM_HAVE_PIDFD + // late connect establish, check once to + // see if have GPU dev fds, if not, this one + // missed the timelimit and timesout + if (req->op == AM_PTL_OP_CONNECT) + _HFI_CONNDBG("late established, special GPU dev FDs poll\n"); + if (req->op == AM_PTL_OP_CONNECT && + PSM2_OK != psm3_oneapi_ze_check_dev_fds_exchanged(ptl, (struct am_epaddr *)(req->epaddr[index]))) + return PSM2_OK_NO_PROGRESS; + else +#endif + return PSM2_OK; +} + +static psm2_error_t psm3_oneapi_ze_shm_dev_fds_poll(struct ptl_am *ptl, psm2_error_t res) +{ +#ifndef PSM_HAVE_PIDFD + // play err safe, callers ignore errors or expect just OK or NO_PROGRESS + if (ptl->gpu_specific.ze_need_dev_fds_poll + && psm3_oneapi_ze_poll_dev_fds_exchange(ptl) != PSM2_OK_NO_PROGRESS) + return PSM2_OK; +#endif + return res; +} + +// On Sender, place the IPC handle in the RTS +// We put offset in the basic "args" parameters and the actual +// IPC handle as payload due to it's size +// Callers expect payload_size >0 when using GPU IPC and key off non-zero +// payload size in RTS to identify a GPU IPC RTS +// Save in the req the needed information about IPC resources allocated here +// so psm3_oneapi_ze_process_cts and release them. +static psm2_error_t psm3_oneapi_ze_shm_build_rts(struct ptl_am *ptl, + psm2_mq_req_t req, int *narg_p, + psm2_amarg_t *args, void **payload_p, size_t *payload_size_p, + union am_gpu_rts_payload *info) +{ +#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) +#ifndef PSM_HAVE_PIDFD + int fd; + int *devfds; + int numfds; + int device_index = 0; + struct drm_prime_handle open_fd = {0, 0, 0}; +#endif + uint64_t handle_fd = 0; + size_t total; + void *buf_base_ptr; + uint64_t alloc_id; + void *buf = req->req_data.buf; + +#ifndef PSM_HAVE_PIDFD + devfds = psm3_oneapi_ze_get_dev_fds(&numfds); + device_index = psm3_oneapi_ze_cur_dev - psm3_oneapi_ze_devices; /* index (offset) in table */ + args[5].u32w0 = device_index; + fd = devfds[device_index]; +#endif + PSM3_ONEAPI_ZE_CALL(zeMemGetAddressRange, psm3_oneapi_ze_context, buf, &buf_base_ptr, &total); + + /* Offset in GPU buffer from which we copy data, we have to + * send it separetly because this offset is lost + * when zeMemGetIpcHandle is called */ + req->gpu_specific.ze_ipc_offset = (uint32_t)((uintptr_t)buf - (uintptr_t)buf_base_ptr); + args[2].u32w0 = (uint32_t)req->gpu_specific.ze_ipc_offset; + alloc_id = psm3_oneapi_ze_get_alloc_id(buf_base_ptr, &info->ze.ze_alloc_type); +#ifndef PSM_HAVE_PIDFD + args[5].u32w1 = (uint32_t)alloc_id; /* 32-bit for now */ +#else + args[5].u64w0 = alloc_id; +#endif + + PSM3_ONEAPI_ZE_CALL(zeMemGetIpcHandle, psm3_oneapi_ze_context, + (const void *)buf_base_ptr, &req->gpu_specific.ze_ipc_handle); +#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE + psm3_oneapi_ze_get_dmabuf_fd((const void *)buf, &handle_fd); +#else + memcpy(&handle_fd, &req->gpu_specific.ze_ipc_handle, sizeof(uint32_t)); +#endif + req->gpu_specific.ze_handle_attached = 1; +#ifndef PSM_HAVE_PIDFD + open_fd.fd = (uint32_t)handle_fd; + if (ioctl(fd, DRM_IOCTL_PRIME_FD_TO_HANDLE, &open_fd) < 0) { + _HFI_ERROR("ioctl failed for DRM_IOCTL_PRIME_FD_TO_HANDLE: for fd %d: %s", open_fd.fd, strerror(errno)); + psm3_handle_error(ptl->ep, PSM2_INTERNAL_ERR, + "ioctl failed for DRM_IOCTL_PRIME_FD_TO_HANDLE for fd %d: errno=%d", + open_fd.fd, errno); + return PSM2_INTERNAL_ERR; + } + _HFI_VDBG("FD_TO_HANDLE: buf %p total 0x%lx base %p alloc_id %lu gem_handle %u\n", + buf, total, buf_base_ptr, alloc_id, open_fd.handle); + info->ze.ze_handle = open_fd.handle; + *narg_p = 6; + *payload_p = (void *)info; + *payload_size_p = sizeof(struct am_oneapi_ze_rts_payload); + // for DRM approach once we have the open_fd we could + // PutIpcHandle(ipc_handle) since open_fd has a reference + // however since that is a legacy mode, we focus on the + // prefered mode and have both delay the Put until CTS received +#else + info->ze.ze_handle = (uint32_t)handle_fd; + *narg_p = 6; + *payload_p = (void *)info; + *payload_size_p = sizeof(struct am_oneapi_ze_rts_payload); +#endif /* PSM_HAVE_PIDFD */ + return PSM2_OK; +#else // if no drm, error out as oneapi ipc handles don't work without drm + return PSM2_INTERNAL_ERR; +#endif // defined(HAVE_DRM) || defined(HAVE_LIBDRM) +} + +// On receiver, pull IPC information out of the RTS which our peer build using +// psm3_oneapi_ze_shm_build_rts. Information is saved to the req for subsequent +// processing after tag matching via psm3_oneapi_ze_shm_rtsmatch +static void psm3_oneapi_ze_shm_process_rts(psm2_mq_req_t req, void *buf, + size_t len, int narg, psm2_amarg_t *args) +{ + struct am_oneapi_ze_rts_payload *info; + + psmi_assert(narg == 6); + info = (struct am_oneapi_ze_rts_payload *)buf; + psmi_assert(len == sizeof(struct am_oneapi_ze_rts_payload)); + req->gpu_specific.ze_handle = info->ze_handle; + req->gpu_specific.ze_alloc_type = info->ze_alloc_type; + req->gpu_specific.ze_handle_attached = 1; + req->gpu_specific.ze_ipc_offset = args[2].u32w0; +#ifndef PSM_HAVE_PIDFD + req->gpu_specific.ze_device_index = args[5].u32w0; + req->gpu_specific.ze_alloc_id = args[5].u32w1; +#else + req->gpu_specific.ze_alloc_id = args[5].u64w0; +#endif +} + +// On receiver, use GPU IPC to copy data from the sender to this process +// This is called on the receiver after psm3_oneapi_ze_process_rts has parsed +// the incoming RTS and tag matching has matched the RTS with a receive buffer +// and populated the req with information about the matched receiver buffer +static int psm3_oneapi_ze_shm_rtsmatch(struct ptl_am *ptl, psm2_mq_req_t req) +{ + if (req->gpu_specific.ze_handle_attached) { + void *buf_ptr = psm3_oneapi_ze_memhandle_acquire( + ptl->memhandle_cache, + req->rts_sbuf - req->gpu_specific.ze_ipc_offset, req->gpu_specific.ze_handle, + req->rts_peer, +#ifndef PSM_HAVE_PIDFD + req->gpu_specific.ze_device_index, req->gpu_specific.ze_alloc_id, +#else + 0, req->gpu_specific.ze_alloc_id, +#endif + req->gpu_specific.ze_alloc_type); + psmi_assert_always(buf_ptr != NULL); + buf_ptr = (uint8_t *)buf_ptr + req->gpu_specific.ze_ipc_offset; + /* zeMemcpy into the receive side buffer + * based on its location */ + _HFI_VDBG("Copying src %p (offset 0x%x) dst %p msg_len %u\n", + buf_ptr, req->gpu_specific.ze_ipc_offset, + req->req_data.buf, req->req_data.recv_msglen); + if (req->is_buf_gpu_mem) { + /*PSM3_GPU_MEMCPY_DTOD*/ + psm3_oneapi_ze_memcpy_DtoD(req->req_data.buf, buf_ptr, + req->req_data.recv_msglen); + // can skip sychronize, it's a noop for oneapi_ze + //PSM3_GPU_SYNCHRONIZE_MEMCPY(); + //psm3_oneapi_ze_synchronize_memcpy(); + } else { + /*PSM3_GPU_MEMCPY_DTOH*/ + psm3_oneapi_ze_memcpy_DtoH(req->req_data.buf, buf_ptr, + req->req_data.recv_msglen); + } + psm3_oneapi_ze_memhandle_release(ptl->memhandle_cache, + (uint8_t *)buf_ptr - req->gpu_specific.ze_ipc_offset); + req->gpu_specific.ze_handle_attached = 0; + return 1; + } + return 0; +} + +// On sender, we have now received the CTS corresponding to an RTS +// we may have built in psm3_oneapi_ze_build_rts. All we need to do here is release +// the resources we allocated in psm3_oneapi_ze_build_rts. We saved the necessary +// information tracking those resources in the send req. +// Returns: +// 0 - the req was not for a GPU IO +// 1 - the req was for a GPU IO and we have released the resources +static int psm3_oneapi_ze_shm_process_cts(psm2_mq_req_t req) +{ + if (req->gpu_specific.ze_handle_attached) { + psm3_oneapi_ze_put_ipc_handle(req->req_data.buf - req->gpu_specific.ze_ipc_offset, + req->gpu_specific.ze_ipc_handle); + req->gpu_specific.ze_handle_attached = 0; + return 1; + } + return 0; +} +// end of RTS and CTS processing functions for PSM3_DEVICES "shm" +//*************************************************************************** + +static psm2_error_t psm3_oneapi_ze_get_cuda_permitted(struct psm2_ep *ep, bool *enable) +{ + *enable = true; + return PSM2_OK; +} + +static psm2_error_t psm3_oneapi_ze_set_cuda_permitted(struct psm2_ep *ep, bool enable) +{ + return PSM2_OK; +} + +static bool psm3_oneapi_ze_is_memcpy_permitted(struct psm2_ep *ep) +{ + return true; +} + +struct psm3_gpu_hal psm3_oneapi_ze_hal = { + .type = "oneapi-ze", +#ifdef PSM_HAVE_RNDV_MOD +#if defined(RV_GPU_ABI_VER_MINOR_0) && defined(RV_GPU_ABI_VER_MAJOR_1) && defined(RV_GPU_ABI_VER_MINOR_1) + // RV GPU API <= 1.0 does not have track GPU alloc_id + // RV GPU API <= 1.1 requires munmap_unpin + // so if RV GPU API <= 1.1, do not allow GPUDirect + .rv_major_rev_fail = RV_GPU_ABI_VER_MAJOR_1, + .rv_minor_rev_fail = RV_GPU_ABI_VER_MINOR_1, +#else + /* not defined if compile against older RV header */ +#error "Intel GPU Support requires version 1.1 or newer rv_user_ioctls.h header" +#endif + + .rv_capability_expected = RV_CAP_INTEL_GPU, + .hal_cap_expected = PSM_HAL_CAP_INTEL_GPU, +#endif /* PSM_HAVE_RNDV_MOD */ + + .ghfp_initialize = psm3_oneapi_ze_initialize, + .ghfp_finalize = psm3_oneapi_ze_finalize, + .ghfp_ep_open = psm3_oneapi_ze_ep_open, + .ghfp_ep_close = psm3_oneapi_ze_ep_close, + .ghfp_identify = psm3_oneapi_ze_identify, + .ghfp_verify_GPU_capabilities = psm3_oneapi_ze_verify_GPU_capabilities, + .ghfp_p2p_supported = psm3_oneapi_ze_p2p_supported, + .ghfp_gpudirect_supported = psm3_oneapi_ze_gpudirect_supported, + .ghfp_using_rv_for_mrs = psm3_oneapi_ze_using_rv_for_mrs, + .ghfp_get_pci_addr = psm3_oneapi_ze_get_pci_addr, +#ifdef PSM_HAVE_RNDV_MOD + .ghfp_min_bar_size = psm3_oneapi_ze_min_bar_size, + .ghfp_check_phys_addr = psm3_oneapi_ze_check_phys_addr, + .ghfp_roundup_gdrcopy = psm3_oneapi_ze_roundup_gdrcopy, +#ifdef PSM_HAVE_REG_MR + .ghfp_roundup_rv_reg_mr = psm3_oneapi_ze_roundup_rv_reg_mr, + .ghfp_init_rv_reg_mr_params = psm3_oneapi_ze_init_rv_reg_mr_params, +#endif + .ghfp_init_rv_pin_mmap_params = psm3_oneapi_ze_init_rv_pin_mmap_params, + .ghfp_rv_reg_mmap_cleanup = psm3_oneapi_ze_rv_reg_mmap_cleanup, +#endif /* PSM_HAVE_RNDV_MOD */ +#ifdef PSM_HAVE_REG_MR + .ghfp_cmp_mr = psm3_oneapi_ze_cmp_mr, + .ghfp_init_mr = psm3_oneapi_ze_init_mr, +#endif + .ghfp_fetch_ctxt = psm3_oneapi_ze_fetch_ctxt, + .ghfp_refresh_ctxt = psm3_oneapi_ze_refresh_ctxt, + .ghfp_register_hostmem = psm3_oneapi_ze_register_hostmem, + .ghfp_unregister_hostmem = psm3_oneapi_ze_unregister_hostmem, + .ghfp_is_gpu_mem = psm3_oneapi_ze_is_gpu_mem, + .ghfp_prepare_HtoD_memcpys = psm3_oneapi_ze_prepare_HtoD_memcpys, + .ghfp_prepare_DtoH_memcpys = psm3_oneapi_ze_prepare_DtoH_memcpys, + .ghfp_shutdown_HtoD_memcpys = psm3_oneapi_ze_shutdown_HtoD_memcpys, + .ghfp_shutdown_DtoH_memcpys = psm3_oneapi_ze_shutdown_DtoH_memcpys, + .ghfp_memcpy_HtoD_start = psm3_oneapi_ze_memcpy_HtoD_start, + .ghfp_memcpy_DtoH_start = psm3_oneapi_ze_memcpy_DtoH_start, + .ghfp_memcpy_done = psm3_oneapi_ze_memcpy_done, + .ghfp_hostbuf_lazy_init = psm3_oneapi_ze_hostbuf_lazy_init, + .ghfp_hostbuf_reset = psm3_oneapi_ze_hostbuf_reset, + .ghfp_hostbuf_destroy = psm3_oneapi_ze_hostbuf_destroy, + .ghfp_memcpy_DtoD = psm3_oneapi_ze_memcpy_DtoD, + .ghfp_memcpy_HtoD = psm3_oneapi_ze_memcpy_HtoD, + .ghfp_memcpy_DtoH = psm3_oneapi_ze_memcpy_DtoH, + .ghfp_memcpy = psm3_oneapi_ze_memcpy, + .ghfp_synchronize_memcpy = psm3_oneapi_ze_synchronize_memcpy, + .ghfp_mark_buf_synchronous = psm3_oneapi_ze_mark_buf_synchronous, + .ghfp_host_alloc = psm3_oneapi_ze_host_alloc, + .ghfp_host_free = psm3_oneapi_ze_host_free, + .ghfp_gpu_addr_send_mr = psm3_oneapi_ze_gpu_addr_send_mr, + .ghfp_gpu_addr_recv_mr = psm3_oneapi_ze_gpu_addr_recv_mr, + // functions for PSM3_DEVICES "shm" RTS/CTS processing to enable + // use of GPU specific scale-up transfers within the given server + .ghfp_shm_init = psm3_oneapi_ze_shm_init, + .ghfp_shm_finalize = psm3_oneapi_ze_shm_finalize, + .ghfp_shm_epaddr_add = psm3_oneapi_ze_shm_epaddr_add, + .ghfp_shm_epaddr_free = psm3_oneapi_ze_shm_epaddr_free, + .ghfp_shm_dev_fds_needed = psm3_oneapi_ze_shm_dev_fds_needed, + .ghfp_shm_dev_fds_send = psm3_oneapi_ze_shm_dev_fds_send, + .ghfp_shm_dev_fds_connreq_poll = psm3_oneapi_ze_shm_dev_fds_connreq_poll, + .ghfp_shm_dev_fds_check_exchanged = psm3_oneapi_ze_shm_dev_fds_check_exchanged, + .ghfp_shm_dev_fds_poll = psm3_oneapi_ze_shm_dev_fds_poll, + .ghfp_shm_build_rts = psm3_oneapi_ze_shm_build_rts, + .ghfp_shm_process_rts = psm3_oneapi_ze_shm_process_rts, + .ghfp_shm_rtsmatch = psm3_oneapi_ze_shm_rtsmatch, + .ghfp_shm_process_cts = psm3_oneapi_ze_shm_process_cts, + .ghfp_get_cuda_permitted = psm3_oneapi_ze_get_cuda_permitted, + .ghfp_set_cuda_permitted = psm3_oneapi_ze_set_cuda_permitted, + .ghfp_is_memcpy_permitted = psm3_oneapi_ze_is_memcpy_permitted, +}; + +#endif /* PSM_ONEAPI */ diff --git a/prov/psm3/psm3/hal_sockets/sockets_ep.c b/prov/psm3/psm3/hal_sockets/sockets_ep.c old mode 100755 new mode 100644 index ce7ddb61bc3..8943cbf511d --- a/prov/psm3/psm3/hal_sockets/sockets_ep.c +++ b/prov/psm3/psm3/hal_sockets/sockets_ep.c @@ -893,7 +893,7 @@ psm3_ep_open_tcp_internal(psm2_ep_t ep, int unit, int port, } #ifdef RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* This function is only called when GPUDirect is enabled */ static psm2_error_t open_rv(psm2_ep_t ep, psm2_uuid_t const job_key) { @@ -908,9 +908,8 @@ static psm2_error_t open_rv(psm2_ep_t ep, psm2_uuid_t const job_key) // GPU Direct is enabled and we need a GPU Cache loc_info.rdma_mode = RV_RDMA_MODE_GPU_ONLY; -#ifdef PSM_ONEAPI - psm3_oneapi_ze_can_use_zemem(); -#endif + + PSM3_GPU_USING_RV_FOR_MRS(); // need portnum for rdma_mode KERNEL or (USER|GPU) loc_info.port_num = ep->portnum; @@ -932,17 +931,14 @@ static psm2_error_t open_rv(psm2_ep_t ep, psm2_uuid_t const job_key) #endif if (loc_info.capability & RV_CAP_GPU_DIRECT) psmi_hal_add_cap(PSM_HAL_CAP_GPUDIRECT); - if (loc_info.capability & RV_CAP_NVIDIA_GPU) - psmi_hal_add_cap(PSM_HAL_CAP_NVIDIA_GPU); - if (loc_info.capability & RV_CAP_INTEL_GPU) - psmi_hal_add_cap(PSM_HAL_CAP_INTEL_GPU); + PSM3_GPU_RV_SET_HAL_CAP(loc_info.capability); // sockets does not support PSM_HAL_CAP_GPUDIRECT_SDMA nor RDMA ep->rv_mr_cache_size = loc_info.mr_cache_size; ep->rv_gpu_cache_size = loc_info.gpu_cache_size; return PSM2_OK; } -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ #endif /* RNDV_MOD */ psm2_error_t @@ -954,7 +950,7 @@ psm3_ep_open_sockets(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid ep->rdmamode = 0; // no rendezvous RDMA for sockets // no MR cache, leave ep->mr_cache_mode as set by caller (NONE) #ifdef RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU ep->rv_gpu_cache_size = psmi_parse_gpudirect_rv_gpu_cache_size(0); #endif #endif @@ -997,9 +993,9 @@ psm3_ep_open_sockets(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid _HFI_PRDBG("Using unit_id[%d] %s.\n", ep->unit_id, ep->dev_name); #ifdef RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* Open rv only when GPUDirect is enabled */ - if (PSMI_IS_GPU_ENABLED && psmi_parse_gpudirect() && + if (PSM3_GPU_IS_ENABLED && psmi_parse_gpudirect() && open_rv(ep, job_key) != PSM2_OK) { _HFI_ERROR("Unable to open rv for port %d of %s.\n", port, ep->dev_name); @@ -1007,7 +1003,7 @@ psm3_ep_open_sockets(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid ep->dev_name = NULL; return PSM2_INTERNAL_ERR; } -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ #endif /* RNDV_MOD */ ep->wiremode = 0; // TCP vs UDP are separate EPID protocols ep->addr_index = addr_index; @@ -1041,7 +1037,7 @@ psm3_sockets_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz) // defaults for SDMA thresholds. // sockets does not support Send DMA, so set large to disable. proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = ~0U; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU proto->iovec_gpu_thresh_eager = proto->iovec_gpu_thresh_eager_blocking = ~0U; #endif #endif @@ -1360,7 +1356,7 @@ void psm3_ep_free_sockets(psm2_ep_t ep) } } #ifdef RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (ep->rv) { psm3_rv_close(ep->rv); ep->rv = NULL; diff --git a/prov/psm3/psm3/hal_sockets/sockets_ep.h b/prov/psm3/psm3/hal_sockets/sockets_ep.h index 51fcd06f792..2cd4b6b467a 100644 --- a/prov/psm3/psm3/hal_sockets/sockets_ep.h +++ b/prov/psm3/psm3/hal_sockets/sockets_ep.h @@ -65,7 +65,7 @@ #include #ifdef RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU #include #include #endif diff --git a/prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c b/prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c index 645dfd3ebd2..3dbe1fdb2f4 100644 --- a/prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c +++ b/prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c @@ -51,8 +51,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) #include "psm_user.h" + +#ifdef PSM_HAVE_GPU #include "psm2_hal.h" #include #include @@ -66,21 +67,14 @@ psm3_sockets_gdr_convert_gpu_to_host_addr(unsigned long buf, size_t size, int flags, psm2_ep_t ep) { +#ifdef RNDV_MOD void *host_addr_buf; uintptr_t pageaddr; uint64_t pagelen; -#ifdef PSM_ONEAPI - PSMI_ONEAPI_ZE_CALL(zeMemGetAddressRange, ze_context, - (const void *)buf, (void **)&pageaddr, &pagelen); -#else - pageaddr = buf & GPU_PAGE_MASK; - pagelen = (uint64_t) (PSMI_GPU_PAGESIZE + - ((buf + size - 1) & GPU_PAGE_MASK) - pageaddr); -#endif + PSM3_GPU_ROUNDUP_GDRCOPY(buf, size, &pageaddr, &pagelen); _HFI_VDBG("buf=%p size=%zu pageaddr=%p pagelen=%"PRIu64" flags=0x%x ep=%p\n", (void *)buf, size, (void *)pageaddr, pagelen, flags, ep); -#ifdef RNDV_MOD ep = ep->mctxt_master; host_addr_buf = psm3_rv_pin_and_mmap(ep->rv, pageaddr, pagelen, IBV_ACCESS_IS_GPU_ADDR); if_pf (! host_addr_buf) { @@ -92,16 +86,12 @@ psm3_sockets_gdr_convert_gpu_to_host_addr(unsigned long buf, return NULL; } //_HFI_ERROR("pinned buf=%p size=%zu pageaddr=%p pagelen=%u flags=0x%x ep=%p, @ %p\n", (void *)buf, size, (void *)pageaddr, pagelen, flags, ep, host_addr_buf); + return (void *)((uintptr_t)host_addr_buf + (buf - pageaddr)); #else psmi_assert_always(0); // unimplemented, should not get here - host_addr_buf = NULL; + return NULL; #endif /* RNDV_MOD */ -#ifdef PSM_ONEAPI - return (void *)((uintptr_t)host_addr_buf + (buf - pageaddr)); -#else - return (void *)((uintptr_t)host_addr_buf + (buf & GPU_PAGE_OFFSET_MASK)); -#endif } -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ #endif /* PSM_SOCKETS */ diff --git a/prov/psm3/psm3/hal_sockets/sockets_hal.c b/prov/psm3/psm3/hal_sockets/sockets_hal.c index dd9ec3735dc..3251784a34f 100644 --- a/prov/psm3/psm3/hal_sockets/sockets_hal.c +++ b/prov/psm3/psm3/hal_sockets/sockets_hal.c @@ -63,17 +63,13 @@ #include "sockets_hal_inline_i.h" #endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) -#define SOCKET_GPU_THRESH_RNDV (~(uint32_t)0) -#endif - static int psm3_hfp_sockets_initialize(psmi_hal_instance_t *phi, int devid_enabled[PTL_MAX_INIT]) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // testing on HED-2629 suggests turning off RNDV can help // latency for messages in size 8-256 KB - psm3_gpu_thresh_rndv = SOCKET_GPU_THRESH_RNDV; + psm3_gpu_thresh_rndv = (~(uint32_t)0); #endif /* we initialize a few HAL software specific capabilities which * are known before context_open can open RV or parse HAL specific @@ -100,40 +96,19 @@ static const char* psm3_hfp_sockets_identify(void) { static char buf[100]; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) /* rv module only applicable to sockets for CUDA builds */ -#ifdef RNDV_MOD -/* we test NVIDIA_GPU_DIRECT since that define - * controls the rv module ioctl header file interface - */ -#if defined(NVIDIA_GPU_DIRECT) || defined(INTEL_GPU_DIRECT) -#ifdef NVIDIA_GPU_DIRECT - snprintf(buf, sizeof(buf), "HAL: %s (%s) built against rv interface v%d.%d gpu v%d.%d cuda", -#else - snprintf(buf, sizeof(buf), "HAL: %s (%s) built against rv interface v%d.%d gpu v%d.%d oneapi-ze", -#endif - psmi_hal_get_hal_instance_name(), - psmi_hal_get_hal_instance_description(), - psm3_rv_get_user_major_bldtime_version(), - psm3_rv_get_user_minor_bldtime_version(), - psm3_rv_get_gpu_user_major_bldtime_version(), - psm3_rv_get_gpu_user_minor_bldtime_version()); -#else /* NVIDIA_GPU_DIRECT || INTEL_GPU_DIRECT */ - snprintf(buf, sizeof(buf), "HAL: %s (%s) built against rv interface v%d.%d", +/* rv module only applicable to sockets for GPU builds */ +#if defined(RNDV_MOD) && defined(PSM_HAVE_GPU) + snprintf(buf, sizeof(buf), "HAL: %s (%s) built against rv interface v%u.%u" PSM3_GPU_FMT_RV_GPU_VER, psmi_hal_get_hal_instance_name(), psmi_hal_get_hal_instance_description(), psm3_rv_get_user_major_bldtime_version(), - psm3_rv_get_user_minor_bldtime_version()); -#endif /* NVIDIA_GPU_DIRECT || INTEL_GPU_DIRECT */ -#else /* RNDV_MOD */ - snprintf(buf, sizeof(buf), "HAL: %s (%s)", - psmi_hal_get_hal_instance_name(), - psmi_hal_get_hal_instance_description()); -#endif /* RNDV_MOD */ -#else /* PSM_CUDA || PSM_ONEAPI */ + psm3_rv_get_user_minor_bldtime_version() + PSM3_GPU_OUT_RV_GPU_VER); +#else snprintf(buf, sizeof(buf), "HAL: %s (%s)", psmi_hal_get_hal_instance_name(), psmi_hal_get_hal_instance_description()); -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif return buf; } @@ -181,15 +156,14 @@ static void psm3_hfp_sockets_mq_init_defaults(struct psm2_mq *mq) // even without RDMA, the receiver controlled pacing helps scalability mq->rndv_nic_thresh = (~(uint32_t)0); // disable rendezvous mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (PSMI_IS_GPU_ENABLED) - mq->ips_gpu_window_rv_str = PSM_GPU_NIC_RNDV_WINDOW_STR; +#ifdef PSM_HAVE_GPU + mq->ips_gpu_window_rv_str = psm3_gpu_rndv_nic_window_default; #endif // we parse inet and rv_gpu_cache_size here so we can cache it // once per EP open, even if multi-rail or multi-QP (void) psm3_sockets_parse_inet(1); #ifdef RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU (void)psmi_parse_gpudirect_rv_gpu_cache_size(1); #endif #endif @@ -203,7 +177,7 @@ static void psm3_hfp_sockets_ep_open_opts_get_defaults(struct psm3_ep_open_opts opts->imm_size = 128; } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU static void psm3_hfp_sockets_gdr_open(void) { } @@ -257,16 +231,10 @@ static hfp_sockets_t psm3_sockets_hi = { .phi = { .hal_index = PSM_HAL_INDEX_SOCKETS, #ifdef USE_UDP - .description = "Sockets" + .description = "Sockets" PSM3_GPU_TYPES, #else - .description = "TCP Sockets" -#endif -#ifdef PSM_CUDA - " (cuda)" -#elif defined(PSM_ONEAPI) - " (oneapi-ze)" + .description = "TCP Sockets" PSM3_GPU_TYPES, #endif - , .nic_sys_class_path = "/sys/class/net", .nic_sys_port_path_fmt = PSM3_PORT_PATH_TYPE_NO_PORT, .params = {0}, @@ -286,7 +254,7 @@ static hfp_sockets_t psm3_sockets_hi = { .hfp_mq_init_defaults = psm3_hfp_sockets_mq_init_defaults, .hfp_ep_open_opts_get_defaults = psm3_hfp_sockets_ep_open_opts_get_defaults, .hfp_context_initstats = psm3_hfp_sockets_context_initstats, -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU .hfp_gdr_open = psm3_hfp_sockets_gdr_open, #endif @@ -328,10 +296,10 @@ static hfp_sockets_t psm3_sockets_hi = { .hfp_ips_ibta_init = psm3_hfp_sockets_ips_ibta_init, .hfp_ips_path_rec_init = psm3_hfp_sockets_ips_path_rec_init, .hfp_ips_ptl_pollintr = psm3_hfp_sockets_ips_ptl_pollintr, -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU .hfp_gdr_close = psm3_hfp_sockets_gdr_close, .hfp_gdr_convert_gpu_to_host_addr = psm3_hfp_sockets_gdr_convert_gpu_to_host_addr, -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ .hfp_get_port_index2pkey = psm3_hfp_sockets_get_port_index2pkey, .hfp_poll_type = psm3_hfp_sockets_poll_type, .hfp_spio_transfer_frame = psm3_hfp_sockets_spio_transfer_frame, diff --git a/prov/psm3/psm3/hal_sockets/sockets_hal.h b/prov/psm3/psm3/hal_sockets/sockets_hal.h old mode 100755 new mode 100644 index 6b8f260cb29..de825ffab47 --- a/prov/psm3/psm3/hal_sockets/sockets_hal.h +++ b/prov/psm3/psm3/hal_sockets/sockets_hal.h @@ -100,11 +100,11 @@ psm3_sockets_recvhdrq_init(const struct ips_epstate *epstate, psm2_error_t psm3_sockets_udp_recvhdrq_progress(struct ips_recvhdrq *recvq, bool force); psm2_error_t psm3_sockets_tcp_recvhdrq_progress(struct ips_recvhdrq *recvq, bool force); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU void* psm3_sockets_gdr_convert_gpu_to_host_addr(unsigned long buf, size_t size, int flags, psm2_ep_t ep); -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ #define FD_STATE_NONE 0 #define FD_STATE_READY 1 diff --git a/prov/psm3/psm3/hal_sockets/sockets_hal_inline_i.h b/prov/psm3/psm3/hal_sockets/sockets_hal_inline_i.h index 9b703674147..7ee5798b547 100644 --- a/prov/psm3/psm3/hal_sockets/sockets_hal_inline_i.h +++ b/prov/psm3/psm3/hal_sockets/sockets_hal_inline_i.h @@ -449,7 +449,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_sockets_ips_ptl_pollintr( next_timeout, pollok, pollcyc, pollintr); } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU static PSMI_HAL_INLINE void psm3_hfp_sockets_gdr_close(void) { } @@ -460,7 +460,7 @@ static PSMI_HAL_INLINE void* psm3_hfp_sockets_gdr_convert_gpu_to_host_addr(unsig return psm3_sockets_gdr_convert_gpu_to_host_addr(buf, size, flags, ep); } -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ #include "sockets_spio.c" @@ -469,7 +469,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_sockets_spio_transfer_frame(struct uint32_t *payload, uint32_t length, uint32_t isCtrlMsg, uint32_t cksum_valid, uint32_t cksum -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , uint32_t is_gpu_payload #endif ) @@ -490,7 +490,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_sockets_spio_transfer_frame(struct return psm3_sockets_udp_spio_transfer_frame(proto, flow, scb, payload, length, isCtrlMsg, cksum_valid, cksum -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , is_gpu_payload #endif ); @@ -499,7 +499,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_sockets_spio_transfer_frame(struct return psm3_sockets_tcp_spio_transfer_frame(proto, flow, scb, payload, length, isCtrlMsg, cksum_valid, cksum -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , is_gpu_payload #endif ); @@ -510,7 +510,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_sockets_transfer_frame(struct ips_p uint32_t *payload, uint32_t length, uint32_t isCtrlMsg, uint32_t cksum_valid, uint32_t cksum -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , uint32_t is_gpu_payload #endif ) @@ -518,7 +518,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_sockets_transfer_frame(struct ips_p return psm3_hfp_sockets_spio_transfer_frame(proto, flow, scb, payload, length, isCtrlMsg, cksum_valid, cksum -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , is_gpu_payload #endif ); diff --git a/prov/psm3/psm3/hal_sockets/sockets_recvhdrq.c b/prov/psm3/psm3/hal_sockets/sockets_recvhdrq.c old mode 100755 new mode 100644 index de893265802..8091d5196da --- a/prov/psm3/psm3/hal_sockets/sockets_recvhdrq.c +++ b/prov/psm3/psm3/hal_sockets/sockets_recvhdrq.c @@ -236,7 +236,7 @@ psm3_sockets_tcp_preprocess_packet(psm2_ep_t ep, int fd, struct ips_recvhdrq_eve goto out; } -#if !defined(PSM_CUDA) && !defined(PSM_ONEAPI) +#ifndef PSM_HAVE_GPU psm2_mq_req_t req = psm3_mq_req_match(rcv_ev->proto->mq, (psm2_epaddr_t) &epstaddr->ipsaddr->msgctl->master_epaddr, (psm2_mq_tag_t *) rcv_ev->p_hdr->tag, 0); diff --git a/prov/psm3/psm3/hal_sockets/sockets_spio.c b/prov/psm3/psm3/hal_sockets/sockets_spio.c index 8d4fe1d65c7..64fe044b2fb 100644 --- a/prov/psm3/psm3/hal_sockets/sockets_spio.c +++ b/prov/psm3/psm3/hal_sockets/sockets_spio.c @@ -71,7 +71,7 @@ /*---------------------------------------------------------------------------*/ /* TCP specific code */ -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // set iov for remaining GPU payload data. It copies device memory to sockets_ep.sbuf // in word boundary and then set iov to use the sockets_ep.sbuf with proper offset. #define PAYLOAD_IOV(iov, payload, payload_len, remaining, buf, is_gpu_payload) \ @@ -101,7 +101,7 @@ #endif // prepare msghdr for a message -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU #define MSG_IOV(msg, header, payload, payload_len, remaining, buf, is_gpu_payload) \ if (likely(remaining > payload_len)) { \ msg.msg_iov[msg.msg_iovlen].iov_len = remaining - payload_len; \ @@ -323,7 +323,7 @@ psm3_sockets_tcp_sendpacing(struct ips_proto *proto, struct ips_flow *flow) static __inline__ psm2_error_t psm3_sockets_tcp_aux_send(psm2_ep_t ep, struct ips_flow *flow, struct ips_message_header *header, uint32_t *payload, uint32_t payload_len -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , uint32_t is_gpu_payload #endif ) @@ -344,7 +344,7 @@ psm3_sockets_tcp_aux_send(psm2_ep_t ep, struct ips_flow *flow, msg.msg_iov[0].iov_len = sizeof(*header); if (payload_len) { PAYLOAD_IOV(msg.msg_iov[1], payload, payload_len, payload_len -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , ep->sockets_ep.sbuf, is_gpu_payload #endif ); @@ -383,7 +383,7 @@ psm3_sockets_tcp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f struct ips_scb *scb, uint32_t *payload, uint32_t length, uint32_t isCtrlMsg, uint32_t cksum_valid, uint32_t cksum -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , uint32_t is_gpu_payload #endif ) @@ -477,7 +477,7 @@ psm3_sockets_tcp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f _HFI_VDBG("Send DISCONN msg opcode=%x via aux_socket\n", opcode); flow->send_remaining = 0; return psm3_sockets_tcp_aux_send(ep, flow, ips_lrh, payload, length -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , is_gpu_payload #endif ); @@ -490,7 +490,7 @@ psm3_sockets_tcp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f len = flow->send_remaining ? flow->send_remaining : sizeof(*ips_lrh) + length; msg.msg_iovlen = 0; MSG_IOV(msg, ips_lrh, payload, length, len -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , ep->sockets_ep.sbuf, is_gpu_payload #endif ); @@ -552,7 +552,7 @@ psm3_sockets_tcp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f psm3_sockaddr_fmt((struct sockaddr *)&flow->ipsaddr->sockets.remote_pri_addr, 0), length); _HFI_PDBG_DUMP_ALWAYS((uint8_t*)ips_lrh, sizeof(*ips_lrh)); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (is_gpu_payload) { PSM3_GPU_MEMCPY_DTOH(ep->sockets_ep.sbuf, payload, length); @@ -566,7 +566,7 @@ psm3_sockets_tcp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f } if_pf (opcode == OPCODE_DISCONNECT_REPLY) { return psm3_sockets_tcp_aux_send(ep, flow, ips_lrh, payload, length -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , is_gpu_payload #endif ); @@ -707,7 +707,7 @@ psm3_sockets_tcp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f // (UDP), it will fill receiver buffer from beginning flow->send_remaining = 0; ret = psm3_sockets_tcp_aux_send(ep, flow, ips_lrh, payload, length -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , is_gpu_payload #endif ); @@ -739,7 +739,7 @@ psm3_sockets_tcp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f _HFI_VDBG("Invalid tcp_fd on %s! Try to use aux socket.\n", ep->dev_name); flow->send_remaining = 0; ret = psm3_sockets_tcp_aux_send(ep, flow, ips_lrh, payload, length -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , is_gpu_payload #endif ); @@ -751,9 +751,9 @@ psm3_sockets_tcp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f } #ifndef PSM_TCP_ACK - // return PSM2_OK for ctrl msg and PSM2_TCP_DATA_SENT for data msg + // return PSM2_OK for ctrl msg and PSM2_RELIABLE_DATA_SENT for data msg if (ret == PSM2_OK && !isCtrlMsg) { - return PSM2_TCP_DATA_SENT; + return PSM2_RELIABLE_DATA_SENT; } #endif return ret; @@ -774,7 +774,7 @@ psm3_sockets_tcp_spio_transfer_frames(struct ips_proto *proto, struct ips_flow * struct msghdr msg = ep->sockets_ep.snd_msg; msg.msg_iovlen = 0; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // this is used for GPU support. It maintains the position in sbuf // to which we copy data from device uint8_t *buf = ep->sockets_ep.sbuf; @@ -796,7 +796,7 @@ psm3_sockets_tcp_spio_transfer_frames(struct ips_proto *proto, struct ips_flow * ret = psm3_sockets_tcp_spio_transfer_frame(proto, flow, scb, ips_scb_buffer(scb), scb->payload_size, PSMI_TRUE, scb->ips_lrh.flags & IPS_SEND_FLAG_PKTCKSUM, scb->cksum[0] -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , IS_TRANSFER_BUF_GPU_MEM(scb) #endif ); @@ -839,7 +839,7 @@ psm3_sockets_tcp_spio_transfer_frames(struct ips_proto *proto, struct ips_flow * len = sizeof(*ips_lrh) + scb->payload_size; } MSG_IOV(msg, ips_lrh, ips_scb_buffer(scb), scb->payload_size, len -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , buf, IS_TRANSFER_BUF_GPU_MEM(scb) #endif ); @@ -864,7 +864,7 @@ psm3_sockets_tcp_spio_transfer_frames(struct ips_proto *proto, struct ips_flow * if (likely(scb->payload_size > 0)) { PAYLOAD_IOV(iovs[msg.msg_iovlen], ips_scb_buffer(scb), scb->payload_size, scb->payload_size -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , buf, IS_TRANSFER_BUF_GPU_MEM(scb) #endif ); @@ -983,7 +983,7 @@ psm3_sockets_udp_gso_send(int fd, struct ips_proto *proto, psm3_sockaddr_in_t *addr, struct ips_scb *scb, uint8_t *payload, uint32_t length, uint32_t frag_size -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , uint32_t is_gpu_payload #endif ) @@ -1027,7 +1027,7 @@ psm3_sockets_udp_gso_send(int fd, struct ips_proto *proto, len + sizeof(*ips_lrh) + HFI_CRC_SIZE_IN_BYTES); _HFI_VDBG("copy payload %p %u\n", payload, len); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (is_gpu_payload) { PSM3_GPU_MEMCPY_DTOH(sbuf_gso + sizeof(*ips_lrh), payload, len); @@ -1099,7 +1099,7 @@ psm3_sockets_udp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f struct ips_scb *scb, uint32_t *payload, uint32_t length, uint32_t isCtrlMsg, uint32_t cksum_valid, uint32_t cksum -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , uint32_t is_gpu_payload #endif ) @@ -1143,7 +1143,7 @@ psm3_sockets_udp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f &flow->ipsaddr->sockets.remote_pri_addr, scb, (uint8_t*)payload, scb->chunk_size_remaining, scb->frag_size -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU ,is_gpu_payload #endif )) { @@ -1159,7 +1159,7 @@ psm3_sockets_udp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f memcpy(sbuf, ips_lrh, sizeof(*ips_lrh)); // copy payload to send buffer, length could be zero, be safe _HFI_VDBG("copy payload %p %u\n", payload, length); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (is_gpu_payload) { PSM3_GPU_MEMCPY_DTOH(sbuf + sizeof(*ips_lrh), payload, length); diff --git a/prov/psm3/psm3/hal_verbs/verbs_ep.c b/prov/psm3/psm3/hal_verbs/verbs_ep.c index f4e30d6c5e9..9a070f797d3 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_ep.c +++ b/prov/psm3/psm3/hal_verbs/verbs_ep.c @@ -231,19 +231,38 @@ psm3_ep_open_verbs(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid_t // HFI_TF_NFLOWS (32) limits receiver side concurrent tidflows (aka inbound // RDMA w/immed). // For USER RC Eager without SRQ we can have num_recv_wqes/FRACTION per - // QP in which case theoretical need could be huge. We add 4000 as a - // swag to cover most cases and user can always tune higher as needed + // QP, and we calculate the total size based on the total QPs required. + // The CQ size for the UD QP is covered by hfi_num_recv_wqes. // For USER RC Eager with SRQ worse case is num_recv_wqes so we // add that to allow up to num_recv_wqes on UD QP and SRQ each and keep // the HFI_TF_NFLOWS+1000 as headroom. if (! ep->verbs_ep.hfi_num_recv_cqes) { ep->verbs_ep.hfi_num_recv_cqes = ep->verbs_ep.hfi_num_recv_wqes+HFI_TF_NFLOWS+1000; +#ifdef USE_RC if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) { - if (ep->verbs_ep.srq) + if (ep->verbs_ep.srq) { ep->verbs_ep.hfi_num_recv_cqes += ep->verbs_ep.hfi_num_recv_wqes; - else - ep->verbs_ep.hfi_num_recv_cqes += 4000; + } else { + int tot_cnt = psm3_get_myrank_count(); + int loc_cnt = psm3_get_mylocalrank_count(); + uint32_t rem_cnt; + uint32_t cqes_per_qp; + + /* + * Check to see if MPI is used. If yes, we will calculate the total + * number of RC QPs. Otherwise, we use a arbitrary large number to + * accomodate up to 128 remote connections + */ + if (tot_cnt > 0 && loc_cnt > 0) + rem_cnt = (uint32_t)(tot_cnt - loc_cnt); + else + rem_cnt = 128; + + cqes_per_qp = ep->verbs_ep.hfi_num_recv_wqes / VERBS_RECV_QP_FRACTION; + ep->verbs_ep.hfi_num_recv_cqes += rem_cnt * cqes_per_qp; + } } +#endif } ep->verbs_ep.recv_cq = ibv_create_cq(ep->verbs_ep.context, ep->verbs_ep.hfi_num_recv_cqes, @@ -354,7 +373,7 @@ psm3_verbs_parse_params(psm2_ep_t ep) "Number of recv CQEs to allocate\n" "(0 will calculate as PSM3_NUM_RECV_WQES+1032 for PSM3_RDMA=0-2\n" "for PSM3_RDMA=3 with SRQ, allow an additional PSM3_NUM_RECV_WQES\n" - "for PSM3_RDMA=3 without SRQ, allow an additional 4000) [0]", + "for PSM3_RDMA=3 without SRQ, calculate based on total QPs) [0]", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)0, &envvar_val); @@ -408,7 +427,7 @@ psm3_verbs_parse_params(psm2_ep_t ep) // (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) // * psm3_mq_max_window_rv(mq, 0) // and automatically increase with warning if not? -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU ep->rv_gpu_cache_size = psmi_parse_gpudirect_rv_gpu_cache_size(0); // TBD - we could check gpu_cache_size >= minimum based on: // (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) @@ -458,7 +477,7 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz) // PSM3_* env for SDMA are parsed later in psm3_ips_proto_init. proto->iovec_thresh_eager = 8192; proto->iovec_thresh_eager_blocking = 8192; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU proto->iovec_gpu_thresh_eager = 128; proto->iovec_gpu_thresh_eager_blocking = 128; #endif @@ -469,37 +488,64 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz) // at this point ep->mtu is our HW capability found during open // and adjusted to allow for PSM headers so ep->mtu reflects maximum // PSM payload (not yet adjusted for optional cksum_sz) - /* See if user specifies a lower MTU to use */ - if (!psm3_getenv("PSM3_MTU", - "Upper bound on packet MTU (<=0 uses port MTU): 1-5,256,512,1024,2048,4096]", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)-1, &env_mtu)) { + char help[128]; + + if ((ep->rdmamode & IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) { + snprintf(help, sizeof(help), "Upper bound on PSM3 payload (<=0 uses port MTU): 1-7, 1024-PSM3_MQ_RNDV_NIC_THRESH(%u)", ep->mq->rndv_nic_thresh); + } else { + snprintf(help, sizeof(help), "Upper bound on packet MTU (<=0 uses port MTU): 1-5,256,512,1024,2048,4096,8192"); + } + /* See if user specifies a MTU to use */ + if (!psm3_getenv("PSM3_MTU", help, + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)-1, &env_mtu)) { + uint32_t mtu; // in bytes // use OPA_MTU_MAX so we don't round down to min MTU when // OPA enum values mistakenly used here. - if (env_mtu.e_int >= IBTA_MTU_MIN && env_mtu.e_int <= OPA_MTU_MAX) //enum - env_mtu.e_int = opa_mtu_enum_to_int((enum opa_mtu)env_mtu.e_int); - else if (env_mtu.e_int < IBTA_MTU_MIN) // pick default - env_mtu.e_int = 8192; // default high, will use wire MTU - else // wash through enum to force round up to next valid MTU - env_mtu.e_int = opa_mtu_enum_to_int(opa_mtu_int_to_enum(env_mtu.e_int)); + if (env_mtu.e_int >= IBTA_MTU_MIN && env_mtu.e_int <= OPA_MTU_MAX) { //enum + mtu = opa_mtu_enum_to_int((enum opa_mtu)env_mtu.e_int); + } else if (env_mtu.e_int < IBTA_MTU_MIN) { // pick default + mtu = ep->mtu + MAX_PSM_HEADER; // use wire MTU + } else if ((ep->rdmamode & IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) { // use as local PSM3 MTU + // Under RDMA3 mode, UD is used for ctr msg only that shall be smaller than + // wire MTU. It's safe to increase PSM3 MTU beyond wire MTU because RC will be + // used, and the NIC driver will segment a msg into multiple packets to ensure + // each pkt size is within wire MTU. + + mtu = env_mtu.e_int; + // only apply PSM3_MTU on eager messages + if (env_mtu.e_int > ep->mq->rndv_nic_thresh) + mtu = ep->mq->rndv_nic_thresh; + if (env_mtu.e_int < opa_mtu_enum_to_int(IBTA_MTU_MIN)) + mtu = opa_mtu_enum_to_int(IBTA_MTU_MIN); + // round down to nearest multiple of 64 + mtu = ROUNDDOWNP2(mtu, 64); + proto->epinfo.ep_mtu = mtu - MAX_PSM_HEADER; + } else { // walk through enum to force round up to next valid MTU + mtu = opa_mtu_enum_to_int(opa_mtu_int_to_enum(env_mtu.e_int)); + } + // only allow MTU decrease // PSM3_MTU specified ends up being used as max verbs payload // so decrease by PSM HEADER size (and cksum below) - if (ep->mtu > env_mtu.e_int - MAX_PSM_HEADER) - ep->mtu = env_mtu.e_int - MAX_PSM_HEADER; + if (ep->mtu > mtu - MAX_PSM_HEADER) + ep->mtu = mtu - MAX_PSM_HEADER; } + /* allow space for optional software managed checksum (for debug) */ ep->mtu -= cksum_sz; - // ep->mtu is our final choice of local PSM payload we can support - proto->epinfo.ep_mtu = ep->mtu; + // if proto->epinfo.ep_mtu is not set, use ep->mtu as our final choice + // of local PSM payload we can support + if (!proto->epinfo.ep_mtu) + proto->epinfo.ep_mtu = ep->mtu; if (PSM2_OK != psm_verbs_alloc_send_pool(ep, ep->verbs_ep.pd, &ep->verbs_ep.send_pool, // save 1 send WQE just to be paranoid (should be unnecessary) min(ep->verbs_ep.hfi_num_send_wqes, ep->verbs_ep.qp_cap.max_send_wr-1), // want to end up with multiple of cache line (64) - // ep->mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU + // proto->epinfo.ep_mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU // be conservative (+BUFFER_HEADROOM) - ep->mtu + MAX_PSM_HEADER + BUFFER_HEADROOM + proto->epinfo.ep_mtu + MAX_PSM_HEADER + BUFFER_HEADROOM )) { _HFI_ERROR( "Unable to allocate UD send buffer pool\n"); goto fail; @@ -516,9 +562,9 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz) if (PSM2_OK != psm_verbs_alloc_recv_pool(ep, 0, ep->verbs_ep.qp, &ep->verbs_ep.recv_pool, min(ep->verbs_ep.hfi_num_recv_wqes, ep->verbs_ep.qp_cap.max_recv_wr), // want to end up with multiple of cache line (64) - // ep->mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU + // proto->epinfo.ep_mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU // be conservative (+BUFFER_HEADROOM) - ep->mtu + MAX_PSM_HEADER + BUFFER_HEADROOM + proto->epinfo.ep_mtu + MAX_PSM_HEADER + BUFFER_HEADROOM )) { _HFI_ERROR( "Unable to allocate UD recv buffer pool\n"); goto fail; @@ -529,9 +575,9 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz) ep->verbs_ep.hfi_num_recv_wqes, (proto->ep->rdmamode == IPS_PROTOEXP_FLAG_RDMA_USER)? 0 // want to end up with multiple of cache line (64) - // ep->mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU + // proto->epinfo.ep_mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU // be conservative (+BUFFER_HEADROOM) - : (ep->mtu + MAX_PSM_HEADER + BUFFER_HEADROOM) + : (proto->epinfo.ep_mtu + MAX_PSM_HEADER + BUFFER_HEADROOM) )) { _HFI_ERROR( "Unable to allocate SRQ recv buffer pool\n"); goto fail; @@ -545,10 +591,10 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz) // no send segmentation, max_segs will constrain ep->chunk_max_segs = 1; - ep->chunk_max_size = ep->mtu; + ep->chunk_max_size = proto->epinfo.ep_mtu; #ifdef PSM_BYTE_FLOW_CREDITS // let flow_credits be the control - proto->flow_credit_bytes = ep->mtu * proto->max_credits; + proto->flow_credit_bytes = proto->epinfo.ep_mtu * proto->max_credits; _HFI_DBG("initial flow_credits %d bytes %d\n", proto->flow_credits, proto->flow_credit_bytes); #else @@ -874,25 +920,9 @@ psm2_error_t psm_verbs_alloc_send_pool(psm2_ep_t ep, struct ibv_pd *pd, _HFI_ERROR( "can't alloc send buffers"); goto fail; } -#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) - // By registering memory with Cuda, we make - // cuMemcpy run faster for copies from - // GPU to the send buffer. - if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt()) - PSMI_CUDA_CALL(cuMemHostRegister, - pool->send_buffers, - pool->send_total*pool->send_buffer_size, - CU_MEMHOSTALLOC_PORTABLE); -#endif -#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) - // By registering memory with Level Zero, we make - // zeCommandListAppendMemoryCopy run faster for copies from - // GPU to the send buffer. - if (PSMI_IS_GPU_ENABLED) - PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, - ze_driver, pool->send_buffers, + // This can allows faster copies from GPU to the send buffer + PSM3_GPU_REGISTER_HOSTMEM( pool->send_buffers, pool->send_total*pool->send_buffer_size); -#endif _HFI_PRDBG("send pool: buffers: %p size %u\n", pool->send_buffers, pool->send_buffer_size); pool->send_bufs = (struct verbs_sbuf *)psmi_calloc(ep, NETWORK_BUFFERS, @@ -993,25 +1023,9 @@ psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, uint32_t for_srq, _HFI_ERROR( "can't alloc recv buffers"); goto fail; } -#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) - // By registering memory with Cuda, we make - // cuMemcpy run faster for copies from - // recv buffer to GPU - if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt()) - PSMI_CUDA_CALL(cuMemHostRegister, - pool->recv_buffers, - pool->recv_total*pool->recv_buffer_size, - CU_MEMHOSTALLOC_PORTABLE); -#endif -#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) - // By registering memory with Level Zero, we make - // zeCommandListAppendMemoryCopy run faster for copies from - // recv buffer to GPU - if (PSMI_IS_GPU_ENABLED) - PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, - ze_driver, pool->recv_buffers, - pool->recv_total*pool->recv_buffer_size); -#endif + // This can allow faster copies from recv buffer to GPU + PSM3_GPU_REGISTER_HOSTMEM(pool->recv_buffers, + pool->recv_total*pool->recv_buffer_size); //printf("recv pool: buffers: %p size %u\n", pool->recv_buffers, pool->recv_buffer_size); #ifdef USE_RC pool->recv_bufs = (struct verbs_rbuf *)psmi_calloc(ep, NETWORK_BUFFERS, @@ -1104,38 +1118,7 @@ void psm_verbs_free_send_pool(psm3_verbs_send_pool_t pool) pool->send_bufs = NULL; } if (pool->send_buffers) { -#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) - if (PSMI_IS_GPU_ENABLED && cu_ctxt) { - /* ignore NOT_REGISTERED in case cuda initialized late */ - /* ignore other errors as context could be destroyed before this */ - CUresult cudaerr; - //PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, - // cuMemHostUnregister, pool->send_buffers); - psmi_count_cuMemHostUnregister++; - cudaerr = psmi_cuMemHostUnregister(pool->send_buffers); - if (cudaerr) { - const char *pStr = NULL; - psmi_count_cuGetErrorString++; - psmi_cuGetErrorString(cudaerr, &pStr); - _HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n", - cudaerr, pStr?pStr:"Unknown"); - } - - } -#endif -#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) - if (PSMI_IS_GPU_ENABLED) { - ze_result_t result; - //PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, - // ze_driver, pool->send_buffers); - psmi_count_zexDriverReleaseImportedPointer++; - result = psmi_zexDriverReleaseImportedPointer(ze_driver, - pool->send_buffers); - if (result != ZE_RESULT_SUCCESS) { - _HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result)); - } - } -#endif + PSM3_GPU_UNREGISTER_HOSTMEM(pool->send_buffers); psmi_free(pool->send_buffers); pool->send_buffers = NULL; } @@ -1156,37 +1139,7 @@ void psm_verbs_free_recv_pool(psm3_verbs_recv_pool_t pool) } #endif if (pool->recv_buffers) { -#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) - if (PSMI_IS_GPU_ENABLED && cu_ctxt) { - /* ignore NOT_REGISTERED in case cuda initialized late */ - /* ignore other errors as context could be destroyed before this */ - CUresult cudaerr; - //PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, - // cuMemHostUnregister, pool->recv_buffers); - psmi_count_cuMemHostUnregister++; - cudaerr = psmi_cuMemHostUnregister(pool->recv_buffers); - if (cudaerr) { - const char *pStr = NULL; - psmi_count_cuGetErrorString++; - psmi_cuGetErrorString(cudaerr, &pStr); - _HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n", - cudaerr, pStr?pStr:"Unknown"); - } - } -#endif -#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) - if (PSMI_IS_GPU_ENABLED) { - ze_result_t result; - //PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, - // ze_driver, pool->recv_buffers); - psmi_count_zexDriverReleaseImportedPointer++; - result = psmi_zexDriverReleaseImportedPointer(ze_driver, - pool->recv_buffers); - if (result != ZE_RESULT_SUCCESS) { - _HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result)); - } - } -#endif + PSM3_GPU_UNREGISTER_HOSTMEM(pool->recv_buffers); psmi_free(pool->recv_buffers); pool->recv_buffers = NULL; } @@ -1667,7 +1620,7 @@ extern int ips_protoexp_rdma_write_completion( uint64_t wr_id); psm2_error_t psm3_verbs_completion_update(psm2_ep_t ep, int drain) { - #define CQE_BATCH 10 // reap a few at a time, hopefully faster this way + #define CQE_BATCH 32 // reap a few at a time, hopefully faster this way //#define CQE_BATCH 8 or 18 // reap a few at a time, hopefully faster this way // 18*COALLESE > default reap threshold so we // should get away with one poll_q @@ -1677,6 +1630,9 @@ psm3_verbs_completion_update(psm2_ep_t ep, int drain) // alloca(sizeof(ibv_wc) & batch) struct ibv_wc wc[CQE_BATCH]; int ne; +#ifdef USE_RC + struct ips_epaddr *ipsaddr; +#endif PSMI_LOCK_ASSERT(ep->mq->progress_lock); // TBD - when coallescing completions we'll tend to fall through to poll_cq @@ -1738,6 +1694,12 @@ psm3_verbs_completion_update(psm2_ep_t ep, int drain) ips_protoexp_rdma_write_completion( wc[i].wr_id & ~VERBS_SQ_WR_ID_MASK); break; + case IBV_WC_RDMA_READ: + ipsaddr = (struct ips_epaddr *)wc[i].wr_id; + + ipsaddr->verbs.remote_seq_outstanding = 0; + _HFI_VDBG("Got remote_recv_psn=%d\n", ipsaddr->verbs.remote_recv_psn); + break; #endif default: _HFI_ERROR("unexpected send completion on %s port %u opcode %d QP %u\n", @@ -2197,15 +2159,15 @@ static psm2_error_t open_rv(psm2_ep_t ep, psm2_uuid_t const job_key) // we always fill in everything we might need in loc_info // in some modes, some of the fields are not used by RV loc_info.mr_cache_size = ep->rv_mr_cache_size; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* gpu_cache_size ignored unless RV_RDMA_MODE_GPU */ loc_info.gpu_cache_size = ep->rv_gpu_cache_size; #endif loc_info.rdma_mode = IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode)? RV_RDMA_MODE_KERNEL: RV_RDMA_MODE_USER; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (PSMI_IS_GPU_ENABLED) { - // when Cuda is enabled we will have larger window_sz and +#ifdef PSM_HAVE_GPU + if (PSM3_GPU_IS_ENABLED) { + // when GPU is enabled we will have larger window_sz and // need to upsize the caches we will use for priority MRs if (ep->rdmamode & IPS_PROTOEXP_FLAG_ENABLED) { // priority window_sz reg_mr for CPU @@ -2214,9 +2176,9 @@ static psm2_error_t open_rv(psm2_ep_t ep, psm2_uuid_t const job_key) if (psmi_parse_gpudirect()) { // When GPU Direct is enabled we need a GPU Cache loc_info.rdma_mode |= RV_RDMA_MODE_GPU; -#ifdef PSM_ONEAPI - psm3_oneapi_ze_can_use_zemem(); -#endif + + PSM3_GPU_USING_RV_FOR_MRS(); + if ((ep->rdmamode & IPS_PROTOEXP_FLAG_ENABLED) && (psmi_parse_gpudirect_rdma_send_limit(1) || psmi_parse_gpudirect_rdma_recv_limit(1))) { @@ -2267,7 +2229,7 @@ static psm2_error_t open_rv(psm2_ep_t ep, psm2_uuid_t const job_key) } // parallel hal_gen1/gen1_hal_inline_i.h handling HFI1_CAP_GPUDIRECT_OT #ifndef RV_CAP_GPU_DIRECT -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU #error "Inconsistent build. RV_CAP_GPU_DIRECT must be defined for GPU builds. Must use GPU enabled rv headers" #else // lifted from rv_user_ioctls.h @@ -2281,15 +2243,12 @@ static psm2_error_t open_rv(psm2_ep_t ep, psm2_uuid_t const job_key) psmi_hal_add_cap(PSM_HAL_CAP_GPUDIRECT_SDMA); psmi_hal_add_cap(PSM_HAL_CAP_GPUDIRECT_RDMA); } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (loc_info.capability & RV_CAP_NVIDIA_GPU) - psmi_hal_add_cap(PSM_HAL_CAP_NVIDIA_GPU); - if (loc_info.capability & RV_CAP_INTEL_GPU) - psmi_hal_add_cap(PSM_HAL_CAP_INTEL_GPU); +#ifdef PSM_HAVE_GPU + PSM3_GPU_RV_SET_HAL_CAP(loc_info.capability); #endif ep->verbs_ep.rv_index = loc_info.rv_index; ep->rv_mr_cache_size = loc_info.mr_cache_size; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU ep->rv_gpu_cache_size = loc_info.gpu_cache_size; #endif ep->verbs_ep.rv_q_depth = loc_info.q_depth; @@ -2442,6 +2401,25 @@ static psm2_error_t verbs_open_dev(psm2_ep_t ep, int unit, int port, int addr_in psm3_gid128_fmt(ep->gid, 2)); } +#if defined(USE_RDMA_READ) +#if defined(USE_RC) + { + struct ibv_device_attr dev_attr; + // get RDMA capabilities of device + if (ibv_query_device(ep->verbs_ep.context, &dev_attr)) { + _HFI_ERROR("Unable to query device %s: %s\n", ep->dev_name, + strerror(errno)); + err = PSM2_INTERNAL_ERR; + goto fail; + } + ep->verbs_ep.max_qp_rd_atom = dev_attr.max_qp_rd_atom; + ep->verbs_ep.max_qp_init_rd_atom = dev_attr.max_qp_init_rd_atom; + _HFI_PRDBG("got device attr: rd_atom %u init_rd_atom %u\n", + dev_attr.max_qp_rd_atom, dev_attr.max_qp_init_rd_atom); + // TBD could have an env variable to reduce requested values + } +#endif // USE_RC +#endif #ifdef RNDV_MOD if (IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode) || ep->mr_cache_mode == MR_CACHE_MODE_KERNEL ) { @@ -2774,6 +2752,9 @@ psm2_error_t modify_rc_qp_to_init(psm2_ep_t ep, struct ibv_qp *qp) //attr.qkey = ep->verbs_ep.qkey; //flags |= IBV_QP_QKEY; // only allowed for UD attr.qp_access_flags = 0; +#ifdef USE_RDMA_READ + attr.qp_access_flags |= IBV_ACCESS_REMOTE_READ; +#endif attr.qp_access_flags |= IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE; //attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC; flags |= IBV_QP_ACCESS_FLAGS; @@ -2804,11 +2785,15 @@ psm2_error_t modify_rc_qp_to_rtr(psm2_ep_t ep, struct ibv_qp *qp, // TBD - we already factored in req vs pr to update pr no need // for modify_cq_qp_to_rtr to repeat it // pr_mtu is max PSM payload in bytes and req_attr_mtu is IB enum - attr.path_mtu = MIN(ibv_mtu_int_to_enum(path_rec->pr_mtu), req_attr->mtu); + attr.path_mtu = MIN(ibv_mtu_int_to_enum(ep->mtu), req_attr->mtu); attr.dest_qp_num = req_attr->qpn; attr.rq_psn = initpsn; flags |= (IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN); +#ifdef USE_RDMA_READ + attr.max_dest_rd_atomic = min(ep->verbs_ep.max_qp_rd_atom, + req_attr->initiator_depth); +#endif _HFI_PRDBG("set max_dest_rd_atomic to %u\n", attr.max_dest_rd_atomic); attr.min_rnr_timer = 12; // TBD well known flags |= (IBV_QP_MIN_RNR_TIMER | IBV_QP_MAX_DEST_RD_ATOMIC); @@ -2818,7 +2803,7 @@ psm2_error_t modify_rc_qp_to_rtr(psm2_ep_t ep, struct ibv_qp *qp, ep->dev_name, strerror(errno)); return PSM2_INTERNAL_ERR; } - _HFI_PRDBG("moved %d to RTR\n", qp->qp_num); + _HFI_PRDBG("moved %d to RTR with MTU=%d\n", qp->qp_num, attr.path_mtu); return PSM2_OK; } @@ -2836,6 +2821,10 @@ psm2_error_t modify_rc_qp_to_rts(psm2_ep_t ep, struct ibv_qp *qp, attr.sq_psn = initpsn; // value we told other side flags |= IBV_QP_SQ_PSN; +#ifdef USE_RDMA_READ + attr.max_rd_atomic = min(ep->verbs_ep.max_qp_init_rd_atom, + req_attr->responder_resources); +#endif _HFI_PRDBG("set max_rd_atomic to %u\n", attr.max_rd_atomic); flags |= IBV_QP_MAX_QP_RD_ATOMIC; @@ -2886,9 +2875,9 @@ unsigned psm3_verbs_parse_rdmamode(int reload) if (psm3_rv_available()) { default_value = IPS_PROTOEXP_FLAG_RDMA_KERNEL; } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // GPUDIRECT causes default_value of RDMA=1 - if (PSMI_IS_GPU_ENABLED && psmi_parse_gpudirect()) + if (PSM3_GPU_IS_ENABLED && psmi_parse_gpudirect()) default_value = IPS_PROTOEXP_FLAG_RDMA_KERNEL; #endif #endif @@ -2950,8 +2939,8 @@ unsigned psm3_verbs_parse_mr_cache_mode(unsigned rdmamode, int reload) // PSM_HAL_CAP_GPUDIRECT_* flags not known until after HAL device open, // so we test SDMA and RDMA here as prereqs for GPUDIRECT_SDMA and RDMA. if (! (rdmamode & IPS_PROTOEXP_FLAG_ENABLED) -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - && (PSMI_IS_GPU_DISABLED || ! psmi_parse_gpudirect() +#ifdef PSM_HAVE_GPU + && (! PSM3_GPU_IS_ENABLED || ! psmi_parse_gpudirect() //verbs always has these HAL capabilities set //|| (!psmi_hal_has_cap(PSM_HAL_CAP_SDMA) // && !psmi_hal_has_cap(PSM_HAL_CAP_RDMA))) @@ -2962,9 +2951,9 @@ unsigned psm3_verbs_parse_mr_cache_mode(unsigned rdmamode, int reload) } else if (IPS_PROTOEXP_FLAG_KERNEL_QP(rdmamode)) { // RDMA enabled in kernel mode. Must use rv MR cache envval.e_uint = MR_CACHE_MODE_RV; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU #ifdef PSM_HAVE_RNDV_MOD - } else if (PSMI_IS_GPU_ENABLED && psmi_parse_gpudirect()) { + } else if (PSM3_GPU_IS_ENABLED && psmi_parse_gpudirect()) { // GPU Direct (RDMA, send DMA and/or gdrcopy) must // use kernel MR cache in RV envval.e_uint = MR_CACHE_MODE_KERNEL; diff --git a/prov/psm3/psm3/hal_verbs/verbs_ep.h b/prov/psm3/psm3/hal_verbs/verbs_ep.h index c1da6b73e53..e85e5776f36 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_ep.h +++ b/prov/psm3/psm3/hal_verbs/verbs_ep.h @@ -107,7 +107,7 @@ // if 1, post as we recv them #define VERBS_SEND_CQ_REAP 256 // check for completions when this many unreaped #define VERBS_PORT 1 // default port if not specified -#define VERBS_RECV_CQE_BATCH 1 // how many CQEs to ask for at a time +#define VERBS_RECV_CQE_BATCH 32 // how many CQEs to ask for at a time #define UD_ADDITION (40) // extra bytes at start of UD recv buffer // defined in verbs API to accomidate IB GRH #define BUFFER_HEADROOM 0 // how much extra to allocate in buffers @@ -310,19 +310,25 @@ struct psm3_verbs_ep { uint32_t qkey; //uint8_t link_layer; // IBV_LINK_LAYER_ETHERNET or other uint8_t active_rate; +#if defined(USE_RDMA_READ) +#if defined(USE_RC) + uint8_t max_qp_rd_atom; + uint8_t max_qp_init_rd_atom; +#endif // USE_RC +#endif struct psm3_verbs_send_pool send_pool; struct psm3_verbs_send_allocator send_allocator; uint32_t send_rdma_outstanding; // number of outstanding RDMAs uint32_t send_reap_thresh; // TBD if should be here or in pool struct psm3_verbs_recv_pool recv_pool; +#ifdef USE_RC + struct psm3_verbs_recv_pool srq_recv_pool; +#endif #if VERBS_RECV_CQE_BATCH > 1 struct ibv_wc recv_wc_list[VERBS_RECV_CQE_BATCH]; int recv_wc_count; // number left in recv_wc_list int recv_wc_next; // next index #else -#ifdef USE_RC - struct psm3_verbs_recv_pool srq_recv_pool; -#endif // if asked to revisit a packet we save it here rbuf_t revisit_buf; uint32_t revisit_payload_size; diff --git a/prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c b/prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c index ab0942e5497..38a8dfce702 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c +++ b/prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c @@ -51,8 +51,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) #include "psm_user.h" + +#ifdef PSM_HAVE_GPU #include "psm2_hal.h" #include #include @@ -66,10 +67,11 @@ psm3_verbs_gdr_convert_gpu_to_host_addr(unsigned long buf, size_t size, int flags, psm2_ep_t ep) { +#ifdef RNDV_MOD void *host_addr_buf; uintptr_t pageaddr; uint64_t pagelen; -#ifdef RNDV_MOD + // when PSM3_MR_ACCESS is enabled, we use the same access flags for // gdrcopy as we use for user space GPU MRs. This can improve MR cache // hit rate. Note the actual mmap is always for CPU read/write access. @@ -79,19 +81,10 @@ psm3_verbs_gdr_convert_gpu_to_host_addr(unsigned long buf, // both tend to be smaller buffers, this may provide a better hit rate. int access = IBV_ACCESS_IS_GPU_ADDR |(ep->mr_access?IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE:0); -#endif -#ifdef PSM_ONEAPI - PSMI_ONEAPI_ZE_CALL(zeMemGetAddressRange, ze_context, - (const void *)buf, (void **)&pageaddr, &pagelen); -#else - pageaddr = buf & GPU_PAGE_MASK; - pagelen = (uint64_t) (PSMI_GPU_PAGESIZE + - ((buf + size - 1) & GPU_PAGE_MASK) - pageaddr); -#endif + PSM3_GPU_ROUNDUP_GDRCOPY(buf, size, &pageaddr, &pagelen); _HFI_VDBG("buf=%p size=%zu pageaddr=%p pagelen=%"PRIu64" flags=0x%x ep=%p\n", (void *)buf, size, (void *)pageaddr, pagelen, flags, ep); -#ifdef RNDV_MOD ep = ep->mctxt_master; host_addr_buf = psm3_rv_pin_and_mmap(ep->rv, pageaddr, pagelen, access); if_pf (! host_addr_buf) { @@ -104,16 +97,12 @@ psm3_verbs_gdr_convert_gpu_to_host_addr(unsigned long buf, return NULL; } //_HFI_ERROR("pinned buf=%p size=%zu pageaddr=%p pagelen=%u access=0x%x flags=0x%x ep=%p, @ %p\n", (void *)buf, size, (void *)pageaddr, pagelen, access, flags, ep, host_addr_buf); + return (void *)((uintptr_t)host_addr_buf + (buf - pageaddr)); #else psmi_assert_always(0); // unimplemented, should not get here - host_addr_buf = NULL; + return NULL; #endif /* RNDV_MOD */ -#ifdef PSM_ONEAPI - return (void *)((uintptr_t)host_addr_buf + (buf - pageaddr)); -#else - return (void *)((uintptr_t)host_addr_buf + (buf & GPU_PAGE_OFFSET_MASK)); -#endif } -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ #endif /* PSM_VERBS */ diff --git a/prov/psm3/psm3/hal_verbs/verbs_hal.c b/prov/psm3/psm3/hal_verbs/verbs_hal.c index 69d27478b48..4cc441d4402 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_hal.c +++ b/prov/psm3/psm3/hal_verbs/verbs_hal.c @@ -96,32 +96,12 @@ static const char* psm3_hfp_verbs_identify(void) static char buf[100]; #ifdef RNDV_MOD -/* we test NVIDIA_GPU_DIRECT here instead of PSM_CUDA since that define - * controls the rv module ioctl header file interface - */ -#ifdef NVIDIA_GPU_DIRECT - snprintf(buf, sizeof(buf), "HAL: %s (%s) built against rv interface v%d.%d gpu v%d.%d cuda", + snprintf(buf, sizeof(buf), "HAL: %s (%s) built against rv interface v%u.%u" PSM3_GPU_FMT_RV_GPU_VER, psmi_hal_get_hal_instance_name(), psmi_hal_get_hal_instance_description(), psm3_rv_get_user_major_bldtime_version(), - psm3_rv_get_user_minor_bldtime_version(), - psm3_rv_get_gpu_user_major_bldtime_version(), - psm3_rv_get_gpu_user_minor_bldtime_version()); -#elif defined(INTEL_GPU_DIRECT) - snprintf(buf, sizeof(buf), "HAL: %s (%s) built against rv interface v%d.%d gpu v%d.%d oneapi-ze", - psmi_hal_get_hal_instance_name(), - psmi_hal_get_hal_instance_description(), - psm3_rv_get_user_major_bldtime_version(), - psm3_rv_get_user_minor_bldtime_version(), - psm3_rv_get_gpu_user_major_bldtime_version(), - psm3_rv_get_gpu_user_minor_bldtime_version()); -#else - snprintf(buf, sizeof(buf), "HAL: %s (%s) built against rv interface v%d.%d", - psmi_hal_get_hal_instance_name(), - psmi_hal_get_hal_instance_description(), - psm3_rv_get_user_major_bldtime_version(), - psm3_rv_get_user_minor_bldtime_version()); -#endif + psm3_rv_get_user_minor_bldtime_version() + PSM3_GPU_OUT_RV_GPU_VER); #else /* RNDV_MOD */ snprintf(buf, sizeof(buf), "HAL: %s (%s)", psmi_hal_get_hal_instance_name(), @@ -174,15 +154,14 @@ static void psm3_hfp_verbs_mq_init_defaults(struct psm2_mq *mq) mq->rndv_nic_thresh = (~(uint32_t)0); // disable rendezvous } mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (PSMI_IS_GPU_ENABLED) - mq->ips_gpu_window_rv_str = PSM_GPU_NIC_RNDV_WINDOW_STR; +#ifdef PSM_HAVE_GPU + mq->ips_gpu_window_rv_str = psm3_gpu_rndv_nic_window_default; #endif // we parse mr_cache_mode and rv_gpu_cache_size here so we can cache it // once per EP open, even if multi-rail or multi-QP (void)psm3_verbs_parse_mr_cache_mode(rdmamode, 1); #ifdef RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU (void)psmi_parse_gpudirect_rv_gpu_cache_size(1); #endif #endif @@ -196,7 +175,7 @@ static void psm3_hfp_verbs_ep_open_opts_get_defaults(struct psm3_ep_open_opts *o opts->imm_size = VERBS_SEND_MAX_INLINE; // PSM header size is 56 } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU static void psm3_hfp_verbs_gdr_open(void) { } @@ -249,13 +228,7 @@ static hfp_verbs_t psm3_verbs_hi = { /* start of public psmi_hal_instance_t data */ .phi = { .hal_index = PSM_HAL_INDEX_VERBS, - .description = "RDMA Verbs" -#ifdef PSM_CUDA - " (cuda)" -#elif defined(PSM_ONEAPI) - " (oneapi-ze)" -#endif - , + .description = "RDMA Verbs" PSM3_GPU_TYPES, .nic_sys_class_path = "/sys/class/infiniband", .nic_sys_port_path_fmt = PSM3_PORT_PATH_TYPE_IB, .params = {0}, @@ -274,7 +247,7 @@ static hfp_verbs_t psm3_verbs_hi = { .hfp_mq_init_defaults = psm3_hfp_verbs_mq_init_defaults, .hfp_ep_open_opts_get_defaults = psm3_hfp_verbs_ep_open_opts_get_defaults, .hfp_context_initstats = psm3_hfp_verbs_context_initstats, -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU .hfp_gdr_open = psm3_hfp_verbs_gdr_open, #endif @@ -316,10 +289,10 @@ static hfp_verbs_t psm3_verbs_hi = { .hfp_ips_ibta_init = psm3_hfp_verbs_ips_ibta_init, .hfp_ips_path_rec_init = psm3_hfp_verbs_ips_path_rec_init, .hfp_ips_ptl_pollintr = psm3_hfp_verbs_ips_ptl_pollintr, -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU .hfp_gdr_close = psm3_hfp_verbs_gdr_close, .hfp_gdr_convert_gpu_to_host_addr = psm3_hfp_verbs_gdr_convert_gpu_to_host_addr, -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ .hfp_get_port_index2pkey = psm3_hfp_verbs_get_port_index2pkey, .hfp_poll_type = psm3_hfp_verbs_poll_type, .hfp_spio_transfer_frame = psm3_hfp_verbs_spio_transfer_frame, diff --git a/prov/psm3/psm3/hal_verbs/verbs_hal.h b/prov/psm3/psm3/hal_verbs/verbs_hal.h index ae18c675a28..1c5d6f75cab 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_hal.h +++ b/prov/psm3/psm3/hal_verbs/verbs_hal.h @@ -87,11 +87,11 @@ psm3_verbs_recvhdrq_init(const struct ips_epstate *epstate, psm2_error_t psm3_verbs_recvhdrq_progress(struct ips_recvhdrq *recvq); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU void* psm3_verbs_gdr_convert_gpu_to_host_addr(unsigned long buf, size_t size, int flags, psm2_ep_t ep); -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ #endif /* _PSM_HAL_VERBS_HAL_H */ #endif /* PSM_VERBS */ diff --git a/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h b/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h index 8ef06d9ae97..7cf2a25a707 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h +++ b/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h @@ -238,12 +238,12 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_ipsaddr_set_req_params( const struct ips_connect_reqrep *req) { #ifdef RNDV_MOD - ipsaddr->verbs.remote_gid = req->verbs.gid; - ipsaddr->verbs.remote_rv_index = req->verbs.rv_index; + ipsaddr->verbs.remote_gid = req->verbs.rv.gid; + ipsaddr->verbs.remote_rv_index = req->verbs.rv.rv_index; if (ipsaddr->verbs.rv_conn) { psmi_assert(IPS_PROTOEXP_FLAG_KERNEL_QP(proto->ep->rdmamode)); psmi_assert(proto->ep->rv); - if (! psm3_nonzero_gid(&req->verbs.gid)) { + if (! psm3_nonzero_gid(&req->verbs.rv.gid)) { _HFI_ERROR("mismatched PSM3_RDMA config, remote end not in mode 1\n"); return PSM2_INTERNAL_ERR; // TBD - if we wanted to allow mismatched config to run in UD mode @@ -266,7 +266,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_ipsaddr_set_req_params( ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->verbs.pr_connecting = 1; } } - // } else if (psm3_nonzero_gid(&req->verbs.gid)) { + // } else if (psm3_nonzero_gid(&req->verbs.rv.gid)) { // We could fail here, but we just let remote end decide // _HFI_ERROR("mismatched PSM3_RDMA config, remote end in mode 1\n"); // return PSM2_INTERNAL_ERR; @@ -305,6 +305,9 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_ipsaddr_set_req_params( } } + ipsaddr->verbs.remote_recv_seq_addr = req->verbs.urc.recv_addr; + ipsaddr->verbs.remote_recv_seq_rkey = req->verbs.urc.recv_rkey; + if (modify_rc_qp_to_init(proto->ep, ipsaddr->verbs.rc_qp)) { _HFI_ERROR("qp_to_init failed\n"); return PSM2_INTERNAL_ERR; @@ -383,27 +386,41 @@ static PSMI_HAL_INLINE void psm3_hfp_verbs_ips_proto_build_connect_message( // only supply gid if we want to use kernel rv if (IPS_PROTOEXP_FLAG_KERNEL_QP(proto->ep->rdmamode) && proto->ep->rv) { - req->verbs.gid = proto->ep->verbs_ep.lgid; - req->verbs.rv_index = proto->ep->verbs_ep.rv_index; + req->verbs.rv.gid = proto->ep->verbs_ep.lgid; + req->verbs.rv.rv_index = proto->ep->verbs_ep.rv_index; } else #endif { - memset(&req->verbs.gid, 0, sizeof(req->verbs.gid)); - req->verbs.rv_index = 0; + memset(&req->verbs.rv.gid, 0, sizeof(req->verbs.rv.gid)); + req->verbs.rv.rv_index = 0; } #if defined(USE_RC) if (ipsaddr->verbs.rc_qp) { psmi_assert(IPS_PROTOEXP_FLAG_USER_RC_QP(proto->ep->rdmamode)); req->initpsn = proto->runid_key;// pid, not ideal, better than const req->verbs.qp_attr.qpn = ipsaddr->verbs.rc_qp->qp_num; - req->verbs.qp_attr.mtu = opa_mtu_int_to_enum(req->mtu); + req->verbs.qp_attr.mtu = opa_mtu_int_to_enum(proto->ep->mtu); req->verbs.qp_attr.srq = 0; req->verbs.qp_attr.resv = 0; req->verbs.qp_attr.target_ack_delay = 0; // TBD; - from local device req->verbs.qp_attr.resv2 = 0; +#ifdef USE_RDMA_READ + // Send our RDMA Read capabilities + req->verbs.qp_attr.responder_resources = proto->ep->verbs_ep.max_qp_rd_atom; + req->verbs.qp_attr.initiator_depth = proto->ep->verbs_ep.max_qp_init_rd_atom; +#else req->verbs.qp_attr.responder_resources = 0; req->verbs.qp_attr.initiator_depth = 0; +#endif memset(&req->verbs.qp_attr.resv3, 0, sizeof(req->verbs.qp_attr.resv3)); + + if (IPS_PROTOEXP_FLAG_RDMA_QP(proto->ep->rdmamode) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) { + req->verbs.urc.recv_addr = (uintptr_t)ipsaddr->verbs.recv_seq_mr->addr; + req->verbs.urc.recv_rkey = ipsaddr->verbs.recv_seq_mr->rkey; + } else { + req->verbs.urc.recv_addr = 0; + req->verbs.urc.recv_rkey = 0; + } } else #endif // USE_RC memset(&req->verbs.qp_attr, 0, sizeof(req->verbs.qp_attr)); @@ -489,6 +506,28 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_ipsaddr_init_connections( ipsaddr->verbs.use_allocator = &proto->ep->verbs_ep.send_allocator; ipsaddr->verbs.use_qp = proto->ep->verbs_ep.qp; ipsaddr->verbs.use_max_inline_data = proto->ep->verbs_ep.qp_cap.max_inline_data; + + if (IPS_PROTOEXP_FLAG_RDMA_QP(proto->ep->rdmamode) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) { + struct ips_flow *flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO]; + + ipsaddr->verbs.recv_seq_mr = ibv_reg_mr(proto->ep->verbs_ep.pd, + &flow->recv_seq_num, sizeof(flow->recv_seq_num), + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ); + if (!ipsaddr->verbs.recv_seq_mr) { + _HFI_ERROR("Unable register recv_seq_num MR on %s: %s\n", + proto->ep->dev_name, strerror(errno)); + goto fail; + } + + ipsaddr->verbs.remote_recv_psn_mr = ibv_reg_mr(proto->ep->verbs_ep.pd, + &ipsaddr->verbs.remote_recv_psn, sizeof(ipsaddr->verbs.remote_recv_psn), + IBV_ACCESS_LOCAL_WRITE); + if (!ipsaddr->verbs.remote_recv_psn_mr) { + _HFI_ERROR("Unable register remote_recv_psn MR on %s: %s\n", + proto->ep->dev_name, strerror(errno)); + goto fail; + } + } #endif #ifdef RNDV_MOD @@ -542,6 +581,14 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_ipsaddr_init_connections( rc_qp_destroy(ipsaddr->verbs.rc_qp); ipsaddr->verbs.rc_qp = NULL; } + if (ipsaddr->verbs.recv_seq_mr) { + ibv_dereg_mr(ipsaddr->verbs.recv_seq_mr); + ipsaddr->verbs.recv_seq_mr = NULL; + } + if (ipsaddr->verbs.remote_recv_psn_mr) { + ibv_dereg_mr(ipsaddr->verbs.remote_recv_psn_mr); + ipsaddr->verbs.remote_recv_psn_mr = NULL; + } #endif return err; } @@ -650,7 +697,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_ptl_pollintr( next_timeout, pollok, pollcyc, pollintr); } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU static PSMI_HAL_INLINE void psm3_hfp_verbs_gdr_close(void) { } @@ -661,7 +708,7 @@ static PSMI_HAL_INLINE void* psm3_hfp_verbs_gdr_convert_gpu_to_host_addr(unsigne return psm3_verbs_gdr_convert_gpu_to_host_addr(buf, size, flags, ep); } -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ #include "verbs_spio.c" @@ -670,7 +717,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_spio_transfer_frame(struct ip uint32_t *payload, uint32_t length, uint32_t isCtrlMsg, uint32_t cksum_valid, uint32_t cksum -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , uint32_t is_gpu_payload #endif ) @@ -678,7 +725,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_spio_transfer_frame(struct ip return psm3_verbs_spio_transfer_frame(proto, flow, scb, payload, length, isCtrlMsg, cksum_valid, cksum -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , is_gpu_payload #endif ); @@ -689,7 +736,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_transfer_frame(struct ips_pro uint32_t *payload, uint32_t length, uint32_t isCtrlMsg, uint32_t cksum_valid, uint32_t cksum -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , uint32_t is_gpu_payload #endif ) @@ -697,7 +744,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_transfer_frame(struct ips_pro return psm3_verbs_spio_transfer_frame(proto, flow, scb, payload, length, isCtrlMsg, cksum_valid, cksum -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , is_gpu_payload #endif ); diff --git a/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c b/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c index f38aa505fc8..9f5d867ba54 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c +++ b/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c @@ -222,7 +222,7 @@ psm2_error_t psm3_verbs_recvhdrq_progress(struct ips_recvhdrq *recvq) break; else if_pf (err < 0) { if (errno == EAGAIN || errno == EWOULDBLOCK - || errno == EBUSY || errno = EINTR) + || errno == EBUSY || errno == EINTR) break; _HFI_ERROR("failed ibv_poll_cq '%s' (%d) on %s port %u epid %s\n", strerror(errno), errno, ep->dev_name, ep->portnum, psm3_epid_fmt_internal(ep->epid, 0)); @@ -360,7 +360,7 @@ psm2_error_t psm3_verbs_recvhdrq_progress(struct ips_recvhdrq *recvq) break; } #if VERBS_RECV_CQE_BATCH > 1 - } while(! done); + } while(ep->verbs_ep.recv_wc_count || !done); #else } #endif diff --git a/prov/psm3/psm3/hal_verbs/verbs_spio.c b/prov/psm3/psm3/hal_verbs/verbs_spio.c index f12478ef70c..d1cf283b162 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_spio.c +++ b/prov/psm3/psm3/hal_verbs/verbs_spio.c @@ -67,6 +67,44 @@ #include "ips_proto_internal.h" #include "ips_proto_params.h" +#ifdef USE_RC +static inline psm2_error_t +psm3_verbs_get_remote_psn(psm2_ep_t ep, struct ips_epaddr *ipsaddr) { + psm2_error_t ret = PSM2_OK; + + struct ibv_send_wr wr; + struct ibv_send_wr *bad_wr; + struct ibv_sge list; + + // set local location to store received data + list.addr = (uintptr_t)ipsaddr->verbs.remote_recv_psn_mr->addr; + list.length = sizeof(ipsaddr->verbs.remote_recv_psn); + list.lkey = ipsaddr->verbs.remote_recv_psn_mr->lkey; + + wr.next = NULL; // just post 1 + wr.wr_id = (uintptr_t)ipsaddr; + wr.sg_list = &list; + wr.num_sge = 1; // size of sg_list + wr.opcode = IBV_WR_RDMA_READ; + + // set remote location where to read data from + wr.wr.rdma.remote_addr = ipsaddr->verbs.remote_recv_seq_addr; + wr.wr.rdma.rkey = ipsaddr->verbs.remote_recv_seq_rkey; + wr.send_flags = IBV_SEND_SIGNALED; + + if_pf (ibv_post_send(ipsaddr->verbs.rc_qp, &wr, &bad_wr)) { + if (errno != EBUSY && errno != EAGAIN && errno != ENOMEM) + _HFI_ERROR("failed to get remote psn num on %s port %u: %s\n", + ep->dev_name, ep->portnum, strerror(errno)); + return PSM2_EP_NO_RESOURCES; + } + ipsaddr->verbs.remote_seq_outstanding = 1; + _HFI_VDBG("posted remote_recv_psn RDMA READ: from 0x%"PRIx64" to 0x%"PRIx64" len %u rkey 0x%x\n", + wr.wr.rdma.remote_addr, list.addr, list.length, wr.wr.rdma.rkey); + return ret; +} +#endif + // TBD we could get also get scb->cksum out of scb // when called: // scb->ips_lrh has fixed size PSM header including OPA LRH @@ -100,7 +138,7 @@ psm3_verbs_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, struct ips_scb *scb, uint32_t *payload, uint32_t length, uint32_t isCtrlMsg, uint32_t cksum_valid, uint32_t cksum -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , uint32_t is_gpu_payload #endif ) @@ -148,6 +186,36 @@ psm3_verbs_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, #endif // PSM_FI PSMI_LOCK_ASSERT(proto->mq->progress_lock); psmi_assert_always(! cksum_valid); // no software checksum yet + +#ifdef USE_RC + if (!isCtrlMsg && flow->ipsaddr->verbs.use_qp->qp_type == IBV_QPT_RC && proto->max_credits < IPS_PROTO_FLOW_CREDITS_RC_MAX) { + if (flow->ipsaddr->verbs.remote_seq_outstanding) { + psm3_verbs_completion_update(proto->ep, 1); + if (flow->ipsaddr->verbs.remote_seq_outstanding) + return PSM2_EP_NO_RESOURCES; + } + + // NOTE: the remote_recv_psn is the actual received pkt psn + 1 (see ips_proto_is_expected_or_nak()) + // and the scb psn_num is the pkt we are going to send out. So we have below diff calculation + int diff = scb->seq_num.psn_num - flow->ipsaddr->verbs.remote_recv_psn; + + _HFI_VDBG("pkt psn=%d remote recv psn=%d diff=%d cc_count=%d\n", + scb->seq_num.psn_num, flow->ipsaddr->verbs.remote_recv_psn, diff, + flow->ipsaddr->verbs.cc_count); + if (diff < 0) + diff += proto->psn_mask + 1; + if (diff >= proto->max_credits || (flow->ipsaddr->verbs.cc_count && diff >= proto->min_credits)) { + psm3_verbs_get_remote_psn(proto->ep, flow->ipsaddr); + // cc_count is congestion control count. right now we use it to indicate whether is + // under congestion control. The count can potentially used in dynamic CC adjustment + // in the future + flow->ipsaddr->verbs.cc_count += 1; + return PSM2_EP_NO_RESOURCES; + } + + flow->ipsaddr->verbs.cc_count = 0; + } +#endif // allocate a send buffer // if we have no buffers, we can return PSM2_EP_NO_RESOURCES and caller // will try again later @@ -161,9 +229,17 @@ psm3_verbs_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, } if_pf (! sbuf) { _HFI_VDBG("out of send buffers\n"); + // try to poll send completion and see if we can free some sbuf + psm3_verbs_completion_update(proto->ep, 1); return PSM2_EP_NO_RESOURCES; } _HFI_VDBG("got sbuf %p index %lu\n", sbuf_to_buffer(sbuf), send_buffer_index(sbuf_pool(ep, sbuf), sbuf_to_buffer(sbuf))); + + uint8_t is_reliable = USE_QP->qp_type == IBV_QPT_RC && scb == STAILQ_FIRST(&flow->scb_unacked); + if (is_reliable) { + // no explicit ack for RC because RC already has its own ack + ips_lrh->bth[2] &= __cpu_to_be32(~IPS_SEND_FLAG_ACKREQ); + } // TBD - we should be able to skip sending some headers such as OPA lrh and // perhaps bth (does PSM use bth to hold PSNs?) // copy scb->ips_lrh to send buffer @@ -171,7 +247,7 @@ psm3_verbs_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, memcpy(sbuf_to_buffer(sbuf), ips_lrh, sizeof(*ips_lrh)); if (!send_dma) { // copy payload to send buffer, length could be zero, be safe -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (is_gpu_payload) { _HFI_VDBG("copy gpu payload %p %u\n", payload, length); PSM3_GPU_MEMCPY_DTOH(sbuf_to_buffer(sbuf) + sizeof(*ips_lrh), @@ -287,7 +363,7 @@ psm3_verbs_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, psm3_ep_verbs_unalloc_sbuf(USE_ALLOCATOR, sbuf, prev_sbuf); ret = PSM2_EP_NO_RESOURCES; } - _HFI_VDBG("done ud_transfer_frame: len %u, remote qpn %u\n", + _HFI_VDBG("done spio_transfer_frame: len %u, remote qpn %u\n", list[0].length +list[1].length, #ifdef USE_RC (USE_QP->qp_type != IBV_QPT_UD)? flow->ipsaddr->verbs.remote_qpn : @@ -297,7 +373,7 @@ psm3_verbs_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, err = psm3_verbs_completion_update(proto->ep, 0); if_pf (err != PSM2_OK) return err; - return ret; + return is_reliable ? PSM2_RELIABLE_DATA_SENT : ret; #undef USE_ALLOCATOR #undef USE_QP #undef USE_MAX_INLINE diff --git a/prov/psm3/psm3/include/linux-i386/sysdep.h b/prov/psm3/psm3/include/linux-i386/sysdep.h index 3d5d944964b..f8e2046f8c6 100644 --- a/prov/psm3/psm3/include/linux-i386/sysdep.h +++ b/prov/psm3/psm3/include/linux-i386/sysdep.h @@ -56,34 +56,6 @@ #ifndef _HFI_i386_SYSDEP_H #define _HFI_i386_SYSDEP_H -typedef struct cpuid { - unsigned eax, ebx, ecx, edx; -} cpuid_t; - -static __inline__ void -get_cpuid(const unsigned func, const unsigned subfunc, cpuid_t *id) -{ - unsigned a, b, c, d; - - asm (" \ - mov %4, %%eax \n\ - mov %5, %%ecx \n\ - cpuid \n\ - mov %%eax, %0 \n\ - mov %%ebx, %1 \n\ - mov %%ecx, %2 \n\ - mov %%edx, %3 \n\ - " : "=g" (a), "=g" (b), "=g" (c), "=g" (d) - : "g" (func), "g" (subfunc) - : "%eax", "%ebx", "%ecx", "%edx" - ); - - id->eax = a; - id->ebx = b; - id->ecx = c; - id->edx = d; -} - static __inline__ uint64_t get_cycles(void) { uint64_t v; diff --git a/prov/psm3/psm3/include/utils_debug.h b/prov/psm3/psm3/include/utils_debug.h index b7b6655f2e6..aba4b020fb5 100644 --- a/prov/psm3/psm3/include/utils_debug.h +++ b/prov/psm3/psm3/include/utils_debug.h @@ -172,7 +172,7 @@ extern char psm3_mylabel[]; void psm3_set_mylabel(char *); extern FILE *psm3_dbgout; extern void psm3_dump_buf(uint8_t *buf, uint32_t len); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU extern void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len); #endif @@ -268,7 +268,7 @@ extern void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len); #define _HFI_PDBG_ON unlikely(psm3_dbgmask & __HFI_PKTDBG) #define _HFI_PDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__) #define _HFI_PDBG_DUMP_ALWAYS(buf, len) psm3_dump_buf(buf, len) -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU #define _HFI_PDBG_DUMP_GPU_ALWAYS(buf, len) psm3_dump_gpu_buf(buf, len) #endif @@ -321,7 +321,7 @@ extern void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len); #define _HFI_MMDBG_ON 0 #define _HFI_MMDBG_ALWAYS(fmt, ...) #define _HFI_PDBG_DUMP_ALWAYS(buf, len) -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU #define _HFI_PDBG_DUMP_GPU_ALWAYS(buf, len) #endif #define _HFI_INFO_ON 0 diff --git a/prov/psm3/psm3/include/utils_env.h b/prov/psm3/psm3/include/utils_env.h index 770f04cc44a..9a83a8f2472 100644 --- a/prov/psm3/psm3/include/utils_env.h +++ b/prov/psm3/psm3/include/utils_env.h @@ -146,6 +146,11 @@ MOCKABLE(psm3_getenv)(const char *name, const char *descr, int level, union psmi_envvar_val *newval); MOCK_DCL_EPILOGUE(psm3_getenv); +// NOTE: This function writes the entire output union pointed to by newval. as a +// result, the backing storage for the pointer must be at least the size of the +// full union type, not simply the size of the type indicated by the type +// parameter. +// int MOCKABLE(psm3_getenv_range)(const char *name, const char *descr, const char *help, unsigned level_flags, int type, union psmi_envvar_val defval, union psmi_envvar_val min, diff --git a/prov/psm3/psm3/include/utils_user.h b/prov/psm3/psm3/include/utils_user.h index e40800aedba..8e225d5e94f 100644 --- a/prov/psm3/psm3/include/utils_user.h +++ b/prov/psm3/psm3/include/utils_user.h @@ -159,14 +159,6 @@ static __inline__ uint32_t psm3_next_power2(uint64_t x) #define HFI_KHDR_TINYLEN_MASK 0xf #define HFI_KHDR_TINYLEN_SHIFT 16 - -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) -extern int is_driver_gpudirect_enabled; - -#define PSMI_IS_DRIVER_GPUDIRECT_ENABLED likely(is_driver_gpudirect_enabled) -#define PSMI_IS_DRIVER_GPUDIRECT_DISABLED unlikely(!is_driver_gpudirect_enabled) -#endif - /* hfi kdeth header format */ struct hfi_kdeth { __le32 kdeth0; @@ -268,4 +260,15 @@ static __inline__ uint64_t nanosecs_to_cycles(uint64_t ns) return (ns * 1000ULL) / psm3_pico_per_cycle; } +/* concatenate two symbols, giving the caller the opportunity to do macro + * expansion of either argument. in particular, this is required for CUDA, + * which #define-maps legacy functions to alternate versions (by appending + * _v2 suffixes). + * + * without this, macro authors will get different results depending on whether + * they immediately use a passed symbol in a concatenation (will not expand), + * or pass it to a nested macro (will expand). + */ +#define PSM3_CONCAT(a, b) a##b + #endif /* UTILS_USER_H */ diff --git a/prov/psm3/psm3/psm.c b/prov/psm3/psm3/psm.c index e46f868f054..06d0a7a11c5 100644 --- a/prov/psm3/psm3/psm.c +++ b/prov/psm3/psm3/psm.c @@ -53,7 +53,6 @@ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ -#include #include #include "psm_user.h" #include "psm2_hal.h" @@ -101,509 +100,6 @@ char *psm3_affinity_shm_name; uint64_t *psm3_shared_affinity_ptr; uint64_t *psm3_shared_affinity_nic_refcount_ptr; -uint32_t psm3_cpu_model; - -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) -int is_gdr_copy_enabled; -uint32_t gdr_copy_limit_send; -uint32_t gdr_copy_limit_recv; -int is_gpudirect_enabled = 0; -int _device_support_gpudirect = -1; // -1 indicates "unset". See device_support_gpudirect(). -int is_driver_gpudirect_enabled; -uint32_t psm3_gpu_thresh_rndv = PSM3_GPU_THRESH_RNDV; -uint64_t psm3_gpu_cache_evict; // in bytes -#endif - -#ifdef PSM_CUDA -int is_cuda_enabled; -int my_gpu_device = 0; -int _gpu_p2p_supported = -1; // -1 indicates "unset". see gpu_p2p_supported(). -int _device_support_unified_addr = -1; // -1 indicates "unchecked". See verify_device_support_unified_addr(). - -/* CUDA Driver Library */ -void *psmi_cuda_lib; -int cuda_lib_version; -/* CUDA Runtime (cudart) Library */ -void *psmi_cudart_lib; -int cuda_runtime_ver; -#endif - -#ifdef PSM_ONEAPI -int is_oneapi_ze_enabled; -int my_gpu_device = 0; -int _gpu_p2p_supported = -1; // -1 indicates "unset". see gpu_p2p_supported(). - -ze_context_handle_t ze_context = NULL; -ze_driver_handle_t ze_driver = NULL; -struct ze_dev_ctxt ze_devices[MAX_ZE_DEVICES]; -int num_ze_devices = 0; -struct ze_dev_ctxt *cur_ze_dev = NULL; - -/* ZE Loader(zel) And Runtime(ze) Library */ -void *psmi_oneapi_ze_lib; -ze_api_version_t zel_api_version = 0; -zel_version_t zel_lib_version = { }; -#endif // PSM_ONEAPI - -#ifdef PSM_CUDA -CUresult (*psmi_cuInit)(unsigned int Flags ); -CUresult (*psmi_cuCtxDetach)(CUcontext c); -CUresult (*psmi_cuCtxGetCurrent)(CUcontext *c); -CUresult (*psmi_cuCtxSetCurrent)(CUcontext c); -CUresult (*psmi_cuPointerGetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); -CUresult (*psmi_cuPointerSetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); -CUresult (*psmi_cuDeviceCanAccessPeer)(int *canAccessPeer, CUdevice dev, CUdevice peerDev); -CUresult (*psmi_cuDeviceGet)(CUdevice* device, int ordinal); -CUresult (*psmi_cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev); -CUresult (*psmi_cuDriverGetVersion)(int* driverVersion); -CUresult (*psmi_cuDeviceGetCount)(int* count); -CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags); -CUresult (*psmi_cuStreamDestroy)(CUstream phStream); -CUresult (*psmi_cuStreamSynchronize)(CUstream phStream); -CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags); -CUresult (*psmi_cuEventDestroy)(CUevent hEvent); -CUresult (*psmi_cuEventQuery)(CUevent hEvent); -CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream); -CUresult (*psmi_cuEventSynchronize)(CUevent hEvent); -CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags); -CUresult (*psmi_cuMemFreeHost)(void* p); -CUresult (*psmi_cuMemHostRegister)(void* p, size_t bytesize, unsigned int Flags); -CUresult (*psmi_cuMemHostUnregister)(void* p); -CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); -CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); -CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); -CUresult (*psmi_cuMemcpyHtoD)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount); -CUresult (*psmi_cuMemcpyDtoHAsync)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); -CUresult (*psmi_cuMemcpyHtoDAsync)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream); -CUresult (*psmi_cuIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr); -CUresult (*psmi_cuIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags); -CUresult (*psmi_cuIpcCloseMemHandle)(CUdeviceptr dptr); -CUresult (*psmi_cuMemGetAddressRange)(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr); -CUresult (*psmi_cuDevicePrimaryCtxGetState)(CUdevice dev, unsigned int* flags, int* active); -CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev); -CUresult (*psmi_cuCtxGetDevice)(CUdevice* device); -CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device); -CUresult (*psmi_cuGetErrorString)(CUresult error, const char **pStr); -cudaError_t (*psmi_cudaRuntimeGetVersion)(int* runtimeVersion); - -uint64_t psmi_count_cuInit; -uint64_t psmi_count_cuCtxDetach; -uint64_t psmi_count_cuCtxGetCurrent; -uint64_t psmi_count_cuCtxSetCurrent; -uint64_t psmi_count_cuPointerGetAttribute; -uint64_t psmi_count_cuPointerSetAttribute; -uint64_t psmi_count_cuDeviceCanAccessPeer; -uint64_t psmi_count_cuDeviceGet; -uint64_t psmi_count_cuDeviceGetAttribute; -uint64_t psmi_count_cuDriverGetVersion; -uint64_t psmi_count_cuDeviceGetCount; -uint64_t psmi_count_cuStreamCreate; -uint64_t psmi_count_cuStreamDestroy; -uint64_t psmi_count_cuStreamSynchronize; -uint64_t psmi_count_cuEventCreate; -uint64_t psmi_count_cuEventDestroy; -uint64_t psmi_count_cuEventQuery; -uint64_t psmi_count_cuEventRecord; -uint64_t psmi_count_cuEventSynchronize; -uint64_t psmi_count_cuMemHostAlloc; -uint64_t psmi_count_cuMemFreeHost; -uint64_t psmi_count_cuMemHostRegister; -uint64_t psmi_count_cuMemHostUnregister; -uint64_t psmi_count_cuMemcpy; -uint64_t psmi_count_cuMemcpyDtoD; -uint64_t psmi_count_cuMemcpyDtoH; -uint64_t psmi_count_cuMemcpyHtoD; -uint64_t psmi_count_cuMemcpyDtoHAsync; -uint64_t psmi_count_cuMemcpyHtoDAsync; -uint64_t psmi_count_cuIpcGetMemHandle; -uint64_t psmi_count_cuIpcOpenMemHandle; -uint64_t psmi_count_cuIpcCloseMemHandle; -uint64_t psmi_count_cuMemGetAddressRange; -uint64_t psmi_count_cuDevicePrimaryCtxGetState; -uint64_t psmi_count_cuDevicePrimaryCtxRetain; -uint64_t psmi_count_cuCtxGetDevice; -uint64_t psmi_count_cuDevicePrimaryCtxRelease; -uint64_t psmi_count_cuGetErrorString; -uint64_t psmi_count_cudaRuntimeGetVersion; - -int psmi_cuda_lib_load() -{ - psm2_error_t err = PSM2_OK; - char *dlerr; - - PSM2_LOG_MSG("entering"); - _HFI_DBG("Loading CUDA library.\n"); - - psmi_cuda_lib = dlopen("libcuda.so.1", RTLD_LAZY); - if (!psmi_cuda_lib) { - dlerr = dlerror(); - _HFI_ERROR("Unable to open libcuda.so.1. Error %s\n", - dlerr ? dlerr : "no dlerror()"); - goto fail; - } - - psmi_cuDriverGetVersion = dlsym(psmi_cuda_lib, "cuDriverGetVersion"); - - if (!psmi_cuDriverGetVersion) { - _HFI_ERROR - ("Unable to resolve symbols in CUDA libraries.\n"); - goto fail; - } - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuGetErrorString);// for PSMI_CUDA_CALL - - PSMI_CUDA_CALL(cuDriverGetVersion, &cuda_lib_version); - if (cuda_lib_version < 7000) { - _HFI_ERROR("Please update CUDA driver, required minimum version is 7.0\n"); - goto fail; - } - - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuInit); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxGetCurrent); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxDetach); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxSetCurrent); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuPointerGetAttribute); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuPointerSetAttribute); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceCanAccessPeer); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetAttribute); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGet); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetCount); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamCreate); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamDestroy); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamSynchronize); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventCreate); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventDestroy); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventQuery); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventRecord); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventSynchronize); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemHostAlloc); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemFreeHost); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemHostRegister); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemHostUnregister); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpy); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoD); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoH); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyHtoD); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoHAsync); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyHtoDAsync); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuIpcGetMemHandle); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuIpcOpenMemHandle); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuIpcCloseMemHandle); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemGetAddressRange); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDevicePrimaryCtxGetState); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDevicePrimaryCtxRetain); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDevicePrimaryCtxRelease); - PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxGetDevice); - - /* CUDA Runtime */ - psmi_cudart_lib = dlopen("libcudart.so", RTLD_LAZY); - if (!psmi_cudart_lib) { - dlerr = dlerror(); - _HFI_ERROR("Unable to open libcudart.so. Error %s\n", - dlerr ? dlerr : "no dlerror()"); - goto fail; - } - PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaRuntimeGetVersion); - - PSM2_LOG_MSG("leaving"); - return err; -fail: - if (psmi_cuda_lib) - dlclose(psmi_cuda_lib); - if (psmi_cudart_lib) - dlclose(psmi_cudart_lib); - err = psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to load CUDA library.\n"); - return err; -} - -static void psmi_cuda_stats_register() -{ -#define PSMI_CUDA_COUNT_DECLU64(func) \ - PSMI_STATS_DECLU64(#func, NULL, &psmi_count_##func) - - struct psmi_stats_entry entries[] = { - PSMI_CUDA_COUNT_DECLU64(cuInit), - PSMI_CUDA_COUNT_DECLU64(cuCtxDetach), - PSMI_CUDA_COUNT_DECLU64(cuCtxGetCurrent), - PSMI_CUDA_COUNT_DECLU64(cuCtxSetCurrent), - PSMI_CUDA_COUNT_DECLU64(cuPointerGetAttribute), - PSMI_CUDA_COUNT_DECLU64(cuPointerSetAttribute), - PSMI_CUDA_COUNT_DECLU64(cuDeviceCanAccessPeer), - PSMI_CUDA_COUNT_DECLU64(cuDeviceGet), - PSMI_CUDA_COUNT_DECLU64(cuDeviceGetAttribute), - PSMI_CUDA_COUNT_DECLU64(cuDriverGetVersion), - PSMI_CUDA_COUNT_DECLU64(cuDeviceGetCount), - PSMI_CUDA_COUNT_DECLU64(cuStreamCreate), - PSMI_CUDA_COUNT_DECLU64(cuStreamDestroy), - PSMI_CUDA_COUNT_DECLU64(cuStreamSynchronize), - PSMI_CUDA_COUNT_DECLU64(cuEventCreate), - PSMI_CUDA_COUNT_DECLU64(cuEventDestroy), - PSMI_CUDA_COUNT_DECLU64(cuEventQuery), - PSMI_CUDA_COUNT_DECLU64(cuEventRecord), - PSMI_CUDA_COUNT_DECLU64(cuEventSynchronize), - PSMI_CUDA_COUNT_DECLU64(cuMemHostAlloc), - PSMI_CUDA_COUNT_DECLU64(cuMemFreeHost), - PSMI_CUDA_COUNT_DECLU64(cuMemHostRegister), - PSMI_CUDA_COUNT_DECLU64(cuMemHostUnregister), - PSMI_CUDA_COUNT_DECLU64(cuMemcpy), - PSMI_CUDA_COUNT_DECLU64(cuMemcpyDtoD), - PSMI_CUDA_COUNT_DECLU64(cuMemcpyDtoH), - PSMI_CUDA_COUNT_DECLU64(cuMemcpyHtoD), - PSMI_CUDA_COUNT_DECLU64(cuMemcpyDtoHAsync), - PSMI_CUDA_COUNT_DECLU64(cuMemcpyHtoDAsync), - PSMI_CUDA_COUNT_DECLU64(cuIpcGetMemHandle), - PSMI_CUDA_COUNT_DECLU64(cuIpcOpenMemHandle), - PSMI_CUDA_COUNT_DECLU64(cuIpcCloseMemHandle), - PSMI_CUDA_COUNT_DECLU64(cuMemGetAddressRange), - PSMI_CUDA_COUNT_DECLU64(cuDevicePrimaryCtxGetState), - PSMI_CUDA_COUNT_DECLU64(cuDevicePrimaryCtxRetain), - PSMI_CUDA_COUNT_DECLU64(cuCtxGetDevice), - PSMI_CUDA_COUNT_DECLU64(cuDevicePrimaryCtxRelease), - PSMI_CUDA_COUNT_DECLU64(cuGetErrorString), - PSMI_CUDA_COUNT_DECLU64(cudaRuntimeGetVersion), - }; -#undef PSMI_CUDA_COUNT_DECLU64 - - psm3_stats_register_type("PSM_Cuda_call_statistics", - "Count of CUDA calls per API entry point for the whole process.\n" - "When using an NVIDIA GPU, PSM3 may call lower level CUDA " - "APIs to access or transfer application buffers in GPU memory.", - PSMI_STATSTYPE_GPU, - entries, PSMI_HOWMANY(entries), NULL, - &psmi_count_cuInit, NULL); /* context must != NULL */ -} -#endif // PSM_CUDA - -#ifdef PSM_ONEAPI -ze_result_t (*psmi_zeInit)(ze_init_flags_t flags); -ze_result_t (*psmi_zeDriverGet)(uint32_t *pCount, ze_driver_handle_t *phDrivers); -ze_result_t (*psmi_zeDeviceGet)(ze_driver_handle_t hDriver, uint32_t *pCount, ze_device_handle_t *phDevices); -ze_result_t (*psmi_zeDevicePciGetPropertiesExt)(ze_device_handle_t hDevice, ze_pci_ext_properties_t *pPciProperties); -#ifndef PSM3_NO_ONEAPI_IMPORT -ze_result_t (*psmi_zeDriverGetExtensionFunctionAddress)(ze_driver_handle_t hDriver, const char *name, void **ppFunctionAddress); -ze_result_t (*psmi_zexDriverImportExternalPointer)(ze_driver_handle_t hDriver, void *ptr, size_t size); -ze_result_t (*psmi_zexDriverReleaseImportedPointer)(ze_driver_handle_t hDriver, void *ptr); -#endif -ze_result_t (*psmi_zeContextCreate)(ze_driver_handle_t hDriver, const ze_context_desc_t *desc, ze_context_handle_t *phContext); -ze_result_t (*psmi_zeContextDestroy)(ze_context_handle_t hContext); -ze_result_t (*psmi_zeCommandQueueCreate)(ze_context_handle_t hContext, ze_device_handle_t hDevice,const ze_command_queue_desc_t *desc, ze_command_queue_handle_t *phCommandQueue); -ze_result_t (*psmi_zeCommandQueueDestroy)(ze_command_queue_handle_t hCommandQueue); -ze_result_t (*psmi_zeCommandQueueExecuteCommandLists)(ze_command_queue_handle_t hCommandQueue, uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, ze_fence_handle_t hFence); -ze_result_t (*psmi_zeCommandQueueSynchronize)(ze_command_queue_handle_t hCommandQueue, uint64_t timeout); -ze_result_t (*psmi_zeCommandListCreate)(ze_context_handle_t hContext, ze_device_handle_t hDevice, const ze_command_list_desc_t *desc, ze_command_list_handle_t *phCommandList); -ze_result_t (*psmi_zeCommandListDestroy)(ze_command_list_handle_t hCommandList); -ze_result_t (*psmi_zeCommandListClose)(ze_command_list_handle_t hCommandList); -ze_result_t (*psmi_zeCommandListReset)(ze_command_list_handle_t hCommandList); -ze_result_t (*psmi_zeCommandListCreateImmediate)(ze_context_handle_t hContext, ze_device_handle_t hDevice, const ze_command_queue_desc_t *desc, ze_command_list_handle_t *phCommandList); -ze_result_t (*psmi_zeCommandListAppendMemoryCopy)(ze_command_list_handle_t hCommandList, void *dstptr, const void *srcptr, size_t size, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents); -ze_result_t (*psmi_zeCommandListAppendSignalEvent)(ze_command_list_handle_t hCommandList, ze_event_handle_t hEvent); -ze_result_t (*psmi_zeDeviceCanAccessPeer)(ze_device_handle_t hDevice, ze_device_handle_t hPeerDevice, ze_bool_t *value); -ze_result_t (*psmi_zeDeviceGetCommandQueueGroupProperties)(ze_device_handle_t hDevice, uint32_t *pCount, ze_command_queue_group_properties_t *pCommandQueueGroupProperties); -ze_result_t (*psmi_zeMemAllocHost)(ze_context_handle_t hContext, const ze_host_mem_alloc_desc_t *host_desc, size_t size, size_t alignment, void **pptr); -ze_result_t (*psmi_zeMemAllocDevice)(ze_context_handle_t hContext, const ze_device_mem_alloc_desc_t *device_desc, size_t size, size_t alignment, ze_device_handle_t hDevice, void **pptr); -ze_result_t (*psmi_zeMemFree)(ze_context_handle_t hContext, void *ptr); -ze_result_t (*psmi_zeMemGetIpcHandle)(ze_context_handle_t hContext, const void *ptr, ze_ipc_mem_handle_t *pIpcHandle); -#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE -ze_result_t (*psmi_zeMemGetIpcHandleFromFileDescriptorExp)(ze_context_handle_t hContext, uint64_t handle, ze_ipc_mem_handle_t *pIpcHandle); -ze_result_t (*psmi_zeMemGetFileDescriptorFromIpcHandleExp)(ze_context_handle_t hContext, ze_ipc_mem_handle_t ipcHandle, uint64_t *pHandle); -ze_result_t (*psmi_zeMemPutIpcHandle)(ze_context_handle_t hContext, ze_ipc_mem_handle_t handle); -#endif -ze_result_t (*psmi_zeMemOpenIpcHandle)(ze_context_handle_t hContext,ze_device_handle_t hDevice, ze_ipc_mem_handle_t handle, ze_ipc_memory_flags_t flags, void **pptr); -ze_result_t (*psmi_zeMemCloseIpcHandle)(ze_context_handle_t hContext, const void *ptr); -ze_result_t (*psmi_zeMemGetAddressRange)(ze_context_handle_t hContext, const void *ptr, void **pBase, size_t *pSize); -ze_result_t (*psmi_zeMemGetAllocProperties)(ze_context_handle_t hContext, const void *ptr, ze_memory_allocation_properties_t *pMemAllocProperties, ze_device_handle_t *phDevice); -ze_result_t (*psmi_zeEventPoolCreate)(ze_context_handle_t hContext, const ze_event_pool_desc_t *desc, uint32_t numDevices, ze_device_handle_t *phDevices, ze_event_pool_handle_t *phEventPool); -ze_result_t (*psmi_zeEventPoolDestroy)(ze_event_pool_handle_t hEventPool); -ze_result_t (*psmi_zeEventCreate)(ze_event_pool_handle_t hEventPool, const ze_event_desc_t *desc, ze_event_handle_t *phEvent); -ze_result_t (*psmi_zeEventDestroy)(ze_event_handle_t hEvent); -ze_result_t (*psmi_zeEventQueryStatus)(ze_event_handle_t hEvent); -ze_result_t (*psmi_zeEventHostSynchronize)(ze_event_handle_t hEvent, uint64_t timeout); -ze_result_t (*psmi_zeEventHostReset)(ze_event_handle_t hEvent); -ze_result_t (*psmi_zelLoaderGetVersions)(size_t *num_elems, zel_component_version_t *versions); - -uint64_t psmi_count_zeInit; -uint64_t psmi_count_zeDriverGet; -uint64_t psmi_count_zeDeviceGet; -uint64_t psmi_count_zeDevicePciGetPropertiesExt; -#ifndef PSM3_NO_ONEAPI_IMPORT -uint64_t psmi_count_zeDriverGetExtensionFunctionAddress; -uint64_t psmi_count_zexDriverImportExternalPointer; -uint64_t psmi_count_zexDriverReleaseImportedPointer; -#endif -uint64_t psmi_count_zeContextCreate; -uint64_t psmi_count_zeContextDestroy; -uint64_t psmi_count_zeCommandQueueCreate; -uint64_t psmi_count_zeCommandQueueDestroy; -uint64_t psmi_count_zeCommandQueueExecuteCommandLists; -uint64_t psmi_count_zeCommandQueueSynchronize; -uint64_t psmi_count_zeCommandListCreate; -uint64_t psmi_count_zeCommandListDestroy; -uint64_t psmi_count_zeCommandListClose; -uint64_t psmi_count_zeCommandListReset; -uint64_t psmi_count_zeCommandListCreateImmediate; -uint64_t psmi_count_zeCommandListAppendMemoryCopy; -uint64_t psmi_count_zeCommandListAppendSignalEvent; -uint64_t psmi_count_zeDeviceCanAccessPeer; -uint64_t psmi_count_zeDeviceGetCommandQueueGroupProperties; -uint64_t psmi_count_zeMemAllocHost; -uint64_t psmi_count_zeMemAllocDevice; -uint64_t psmi_count_zeMemFree; -uint64_t psmi_count_zeMemGetIpcHandle; -#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE -uint64_t psmi_count_zeMemGetIpcHandleFromFileDescriptorExp; -uint64_t psmi_count_zeMemGetFileDescriptorFromIpcHandleExp; -uint64_t psmi_count_zeMemPutIpcHandle; -#endif -uint64_t psmi_count_zeMemOpenIpcHandle; -uint64_t psmi_count_zeMemCloseIpcHandle; -uint64_t psmi_count_zeMemGetAddressRange; -uint64_t psmi_count_zeMemGetAllocProperties; -uint64_t psmi_count_zeEventPoolCreate; -uint64_t psmi_count_zeEventPoolDestroy; -uint64_t psmi_count_zeEventCreate; -uint64_t psmi_count_zeEventDestroy; -uint64_t psmi_count_zeEventQueryStatus; -uint64_t psmi_count_zeEventHostSynchronize; -uint64_t psmi_count_zeEventHostReset; -uint64_t psmi_count_zelLoaderGetVersions; - -int psmi_oneapi_ze_load() -{ - psm2_error_t err = PSM2_OK; - char *dlerr; - - PSM2_LOG_MSG("entering"); - _HFI_VDBG("Loading OneAPI Level Zero library.\n"); - - psmi_oneapi_ze_lib = dlopen("libze_loader.so", RTLD_LAZY); - if (!psmi_oneapi_ze_lib) { - dlerr = dlerror(); - _HFI_ERROR( - "Unable to open libze_loader.so. Error %s\n", - dlerr ? dlerr : "no dlerror()"); - goto fail; - } - - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeInit); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDriverGet); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDeviceGet); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDevicePciGetPropertiesExt); -#ifndef PSM3_NO_ONEAPI_IMPORT - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDriverGetExtensionFunctionAddress); -#endif - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeContextCreate); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeContextDestroy); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandQueueCreate); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandQueueDestroy); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandQueueExecuteCommandLists); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandQueueSynchronize); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandListCreate); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandListDestroy); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandListClose); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandListReset); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandListCreateImmediate); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandListAppendMemoryCopy); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandListAppendSignalEvent); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDeviceCanAccessPeer); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDeviceGetCommandQueueGroupProperties); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemAllocHost); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemAllocDevice); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemFree); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemGetIpcHandle); -#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemGetIpcHandleFromFileDescriptorExp); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemGetFileDescriptorFromIpcHandleExp); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemPutIpcHandle); -#endif - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemOpenIpcHandle); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemCloseIpcHandle); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemGetAddressRange); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemGetAllocProperties); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeEventPoolCreate); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeEventPoolDestroy); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeEventCreate); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeEventDestroy); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeEventQueryStatus); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeEventHostSynchronize); - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeEventHostReset); - - /* ze loader API */ - PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zelLoaderGetVersions); - - PSM2_LOG_MSG("leaving"); - return err; -fail: - if (psmi_oneapi_ze_lib) - dlclose(psmi_oneapi_ze_lib); - err = psm3_handle_error(PSMI_EP_NORETURN, - PSM2_INTERNAL_ERR, - "Unable to load OneAPI Level Zero library.\n"); - return err; -} - -static void psmi_oneapi_ze_stats_register() -{ -#define PSMI_ONEAPI_ZE_COUNT_DECLU64(func) \ - PSMI_STATS_DECLU64(#func, NULL, &psmi_count_##func) - - struct psmi_stats_entry ze_entries[] = { - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeInit), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDriverGet), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDeviceGet), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDevicePciGetPropertiesExt), -#ifndef PSM3_NO_ONEAPI_IMPORT - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDriverGetExtensionFunctionAddress), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zexDriverImportExternalPointer), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zexDriverReleaseImportedPointer), -#endif - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeContextCreate), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeContextDestroy), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandQueueCreate), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandQueueDestroy), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandQueueExecuteCommandLists), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandQueueSynchronize), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandListCreate), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandListDestroy), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandListClose), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandListReset), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandListCreateImmediate), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandListAppendMemoryCopy), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandListAppendSignalEvent), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDeviceCanAccessPeer), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDeviceGetCommandQueueGroupProperties), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemAllocHost), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemAllocDevice), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemFree), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemGetIpcHandle), -#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemGetIpcHandleFromFileDescriptorExp), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemGetFileDescriptorFromIpcHandleExp), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemPutIpcHandle), -#endif - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemOpenIpcHandle), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemCloseIpcHandle), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemGetAddressRange), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemGetAllocProperties), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeEventPoolCreate), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeEventPoolDestroy), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeEventCreate), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeEventDestroy), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeEventQueryStatus), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeEventHostSynchronize), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zeEventHostReset), - PSMI_ONEAPI_ZE_COUNT_DECLU64(zelLoaderGetVersions) - }; -#undef PSMI_ONEAPI_ZE_COUNT_DECLU64 - - psm3_stats_register_type("PSM_OneAPI_ZE_call_statistics", - "Count of OneAPI Level Zero calls per API entry point for the whole process.\n" - "When using an Intel(r) GPU, PSM3 may call Level Zero " - "APIs to access or transfer application buffers in GPU memory.", - PSMI_STATSTYPE_GPU, - ze_entries, PSMI_HOWMANY(ze_entries), NULL, - &psmi_count_zeInit, NULL); /* context must != NULL */ -} -#endif // PSM_ONEAPI - /* * Bit field that contains capability set. * Each bit represents different capability. @@ -639,410 +135,6 @@ int MOCKABLE(psm3_isinitialized)() } MOCK_DEF_EPILOGUE(psm3_isinitialized); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) -static void psmi_gpu_init(void) -{ - int ret; - - union psmi_envvar_val env_enable_gdr_copy; - psm3_getenv("PSM3_GDRCOPY", - "Enable (set envvar to 1) for gdr copy support in PSM (Enabled by default)", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)1, &env_enable_gdr_copy); - is_gdr_copy_enabled = env_enable_gdr_copy.e_int; - - union psmi_envvar_val env_gpu_thresh_rndv; - ret = psm3_getenv_range("PSM3_GPU_THRESH_RNDV", - "RNDV protocol is used for GPU send message sizes greater than the threshold", - NULL, PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)psm3_gpu_thresh_rndv, - (union psmi_envvar_val)0, (union psmi_envvar_val)UINT32_MAX, - NULL, NULL, &env_gpu_thresh_rndv); - if (ret > 0) - /* - * For backward compatibility, check if the old variable name is set. - * Priority order: New name > old name > default value. - */ - psm3_getenv("PSM3_CUDA_THRESH_RNDV", - "[Deprecated, use PSM3_GPU_THRESH_RNDV]" - " RNDV protocol is used for GPU send message sizes greater than the threshold", - PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)psm3_gpu_thresh_rndv, - &env_gpu_thresh_rndv); - - psm3_gpu_thresh_rndv = env_gpu_thresh_rndv.e_uint; - - - union psmi_envvar_val env_gdr_copy_limit_send; - psm3_getenv("PSM3_GDRCOPY_LIMIT_SEND", - "GDR Copy is turned off on the send side" - " for message sizes greater than the limit" -#ifndef OPA - " or larger than 1 MTU\n", -#else - "\n", -#endif - PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)GDR_COPY_LIMIT_SEND, &env_gdr_copy_limit_send); - gdr_copy_limit_send = env_gdr_copy_limit_send.e_int; - - if (gdr_copy_limit_send < 8 || gdr_copy_limit_send > psm3_gpu_thresh_rndv) - gdr_copy_limit_send = max(GDR_COPY_LIMIT_SEND, psm3_gpu_thresh_rndv); - - union psmi_envvar_val env_gdr_copy_limit_recv; - psm3_getenv("PSM3_GDRCOPY_LIMIT_RECV", - "GDR Copy is turned off on the recv side" - " for message sizes greater than the limit\n", - PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)GDR_COPY_LIMIT_RECV, &env_gdr_copy_limit_recv); - gdr_copy_limit_recv = env_gdr_copy_limit_recv.e_int; - - if (gdr_copy_limit_recv < 8) - gdr_copy_limit_recv = GDR_COPY_LIMIT_RECV; - - if (!is_gdr_copy_enabled) - gdr_copy_limit_send = gdr_copy_limit_recv = 0; -} -#endif /* PSM_CUDA || PSM_ONEAPI */ - -#ifdef PSM_CUDA -int psmi_cuda_initialize() -{ - psm2_error_t err = PSM2_OK; - - PSM2_LOG_MSG("entering"); - _HFI_DBG("Enabling CUDA support.\n"); - - psmi_cuda_stats_register(); - - err = psmi_cuda_lib_load(); - if (err != PSM2_OK) - goto fail; - - PSMI_CUDA_CALL(cuInit, 0); - - PSMI_CUDA_CALL(cudaRuntimeGetVersion, &cuda_runtime_ver); - -#ifdef PSM_HAVE_RNDV_MOD - psm2_get_gpu_bars(); -#endif - - psmi_gpu_init(); - - PSM2_LOG_MSG("leaving"); - return err; -fail: - err = psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to initialize PSM3 CUDA support.\n"); - return err; -} -#endif // PSM_CUDA - -#ifdef PSM_ONEAPI - -static void psmi_oneapi_find_copy_only_engine(ze_device_handle_t dev, - struct ze_dev_ctxt *ctxt) -{ - uint32_t count = 0; - ze_command_queue_group_properties_t *props = NULL; - int i; - int done = 0; - - /* Set the default */ - ctxt->ordinal = 0; - ctxt->index = 0; - ctxt->num_queues = 1; - PSMI_ONEAPI_ZE_CALL(zeDeviceGetCommandQueueGroupProperties, dev, - &count, NULL); - props = psmi_calloc(PSMI_EP_NONE, UNDEFINED, count, sizeof(*props)); - if (!props) { - _HFI_ERROR("Failed to allocate mem for CmdQ Grp\n"); - return; - } - PSMI_ONEAPI_ZE_CALL(zeDeviceGetCommandQueueGroupProperties, dev, - &count, props); - - // pick the last command queue group which supports copy but not compute. - // For PVC this will be the xeLink copy engine which will also - // have numQueues >1 (TBD - perhaps only select if it has numQueues>1). - // This ordinal is then supplied to create Command Queues and Command Lists. - for (i = count - 1; i >= 0; i--) { - _HFI_DBG("GPU Queue Group %d: copy=%d Compute=%d num_queues=%d\n", i, - (props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) != 0, - (props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) != 0, - (int)props[i].numQueues); - if (! done && (props[i].flags & - ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) && - !(props[i].flags & - ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)) { - ctxt->ordinal = i; - ctxt->num_queues = props[i].numQueues; - done = 1; - if (_HFI_DBG_ON) { - _HFI_DBG_ALWAYS("Selected GPU copy engine %d\n", i); - } else { - break; - } - } - } - psmi_free(props); -} - -// create command queue for use in psmi_oneapi_ze_memcpy for sync memcpy -static void psmi_oneapi_cmd_create(ze_device_handle_t dev, struct ze_dev_ctxt *ctxt) -{ - ze_command_queue_desc_t ze_cq_desc = { - .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - .flags = 0, - //.mode set below - .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL, - }; - - psmi_oneapi_find_copy_only_engine(dev, ctxt); - ze_cq_desc.ordinal = ctxt->ordinal; - ze_cq_desc.index = ctxt->index; - - if (psm3_oneapi_immed_sync_copy) { - ze_cq_desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS; - PSMI_ONEAPI_ZE_CALL(zeCommandListCreateImmediate, ze_context, - dev, &ze_cq_desc, &ctxt->cl); - } else { - ze_command_list_desc_t ze_cl_desc = { - .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, - .flags = 0 - }; - ze_cq_desc.mode = ZE_COMMAND_QUEUE_MODE_DEFAULT; - - PSMI_ONEAPI_ZE_CALL(zeCommandQueueCreate, ze_context, - dev, &ze_cq_desc, &ctxt->cq); - - ze_cl_desc.commandQueueGroupOrdinal = ctxt->ordinal; - PSMI_ONEAPI_ZE_CALL(zeCommandListCreate, ze_context, - dev, &ze_cl_desc, &ctxt->cl); - } - ctxt->dev = dev; - - if (psm3_oneapi_parallel_dtod_copy_thresh < UINT_MAX) { - // create resources for dual copy mechanism - ze_event_pool_desc_t pool_desc = { - .stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, - .flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE, - .count = 2 - }; - ze_event_desc_t event_desc = { - .stype = ZE_STRUCTURE_TYPE_EVENT_DESC, - .signal = ZE_EVENT_SCOPE_FLAG_HOST, - .wait = ZE_EVENT_SCOPE_FLAG_HOST, - }; - PSMI_ONEAPI_ZE_CALL(zeEventPoolCreate, - ze_context, &pool_desc, 0, NULL, &ctxt->event_pool); - - event_desc.index = 0; - PSMI_ONEAPI_ZE_CALL(zeEventCreate, ctxt->event_pool, &event_desc, - &ctxt->copy_status0); - - event_desc.index = 1; - PSMI_ONEAPI_ZE_CALL(zeEventCreate, ctxt->event_pool, &event_desc, - &ctxt->copy_status1); - - psmi_oneapi_async_cmd_create(ctxt, &ctxt->async_cq0, - &ctxt->async_cl0); - psmi_oneapi_async_cmd_create(ctxt, &ctxt->async_cq1, - &ctxt->async_cl1); - } -} - -void psmi_oneapi_cmd_create_all(void) -{ - int i; - struct ze_dev_ctxt *ctxt; - ze_context_desc_t ctxtDesc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, NULL, 0 }; - - if (!ze_context) - PSMI_ONEAPI_ZE_CALL(zeContextCreate, ze_driver, &ctxtDesc, - &ze_context); - - for (i = 0; i < num_ze_devices; i++) { - ctxt = &ze_devices[i]; - - if (!ctxt->cl) { - psmi_oneapi_cmd_create(ctxt->dev, ctxt); - _HFI_DBG("Initialized cmd queues for ze_device[%d] %p\n", - i, ctxt->dev); - } - } - if (num_ze_devices > 0) - cur_ze_dev = &ze_devices[0]; -} - -void psmi_oneapi_cmd_destroy_all(void) -{ - int i; - struct ze_dev_ctxt *ctxt; - - for (i = 0; i < num_ze_devices; i++) { - ctxt = &ze_devices[i]; - - if (ctxt->async_cl1 != NULL) { - PSMI_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->async_cl1); - ctxt->async_cl1 = NULL; - } - if (ctxt->async_cq1 != NULL) { - PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, ctxt->async_cq1); - ctxt->async_cq1 = NULL; - } - if (ctxt->async_cl0 != NULL) { - PSMI_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->async_cl0); - ctxt->async_cl0 = NULL; - } - if (ctxt->async_cq0 != NULL) { - PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, ctxt->async_cq0); - ctxt->async_cq0 = NULL; - } - if (ctxt->copy_status1 != NULL) { - PSMI_ONEAPI_ZE_CALL(zeEventDestroy, ctxt->copy_status1); - ctxt->copy_status1 = NULL; - } - if (ctxt->copy_status0 != NULL) { - PSMI_ONEAPI_ZE_CALL(zeEventDestroy, ctxt->copy_status0); - ctxt->copy_status0 = NULL; - } - if (ctxt->event_pool != NULL) { - PSMI_ONEAPI_ZE_CALL(zeEventPoolDestroy, ctxt->event_pool); - ctxt->event_pool = NULL; - } - if (ctxt->cl) { - PSMI_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->cl); - ctxt->cl = NULL; - } - if (ctxt->cq) { - PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, ctxt->cq); - ctxt->cq = NULL; - } - } - cur_ze_dev = NULL; - - /* Also destroy ze_context */ - if (ze_context) { - PSMI_ONEAPI_ZE_CALL(zeContextDestroy, ze_context); - ze_context = NULL; - } -} - -int psmi_oneapi_ze_initialize() -{ - psm2_error_t err = PSM2_OK; - uint32_t ze_driver_count = 1; - uint32_t ze_device_count = 0; - ze_device_handle_t devices[MAX_ZE_DEVICES]; - zel_component_version_t *zel_comps = NULL; - size_t num_zel_comps; - int i; - union psmi_envvar_val env; - - PSM2_LOG_MSG("entering"); - _HFI_DBG("Init Level Zero library.\n"); - - psmi_oneapi_ze_stats_register(); - err = psmi_oneapi_ze_load(); - if (err != PSM2_OK) - goto fail; - - psm3_getenv("PSM3_ONEAPI_IMMED_SYNC_COPY", - "Use Immediate CommandList for synchronous copy to/from GPU]", - PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)1, &env); - psm3_oneapi_immed_sync_copy = env.e_int; - - psm3_getenv("PSM3_ONEAPI_IMMED_ASYNC_COPY", - "Use Immediate CommandList for asynchronous pipeline copy to/from GPU]", - PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)1, &env); - psm3_oneapi_immed_async_copy = env.e_int; - - psm3_getenv("PSM3_ONEAPI_PARALLEL_DTOD_COPY_THRESH", - "Use parallel CommandLists for GPU to GPU copy larger than threshold", - PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)(256*1024-1), &env); - // no benefit below 128K-1, plus the copy is spilt at a 64K boundary - psm3_oneapi_parallel_dtod_copy_thresh = max(128*1024-1, env.e_uint); - - - PSMI_ONEAPI_ZE_CALL(zeInit, ZE_INIT_FLAG_GPU_ONLY); - - /* Need to query count before alloc array */ - PSMI_ONEAPI_ZE_CALL(zelLoaderGetVersions, &num_zel_comps, NULL); - if (num_zel_comps > 0) { - zel_comps = (zel_component_version_t *)psmi_calloc( - PSMI_EP_NONE, UNDEFINED, sizeof(zel_component_version_t), - num_zel_comps); - PSMI_ONEAPI_ZE_CALL(zelLoaderGetVersions, &num_zel_comps, zel_comps); - - /* Loop looking for "loader" name */ - for (i = 0; i < num_zel_comps; i++) { - if (!strncmp(zel_comps[i].component_name, "loader", sizeof("loader"))){ - zel_lib_version = zel_comps[i].component_lib_version; - zel_api_version = zel_comps[i].spec_version; - break; - } - } - psmi_free(zel_comps); - if (i == num_zel_comps) { - _HFI_DBG("WARNING: 'loader' not found among the %zd components reported" - " by zelLoaderGetVersions, unable to report Level-Zero version", - num_zel_comps); - } - } else { - _HFI_DBG("WARNING: no components reported by zelLoaderGetVersions," - " unable to report Level-Zero version"); - } - - PSMI_ONEAPI_ZE_CALL(zeDriverGet, &ze_driver_count, &ze_driver); -#ifndef PSM3_NO_ONEAPI_IMPORT - PSMI_ONEAPI_ZE_CALL(zeDriverGetExtensionFunctionAddress, ze_driver, "zexDriverImportExternalPointer", (void **)&psmi_zexDriverImportExternalPointer); - PSMI_ONEAPI_ZE_CALL(zeDriverGetExtensionFunctionAddress, ze_driver, "zexDriverReleaseImportedPointer", (void **)&psmi_zexDriverReleaseImportedPointer); -#endif - - PSMI_ONEAPI_ZE_CALL(zeDeviceGet, ze_driver, &ze_device_count, NULL); - if (ze_device_count > MAX_ZE_DEVICES) - ze_device_count = MAX_ZE_DEVICES; - PSMI_ONEAPI_ZE_CALL(zeDeviceGet, ze_driver, &ze_device_count, devices); - - ze_context_desc_t ctxtDesc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, NULL, 0 }; - PSMI_ONEAPI_ZE_CALL(zeContextCreate, ze_driver, &ctxtDesc, &ze_context); - _HFI_DBG("ze_driver %p %u devices first device %p ze_context %p\n", - ze_driver, ze_device_count, devices[0], ze_context); - - for (i = 0; i < ze_device_count; i++) { - ze_devices[i].dev_index = i; - psmi_oneapi_cmd_create(devices[i], &ze_devices[i]); - _HFI_DBG("Initialized cmd queues for ze_device[%d] %p\n", - i, ze_devices[i].dev); - } - - num_ze_devices = ze_device_count; - if (num_ze_devices > 0) - cur_ze_dev = &ze_devices[0]; - - err = psmi_oneapi_putqueue_alloc(); - if (err != PSM2_OK) - goto fail; - - psmi_gpu_init(); - -#ifndef PSM_HAVE_PIDFD - psm3_num_ze_dev_fds = 0; -#endif - - PSM2_LOG_MSG("leaving"); - return err; -fail: - err = psm3_handle_error(PSMI_EP_NORETURN, - PSM2_INTERNAL_ERR, - "Unable to initialize PSM3 OneAPI Level Zero support.\n"); - return err; -} -#endif // PSM_ONEAPI - static void psmi_free_subnets(void) @@ -1260,29 +352,6 @@ psm2_error_t psm3_init(int *major, int *minor) psm3_verno_client_val = min(PSMI_VERNO_MAKE(*major, *minor), psm3_verno); - /* Check to see if we need to set Architecture flags to something - * besides big core Xeons */ - cpuid_t id; - psm3_cpu_model = CPUID_MODEL_UNDEFINED; - - /* First check to ensure Genuine Intel */ - get_cpuid(0x0, 0, &id); - if(id.ebx == CPUID_GENUINE_INTEL_EBX - && id.ecx == CPUID_GENUINE_INTEL_ECX - && id.edx == CPUID_GENUINE_INTEL_EDX) - { - /* Use cpuid with EAX=1 to get processor info */ - get_cpuid(0x1, 0, &id); - psm3_cpu_model = CPUID_GENUINE_INTEL; - } - - if( (psm3_cpu_model == CPUID_GENUINE_INTEL) && - (id.eax & CPUID_FAMILY_MASK) == CPUID_FAMILY_XEON) - { - psm3_cpu_model = ((id.eax & CPUID_MODEL_MASK) >> 4) | - ((id.eax & CPUID_EXMODEL_MASK) >> 12); - } - psmi_refcount++; /* psm3_dbgmask lives in libhfi.so */ psm3_getenv("PSM3_TRACEMASK", @@ -1450,90 +519,16 @@ psm2_error_t psm3_init(int *major, int *minor) if (psm3_device_is_enabled(devid_enabled, PTL_DEVID_AMSH)) { if (psm3_dsa_init()) { err = PSM2_INTERNAL_ERR; - goto fail_hal; + goto fail_dsa; } } #endif -#ifdef PSM_CUDA - union psmi_envvar_val env_enable_cuda; - psm3_getenv("PSM3_CUDA", - "Enable (set envvar to 1) for cuda support in PSM (Disabled by default)", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)0, &env_enable_cuda); - // order important, always parse gpudirect - is_cuda_enabled = psmi_parse_gpudirect() || env_enable_cuda.e_int; - - if (PSMI_IS_GPU_ENABLED) { - err = psmi_cuda_initialize(); - if (err != PSM2_OK) -#ifdef PSM_DSA - goto fail_undsa; -#else - goto fail_hal; -#endif - } -#else /* PSM_CUDA */ - /* PSM3_CUDA is not allowed for this build, so we check it's - * presence but don't want to use psm3_getenv since we don't - * want it to appear in PSM3_VERBOSE_ENV help text - */ - int enable_cuda = 0; - if (psm3_parse_str_int(psm3_env_get("PSM3_CUDA"), &enable_cuda, - INT_MIN, INT_MAX) == -2 - || enable_cuda) { - _HFI_INFO("WARNING: PSM built without CUDA enabled, PSM3_CUDA unavailable\n"); +#ifdef PSM_HAVE_GPU + if ( (err = PSM3_GPU_INITIALIZE()) != PSM2_OK) { + goto fail_gpu; } -#endif /* PSM_CUDA */ - -#ifdef PSM_ONEAPI - union psmi_envvar_val env_enable_oneapi; - psm3_getenv("PSM3_ONEAPI_ZE", - "Enable (set envvar to 1) for OneAPI Level Zero (ZE) support in PSM (Disabled by default)", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)0, &env_enable_oneapi); - is_oneapi_ze_enabled = psmi_parse_gpudirect() || env_enable_oneapi.e_int; - - if (PSMI_IS_GPU_ENABLED) { - err = psmi_oneapi_ze_initialize(); - if (err != PSM2_OK) { -#ifdef PSM_DSA - goto fail_undsa; -#else - goto fail_hal; #endif - } - } -#else /* PSM_ONEAPI */ - /* PSM3_ONEAPI_ZE is not allowed for this build, so we check it's - * presence but don't want to use psm3_getenv since we don't - * want it to appear in PSM3_VERBOSE_ENV help text - */ - int enable_oneapi = 0; - if (psm3_parse_str_int(psm3_env_get("PSM3_ONEAPI_ZE"), &enable_oneapi, - INT_MIN, INT_MAX) == -2 - || enable_oneapi) { - _HFI_INFO("WARNING: PSM built without ONEAPI_ZE enabled, PSM3_ONEAPI_ZE unavailable\n"); - } -#endif /* PSM_ONEAPI */ - -#if !defined(PSM_CUDA) && ! defined(PSM_ONEAPI) - /* PSM3_GPUDIRECT is not allowed for this build, so we check it's - * presence but don't want to use psm3_getenv since we don't - * want it to appear in PSM3_VERBOSE_ENV help text - * Note we check here, rather than in ips_proto_init, because - * PSM3_GPUDIERECT can enable GPU for ptl_am (shm) as well as ips, - * so if a user attempted a non-GPU build single node run with - * PSM3_GPUDIRECT=1 and expected GPU handling in shm, they would not - * get the behavior they expected - */ - unsigned int gpudirect = 0; - if (psm3_parse_str_uint(psm3_env_get("PSM3_GPUDIRECT"), &gpudirect, - 0, UINT_MAX) == -2 - || gpudirect) { - _HFI_INFO("WARNING: PSM built with neither ONEAPI_ZE nor CUDA enabled, PSM3_GPUDIRECT unavailable\n"); - } -#endif /* !defined(PSM_CUDA) && ! defined(PSM_ONEAPI) */ update: *major = (int)psm3_verno_major; @@ -1544,15 +539,17 @@ psm2_error_t psm3_init(int *major, int *minor) PSM2_LOG_MSG("leaving"); return err; -#if defined(PSM_DSA) && (defined(PSM_CUDA) || defined(PSM_ONEAPI)) -fail_undsa: +#ifdef PSM_HAVE_GPU +fail_gpu: +#if defined(PSM_DSA) psm3_dsa_fini(); #endif -#if defined(PSM_DSA) || defined(PSM_CUDA) || defined(PSM_ONEAPI) -fail_hal: +#endif +#if defined(PSM_DSA) +fail_dsa: +#endif psm3_hwloc_topology_destroy(); // always safe to call psm3_hal_finalize(); -#endif fail_epid: psm3_epid_fini(); fail_unref: @@ -1611,6 +608,8 @@ psm2_error_t psm3_info_query(psm2_info_query_t q, void *out, 2, /* PSM2_INFO_QUERY_PORT_SPEED */ 0, /* PSM2_INFO_QUERY_NUM_ADDR_PER_UNIT */ 4, /* PSM2_INFO_QUERY_UNIT_ADDR_NAME */ + 0, /* PSM2_INFO_QUERY_GPU_THRESH_RNDV */ + 0, /* PSM2_INFO_QUERY_MQ_RNDV_SHM_GPU_THRESH_DEFAULT */ }; psm2_error_t rv = PSM2_INTERNAL_ERR; @@ -1637,15 +636,7 @@ psm2_error_t psm3_info_query(psm2_info_query_t q, void *out, rv = PSM2_OK; break; case PSM2_INFO_QUERY_FEATURE_MASK: - { -#ifdef PSM_CUDA - *((uint32_t*)out) = PSM2_INFO_QUERY_FEATURE_CUDA; -#elif defined(PSM_ONEAPI) - *((uint32_t*)out) = PSM2_INFO_QUERY_FEATURE_ONEAPI; -#else - *((uint32_t*)out) = 0; -#endif /* PSM_CUDA */ - } + *((uint32_t*)out) = PSM3_GPU_QUERY_FEATURE_MASK(); rv = PSM2_OK; break; case PSM2_INFO_QUERY_UNIT_NAME: @@ -1776,6 +767,18 @@ psm2_error_t psm3_info_query(psm2_info_query_t q, void *out, rv = PSM2_OK; } break; + case PSM2_INFO_QUERY_GPU_THRESH_RNDV: +#ifdef PSM_HAVE_GPU + *((uint32_t*)out) = psm3_gpu_thresh_rndv; + rv = PSM2_OK; +#endif + break; + case PSM2_INFO_QUERY_MQ_RNDV_SHM_GPU_THRESH_DEFAULT: +#ifdef PSM_HAVE_GPU + *((uint32_t*)out) = psm3_gpu_mq_rndv_shm_gpu_thresh_default; + rv = PSM2_OK; +#endif + break; default: return PSM2_IQ_INVALID_QUERY; } @@ -1921,24 +924,8 @@ psm2_error_t psm3_finalize(void) psm3_hwloc_topology_destroy(); // always safe to call psm3_hal_finalize(); -#ifdef PSM_CUDA - if (PSMI_IS_GPU_ENABLED) - psm3_stats_deregister_type(PSMI_STATSTYPE_GPU, &psmi_count_cuInit); -#elif defined(PSM_ONEAPI) - if (PSMI_IS_GPU_ENABLED) { - psm3_stats_deregister_type(PSMI_STATSTYPE_GPU, &psmi_count_zeInit); - /* - * Trying to destroy command list, queue, and context will result in - * segfaults here. - */ - /*psmi_oneapi_putqueue_free(); - psmi_oneapi_cmd_destroy(); - if (ze_context) { - PSMI_ONEAPI_ZE_CALL(zeContextDestroy, ze_context); - ze_context = NULL; - } */ - } -#endif // PSM_CUDA + + PSM3_GPU_FINALIZE(); psmi_refcount = PSMI_FINALIZED; PSM2_LOG_MSG("leaving"); diff --git a/prov/psm3/psm3/psm2.h b/prov/psm3/psm3/psm2.h index cadb561dbd4..ce007280daf 100644 --- a/prov/psm3/psm3/psm2.h +++ b/prov/psm3/psm3/psm2.h @@ -314,8 +314,8 @@ enum psm2_error { /*! PSM2 is finalized */ PSM2_IS_FINALIZED = 13, - /*! TCP data send is successful */ - PSM2_TCP_DATA_SENT = 14, + /*! data was sent reliably */ + PSM2_RELIABLE_DATA_SENT = 14, /*! Endpoint was closed */ PSM2_EP_WAS_CLOSED = 20, @@ -1325,6 +1325,15 @@ void *psm3_epaddr_getctxt(psm2_epaddr_t epaddr); * option value: Context associated with PSM2 endpoint address. */ +/* PSM2 endpoint CUDA_PERMITTED flag */ +#define PSM2_CORE_OPT_EP_CUDA_PERMITTED 0x103 + /**< [@b uint32_t ] Set/Get the CUDA_PERMITTED flag associated with a PSM2 + * endpoint (psm2_ep_t). + * + * component object: PSM2 endpoint (@ref psm2_ep_t). + * option value: Boolean flag. + */ + /* PSM2_COMPONENT_IB options */ /* Default service level to use to communicate with remote endpoints */ #define PSM2_IB_OPT_DF_SL 0x201 @@ -1717,6 +1726,14 @@ typedef enum psm2_info_query_et Output parameter: char*, description: name of the device's address. */ PSM2_INFO_QUERY_UNIT_ADDR_NAME, +/*! Required input arguments 0 + Output parameter: uint32_t*, description: configured PSM3_GPU_THRESH_RNDV */ + PSM2_INFO_QUERY_GPU_THRESH_RNDV, + +/*! Required input arguments 0 + Output parameter: uint32_t*, description: default for PSM3_MQ_RNDV_SHM_GPU_THRESH */ + PSM2_INFO_QUERY_MQ_RNDV_SHM_GPU_THRESH_DEFAULT, + PSM2_INFO_QUERY_LAST, /* must appear last, and the info query constants are used as an index. */ } psm2_info_query_t; @@ -1772,14 +1789,14 @@ psm2_error_t psm3_info_query(psm2_info_query_t, void *out, * Used to support interrupt driven progress with CPU release when * >1 process per core * - * @param[in] int timeout timeout in milliseconds. <0 is infinite timeout + * @param[in] int timeout_ms timeout in milliseconds. <0 is infinite timeout * * @returns PSM2_OK if wait completed and some progress may have been made * @returns PSM2_TIMEOUT if wait timeout exceeded with no progress made * @returns PSM2_INTERNAL_ERR if wait mode not allowed for given HAL * @returns PSM2_PARAM_ERR if not allowed for use with current PSM settings/mode */ -psm2_error_t psm3_wait(int timeout); +psm2_error_t psm3_wait(int timeout_ms); /** @brief PSM2 env initialization * @@ -1905,6 +1922,7 @@ int psm3_getenv_str(const char *name, const char *descr, int visible, * @param[in] unint32_t parameter copy length */ void psm3_memcpy(void *dest, const void *src, uint32_t len); +void psm3_ep_memcpy(psm2_ep_t ep, void *dest, const void *src, uint32_t len); /*! @} */ diff --git a/prov/psm3/psm3/psm2_hal.c b/prov/psm3/psm3/psm2_hal.c index 31a1cf67ecf..7b17fe757ba 100644 --- a/prov/psm3/psm3/psm2_hal.c +++ b/prov/psm3/psm3/psm2_hal.c @@ -97,7 +97,7 @@ void psm3_hal_register_instance(psmi_hal_instance_t *psm_hi) REJECT_IMPROPER_HI(hfp_mq_init_defaults); REJECT_IMPROPER_HI(hfp_ep_open_opts_get_defaults); REJECT_IMPROPER_HI(hfp_context_initstats); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU REJECT_IMPROPER_HI(hfp_gdr_open); #endif @@ -147,10 +147,10 @@ void psm3_hal_register_instance(psmi_hal_instance_t *psm_hi) REJECT_IMPROPER_HI(hfp_ips_ibta_init); REJECT_IMPROPER_HI(hfp_ips_path_rec_init); REJECT_IMPROPER_HI(hfp_ips_ptl_pollintr); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU REJECT_IMPROPER_HI(hfp_gdr_close); REJECT_IMPROPER_HI(hfp_gdr_convert_gpu_to_host_addr); -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ REJECT_IMPROPER_HI(hfp_get_port_index2pkey); REJECT_IMPROPER_HI(hfp_poll_type); diff --git a/prov/psm3/psm3/psm2_hal.h b/prov/psm3/psm3/psm2_hal.h index 91d187dcd56..a1f83899ad4 100644 --- a/prov/psm3/psm3/psm2_hal.h +++ b/prov/psm3/psm3/psm2_hal.h @@ -253,16 +253,6 @@ typedef struct _psmi_hal_params char **unit_driver; } psmi_hal_params_t; - -#define PSM_HAL_ALG_ACROSS 0 -#define PSM_HAL_ALG_WITHIN 1 -#define PSM_HAL_ALG_ACROSS_ALL 2 -#define PSM_HAL_ALG_CPU_CENTRIC 3 -#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY -#define PSM_HAL_ALG_GPU_CENTRIC 4 -#endif - - typedef enum { PSMI_HAL_POLL_TYPE_NONE = 0, PSMI_HAL_POLL_TYPE_URGENT = 1, @@ -314,7 +304,7 @@ struct _psmi_hal_instance /* Initialize PSM3_PRINT_STATS stats for given ep */ void (*hfp_context_initstats)(psm2_ep_t ep); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU void (*hfp_gdr_open)(void); #endif @@ -414,12 +404,12 @@ struct _psmi_hal_instance int next_timeout, uint64_t *pollok, uint64_t *pollcyc, uint64_t *pollintr); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* Direct GPU Copy */ void (*hfp_gdr_close)(void); void* (*hfp_gdr_convert_gpu_to_host_addr)(unsigned long buf, size_t size, int flags, psm2_ep_t ep); -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ /* Given an open context and index, return an error, or the * corresponding pkey for the index as programmed by the SM */ /* Returns an int, so -1 indicates an error. */ @@ -432,7 +422,7 @@ struct _psmi_hal_instance uint32_t *payload, uint32_t length, uint32_t isCtrlMsg, uint32_t cksum_valid, uint32_t cksum -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , uint32_t is_gpu_payload #endif ); @@ -441,7 +431,7 @@ struct _psmi_hal_instance uint32_t *payload, uint32_t length, uint32_t isCtrlMsg, uint32_t cksum_valid, uint32_t cksum -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , uint32_t is_gpu_payload #endif ); @@ -556,7 +546,7 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...) #define psmi_hal_mq_init_defaults(...) PSMI_HAL_DISPATCH_FUNC(mq_init_defaults,__VA_ARGS__) #define psmi_hal_ep_open_opts_get_defaults(...) PSMI_HAL_DISPATCH_FUNC(ep_open_opts_get_defaults,__VA_ARGS__) #define psmi_hal_context_initstats(...) PSMI_HAL_DISPATCH_FUNC(context_initstats,__VA_ARGS__) -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU #define psmi_hal_gdr_open(...) PSMI_HAL_DISPATCH_FUNC(gdr_open,__VA_ARGS__) #endif #define psmi_hal_finalize_(...) PSMI_HAL_DISPATCH_FUNC(finalize_,__VA_ARGS__) @@ -603,10 +593,10 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...) #define psmi_hal_ips_ibta_init(...) PSMI_HAL_DISPATCH(ips_ibta_init,__VA_ARGS__) #define psmi_hal_ips_path_rec_init(...) PSMI_HAL_DISPATCH(ips_path_rec_init,__VA_ARGS__) #define psmi_hal_ips_ptl_pollintr(...) PSMI_HAL_DISPATCH(ips_ptl_pollintr,__VA_ARGS__) -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU #define psmi_hal_gdr_close(...) PSMI_HAL_DISPATCH(gdr_close,__VA_ARGS__) #define psmi_hal_gdr_convert_gpu_to_host_addr(...) PSMI_HAL_DISPATCH(gdr_convert_gpu_to_host_addr,__VA_ARGS__) -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ #define psmi_hal_get_port_index2pkey(...) PSMI_HAL_DISPATCH(get_port_index2pkey,__VA_ARGS__) #define psmi_hal_poll_type(...) PSMI_HAL_DISPATCH(poll_type,__VA_ARGS__) diff --git a/prov/psm3/psm3/psm2_hal_inline_t.h b/prov/psm3/psm3/psm2_hal_inline_t.h index 68e7276f425..ad86a97e9b9 100644 --- a/prov/psm3/psm3/psm2_hal_inline_t.h +++ b/prov/psm3/psm3/psm2_hal_inline_t.h @@ -125,13 +125,13 @@ static PSMI_HAL_INLINE psm2_error_t PSMI_HAL_CAT_INL_SYM(ips_ptl_pollintr) (psm2_ep_t ep, struct ips_recvhdrq *recvq, int fd_pipe, int next_timeout, uint64_t *pollok, uint64_t *pollcyc, uint64_t *pollintr); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU static PSMI_HAL_INLINE void PSMI_HAL_CAT_INL_SYM(gdr_close) (void); static PSMI_HAL_INLINE void* PSMI_HAL_CAT_INL_SYM(gdr_convert_gpu_to_host_addr) (unsigned long buf, size_t size, int flags, psm2_ep_t ep); -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_index2pkey) (psm2_ep_t ep, int index); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(poll_type) @@ -143,7 +143,7 @@ static PSMI_HAL_INLINE psm2_error_t PSMI_HAL_CAT_INL_SYM(spio_transfer_frame) uint32_t *payload, uint32_t length, uint32_t isCtrlMsg, uint32_t cksum_valid, uint32_t cksum -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , uint32_t is_gpu_payload #endif ); @@ -153,7 +153,7 @@ static PSMI_HAL_INLINE psm2_error_t PSMI_HAL_CAT_INL_SYM(transfer_frame) uint32_t *payload, uint32_t length, uint32_t isCtrlMsg, uint32_t cksum_valid, uint32_t cksum -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , uint32_t is_gpu_payload #endif ); diff --git a/prov/psm3/psm3/psm2_hal_loopback.c b/prov/psm3/psm3/psm2_hal_loopback.c index 6789ad18f59..5612a95fd90 100644 --- a/prov/psm3/psm3/psm2_hal_loopback.c +++ b/prov/psm3/psm3/psm2_hal_loopback.c @@ -200,7 +200,7 @@ static int psm3_hfp_loopback_get_port_lid(int unit, int port, int addr_index) static void psm3_hfp_loopback_mq_init_defaults(struct psm2_mq *mq) { mq->ips_cpu_window_rv_str = NULL; // no rendezvous -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU mq->ips_gpu_window_rv_str = NULL; // no rendezvous #endif mq->rndv_nic_thresh = (~(uint32_t)0); // disable rendezvous @@ -222,11 +222,11 @@ static int psm3_hfp_loopback_get_default_pkey(void) return 0x8001; // not used (only used in ptl_ips), pick a safe value } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU static void psm3_hfp_loopback_gdr_open(void) { /* disable GPU Direct copy, no driver to help us */ - is_gdr_copy_enabled = gdr_copy_limit_send = gdr_copy_limit_recv = 0; + psm3_gpu_is_gdr_copy_enabled = psm3_gpu_gdr_copy_limit_send = psm3_gpu_gdr_copy_limit_recv = 0; } #endif @@ -257,7 +257,7 @@ hfp_loopback_t psm3_loopback_hi = { .hfp_mq_init_defaults = psm3_hfp_loopback_mq_init_defaults, .hfp_ep_open_opts_get_defaults = psm3_hfp_loopback_ep_open_opts_get_defaults, .hfp_context_initstats = NULL, // ptl_ips only -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU .hfp_gdr_open = psm3_hfp_loopback_gdr_open, #endif @@ -302,10 +302,10 @@ hfp_loopback_t psm3_loopback_hi = { .hfp_ips_ibta_init = NULL, .hfp_ips_path_rec_init = NULL, .hfp_ips_ptl_pollintr = NULL, -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU .hfp_gdr_close = NULL, .hfp_gdr_convert_gpu_to_host_addr = NULL, -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ .hfp_get_port_index2pkey = NULL, .hfp_poll_type = NULL, .hfp_spio_transfer_frame = NULL, diff --git a/prov/psm3/psm3/psm2_mq.h b/prov/psm3/psm3/psm2_mq.h index 517b4802d5b..3f7f128e3ad 100644 --- a/prov/psm3/psm3/psm2_mq.h +++ b/prov/psm3/psm3/psm2_mq.h @@ -1639,7 +1639,7 @@ struct psm2_mq_stats { #else uint64_t dsa_stats[DSA_STATS_SZ*2]; /* same size as dsa_stats[2] */ #endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /** maximum handles allowed in cache **/ uint64_t gpu_ipc_cache_limit; /** current handles in cache **/ @@ -1656,9 +1656,9 @@ struct psm2_mq_stats { uint64_t gpu_ipc_cache_remove; /** cache cleared due to error opening new Ipc Handle **/ uint64_t gpu_ipc_cache_clear; -#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#else /* PSM_HAVE_GPU */ uint64_t _reserved_gpu[8]; -#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#endif /* PSM_HAVE_GPU */ /** sysbufs are used for unexpected eager receive (and RTS payload) */ /** Number of messages using system buffers (not used for 0 byte msg) */ @@ -1669,7 +1669,7 @@ struct psm2_mq_stats { /** rank in MPI_COMM_WORLD, while unchanging, easiest to put here */ uint64_t comm_world_rank; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /** Messages transmitted eagerly from CPU buffer */ uint64_t tx_eager_cpu_num; /** Bytes transmitted eagerly from CPU buffer */ diff --git a/prov/psm3/psm3/psm_config.h b/prov/psm3/psm3/psm_config.h index 9bd59690005..0c391320df7 100644 --- a/prov/psm3/psm3/psm_config.h +++ b/prov/psm3/psm3/psm_config.h @@ -81,6 +81,11 @@ /* #define INTEL_GPU_DIRECT */ #endif +// define here so pxmx3 and psm_user.h can use this define +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#define PSM_HAVE_GPU +#endif + #ifndef PSM3_BRAKE_DEBUG /* #define PSM3_BRAKE_DEBUG */ #endif @@ -164,32 +169,23 @@ #endif // PSM_CUDA -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU #define GPU_WINDOW_PREFETCH_DEFAULT 2 #define GPU_SMALLHOSTBUF_SZ (256*1024) #define GPU_PAGE_OFFSET_MASK (PSMI_GPU_PAGESIZE -1) #define GPU_PAGE_MASK ~GPU_PAGE_OFFSET_MASK -/* All GPU transfers beyond this threshold use - * RNDV protocol. It is mostly a send side knob. - */ -#define PSM3_GPU_THRESH_RNDV 8000 #define GPUDIRECT_THRESH_RV 3 #define GDR_COPY_LIMIT_SEND 128 #define GDR_COPY_LIMIT_RECV 64000 -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ #define PSM_MQ_NIC_MAX_TINY 8 /* max TINY payload allowed */ #define PSM3_MQ_RNDV_NIC_THRESH 64000 #define PSM_CPU_NIC_RNDV_WINDOW_STR "131072" -#ifdef PSM_CUDA -#define PSM_GPU_NIC_RNDV_WINDOW_STR "2097152" -#elif defined(PSM_ONEAPI) -#define PSM_GPU_NIC_RNDV_WINDOW_STR "131072:524287,262144:1048575,524288" -#endif #define PSM3_MQ_RNDV_NIC_WINDOW_MAX (4 * 1024 * 1024) /* max rndv window */ /* @@ -197,14 +193,6 @@ */ #define PSM3_MQ_RNDV_SHM_THRESH 16000 -#if defined(PSM_CUDA) -/* Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem */ -#define PSM3_MQ_RNDV_SHM_GPU_THRESH 63 -#elif defined(PSM_ONEAPI) -/* Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem */ -#define PSM3_MQ_RNDV_SHM_GPU_THRESH 127 -#endif - // LEARN_HASH_SELECTOR has PSM3 dynamically learn the combinations // of src_addr presence and tagsel used by a given middleware. This // allows PSM3 to self-optimize for use with varied middleware uses @@ -245,7 +233,11 @@ #define PSMI_DEVICES_DEFAULT "self,shm,nic" /* Lock */ +#if defined(__x86_64__) || defined(__i386__) #define PSMI_USE_PTHREAD_SPINLOCKS 0 +#else /* non-Intel arch */ +#define PSMI_USE_PTHREAD_SPINLOCKS 1 +#endif /* Utils */ #define PSMI_EPID_TABSIZE_CHUNK 128 diff --git a/prov/psm3/psm3/psm_context.c b/prov/psm3/psm3/psm_context.c index 678b394d71e..e82a701e74a 100644 --- a/prov/psm3/psm3/psm_context.c +++ b/prov/psm3/psm3/psm_context.c @@ -110,18 +110,12 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde psm2_uuid_t const job_key, uint16_t network_pkey, int64_t timeout_ns) { - long open_timeout = 0, unit_start, unit_end, unit_id, unit_id_prev; + long open_timeout = 0; psm2_error_t err = PSM2_OK; - int nunits = psmi_hal_get_num_units(), nunitsactive=0; + int nunits = psmi_hal_get_num_units(); union psmi_envvar_val env_rcvthread; static int norcvthread; /* only for first rail */ - /* - * If shared contexts are enabled, try our best to schedule processes - * across one or many devices - */ - - /* if no units, then no joy. */ if (nunits <= 0) { err = psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, @@ -129,75 +123,41 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde goto ret; } - /* Calculate the number of active units: */ - for (unit_id=0;unit_id < nunits;unit_id++) - { - if (psmi_hal_get_unit_active(unit_id) > 0) - nunitsactive++; - } - /* if no active units, then no joy. */ - if (nunitsactive == 0) - { - err = psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "PSM3 no nic units are active"); - goto ret; - } if (timeout_ns > 0) open_timeout = (long)(timeout_ns / MSEC_ULL); - - unit_start = 0; unit_end = nunits - 1; - err = psm3_compute_start_and_end_unit(unit_param, addr_index, - nunitsactive, nunits, job_key, - &unit_start, &unit_end); - if (err != PSM2_OK) - goto ret; - - /* Loop from unit_start to unit_end inclusive and pick 1st active found - * As needed wrap, so it's valid for unit_start >= unit_end - */ - int success = 0; - unit_id_prev = unit_id = unit_start; - do - { - /* if the unit_id is not active, go to next one. */ - if (psmi_hal_get_unit_active(unit_id) <= 0) { - unit_id_prev = unit_id; - unit_id = (unit_id + 1) % nunits; - continue; + if (unit_param == PSM3_NIC_ANY) { + /* user did not set PSM3_NIC and not PSM3_MULTIRAIL */ + unit_param = psm3_autoselect_one(addr_index, nunits, job_key); + if (unit_param < 0) { + err = psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM3 no nic units are active"); + goto ret; } + } else { + _HFI_DBG("Caller selected NIC %ld.\n", unit_param); + psmi_assert(unit_param >= 0); // caller checked valid + } - /* open this unit. */ - if (psmi_hal_context_open(unit_id, port, - psmi_hash_addr_index(unit_id, port, addr_index), - open_timeout, - ep, job_key, HAL_CONTEXT_OPEN_RETRY_MAX)) { - // in modes where we refcount NIC use, - // psm3_compute_start_and_end_unit will have returned exactly - // 1 NIC and refcount'ed it, so we dec refcount here - psm3_dec_nic_refcount(unit_id); - /* go to next unit if failed to open. */ - unit_id_prev = unit_id; - unit_id = (unit_id + 1) % nunits; - continue; - } - // HAL context_open has initialized: - // ep->unit_id, ep->portnum, ep->addr_index, - // ep->dev_name, ep->subnet, ep->addr, ep->gid, ep->wiremode, - // ep->epid and - // HAL specific ep fields (context, verbs_ep or sockets_ep) - psmi_assert_always(! psm3_epid_zero_internal(ep->epid)); - success = 1; - break; - - } while (unit_id_prev != unit_end); - - if (!success) - { + /* open this unit. */ + if (psmi_hal_get_unit_active(unit_param) <= 0 + || psmi_hal_context_open(unit_param, port, + psmi_hash_addr_index(unit_param, port, addr_index), + open_timeout, + ep, job_key, HAL_CONTEXT_OPEN_RETRY_MAX)) { + // in modes where we refcount NIC use, + // psm3_autoselect_one refcount'ed it, so we dec refcount here + psm3_dec_nic_refcount(unit_param); err = psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "PSM3 can't open nic unit: %ld",unit_param); + "PSM3 can't open nic unit: %ld",unit_param); goto bail; } + // HAL context_open has initialized: + // ep->unit_id, ep->portnum, ep->addr_index, + // ep->dev_name, ep->subnet, ep->addr, ep->gid, ep->wiremode, + // ep->epid and + // HAL specific ep fields (context, verbs_ep or sockets_ep) + psmi_assert_always(! psm3_epid_zero_internal(ep->epid)); _HFI_VDBG("hal_context_open() passed.\n"); @@ -233,7 +193,7 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde "with RCVTHREAD on"); #endif } - _HFI_PRDBG("Opened unit %ld port %ld: EPID=%s %s\n", unit_id, port, + _HFI_PRDBG("Opened unit %ld port %ld: EPID=%s %s\n", unit_param, port, psm3_epid_fmt_internal(ep->epid, 0), psm3_epid_fmt_addr(ep->epid, 1)); goto ret; @@ -242,7 +202,7 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde psmi_hal_close_context(ep); psm3_dec_nic_refcount(ep->unit_id); bail: - _HFI_PRDBG("open failed: unit_id: %ld, err: %d (%s)\n", unit_id, err, strerror(errno)); + _HFI_PRDBG("open failed: unit_id: %ld, err: %d (%s)\n", unit_param, err, strerror(errno)); ret: _HFI_VDBG("psm3_context_open() return %d\n", err); diff --git a/prov/psm3/psm3/psm_ep.c b/prov/psm3/psm3/psm_ep.c index 86dfa9a88d0..055d8ca11d2 100644 --- a/prov/psm3/psm3/psm_ep.c +++ b/prov/psm3/psm3/psm_ep.c @@ -678,9 +678,9 @@ psm3_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled, ep->hfi_num_send_rdma = 0; #endif #ifdef PSM_HAVE_RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU ep->rv_gpu_cache_size = 0; -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ #endif /* PSM_HAVE_RNDV_MOD */ /* See how many iterations we want to spin before yielding */ @@ -747,10 +747,7 @@ psm3_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled, if (! mq->ep) // only call on 1st EP within MQ psm3_mq_initstats(mq, ep->epid); -#ifdef PSM_CUDA - if (PSMI_IS_GPU_ENABLED) - verify_device_support_unified_addr(); -#endif + PSM3_GPU_VERIFY_CAPABILITIES(); _HFI_VDBG("start ptl device init...\n"); if (psm3_ep_device_is_enabled(ep, PTL_DEVID_SELF)) { @@ -827,15 +824,7 @@ psm3_ep_open(psm2_uuid_t const unique_job_key, return PSM2_TOO_MANY_ENDPOINTS; } -#if defined(PSM_ONEAPI) - /* Make sure ze_context and command queue/list are available. - * They could be destroyed when there is no more endpoints. - * If another endpoint is created after that, the code here can - * recreate the context, command queue and list. - */ - if (PSMI_IS_GPU_ENABLED && !cur_ze_dev) - psmi_oneapi_cmd_create_all(); -#endif //PSM_ONEAPI + PSM3_GPU_EP_OPEN(); /* Matched Queue initialization. We do this early because we have to * make sure ep->mq exists and is valid before calling ips_do_work. @@ -869,11 +858,12 @@ psm3_ep_open(psm2_uuid_t const unique_job_key, opts.addr_index = multirail_config.addr_indexes[0]; } } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // if HAL doesn't support GDR Copy, it may disable Gdr Copy - // by zeroing is_gdr_copy_enabled, gdr_copy_limit_send, and - // gdr_copy_limit_recv during gdr_open - if (PSMI_IS_GDR_COPY_ENABLED) + // by zeroing psm3_gpu_is_gdr_copy_enabled, + // psm3_gpu_gdr_copy_limit_send, and + // psm3_gpu_gdr_copy_limit_recv during gdr_open + if (PSM3_GPU_IS_GDR_COPY_ENABLED) psmi_hal_gdr_open(); #endif @@ -982,12 +972,8 @@ psm3_ep_open(psm2_uuid_t const unique_job_key, fail: fflush(stdout); PSMI_UNLOCK(psm3_creation_lock); -#if defined(PSM_ONEAPI) - if (PSMI_IS_GPU_ENABLED && psm3_opened_endpoint_count == 0) { - psmi_oneapi_putqueue_free(); - psmi_oneapi_cmd_destroy_all(); - } -#endif //PSM_ONEAPI + if (psm3_opened_endpoint_count == 0) + PSM3_GPU_EP_CLOSE(); PSM2_LOG_MSG("leaving"); return err; } @@ -1005,14 +991,14 @@ psm2_error_t psm3_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in) } #endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* * The close on the gdr fd needs to be called before the * close on the hfi fd as the the gdr device will hold * reference count on the hfi device which will make the close * on the hfi fd return without actually closing the fd. */ - if (PSMI_IS_GDR_COPY_ENABLED) + if (PSM3_GPU_IS_GDR_COPY_ENABLED) psmi_hal_gdr_close(); #endif union psmi_envvar_val timeout_intval; @@ -1202,17 +1188,8 @@ psm2_error_t psm3_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in) (double)cycles_to_nanosecs(get_cycles() - t_start) / SEC_ULL); } -#if defined(PSM_ONEAPI) - /* - * It would be ideal to destroy the global command list, queue, and - * context in psm3_finalize(). Unfortunately, it will cause segfaults - * in Level-zero library. - */ - if (PSMI_IS_GPU_ENABLED && psm3_opened_endpoint_count == 0) { - psmi_oneapi_putqueue_free(); - psmi_oneapi_cmd_destroy_all(); - } -#endif //PSM_ONEAPI + if (psm3_opened_endpoint_count == 0) + PSM3_GPU_EP_CLOSE(); PSM2_LOG_MSG("leaving"); return err; } @@ -1376,7 +1353,7 @@ int psm3_ep_device_is_enabled(const psm2_ep_t ep, int devid) } #ifdef PSM_HAVE_RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // used for GdrCopy // given an ep this returns the "next one". @@ -1517,5 +1494,5 @@ int64_t psm3_gpu_evict_some(psm2_ep_t ep, uint64_t length, int access) } return evicted; } -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ #endif /* PSM_HAVE_RNDV_MOD */ diff --git a/prov/psm3/psm3/psm_ep.h b/prov/psm3/psm3/psm_ep.h index f8376331e32..76b05b6fb80 100644 --- a/prov/psm3/psm3/psm_ep.h +++ b/prov/psm3/psm3/psm_ep.h @@ -96,6 +96,11 @@ node->mctxt_next = node->mctxt_prev = node; \ node->mctxt_master = NULL +#define PSM_EP_FOR_EACH_MCTXT(root, iter) \ + for ( struct psm2_ep *iter = (root)->mctxt_master \ + ; iter \ + ; iter = iter->mctxt_next == iter->mctxt_master ? NULL : iter->mctxt_next) + struct psm2_ep { psm2_epid_t epid; /**> This endpoint's Endpoint ID */ psm2_epaddr_t epaddr; /**> This ep's ep address */ @@ -108,6 +113,9 @@ struct psm2_ep { struct psm3_sockets_ep sockets_ep; #endif }; +#ifdef PSM_HAVE_GPU + union psm2_ep_gpu_specific gpu_specific; +#endif /* unit_id and portnum are set to 0 when ptl_ips not enabled */ int unit_id; @@ -136,7 +144,7 @@ struct psm2_ep { #ifdef PSM_HAVE_RNDV_MOD psm3_rv_t rv; // rendezvous module open handle uint32_t rv_mr_cache_size; /** PSM3_RV_MR_CACHE_SIZE */ -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint32_t rv_gpu_cache_size; /** PSM3_RV_GPU_CACHE_SIZE */ #endif #endif /* PSM_HAVE_RNDV_MOD */ @@ -144,14 +152,6 @@ struct psm2_ep { uint32_t hfi_num_descriptors;/** Number of allocated scb descriptors*/ #ifdef PSM_HAVE_REG_MR uint32_t hfi_num_send_rdma;/** Number of concurrent RDMA*/ -#endif -#ifdef PSM_ONEAPI -#ifndef PSM_HAVE_PIDFD - // TBD - move to ptl_am - int ze_ipc_socket; // AF_UNIX listener sock to recv GPU Dev FDs - char *listen_sockname; // /dev/shm filename for ze_ipc_socket - int need_dev_fds_poll; // are there outstanding dev_fds to be polled -#endif #endif uint8_t wiremode; /* EPID protocol specific basic modes * For RoCE/IB reflects @@ -275,7 +275,7 @@ struct psm2_epaddr { int psm3_ep_device_is_enabled(const psm2_ep_t ep, int devid); #ifdef PSM_HAVE_RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU extern int64_t psm3_gpu_evict_some(psm2_ep_t ep, uint64_t length, int access); #endif #endif diff --git a/prov/psm3/psm3/psm_help.h b/prov/psm3/psm3/psm_help.h index a3908ba2563..5c738b95e2d 100644 --- a/prov/psm3/psm3/psm_help.h +++ b/prov/psm3/psm3/psm_help.h @@ -150,6 +150,12 @@ (((uint32_t)(val)) & (~((uint32_t)(align)-1))) #endif +/* round down 64-bit value to align, align must be a power of 2 */ +#ifndef ROUNDDOWN64P2 +#define ROUNDDOWN64P2(val, align) \ + (((uint64_t)(val)) & (~((uint64_t)(align)-1))) +#endif + /* round down value to align, align can be any value, less efficient than ROUNDDOWNP2 */ #ifndef ROUNDDOWN #define ROUNDDOWN(val, align) \ @@ -165,12 +171,20 @@ /* how many entries are in a statically allocated table */ #define PSMI_HOWMANY(table) (sizeof(table)/sizeof(table[0])) - +// cycles (e.g. rdtsc) to time conversions #define SEC_ULL 1000000000ULL #define MSEC_ULL 1000000ULL #define USEC_ULL 1000ULL #define NSEC_ULL 1ULL +// time units conversions +#define NSEC_PER_SEC 1000000000 +#define NSEC_PER_MSEC 1000000 +#define NSEC_PER_USEC 1000 +#define USEC_PER_SEC 1000000 +#define USEC_PER_MSEC 1000 +#define MSEC_PER_SEC 1000 + #define PSMI_TRUE 1 #define PSMI_FALSE 0 diff --git a/prov/psm3/psm3/psm_mpool.c b/prov/psm3/psm3/psm_mpool.c index 6bf33b7d74a..6472752f5b7 100644 --- a/prov/psm3/psm3/psm_mpool.c +++ b/prov/psm3/psm3/psm_mpool.c @@ -99,7 +99,7 @@ struct mpool { non_empty_callback_fn_t mp_non_empty_cb; void *mp_non_empty_cb_context; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU alloc_dealloc_callback_fn_t mp_alloc_dealloc_cb; void *mp_alloc_dealloc_cb_context; #endif @@ -232,7 +232,7 @@ MOCKABLE(psm3_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk, } MOCK_DEF_EPILOGUE(psm3_mpool_create); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU mpool_t psm3_mpool_create_for_gpu(size_t obj_size, uint32_t num_obj_per_chunk, uint32_t num_obj_max_total, int flags, @@ -259,7 +259,7 @@ psm3_mpool_create_for_gpu(size_t obj_size, uint32_t num_obj_per_chunk, return mp; } -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ /** * psm3_mpool_get() @@ -413,7 +413,7 @@ void *psm3_mpool_find_obj_by_index(mpool_t mp, int index) return (void *)((uintptr_t) me + sizeof(struct mpool_element)); } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /** * psmi_mpool_chunk_dealloc() * memory pool @@ -430,7 +430,7 @@ void psmi_mpool_chunk_dealloc(mpool_t mp, int idx) j * mp->mp_elm_size + sizeof(struct mpool_element))); } -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ /** * psm3_mpool_destroy() @@ -447,7 +447,7 @@ void psm3_mpool_destroy(mpool_t mp) for (i = 0; i < mp->mp_elm_vector_size; i++) { if (mp->mp_elm_vector[i]) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (mp->mp_alloc_dealloc_cb) psmi_mpool_chunk_dealloc(mp, i); #endif @@ -494,7 +494,7 @@ static int psmi_mpool_allocate_chunk(mpool_t mp) if (num_to_allocate == 0) return PSM2_NO_MEMORY; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (mp->mp_alloc_dealloc_cb) chunk = psmi_calloc(PSMI_EP_NONE, mp->mp_memtype, num_to_allocate, mp->mp_elm_size); @@ -504,7 +504,7 @@ static int psmi_mpool_allocate_chunk(mpool_t mp) #else chunk = psmi_malloc(PSMI_EP_NONE, mp->mp_memtype, num_to_allocate * mp->mp_elm_size); -#endif /* PSM_CUDA || PSM_ONEAPI) */ +#endif /* PSM_HAVE_GPU */ if (chunk == NULL) { fprintf(stderr, "Failed to allocate memory for memory pool chunk: %s\n", @@ -513,13 +513,13 @@ static int psmi_mpool_allocate_chunk(mpool_t mp) } for (i = 0; i < num_to_allocate; i++) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (mp->mp_alloc_dealloc_cb) mp->mp_alloc_dealloc_cb(1 /* is alloc */, mp->mp_alloc_dealloc_cb_context, (void *)((uintptr_t)chunk + i * mp->mp_elm_size + sizeof(struct mpool_element))); -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ elm = (struct mpool_element *)((uintptr_t) chunk + i * mp->mp_elm_size + mp->mp_elm_offset); diff --git a/prov/psm3/psm3/psm_mpool.h b/prov/psm3/psm3/psm_mpool.h index 81655e81dc1..69038fff930 100644 --- a/prov/psm3/psm3/psm_mpool.h +++ b/prov/psm3/psm3/psm_mpool.h @@ -80,7 +80,7 @@ MOCKABLE(psm3_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk, non_empty_callback_fn_t cb, void *context); MOCK_DCL_EPILOGUE(psm3_mpool_create); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU mpool_t psm3_mpool_create_for_gpu(size_t obj_size, uint32_t num_obj_per_chunk, uint32_t num_obj_max_total, int flags, psmi_memtype_t statstype, diff --git a/prov/psm3/psm3/psm_mq.c b/prov/psm3/psm3/psm_mq.c index 4248ff7d28d..e617e44ab52 100644 --- a/prov/psm3/psm3/psm_mq.c +++ b/prov/psm3/psm3/psm_mq.c @@ -968,12 +968,12 @@ psm3_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len) if (req->req_data.buf != NULL) { /* 0-byte messages don't alloc a sysbuf */ msglen = mq_set_msglen(req, len, req->req_data.send_msglen); psm3_mq_recv_copy(mq, req, -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU req->is_buf_gpu_mem, #endif buf, len, msglen); psm3_mq_sysbuf_free(mq, req->req_data.buf); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU } else { mq->stats.rx_sysbuf_cpu_num++; #endif @@ -990,7 +990,7 @@ psm3_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len) */ req->recv_msgoff = min(req->recv_msgoff, msglen); psm3_mq_recv_copy(mq, req, -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU req->is_buf_gpu_mem, #endif buf, len, req->recv_msgoff); @@ -1009,7 +1009,7 @@ psm3_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len) req->recv_msgoff = min(req->recv_msgoff, msglen); if (req->send_msgoff) { // only have sysbuf if RTS w/payload psm3_mq_recv_copy(mq, req, -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU req->is_buf_gpu_mem, #endif buf, len, req->recv_msgoff); @@ -1061,12 +1061,12 @@ psm3_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t *ta psm2_mq_req_t recv_req; int table; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU int gpu_mem = 0; void *gpu_user_buffer = NULL; - if (len && PSMI_IS_GPU_ENABLED && PSMI_IS_GPU_MEM(buf)) { - PSM3_MARK_BUF_SYNCHRONOUS(buf); + if (len && PSM3_IS_GPU_MEM(buf)) { + PSM3_GPU_MARK_BUF_SYNCHRONOUS(buf); gpu_mem = 1; gpu_user_buffer = buf; @@ -1094,7 +1094,7 @@ psm3_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t *ta recv_req->recv_msgoff = 0; recv_req->req_data.context = context; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU recv_req->is_buf_gpu_mem = gpu_mem; recv_req->user_gpu_buffer = gpu_user_buffer; #endif @@ -1110,7 +1110,7 @@ psm3_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t *ta tag->tag[0], tag->tag[1], tag->tag[2], tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], recv_req); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU recv_req->is_buf_gpu_mem = gpu_mem; recv_req->user_gpu_buffer = gpu_user_buffer; #endif @@ -1141,11 +1141,11 @@ psm3_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_req_t req; int table; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU int gpu_mem = 0; - if (len && PSMI_IS_GPU_ENABLED && PSMI_IS_GPU_MEM(buf)) { - PSM3_MARK_BUF_SYNCHRONOUS(buf); + if (len && PSM3_IS_GPU_MEM(buf)) { + PSM3_GPU_MARK_BUF_SYNCHRONOUS(buf); gpu_mem = 1; } @@ -1177,7 +1177,7 @@ psm3_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src, req->recv_msgoff = 0; req->req_data.context = context; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU req->is_buf_gpu_mem = gpu_mem; if (gpu_mem) req->user_gpu_buffer = buf; @@ -1195,7 +1195,7 @@ psm3_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src, " tagsel=%08x.%08x.%08x req=%p\n", buf, len, tag->tag[0], tag->tag[1], tag->tag[2], tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], req); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU req->is_buf_gpu_mem = gpu_mem; if (gpu_mem) req->user_gpu_buffer = buf; @@ -1262,9 +1262,9 @@ psm3_mq_imrecv(psm2_mq_t mq, uint32_t flags, void *buf, uint32_t len, user's buffer. */ req->req_data.context = context; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (len && PSMI_IS_GPU_ENABLED && PSMI_IS_GPU_MEM(buf)) { - PSM3_MARK_BUF_SYNCHRONOUS(buf); +#ifdef PSM_HAVE_GPU + if (len && PSM3_IS_GPU_MEM(buf)) { + PSM3_GPU_MARK_BUF_SYNCHRONOUS(buf); req->is_buf_gpu_mem = 1; req->user_gpu_buffer = buf; } else { @@ -1445,7 +1445,7 @@ psm2_error_t psm3_mqopt_ctl(psm2_mq_t mq, uint32_t key, void *value, int get) _HFI_VDBG("RNDV_SHM_SZ = %d (%s)\n", mq->shm_thresh_rv, get ? "GET" : "SET"); break; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU case PSM2_MQ_GPU_RNDV_SHM_SZ: if (get) *((uint32_t *) value) = mq->shm_gpu_thresh_rv; @@ -1735,7 +1735,7 @@ uint32_t psm3_mq_max_window_rv(psm2_mq_t mq, int gpu) // must do search since window_rv may not be increasing (but usually is) uint32_t ret = 0; struct psm3_mq_window_rv_entry *e; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (gpu) e = mq->ips_gpu_window_rv; else @@ -1750,16 +1750,16 @@ uint32_t psm3_mq_max_window_rv(psm2_mq_t mq, int gpu) uint32_t psm3_mq_get_window_rv(psm2_mq_req_t req) { if (! req->window_rv) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (req->is_buf_gpu_mem) { req->window_rv = search_window( req->mq->ips_gpu_window_rv, req->req_data.send_msglen); } else -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ req->window_rv = search_window(req->mq->ips_cpu_window_rv, req->req_data.send_msglen); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU _HFI_VDBG("Selected Window of %u for %u byte %s msg\n", req->window_rv, req->req_data.send_msglen, @@ -2053,7 +2053,7 @@ static uint64_t shm_dsa_avg_copy_size_recv(void *context) } #endif /* PSM_DSA */ -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU static uint64_t gpu_ipc_hit_rate(void *context) { psm2_mq_t mq = (psm2_mq_t)context; @@ -2071,7 +2071,7 @@ static uint64_t gpu_ipc_miss_rate(void *context) else return 0; } -#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#endif /* PSM_HAVE_GPU */ static uint64_t self_avg_msg_size_sent(void *context) @@ -2083,7 +2083,7 @@ static uint64_t self_avg_msg_size_sent(void *context) return 0; } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU static uint64_t eager_cpu_avg_msg_size_sent(void *context) { psm2_mq_t mq = (psm2_mq_t)context; @@ -2133,7 +2133,7 @@ static uint64_t sysbuf_cuCopy_avg_size_recv(void *context) else return 0; } -#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#endif /* PSM_HAVE_GPU */ psm2_error_t psm3_mq_initstats(psm2_mq_t mq, psm2_epid_t epid) { @@ -2393,7 +2393,7 @@ psm2_error_t psm3_mq_initstats(psm2_mq_t mq, psm2_epid_t epid) "Total DSA receive copiess which failured for non-page fault error", &mq->stats.dsa_stats[1].dsa_error), #endif /* PSM_DSA */ -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // ------------------------------------------------------------ PSMI_STATS_DECL_HELP("Intra-node GPU messages may use GPU IPC Handles " "to perform GPU to GPU rendezvous messages directly to and from " @@ -2438,7 +2438,7 @@ psm2_error_t psm3_mq_initstats(psm2_mq_t mq, psm2_epid_t epid) PSMI_STATS_DECLU64("gpu_ipc_clear", "Number of times entire cache was cleared and reset due to error", &mq->stats.gpu_ipc_cache_clear), -#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#endif /* PSM_HAVE_GPU */ // ------------------------------------------------------------ PSMI_STATS_DECL_HELP("The PSM3 self protocol is used in the " @@ -2454,7 +2454,7 @@ psm2_error_t psm3_mq_initstats(psm2_mq_t mq, psm2_epid_t epid) "Average message size sent using PSM3 self protocol", self_avg_msg_size_sent), -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // ------------------------------------------------------------ PSMI_STATS_DECL_HELP("Eager messages may be sent from GPU or " "CPU application buffers.\n" @@ -2514,7 +2514,7 @@ psm2_error_t psm3_mq_initstats(psm2_mq_t mq, psm2_epid_t epid) PSMI_STATS_DECL_FUNC("sysbuf_cuCopy_avg_size_recv", "Average gpuCopy size from a receive bounce buffer to a GPU buffer", sysbuf_cuCopy_avg_size_recv), -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ }; return psm3_stats_register_type("MPI_Statistics_Summary", @@ -2577,8 +2577,8 @@ psm2_error_t psm3_mq_malloc(psm2_mq_t *mqo) // shm_thresh_rv is N/A to NIC and HAL, so we set this here and let // HAL set the rest of the defaults mq->shm_thresh_rv = PSM3_MQ_RNDV_SHM_THRESH; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - mq->shm_gpu_thresh_rv = PSM3_MQ_RNDV_SHM_GPU_THRESH; +#ifdef PSM_HAVE_GPU + mq->shm_gpu_thresh_rv = psm3_gpu_mq_rndv_shm_gpu_thresh_default; #endif psmi_hal_mq_init_defaults(mq); @@ -2604,7 +2604,7 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq) { union psmi_envvar_val env_hfitiny, env_rvwin, env_hfirv, env_shmrv, env_hash, env_stats; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU union psmi_envvar_val env_shmgpurv; #endif @@ -2651,8 +2651,8 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq) // already checked, shouldn't get parse errors nor empty strings psmi_assert(0); } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (PSMI_IS_GPU_ENABLED && mq->ips_gpu_window_rv_str) { +#ifdef PSM_HAVE_GPU + if (mq->ips_gpu_window_rv_str) { union psmi_envvar_val env_gpurvwin; char *env; @@ -2679,7 +2679,7 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq) } #else (void)got_depwin; // keep compiler happy -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ } psm3_getenv("PSM3_MQ_RNDV_SHM_THRESH", @@ -2688,8 +2688,8 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq) (union psmi_envvar_val)mq->shm_thresh_rv, &env_shmrv); mq->shm_thresh_rv = env_shmrv.e_uint; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (PSMI_IS_GPU_ENABLED) { +#ifdef PSM_HAVE_GPU + if (PSM3_GPU_IS_ENABLED) { psm3_getenv("PSM3_MQ_RNDV_SHM_GPU_THRESH", "shm eager-to-rendezvous switchover for GPU send", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, @@ -2729,7 +2729,7 @@ psm2_error_t MOCKABLE(psm3_mq_free)(psm2_mq_t mq) psm3_mq_req_fini(mq); psm3_mq_sysbuf_fini(mq); psm3_stats_deregister_type(PSMI_STATSTYPE_MQ, mq); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU psmi_free(mq->ips_gpu_window_rv); #endif psmi_free(mq->ips_cpu_window_rv); diff --git a/prov/psm3/psm3/psm_mq_internal.h b/prov/psm3/psm3/psm_mq_internal.h index 824dc1ad60a..64bc05a2288 100644 --- a/prov/psm3/psm3/psm_mq_internal.h +++ b/prov/psm3/psm3/psm_mq_internal.h @@ -180,12 +180,12 @@ struct psm2_mq { uint32_t hfi_thresh_tiny; uint32_t rndv_nic_thresh; uint32_t shm_thresh_rv; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint32_t shm_gpu_thresh_rv; #endif const char *ips_cpu_window_rv_str; // default input to parser struct psm3_mq_window_rv_entry *ips_cpu_window_rv; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU const char *ips_gpu_window_rv_str; // default input to parser struct psm3_mq_window_rv_entry *ips_gpu_window_rv; #endif @@ -330,31 +330,10 @@ struct psm2_mq_req { psm3_verbs_mr_t mr; // local registered memory for app buffer #endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint8_t* user_gpu_buffer; /* for recv */ STAILQ_HEAD(sendreq_spec_, ips_gpu_hostbuf) sendreq_prefetch; uint32_t prefetch_send_msgoff; -#endif -#ifdef PSM_CUDA - CUipcMemHandle cuda_ipc_handle; - uint8_t cuda_ipc_handle_attached; - uint32_t cuda_ipc_offset; -#endif -#ifdef PSM_ONEAPI - union { - ze_ipc_mem_handle_t ipc_handle; // for sender req - uint32_t ze_handle; // receiver req pidfd or gem_handle - }; - uint8_t ze_handle_attached; - uint8_t ze_alloc_type; - uint32_t ze_ipc_offset; -#ifndef PSM_HAVE_PIDFD - uint32_t ze_device_index; -#endif - uint64_t ze_alloc_id; -#endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - int gpu_hostbuf_used; /* * is_sendbuf_gpu_mem - Used to always select TID path on the receiver * when send is on a device buffer @@ -365,6 +344,10 @@ struct psm2_mq_req { * on a device/host buffer. */ uint8_t is_buf_gpu_mem; + uint16_t pad; // ensure fields below are 64 bit aligned + // GPU specific fields for use in PSM3 shm GPU IPC + union psm2_mq_req_gpu_specific gpu_specific; + int gpu_hostbuf_used; #endif /* PTLs get to store their own per-request data. MQ manages the allocation @@ -547,8 +530,8 @@ PSMI_ALWAYS_INLINE( void mq_copy_tiny(uint32_t *dest, uint32_t *src, uint8_t len)) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (len && PSMI_IS_GPU_ENABLED && (PSMI_IS_GPU_MEM(dest) || PSMI_IS_GPU_MEM(src))) { +#ifdef PSM_HAVE_GPU + if (len && (PSM3_IS_GPU_MEM(dest) || PSM3_IS_GPU_MEM(src))) { PSM3_GPU_MEMCPY(dest, src, len); return; } @@ -587,7 +570,7 @@ mq_copy_tiny(uint32_t *dest, uint32_t *src, uint8_t len)) typedef void (*psmi_mtucpy_fn_t)(void *dest, const void *src, uint32_t len); typedef void (*psmi_copy_tiny_fn_t)(uint32_t *dest, uint32_t *src, uint8_t len); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_ALWAYS_INLINE( void @@ -781,7 +764,7 @@ MOCK_DCL_EPILOGUE(psm3_mq_free); void psm3_mq_handle_rts_complete(psm2_mq_req_t req); int psm3_mq_handle_data(psm2_mq_t mq, psm2_mq_req_t req, uint32_t offset, const void *payload, uint32_t paylen -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , int use_gdrcopy, psm2_ep_t ep #endif ); @@ -804,7 +787,7 @@ int psm3_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t req); // can get future cache hits on other size messages in same buffer // not needed - msglen - negotiated total message size // copysz - actual amount to copy (<= msglen) -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU void psm3_mq_recv_copy(psm2_mq_t mq, psm2_mq_req_t req, uint8_t is_buf_gpu_mem, void *buf, uint32_t len, uint32_t copysz); #else diff --git a/prov/psm3/psm3/psm_mq_recv.c b/prov/psm3/psm3/psm_mq_recv.c index 181d4dd5ba7..f8ea86a5fa6 100644 --- a/prov/psm3/psm3/psm_mq_recv.c +++ b/prov/psm3/psm3/psm_mq_recv.c @@ -98,7 +98,7 @@ void psm3_mq_handle_rts_complete(psm2_mq_req_t req) return; } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* * Copy a packet from host buffer to a gpu buffer. * @@ -170,12 +170,12 @@ psm3_mq_req_gpu_copy(uint64_t gpu_buf_start, uint32_t gpu_buf_len, pkt_len = len; } } -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ static void psm3_mq_req_copy(psm2_mq_req_t req, uint32_t offset, const void *buf, uint32_t nbytes -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , int use_gdrcopy, psm2_ep_t ep #endif ) @@ -198,7 +198,7 @@ psm3_mq_req_copy(psm2_mq_req_t req, msglen_this = nbytes; } if (msgptr != buf) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // for loopback HAL, invalid to call psm3_mq_get_window_rv() // however, for loopback HAL, gdr copy is disabled if (use_gdrcopy) @@ -227,7 +227,7 @@ psm3_mq_req_copy(psm2_mq_req_t req, int psm3_mq_handle_data(psm2_mq_t mq, psm2_mq_req_t req, uint32_t offset, const void *buf, uint32_t nbytes -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , int use_gdrcopy, psm2_ep_t ep #endif ) @@ -245,7 +245,7 @@ psm3_mq_handle_data(psm2_mq_t mq, psm2_mq_req_t req, rc = MQ_RET_UNEXP_OK; } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU psm3_mq_req_copy(req, offset, buf, nbytes, use_gdrcopy, ep); #else psm3_mq_req_copy(req, offset, buf, nbytes); @@ -416,7 +416,7 @@ psm3_mq_handle_rts(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag, if (paylen) { // payload of RTS can contain a single packet synchronous MPI msg psm3_mq_mtucpy(req->req_data.buf, payload, paylen); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (req->is_buf_gpu_mem) { stats->rndv_rts_cuCopy_recv++; stats->rndv_rts_cuCopy_recv_bytes += paylen; @@ -474,7 +474,7 @@ psm3_mq_handle_rts(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag, if (paylen) { req->req_data.buf = psm3_mq_sysbuf_alloc(mq, paylen); psmi_assert(paylen == 0 || req->req_data.buf != NULL); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU psm3_mq_mtucpy_host_mem(req->req_data.buf, payload, paylen); #else psm3_mq_mtucpy(req->req_data.buf, payload, paylen); @@ -521,9 +521,9 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag, psm2_mq_req_t req; uint32_t msglen; psmi_mtucpy_fn_t psmi_mtucpy_fn; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU int use_gdrcopy = 0; -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ psm2_mq_tag_t *tag = (psm2_mq_tag_t *)_tag; if (msgorder && (req = psm3_mq_req_match(mq, src, tag, 1))) { @@ -543,7 +543,7 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag, switch (opcode) { case MQ_MSG_TINY: /* mq_copy_tiny() can handle zero byte */ -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (!req->is_buf_gpu_mem) { mq_copy_tiny_host_mem((uint32_t *) user_buffer, (uint32_t *) payload, msglen); stats->tiny_cpu_recv++; @@ -561,7 +561,7 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag, user_buffer = req->req_data.buf; #endif mq_copy_tiny((uint32_t *) user_buffer, (uint32_t *) payload, msglen); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU stats->tiny_cuCopy_recv++; stats->tiny_cuCopy_recv_bytes += msglen; } @@ -577,7 +577,7 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag, case MQ_MSG_SHORT: /* message fits in 1 payload */ psmi_mtucpy_fn = psm3_mq_mtucpy; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (!req->is_buf_gpu_mem) { psmi_mtucpy_fn = psm3_mq_mtucpy_host_mem; stats->short_cpu_recv++; @@ -589,15 +589,10 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag, (unsigned long)req->req_data.buf, msglen, 1, mq->ep))) { psmi_mtucpy_fn = psm3_mq_mtucpy_host_mem; -#ifdef PSM_ONEAPI - use_gdrcopy = 1; -#endif stats->short_gdrcopy_recv++; stats->short_gdrcopy_recv_bytes += msglen; } else { user_buffer = req->req_data.buf; -#endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) stats->short_cuCopy_recv++; stats->short_cuCopy_recv_bytes += msglen; } @@ -635,7 +630,7 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag, _HFI_VDBG("exp MSG_EAGER of length %d bytes pay=%d\n", msglen, paylen); // !offset -> only count recv msgs on 1st pkt in msg -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (!req->is_buf_gpu_mem) { if (!offset) stats->eager_cpu_recv++; stats->eager_cpu_recv_bytes += paylen; @@ -655,7 +650,7 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag, #endif if (paylen > 0) psm3_mq_handle_data(mq, req, offset, payload, -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU paylen, use_gdrcopy, mq->ep); #else paylen); @@ -721,7 +716,7 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag, if (msglen > 0) { req->req_data.buf = psm3_mq_sysbuf_alloc(mq, msglen); psmi_assert(msglen == 0 || req->req_data.buf != NULL); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU mq_copy_tiny_host_mem((uint32_t *) req->req_data.buf, (uint32_t *) payload, msglen); #else @@ -741,14 +736,14 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag, req->req_data.buf = psm3_mq_sysbuf_alloc(mq, msglen); psmi_assert(msglen == 0 || req->req_data.buf != NULL); if (msglen <= paylen) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU psm3_mq_mtucpy_host_mem(req->req_data.buf, payload, msglen); #else psm3_mq_mtucpy(req->req_data.buf, payload, msglen); #endif } else { psmi_assert((msglen & ~0x3) == paylen); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU psm3_mq_mtucpy_host_mem(req->req_data.buf, payload, paylen); #else psm3_mq_mtucpy(req->req_data.buf, payload, paylen); @@ -758,7 +753,7 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag, * copy after the DW payload. */ uint32_t off[] = { offset }; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU mq_copy_tiny_host_mem((uint32_t *)(req->req_data.buf+paylen), (uint32_t *)off, msglen & 0x3); #else @@ -781,7 +776,7 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag, _HFI_VDBG("unexp MSG_EAGER of length %d bytes pay=%d\n", msglen, paylen); if (paylen > 0) -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU psm3_mq_handle_data(mq, req, offset, payload, paylen, 0, NULL); #else psm3_mq_handle_data(mq, req, offset, payload, paylen); @@ -807,7 +802,7 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag, return MQ_RET_UNEXP_OK; } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) // declared inline in psm_mq_internal.h for non-CUDA +#ifdef PSM_HAVE_GPU // declared inline in psm_mq_internal.h for non-GPU // perform the actual copy for an psmi_mq_irecv_inner. We copy from a sysbuf // (req->req_data.buf) to the actual user buffer (buf) and keep statistics. // is_buf_gpu_mem indicates if buf is a gpu buffer @@ -826,22 +821,22 @@ void psm3_mq_recv_copy(psm2_mq_t mq, psm2_mq_req_t req, uint8_t is_buf_gpu_mem, return; } if (!is_buf_gpu_mem) { - psmi_assert(! PSMI_IS_GPU_ENABLED || !PSMI_IS_GPU_MEM(buf)); + psmi_assert(!PSM3_IS_GPU_MEM(buf)); mq->stats.rx_sysbuf_cpu_num++; mq->stats.rx_sysbuf_cpu_bytes += copysz; psmi_mtucpy_fn = psm3_mq_mtucpy_host_mem; - // len could be huge, so limit ourselves to gdr_copy_limit_recv - // Note to get here copysz <= gdr_copy_limit_recv + // len could be huge, so limit ourselves to psm3_gpu_gdr_copy_limit_recv + // Note to get here copysz <= psm3_gpu_gdr_copy_limit_recv } else if (PSMI_USE_GDR_COPY_RECV(copysz) && NULL != (ubuf = psmi_hal_gdr_convert_gpu_to_host_addr((unsigned long)buf, - min(gdr_copy_limit_recv, len), 1, + min(psm3_gpu_gdr_copy_limit_recv, len), 1, mq->ep))) { - psmi_assert(! PSMI_IS_GPU_ENABLED || PSMI_IS_GPU_MEM(buf)); + psmi_assert(! PSM3_GPU_IS_ENABLED || PSM3_IS_GPU_MEM(buf)); psmi_mtucpy_fn = psm3_mq_mtucpy_host_mem; mq->stats.rx_sysbuf_gdrcopy_num++; mq->stats.rx_sysbuf_gdrcopy_bytes += copysz; } else { - psmi_assert(! PSMI_IS_GPU_ENABLED || PSMI_IS_GPU_MEM(buf)); + psmi_assert(! PSM3_GPU_IS_ENABLED || PSM3_IS_GPU_MEM(buf)); ubuf = buf; mq->stats.rx_sysbuf_cuCopy_num++; mq->stats.rx_sysbuf_cuCopy_bytes += copysz; @@ -849,7 +844,7 @@ void psm3_mq_recv_copy(psm2_mq_t mq, psm2_mq_req_t req, uint8_t is_buf_gpu_mem, if (copysz) psmi_mtucpy_fn(ubuf, (const void *)req->req_data.buf, copysz); } -#endif // defined(PSM_CUDA) || defined(PSM_ONEAPI) +#endif // PSM_HAVE_GPU // we landed an out of order message in a sysbuf and can now process it // ureq is where we landed it. If found, ereq is the user posted receive. @@ -873,13 +868,13 @@ int psm3_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq) case MQ_STATE_COMPLETE: if (ureq->req_data.buf != NULL) { /* 0-byte don't alloc a sysreq_data.buf */ psm3_mq_recv_copy(mq, ureq, -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU ereq->is_buf_gpu_mem, #endif ereq->req_data.buf, ereq->req_data.buf_len, msglen); psm3_mq_sysbuf_free(mq, ureq->req_data.buf); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU } else { mq->stats.rx_sysbuf_cpu_num++; // zero length #endif @@ -895,7 +890,7 @@ int psm3_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq) ereq->send_msgoff = ureq->send_msgoff; ereq->recv_msgoff = min(ureq->recv_msgoff, msglen); psm3_mq_recv_copy(mq, ureq, -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU ereq->is_buf_gpu_mem, #endif ereq->req_data.buf, @@ -913,7 +908,7 @@ int psm3_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq) ereq->recv_msgoff = min(ureq->recv_msgoff, msglen); if (ereq->send_msgoff) { // only have sysbuf if RTS w/payload psm3_mq_recv_copy(mq, ureq, -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU ereq->is_buf_gpu_mem, #endif ereq->req_data.buf, diff --git a/prov/psm3/psm3/psm_nic_select.c b/prov/psm3/psm3/psm_nic_select.c index 58d3ab72b15..cbd8943d457 100644 --- a/prov/psm3/psm3/psm_nic_select.c +++ b/prov/psm3/psm3/psm_nic_select.c @@ -72,6 +72,22 @@ #endif #endif +// PSM3_NIC_SELECTION_ALG choices. +// ALG_NUMA is the default. This option spreads the NIC selection within the +// local CPU socket's NICs (NUMA). +// If it is preferred to spread job over over entire set of NICs within the +// system, use ALG_ANY. +// For systems with PCIe switches for GPU Direct, GPU_CENTRIC is typically best. +// For GPU systems w/o switches, CPU_CENTRIC may be best. +#define PSMI_NIC_SEL_ALG_NUMA 0 /* Round Robin within NUMA */ +#define PSMI_NIC_SEL_ALG_FIRST 1 /* First Active NIC */ +#define PSMI_NIC_SEL_ALG_ANY 2 /* Round Robin All */ +#define PSMI_NIC_SEL_ALG_CPU_CENTRIC 3 /* Round Robin, prefer CPU distance */ +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY +#define PSMI_NIC_SEL_ALG_GPU_CENTRIC 4 /* Round Robin, prefer GPU distance */ +#endif + + // subnuma is risky right now, so disable and explore in future //#ifdef PSM_USE_HWLOC //#define PSM3_HAVE_CPU_SUBNUMA @@ -152,7 +168,7 @@ psm3_get_uuid_hash(psm2_uuid_t const uuid) int psm3_get_current_proc_location() { - int core_id, node_id; + int core_id, node_id; core_id = sched_getcpu(); if (core_id < 0) @@ -250,66 +266,6 @@ int psm3_get_max_cpu_numa() return max_cpu_numa; } -/* search the list of all units for those which are active - * and optionally match the given NUMA node_id (when node_id >= 0) - * returns the number of active units found. - * Note get_unit_active tests for active ports, valid addresses and - * performs filtering as done in get_port_subnets - */ -static int -hfi_find_active_hfis(int nunits, int node_id, int *saved_hfis) -{ - int found = 0, unit_id; - - for (unit_id = 0; unit_id < nunits; unit_id++) { - int node_id_i; - - if (psmi_hal_get_unit_active(unit_id) <= 0) - continue; - - if (node_id < 0) { - saved_hfis[found++] = unit_id; - _HFI_DBG("RoundRobinAll Found NIC unit= %d, local rank=%d.\n", - unit_id, psm3_get_mylocalrank()); - } else if (!psmi_hal_get_node_id(unit_id, &node_id_i) - && node_id_i == node_id) { - saved_hfis[found++] = unit_id; - _HFI_DBG("RoundRobin Found NIC unit= %d, node = %d, local rank=%d.\n", - unit_id, node_id, psm3_get_mylocalrank()); - } - } - return found; -} - -// select NIC across all NICs, use a hash of job_id and local rank to -// distribute local ranks across NICs and to attempt to distribute -// jobs across NICs. -// TBD - if know never have >1 job per node, could ignore job_id, perhaps -// have an env to exclude job_id from hash so NIC selection is deterministic -static void -psmi_spread_nic_selection(psm2_uuid_t const job_key, long *unit_start, - long *unit_end, int nunits) -{ - int found, saved_hfis[nunits]; - - /* we are going to look at: - (a hash of the job key plus the local rank id) mod nunits. */ - found = hfi_find_active_hfis(nunits, -1, saved_hfis); - if (found) - *unit_start = saved_hfis[((psm3_get_mylocalrank()+1) + - psm3_get_uuid_hash(job_key)) % found]; - else - // none found, caller will fail, start is a don't care - *unit_start = 0; - /* just in case, caller will check all other units, with wrap */ - if (*unit_start > 0) - *unit_end = *unit_start - 1; - else - *unit_end = nunits-1; - _HFI_DBG("RoundRobinAll Will select 1st viable NIC unit= %ld to %ld.\n", - *unit_start, *unit_end); -} - static int psm3_create_and_open_affinity_shm(psm2_uuid_t const job_key) { @@ -411,57 +367,6 @@ psm3_create_and_open_affinity_shm(psm2_uuid_t const job_key) return -1; } -/* - * Spread HFI selection between units if we find more than one within a socket. - */ -static void -psmi_spread_hfi_within_socket(long *unit_start, long *unit_end, int node_id, - int *saved_hfis, int found, psm2_uuid_t const job_key) -{ - int ret, shm_location; - - /* - * Take affinity lock and open shared memory region to be able to - * accurately determine which HFI to pick for this process. If any - * issues, bail by picking first known HFI. - */ - if (!psm3_affinity_semaphore_open) - goto spread_hfi_fallback; - - ret = psm3_create_and_open_affinity_shm(job_key); - if (ret < 0) - goto spread_hfi_fallback; - - // one shm entry per CPU NUMA domain - // The entry contains the next round robin NIC to use - // in the form of a index into saved_hfis - // saved_hfis has a list of all the NUMA local active NICs - shm_location = AFFINITY_SHM_HFI_INDEX_LOCATION + node_id; - if (shm_location > PSMI_PAGESIZE) - goto spread_hfi_fallback; - - /* Start critical section to read/write shm object */ - if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) { - _HFI_VDBG("Could not enter critical section to update NIC index\n"); - goto spread_hfi_fallback; - } - - *unit_start = *unit_end = saved_hfis[psm3_shared_affinity_ptr[shm_location]]; - psm3_shared_affinity_ptr[shm_location] = - (psm3_shared_affinity_ptr[shm_location] + 1) % found; - _HFI_DBG("RoundRobin Selected NIC unit= %ld, Next NIC=%ld, node = %d, local rank=%d, found=%d.\n", - *unit_start, psm3_shared_affinity_ptr[shm_location], node_id, - psm3_get_mylocalrank(), found); - - /* End Critical Section */ - psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name); - - return; - -spread_hfi_fallback: - *unit_start = *unit_end = saved_hfis[0]; -} - static void psm3_create_affinity_semaphores(psm2_uuid_t const job_key) { @@ -730,9 +635,9 @@ void nic_info_filter_gpu_distance(struct nic_info *nic_info, unsigned ninfo) unsigned i; int min_distance = INT_MAX; // smallest distance found unsigned found = 0; - struct pci_addr gpu_pci_addr; + struct pci_addr gpu_pci_addr = { 0 }; - if (! PSMI_IS_GPU_ENABLED) + if (! PSM3_GPU_IS_ENABLED) return; psm3_deferred_hwloc_topology_init(); @@ -740,66 +645,9 @@ void nic_info_filter_gpu_distance(struct nic_info *nic_info, unsigned ninfo) return; // hwloc incorrect version psmi_assert(psm3_hwloc_topology_initialized); - // Get current GPU PCIe address to gpu_pci_addr; -#ifdef PSM_CUDA - { - int domain, bus, dev; - int num_devices; - CUdevice device; - - PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices); - _HFI_DBG("%d Cuda GPUs found\n", num_devices); - if (! num_devices) - return; - - if (num_devices == 1) { - PSMI_CUDA_CALL(cuDeviceGet, &device, 0); - } else { - // all GPUs will be visible to process, see if app chose one first - CUcontext ctxt = {0}; - if (! psmi_cuCtxGetCurrent || psmi_cuCtxGetCurrent(&ctxt) || ! ctxt) { - _HFI_DBG("Unable to get Cuda ctxt\n"); - //PSMI_CUDA_CALL(cuDeviceGet, &device, 0); - return; - } else { - PSMI_CUDA_CALL(cuCtxGetDevice, &device); - } - } - _HFI_DBG("Using Cuda GPU %d\n", device); - PSMI_CUDA_CALL(cuDeviceGetAttribute, - &domain, - CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, - device); - PSMI_CUDA_CALL(cuDeviceGetAttribute, - &bus, - CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, - device); - PSMI_CUDA_CALL(cuDeviceGetAttribute, - &dev, - CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, - device); - gpu_pci_addr.domain = domain; - gpu_pci_addr.bus = bus; - gpu_pci_addr.dev = dev; - gpu_pci_addr.func = 0; - } -#elif defined(PSM_ONEAPI) - { - ze_pci_ext_properties_t PciProperties; - - _HFI_DBG("%d Level Zero GPUs found\n", num_ze_devices); - if (! num_ze_devices) - return; - - // caling middleware will have limited GPUs visible to process - PSMI_ONEAPI_ZE_CALL(zeDevicePciGetPropertiesExt, - ze_devices[0].dev, &PciProperties); - gpu_pci_addr.domain = PciProperties.address.domain; - gpu_pci_addr.bus = PciProperties.address.bus; - gpu_pci_addr.dev = PciProperties.address.device; - gpu_pci_addr.func = PciProperties.address.function; - } -#endif + // Get current GPU's PCIe address to gpu_pci_addr; + PSM3_GPU_GET_PCI_ADDR( &gpu_pci_addr.domain, &gpu_pci_addr.bus, + &gpu_pci_addr.dev, &gpu_pci_addr.func); _HFI_DBG("GPU PCIe address is %04x:%02x:%02x.%x\n", gpu_pci_addr.domain, gpu_pci_addr.bus, gpu_pci_addr.dev, gpu_pci_addr.func); @@ -847,6 +695,14 @@ void nic_info_filter_gpu_distance(struct nic_info *nic_info, unsigned ninfo) } #endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */ +// filter down the list of NICs solely based on CPU NUMA locality +static void nic_info_filter_cpu_numa(struct nic_info *nic_info, + unsigned ninfo) +{ + _HFI_DBG("Filtering NICs with CPU NUMA Strategy\n"); + nic_info_filter_numa(nic_info, ninfo); +} + // filter down the list of NICs with a CPU locality focus as priority // if present, the GPU is considered last. If the GPU is NUMA local // to the CPU, the GPU filter can further limit NICs to those close to the @@ -1008,7 +864,7 @@ psm3_open_shm_scoreboard_and_select_nic( goto fallback; } - // balance among procceses within current node + // balance among processes within current node nic_info_filter_refcount(nic_info, ninfo, psm3_shared_affinity_nic_refcount_ptr, nunits, "local node"); @@ -1057,198 +913,58 @@ void psm3_dec_nic_refcount(int unit_id) } } -psm2_error_t -psm3_compute_start_and_end_unit_cpu_centric( - psm2_uuid_t const job_key, - long *unit_start,long *unit_end, int nunits) +static int parse_selection_alg(const char *str) { - unsigned index; - unsigned ninfo; - struct nic_info nic_info[PSMI_MAX_RAILS]; - - // caller will enumerate addr_index, just just get all active ports - ninfo = nic_info_init(nic_info, nunits, 0); - if (! ninfo) { - // should not happen, caller already confirmed there is >1 active unit - // mimic what caller of psm3_compute_start_and_end_unit would do - return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "PSM3 no nic units are active"); - } - - nic_info_filter_cpu_centric(nic_info, ninfo); - - index = psm3_open_shm_scoreboard_and_select_nic(nic_info, ninfo, - job_key, nunits); - psmi_assert(index >= 0 && index < ninfo); - - // caller will select 1st active port and an addr_index within unit - *unit_start = *unit_end = nic_info[index].unit; - return PSM2_OK; -} - + if (!strcasecmp(str, "Round Robin") + || !strcasecmp(str, "RoundRobin") + || !strcasecmp(str, "rr")) + return PSMI_NIC_SEL_ALG_NUMA; + else if (!strcasecmp(str, "Packed") + || !strcasecmp(str, "p")) + return PSMI_NIC_SEL_ALG_FIRST; + else if (!strcasecmp(str, "Round Robin All") + || !strcasecmp(str, "RoundRobinAll") + || !strcasecmp(str, "rra")) + return PSMI_NIC_SEL_ALG_ANY; + else if (!strcasecmp(str, "CPU Centric Round Robin") + || !strcasecmp(str, "CpuRoundRobin") + || !strcasecmp(str, "crr")) + return PSMI_NIC_SEL_ALG_CPU_CENTRIC; #ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY -psm2_error_t -psm3_compute_start_and_end_unit_gpu_centric( - psm2_uuid_t const job_key, - long *unit_start,long *unit_end, int nunits) -{ - unsigned index; - unsigned ninfo; - struct nic_info nic_info[PSMI_MAX_RAILS]; - - // caller will enumerate addr_index, just just get all active ports - ninfo = nic_info_init(nic_info, nunits, 0); - if (! ninfo) { - // should not happen, caller already confirmed there is >1 active unit - // mimic what caller of psm3_compute_start_and_end_unit would do - return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "PSM3 no nic units are active"); - } - - nic_info_filter_gpu_centric(nic_info, ninfo); - - index = psm3_open_shm_scoreboard_and_select_nic(nic_info, ninfo, - job_key, nunits); - psmi_assert(index >= 0 && index < ninfo); - - // caller will select 1st active port and an addr_index within unit - *unit_start = *unit_end = nic_info[index].unit; - return PSM2_OK; + else if (!strcasecmp(str, "GPU Centric Round Robin") + || !strcasecmp(str, "GpuRoundRobin") + || !strcasecmp(str, "grr")) + return PSMI_NIC_SEL_ALG_GPU_CENTRIC; +#endif + else + return -1; } -#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */ -// return set of units to consider and which to start at. -// caller will use 1st active unit which can be opened. -// caller will wrap around so it's valid for start >= end -// Note: When using multiple rails per PSM process, higher level code will -// walk through desired units and unit_param will specify a specific unit -// if unit_param is PSM3_NIC_ANY, this will pick starting point for nic search -psm2_error_t -psm3_compute_start_and_end_unit(long unit_param, long addr_index, - int nunitsactive, int nunits, - psm2_uuid_t const job_key, - long *unit_start, long *unit_end) +/* check for valid PSM3_SELECTION_ALG + * returns: + * 0 - valid + * -1 - empty string + * -2 - invalid syntax + */ +static int parse_check_selection_alg(int type, const union psmi_envvar_val val, + void *ptr, size_t errstr_size, char errstr[]) { - unsigned short nic_sel_alg = PSMI_UNIT_SEL_ALG_ACROSS; - int node_id, found = 0; - int saved_hfis[nunits]; - - /* if the user did not set PSM3_NIC then ... */ - if (unit_param == PSM3_NIC_ANY) - { - if (nunitsactive > 1) { - // if NICs are on different planes (non-routed subnets) - // we need to have all ranks default to the same plane - // so force 1st active NIC in that case - int have_subnet = 0, unit_id; - psmi_subnet128_t got_subnet = { }; - for (unit_id = 0; unit_id < nunits; unit_id++) { - psmi_subnet128_t subnet; - if (psmi_hal_get_unit_active(unit_id) <= 0) - continue; - if (0 != psmi_hal_get_port_subnet(unit_id, 1 /* VERBS_PORT*/, - addr_index>0?addr_index:0, - &subnet, NULL, NULL, NULL)) - continue; // can't access NIC - if (! have_subnet) { - have_subnet = 1; - got_subnet = subnet; - } else if (! psm3_subnets_match(got_subnet, - subnet)) { - // active units have different tech - // (IB/OPA vs Eth) or different subnets - // caller will pick 1st active unit - *unit_start = 0; - *unit_end = nunits - 1; - _HFI_DBG("Multi-Plane config: Will select 1st viable NIC unit= %ld to %ld.\n", - *unit_start, *unit_end); - return PSM2_OK; - } - } - } - - /* Get the actual selection algorithm from the environment: */ - nic_sel_alg = psmi_parse_nic_selection_algorithm(); - /* If round-robin is selection algorithm and ... */ - if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS) && - /* there are more than 1 active units then ... */ - (nunitsactive > 1)) - { - /* - * Pick an HFI on same root complex as current task. - * linux IPC ensures balanced NIC usage within job. - * If none found, fall back to - * RoundRobinAll load-balancing algorithm. - */ - node_id = psm3_get_current_proc_location(); - if (node_id >= 0) { - found = hfi_find_active_hfis(nunits, node_id, - saved_hfis); - if (found > 1) { - psm3_create_affinity_semaphores(job_key); - psmi_spread_hfi_within_socket(unit_start, unit_end, - node_id, saved_hfis, - found, job_key); - } else if (found == 1) { - *unit_start = *unit_end = saved_hfis[0]; - _HFI_DBG("RoundRobin Selected NIC unit= %ld, node = %d, local rank=%d, found=%d.\n", - *unit_start, node_id, - psm3_get_mylocalrank(), found); - } - } - - if (node_id < 0 || !found) { - _HFI_DBG("RoundRobin No local NIC found, using RoundRobinAll, node = %d, local rank=%d, found=%d.\n", - node_id, - psm3_get_mylocalrank(), found); - psmi_spread_nic_selection(job_key, unit_start, - unit_end, nunits); - } - } else if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS_ALL) && - (nunitsactive > 1)) { - psmi_spread_nic_selection(job_key, unit_start, - unit_end, nunits); - } else if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_CPU_CENTRIC) && - (nunitsactive > 1)) { - return psm3_compute_start_and_end_unit_cpu_centric(job_key, - unit_start, unit_end, nunits); -#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY - } else if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_GPU_CENTRIC) && - (nunitsactive > 1)) { - return psm3_compute_start_and_end_unit_gpu_centric(job_key, - unit_start, unit_end, nunits); -#endif - } else { // PSMI_UNIT_SEL_ALG_WITHIN or only 1 active unit - // caller will pick 1st active unit - *unit_start = 0; - *unit_end = nunits - 1; - _HFI_DBG("%s: Will select 1st viable NIC unit= %ld to %ld.\n", - (nic_sel_alg == PSMI_UNIT_SEL_ALG_WITHIN) - ?"Packed":"Only 1 viable NIC", - *unit_start, *unit_end); - } - } else if (unit_param >= 0) { - /* the user specified PSM3_NIC, we use it. */ - *unit_start = *unit_end = unit_param; - _HFI_DBG("Caller selected NIC %ld.\n", *unit_start); - } else { - psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "PSM3 can't open unit: %ld for reading and writing", - unit_param); - return PSM2_EP_DEVICE_FAILURE; - } - - return PSM2_OK; + psmi_assert(type == PSMI_ENVVAR_TYPE_STR); + if (! val.e_str || ! *val.e_str) + return -1; + if (parse_selection_alg(val.e_str) < 0) + return -2; + return 0; } static int psmi_parse_nic_selection_algorithm(void) { union psmi_envvar_val env_nic_alg; - int nic_alg = PSMI_UNIT_SEL_ALG_ACROSS; + int nic_alg; const char* PSM3_NIC_SELECTION_ALG_HELP = - "NIC Device Selection Algorithm to use. Round Robin[RoundRobin or rr] (Default) " + "Round Robin[RoundRobin or rr] (Default)" ", Packed[p], Round Robin All[RoundRobinAll or rra]," #ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY " CPU Centric Round Robin [CpuRoundRobin or crr]" @@ -1257,43 +973,106 @@ int psmi_parse_nic_selection_algorithm(void) " or CPU Centric Round Robin [CpuRoundRobin or crr]"; #endif + psm3_getenv_range("PSM3_NIC_SELECTION_ALG", + "NIC Device Selection Algorithm", + PSM3_NIC_SELECTION_ALG_HELP, + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)"rr", + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + parse_check_selection_alg, NULL, &env_nic_alg); - /* If a specific unit is set in the environment, use that one. */ - psm3_getenv("PSM3_NIC_SELECTION_ALG", PSM3_NIC_SELECTION_ALG_HELP, - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, - (union psmi_envvar_val)"rr", &env_nic_alg); - - if (!strcasecmp(env_nic_alg.e_str, "Round Robin") - || !strcasecmp(env_nic_alg.e_str, "RoundRobin") - || !strcasecmp(env_nic_alg.e_str, "rr")) - nic_alg = PSMI_UNIT_SEL_ALG_ACROSS; - else if (!strcasecmp(env_nic_alg.e_str, "Packed") - || !strcasecmp(env_nic_alg.e_str, "p")) - nic_alg = PSMI_UNIT_SEL_ALG_WITHIN; - else if (!strcasecmp(env_nic_alg.e_str, "Round Robin All") - || !strcasecmp(env_nic_alg.e_str, "RoundRobinAll") - || !strcasecmp(env_nic_alg.e_str, "rra")) - nic_alg = PSMI_UNIT_SEL_ALG_ACROSS_ALL; - else if (!strcasecmp(env_nic_alg.e_str, "CPU Centric Round Robin") - || !strcasecmp(env_nic_alg.e_str, "CpuRoundRobin") - || !strcasecmp(env_nic_alg.e_str, "crr")) - nic_alg = PSMI_UNIT_SEL_ALG_CPU_CENTRIC; + nic_alg = parse_selection_alg(env_nic_alg.e_str); + psmi_assert(nic_alg >= 0); + return nic_alg; +} + +// Autoselect one unit for non-multirail operation. +// caller will select 1st active port and an addr_index within unit +// returns the unit number or -1 if unable to find an active unit +int +psm3_autoselect_one(long addr_index, int nunits, psm2_uuid_t const job_key) +{ + unsigned short nic_sel_alg; + unsigned first_active = nunits; // invalid value. for error check + int have_subnet = 0, unit_id; + psmi_subnet128_t got_subnet = { }; + unsigned ninfo; + struct nic_info nic_info[PSMI_MAX_RAILS]; + unsigned index; + int nunitsactive = 0; + + // find first_active, also if NICs are on different planes + // (non-routed subnets) we need to have all ranks default to the + // same plane so force 1st active NIC in that case + for (unit_id = 0; unit_id < nunits; unit_id++) { + psmi_subnet128_t subnet; + if (psmi_hal_get_unit_active(unit_id) <= 0) + continue; + if (0 != psmi_hal_get_port_subnet(unit_id, 1 /* VERBS_PORT*/, + addr_index>0?addr_index:0, + &subnet, NULL, NULL, NULL)) + continue; // can't access NIC + // found an active viable NIC + nunitsactive++; + if (! have_subnet) { + have_subnet = 1; + got_subnet = subnet; + first_active = unit_id; + } else if (! psm3_subnets_match(got_subnet, subnet)) { + // Active units have different tech (IB/OPA vs Eth) + // or different subnets. + // Use 1st active unit so all ranks in job can communicate + _HFI_DBG("Multi-Plane config: Using 1st viable NIC unit= %u.\n", + first_active); + return first_active; + } + } + if (nunitsactive == 0) + return -1; + + nic_sel_alg = psmi_parse_nic_selection_algorithm(); + + if (nunitsactive <= 1 || nic_sel_alg == PSMI_NIC_SEL_ALG_FIRST) { + // pick 1st active unit + _HFI_DBG("%s: Selected 1st viable NIC unit= %u.\n", + (nic_sel_alg == PSMI_NIC_SEL_ALG_FIRST) + ?"Packed":"Only 1 viable NIC", + first_active); + return first_active; + } + + ninfo = nic_info_init(nic_info, nunits, 0); + if (! ninfo) { + // should not happen, already confirmed there is >1 active unit + return -1; + } + switch (nic_sel_alg) { + default: + case PSMI_NIC_SEL_ALG_NUMA: /* round-robin is selection algorithm */ + nic_info_filter_cpu_numa(nic_info, ninfo); + break; + case PSMI_NIC_SEL_ALG_ANY: + // we will use any active unit + _HFI_DBG("No further NIC filtering\n"); + break; + case PSMI_NIC_SEL_ALG_CPU_CENTRIC: + nic_info_filter_cpu_centric(nic_info, ninfo); + break; #ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY - else if (!strcasecmp(env_nic_alg.e_str, "GPU Centric Round Robin") - || !strcasecmp(env_nic_alg.e_str, "GpuRoundRobin") - || !strcasecmp(env_nic_alg.e_str, "grr")) - nic_alg = PSMI_UNIT_SEL_ALG_GPU_CENTRIC; + case PSMI_NIC_SEL_ALG_GPU_CENTRIC: + nic_info_filter_gpu_centric(nic_info, ninfo); + break; #endif - else { - _HFI_INFO( - "Invalid value for PSM3_NIC_SELECTION_ALG ('%s') %-40s Using: %s\n", - env_nic_alg.e_str, PSM3_NIC_SELECTION_ALG_HELP, "RoundRobin"); - nic_alg = PSMI_UNIT_SEL_ALG_ACROSS; } - return nic_alg; + index = psm3_open_shm_scoreboard_and_select_nic(nic_info, ninfo, + job_key, nunits); + psmi_assert(index >= 0 && index < ninfo); + + return nic_info[index].unit; } + /* parse a list of NIC rails for PSM3_MULTIRAIL_MAP * map is in format: unit:port-addr_index,unit:port-addr_index,...;unit.... * where :port is optional (default of 1) and unit can be name or number @@ -1590,109 +1369,95 @@ psm3_copy_nic_info_to_multitrail_config( } } -// select a list of NICs to use, optimizing for CPU locality first +// Multirail enabled, see if PSM3_MULTIRAIL_MAP is selecting NICs +// for PSM3_MULTIRAIL=1 or 2, PSM3_MULTIRAIL_MAP can explicitly select NICs. +// returns: +// PSM2_OK - PSM3_MULTIRAIL_MAP specified and valid, multirail_config updated +// PSM2_EP_NO_DEVICE - PSM3_MULTIRAIL_MAP not specified or invalid static psm2_error_t -psm3_ep_multirail_autoselect_cpu_centric(uint32_t nunits, +psm3_ep_multirail_map(int multirail_mode, struct multirail_config *multirail_config) { - unsigned ninfo; - struct nic_info nic_info[PSMI_MAX_RAILS]; + int ret; + union psmi_envvar_val env_multirail_map; + int map_index; - // enumerate addr_index too - ninfo = nic_info_init(nic_info, nunits, 1); - if (! ninfo) { - // caller will try single NIC selection next - multirail_config->num_rails = 0; - return PSM2_OK; + // PSM3_MUTLIRAIL_MAP only allowed for PSM3_MULTIRAIL=1 or 2 + // We treat invalid input, such as bad syntax or selection of an unusable + // port (down/missing/etc), as a fatal error instead of attempting to run + // on the default PSM3_MULTIRAIL_MAP config. This helps avoid + // inconsistent NIC selections, especially for down ports, which may + // cause confusing behaviors or errors. + // If PSM3_MULTIRAIL_MAP contains multiple lists of NICs, then + // if PSM3_MULTIRAIL=1 - use local rank index (0, ...) to select + // if PSM3_MULTIRAIL=2 - use process NUMA (0, ...) to select + if (multirail_mode == 1) { + map_index = psm3_get_mylocalrank(); + } else if (multirail_mode == 2) { + map_index = psm3_get_current_proc_location(); + if (map_index < 0) { + return psm3_handle_error(PSMI_EP_NORETURN, + PSM2_EP_DEVICE_FAILURE, + "Unable to get NUMA location of current process\n"); + } + } else { + return PSM2_EP_NO_DEVICE; // caller will ignore MULTIRAIL_MAP } - - nic_info_filter_cpu_centric(nic_info, ninfo); - - // we will use all unfiltered units - - // ensure psm3_context_set_affinity doesn't unnecessarily narrow CPU - // selection, it will be called per rail and if rails are in - // different CPU NUMA could have an undesired impact - setenv("PSM3_NO_AFFINITY", "1", 1); - - psm3_copy_nic_info_to_multitrail_config(nic_info, ninfo, multirail_config); - return PSM2_OK; -} - -#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY -// select a list of NICs to use, optimizing for GPU locality first -static psm2_error_t -psm3_ep_multirail_autoselect_gpu_centric(uint32_t nunits, - struct multirail_config *multirail_config) -{ - unsigned ninfo; - struct nic_info nic_info[PSMI_MAX_RAILS]; - - // enumerate addr_index too - ninfo = nic_info_init(nic_info, nunits, 1); - if (! ninfo) { - // caller will try single NIC selection next + ret = psm3_getenv_range("PSM3_MULTIRAIL_MAP", + "Explicit NIC selections for each rail", + "Specified as:\n" + " rail,rail,...;rail,rail,...\n" +#if 0 + "Where rail can be: unit:port-addr_index or unit\n" +#else + "Where rail can be: unit-addr_index or unit\n" +#endif + "unit can be device name or unit number\n" +#if 0 + "where :port is optional (default of 1)\n" +#endif + "addr_index can be 0 to PSM3_ADDR_PER_NIC-1, or 'any' or 'all'\n" + "When addr_index is omitted, it defaults to 'all'\n" + "When more than 1 set of rails is present (each set is separated by ;),\n" + "the set to use for a given process is selected based on PSM3_MULTIRAIL.\n" + " 1 - use local rank number to select\n" + " 2 - use local CPU NUMA to select\n" + "When empty, PSM3 will autoselect NICs as controlled by PSM3_MULTIRAIL.", + PSMI_ENVVAR_LEVEL_USER|PSMI_ENVVAR_FLAG_FATAL, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)"", + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + psm3_parse_check_multirail_map, &map_index, &env_multirail_map); + if (ret < 0) { // syntax error in input, ret error instead of using default + psmi_assert(0); // should not get here since specified FLAG_FATAL multirail_config->num_rails = 0; - return PSM2_OK; + return psm3_handle_error(PSMI_EP_NORETURN, + PSM2_EP_DEVICE_FAILURE, + "Invalid value for PSM3_MULTIRAIL_MAP: '%s', can't proceed\n", + env_multirail_map.e_str); } - - nic_info_filter_gpu_centric(nic_info, ninfo); - - // we will use all unfiltered units - - // ensure psm3_context_set_affinity doesn't unnecessarily narrow CPU - // selection, it will be called per rail and if rails are in - // different CPU NUMA could have an undesired impact - setenv("PSM3_NO_AFFINITY", "1", 1); - - psm3_copy_nic_info_to_multitrail_config(nic_info, ninfo, multirail_config); - return PSM2_OK; -} -#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */ - -// for use in psm3_ep_multirail_autoselect so can sort rails -// by subnet and addr_index -struct rail_info { - psmi_subnet128_t subnet; - unsigned unit; - unsigned port; - unsigned addr_index; -}; - -static int cmpfunc(const void *p1, const void *p2) -{ - struct rail_info *a = ((struct rail_info *) p1); - struct rail_info *b = ((struct rail_info *) p2); - int ret; - - ret = psmi_subnet128_cmp(a->subnet, b->subnet); - if (ret == 0) { - if (a->addr_index < b->addr_index) - return -1; - else if (a->addr_index > b->addr_index) - return 1; + if (! ret) { + // valid input + if (psm3_parse_multirail_map(env_multirail_map.e_str, map_index, 0, NULL, + multirail_config) < 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); + } + return PSM2_OK; } - return ret; + return PSM2_EP_NO_DEVICE; } // Multirail enabled, autoselect one or more NICs for this process -// multirail_mode is PSM3_MULTIRAIL selection (1=all NICs, 2=NUMA local NICs) +// multirail_mode is PSM3_MULTIRAIL selection +// (1=all NICs, 2=NUMA local NICs, 3=cpu centric, 4=gpu centric) static psm2_error_t psm3_ep_multirail_autoselect(int multirail_mode, struct multirail_config *multirail_config) { uint32_t num_units = 0; - psmi_subnet128_t subnet; - unsigned i, j, k, count = 0; - int ret; psm2_error_t err = PSM2_OK; - struct rail_info rail_info[PSMI_MAX_RAILS]; - int multirail_within_socket_used = 0; - int node_id = -1, found = 0; - - if (multirail_mode == 2) - multirail_within_socket_used = 1; - + unsigned ninfo; + struct nic_info nic_info[PSMI_MAX_RAILS]; if ((err = psm3_ep_num_devunits(&num_units))) { return err; @@ -1705,87 +1470,41 @@ psm3_ep_multirail_autoselect(int multirail_mode, num_units = PSMI_MAX_RAILS; } - if (multirail_mode == 3) - return psm3_ep_multirail_autoselect_cpu_centric(num_units, multirail_config); -#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY - if (multirail_mode == 4) - return psm3_ep_multirail_autoselect_gpu_centric(num_units, multirail_config); -#endif - - /* - * PSM3_MULTIRAIL=2 functionality- - * - Try to find at least find one NIC in the same root - * complex. If none found, continue to run and - * use remaining NIC in the system. - * - If we do find at least one NIC in same root complex, we - * go ahead and add to list. - */ - if (multirail_within_socket_used) { - node_id = psm3_get_current_proc_location(); - for (i = 0; i < num_units; i++) { - if (psmi_hal_get_unit_active(i) <= 0) - continue; - int node_id_i; - - if (!psmi_hal_get_node_id(i, &node_id_i)) { - if (node_id_i == node_id) { - found = 1; - break; - } - } - } + // enumerate addr_index too + ninfo = nic_info_init(nic_info, num_units, 1); + if (! ninfo) { + // caller will try single NIC selection next + multirail_config->num_rails = 0; + return PSM2_OK; } -/* - * Get all the ports and addr_index with a valid lid and gid, one port per unit. - * but up to PSM3_ADDR_PER_NIC addresses. If we are using the NUMA selection - * algorithm and found at list 1 NUMA local NIC above, limit the list to NUMA - * local NICs, otherwise list all NICs - */ - for (i = 0; i < num_units; i++) { - int node_id_i; - - if (!psmi_hal_get_node_id(i, &node_id_i)) - { - if (multirail_within_socket_used && - found && (node_id_i != node_id)) - continue; - } - - for (j = PSM3_NIC_MIN_PORT; j <= PSM3_NIC_MAX_PORT; j++) { - int got_port = 0; - for (k = 0; k < psm3_addr_per_nic; k++) { - ret = psmi_hal_get_port_lid(i, j, k); - if (ret <= 0) - continue; - ret = psmi_hal_get_port_subnet(i, j, k, &subnet, NULL, NULL, NULL); - if (ret == -1) - continue; - rail_info[count].subnet = subnet; - rail_info[count].unit = i; - rail_info[count].port = j; - rail_info[count].addr_index = k; - got_port = 1; - count++; - } - if (got_port) // one port per unit - break; - } + switch (multirail_mode) { + default: + case 1: + // we will use all active units + _HFI_DBG("No further NIC filtering\n"); + break; + case 2: + nic_info_filter_cpu_numa(nic_info, ninfo); + break; + case 3: + nic_info_filter_cpu_centric(nic_info, ninfo); + break; +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY + case 4: + nic_info_filter_gpu_centric(nic_info, ninfo); + break; +#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */ } -/* - * Sort all the ports within rail_info from small to big. - * This is for multiple fabrics, and we use fabric with the - * smallest subnet to make the master connection. - */ - qsort(rail_info, count, sizeof(rail_info[0]), cmpfunc); + // we will use all unfiltered units - for (i = 0; i < count; i++) { - multirail_config->units[i] = rail_info[i].unit; - multirail_config->ports[i] = rail_info[i].port; - multirail_config->addr_indexes[i] = rail_info[i].addr_index; - } - multirail_config->num_rails = count; + // ensure psm3_context_set_affinity doesn't unnecessarily narrow CPU + // selection, it will be called per rail and if rails are in + // different CPU NUMA could have an undesired impact + setenv("PSM3_NO_AFFINITY", "1", 1); + + psm3_copy_nic_info_to_multitrail_config(nic_info, ninfo, multirail_config); return PSM2_OK; } @@ -1793,7 +1512,8 @@ psm3_ep_multirail_autoselect(int multirail_mode, // list of unit/port/addr_index in multirail_config. // When multirail_config->num_rails is returned as 0, multirail is not enabled // and other mechanisms (PSM3_NIC, PSM3_NIC_SELECTION_ALG) must be -// used by the caller to select a single NIC for the process. +// used by the caller to select a single NIC for the process +// via psm3_autoselect_one(). // This can return num_rails==1 if exactly 1 NIC is to be used by this process // or num_rails>1 if this process is to stripe data across multiple NICs // in which case the 1st NIC in multirail_config should be used as the @@ -1801,10 +1521,7 @@ psm3_ep_multirail_autoselect(int multirail_mode, psm2_error_t psm3_ep_multirail(struct multirail_config *multirail_config) { - int ret; union psmi_envvar_val env_multirail; - union psmi_envvar_val env_multirail_map; - int map_index; psm3_getenv_range("PSM3_MULTIRAIL", "Control use of multiple NICs", @@ -1863,71 +1580,9 @@ psm3_ep_multirail(struct multirail_config *multirail_config) return PSM2_OK; } - if (env_multirail.e_int == 1 || env_multirail.e_int == 2) { - // TBD - move this code to a separate function - // for PSM3_MULTIRAIL=1 or 2, PSM3_MULTIRAIL_MAP can explicitly select NICs. - // We treat invalid input, such as bad syntax or selection of an unusable - // port (down/missing/etc), as a fatal error instead of attempting to run - // on the default PSM3_MULTIRAIL_MAP config. This helps avoid - // inconsistent NIC selections, especially for down ports, which may - // cause confusing behaviors or errors. - // If PSM3_MULTIRAIL_MAP contains multiple lists of NICs, then - // if PSM3_MULTIRAIL=1 - use local rank index (0, ...) to select - // if PSM3_MULTIRAIL=2 - use process NUMA (0, ...) to select - if (env_multirail.e_int == 1) { - map_index = psm3_get_mylocalrank(); - } else if (env_multirail.e_int == 2) { - map_index = psm3_get_current_proc_location(); - if (map_index < 0) { - return psm3_handle_error(PSMI_EP_NORETURN, - PSM2_EP_DEVICE_FAILURE, - "Unable to get NUMA location of current process\n"); - } - } else { - psmi_assert(0); - } - ret = psm3_getenv_range("PSM3_MULTIRAIL_MAP", - "Explicit NIC selections for each rail", - "Specified as:\n" - " rail,rail,...;rail,rail,...\n" -#if 0 - "Where rail can be: unit:port-addr_index or unit\n" -#else - "Where rail can be: unit-addr_index or unit\n" -#endif - "unit can be device name or unit number\n" -#if 0 - "where :port is optional (default of 1)\n" -#endif - "addr_index can be 0 to PSM3_ADDR_PER_NIC-1, or 'any' or 'all'\n" - "When addr_index is omitted, it defaults to 'all'\n" - "When more than 1 set of rails is present (each set is separated by ;),\n" - "the set to use for a given process is selected based on PSM3_MULTIRAIL.\n" - " 1 - use local rank number to select\n" - " 2 - use local CPU NUMA to select\n" - "When empty, PSM3 will autoselect NICs as controlled by PSM3_MULTIRAIL.", - PSMI_ENVVAR_LEVEL_USER|PSMI_ENVVAR_FLAG_FATAL, PSMI_ENVVAR_TYPE_STR, - (union psmi_envvar_val)"", - (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, - psm3_parse_check_multirail_map, &map_index, &env_multirail_map); - if (ret < 0) { // syntax error in input, ret error instead of using default - psmi_assert(0); // should not get here since specified FLAG_FATAL - multirail_config->num_rails = 0; - return psm3_handle_error(PSMI_EP_NORETURN, - PSM2_EP_DEVICE_FAILURE, - "Invalid value for PSM3_MULTIRAIL_MAP: '%s', can't proceed\n", - env_multirail_map.e_str); - } - if (! ret) { - // valid input - if (psm3_parse_multirail_map(env_multirail_map.e_str, map_index, 0, NULL, - multirail_config) < 0) { - // already checked, shouldn't get parse errors nor empty strings - psmi_assert(0); - } - return PSM2_OK; - } - } + // see if PSM3_MULTIRAIL_MAP is manually selecting NICs + if (psm3_ep_multirail_map(env_multirail.e_int, multirail_config) == PSM2_OK) + return PSM2_OK; // multirail enabled, automatically select 1 or more NICs return psm3_ep_multirail_autoselect(env_multirail.e_int, multirail_config); diff --git a/prov/psm3/psm3/psm_nic_select.h b/prov/psm3/psm3/psm_nic_select.h index cfd23ea1081..c69b52b0e83 100644 --- a/prov/psm3/psm3/psm_nic_select.h +++ b/prov/psm3/psm3/psm_nic_select.h @@ -60,29 +60,6 @@ #ifndef _PSM_NIC_SELECT_H #define _PSM_NIC_SELECT_H -// PSM3_NIC_SELECTION_ALG choices -/* - * round robin contexts across HFIs, then - * ports; this is the default. - * This option spreads the HFI selection within the local socket. - * If it is preferred to spread job over over entire set of - * HFIs within the system, see ALG_ACROSS_ALL below. - */ -#define PSMI_UNIT_SEL_ALG_ACROSS PSM_HAL_ALG_ACROSS - -#define PSMI_UNIT_SEL_ALG_ACROSS_ALL PSM_HAL_ALG_ACROSS_ALL - -/* - * use all contexts on an HFI (round robin - * active ports within), then next HFI - */ -#define PSMI_UNIT_SEL_ALG_WITHIN PSM_HAL_ALG_WITHIN - -#define PSMI_UNIT_SEL_ALG_CPU_CENTRIC PSM_HAL_ALG_CPU_CENTRIC -#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY -#define PSMI_UNIT_SEL_ALG_GPU_CENTRIC PSM_HAL_ALG_GPU_CENTRIC -#endif - struct multirail_config { int num_rails; uint32_t units[PSMI_MAX_RAILS]; @@ -90,18 +67,15 @@ struct multirail_config { int addr_indexes[PSMI_MAX_RAILS]; }; -// return set of units to consider and which to start at. -// caller will use 1st active unit which can be opened. -// caller will wrap around so it's valid for start >= end -// Note: When using multiple rails per PSM process, higher level code will -// walk through desired units and unit_param will specify a specific unit -// if unit_param is PSM3_NIC_ANY, this will pick starting point for nic search -psm2_error_t -psm3_compute_start_and_end_unit(long unit_param, long addr_index, - int nunitsactive,int nunits, - psm2_uuid_t const job_key, - long *unit_start,long *unit_end); +// Autoselect one unit for non-multirail operation. +// caller will select 1st active port and an addr_index within unit +// returns the unit number or -1 if unable to find an active unit +int +psm3_autoselect_one(long addr_index, int nunits, psm2_uuid_t const job_key); +// determine if PSM3_MULTIRAIL is enabled, and if so select the rails +// and place the list in multirail_config. If multirail is not enabled +// multirail_config.num_rails will be set to 0 psm2_error_t psm3_ep_multirail(struct multirail_config *multirail_config); diff --git a/prov/psm3/psm3/psm_oneapi_ze.c b/prov/psm3/psm3/psm_oneapi_ze.c deleted file mode 100644 index 2090fb68326..00000000000 --- a/prov/psm3/psm3/psm_oneapi_ze.c +++ /dev/null @@ -1,1040 +0,0 @@ -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2021 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2021 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ -#ifdef PSM_ONEAPI -#include -#include -#include -#include -#include -#include -#include -#include -#include "psm_user.h" -#include "psm_mq_internal.h" -#include "ptl_am/psm_am_internal.h" -#include "psmi_wrappers.h" - -#ifndef PSM_HAVE_PIDFD -static int psm3_ze_dev_fds[MAX_ZE_DEVICES]; -int psm3_num_ze_dev_fds; -#endif -int psm3_oneapi_immed_sync_copy; -int psm3_oneapi_immed_async_copy; -unsigned psm3_oneapi_parallel_dtod_copy_thresh; - -const char* psmi_oneapi_ze_result_to_string(const ze_result_t result) { -#define ZE_RESULT_CASE(RES) case ZE_RESULT_##RES: return STRINGIFY(RES) - - switch (result) { - ZE_RESULT_CASE(SUCCESS); - ZE_RESULT_CASE(NOT_READY); - ZE_RESULT_CASE(ERROR_UNINITIALIZED); - ZE_RESULT_CASE(ERROR_DEVICE_LOST); - ZE_RESULT_CASE(ERROR_INVALID_ARGUMENT); - ZE_RESULT_CASE(ERROR_OUT_OF_HOST_MEMORY); - ZE_RESULT_CASE(ERROR_OUT_OF_DEVICE_MEMORY); - ZE_RESULT_CASE(ERROR_MODULE_BUILD_FAILURE); - ZE_RESULT_CASE(ERROR_INSUFFICIENT_PERMISSIONS); - ZE_RESULT_CASE(ERROR_NOT_AVAILABLE); - ZE_RESULT_CASE(ERROR_UNSUPPORTED_VERSION); - ZE_RESULT_CASE(ERROR_UNSUPPORTED_FEATURE); - ZE_RESULT_CASE(ERROR_INVALID_NULL_HANDLE); - ZE_RESULT_CASE(ERROR_HANDLE_OBJECT_IN_USE); - ZE_RESULT_CASE(ERROR_INVALID_NULL_POINTER); - ZE_RESULT_CASE(ERROR_INVALID_SIZE); - ZE_RESULT_CASE(ERROR_UNSUPPORTED_SIZE); - ZE_RESULT_CASE(ERROR_UNSUPPORTED_ALIGNMENT); - ZE_RESULT_CASE(ERROR_INVALID_SYNCHRONIZATION_OBJECT); - ZE_RESULT_CASE(ERROR_INVALID_ENUMERATION); - ZE_RESULT_CASE(ERROR_UNSUPPORTED_ENUMERATION); - ZE_RESULT_CASE(ERROR_UNSUPPORTED_IMAGE_FORMAT); - ZE_RESULT_CASE(ERROR_INVALID_NATIVE_BINARY); - ZE_RESULT_CASE(ERROR_INVALID_GLOBAL_NAME); - ZE_RESULT_CASE(ERROR_INVALID_KERNEL_NAME); - ZE_RESULT_CASE(ERROR_INVALID_FUNCTION_NAME); - ZE_RESULT_CASE(ERROR_INVALID_GROUP_SIZE_DIMENSION); - ZE_RESULT_CASE(ERROR_INVALID_GLOBAL_WIDTH_DIMENSION); - ZE_RESULT_CASE(ERROR_INVALID_KERNEL_ARGUMENT_INDEX); - ZE_RESULT_CASE(ERROR_INVALID_KERNEL_ARGUMENT_SIZE); - ZE_RESULT_CASE(ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE); - ZE_RESULT_CASE(ERROR_INVALID_COMMAND_LIST_TYPE); - ZE_RESULT_CASE(ERROR_OVERLAPPING_REGIONS); - ZE_RESULT_CASE(ERROR_UNKNOWN); - default: - return "Unknown error"; - } - -#undef ZE_RESULT_CASE -} - -// when allocating bounce buffers either malloc w/Import or -// zeMemAllocHost can be used. zeMemAllocHost tends to perform -// better in the subsequent GPU copy's AppendMemoryCopy. However -// zeMemAllocHost results in a GPU-like address which requires dmabuf -// so we can't use zeMemAllocHost for DMA to/from the bounce buffer -// unless rv is available to handle GPU addresses (eg. PSM3_GPUDIRECT=1) - -void *psm3_oneapi_ze_host_alloc_malloc(unsigned size) -{ - void *ret_ptr = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size); -#ifndef PSM3_NO_ONEAPI_IMPORT - PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, ze_driver, ret_ptr, size); -#endif - return ret_ptr; -} - -void psm3_oneapi_ze_host_free_malloc(void *ptr) -{ -#ifndef PSM3_NO_ONEAPI_IMPORT - PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, ptr); -#endif - psmi_free(ptr); -} - -#ifndef PSM3_USE_ONEAPI_MALLOC -void *psm3_oneapi_ze_host_alloc_zemem(unsigned size) -{ - void *ret_ptr; - ze_host_mem_alloc_desc_t host_desc = { - .stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, - .flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW - }; - PSMI_ONEAPI_ZE_CALL(zeMemAllocHost, ze_context, - &host_desc, size, 8, &ret_ptr); - return ret_ptr; -} - -void psm3_oneapi_ze_host_free_zemem(void *ptr) -{ - PSMI_ONEAPI_ZE_CALL(zeMemFree, ze_context, ptr); -} - -void *(*psm3_oneapi_ze_host_alloc)(unsigned size) = psm3_oneapi_ze_host_alloc_malloc; -void (*psm3_oneapi_ze_host_free)(void *ptr) = psm3_oneapi_ze_host_free_malloc; -int psm3_oneapi_ze_using_zemem_alloc = 0; -#endif /* PSM3_USE_ONEAPI_MALLOC */ - -// this is only called if GPU Direct is enabled in rv such that -// GDR Copy and/or RDMA MRs can provide GPU-like addresses to rv -void psm3_oneapi_ze_can_use_zemem() -{ -#ifndef PSM3_USE_ONEAPI_MALLOC - psm3_oneapi_ze_host_alloc = psm3_oneapi_ze_host_alloc_zemem; - psm3_oneapi_ze_host_free = psm3_oneapi_ze_host_free_zemem; - psm3_oneapi_ze_using_zemem_alloc = 1; -#endif -} - -// synchronous GPU memcpy -void psmi_oneapi_ze_memcpy(void *dstptr, const void *srcptr, size_t size) -{ - struct ze_dev_ctxt *ctxt; - - psmi_assert(size > 0); - ctxt = psmi_oneapi_dev_ctxt_get(dstptr); - if (!ctxt) { - ctxt = psmi_oneapi_dev_ctxt_get(srcptr); - if (!ctxt) { - _HFI_ERROR("dst %p src %p not GPU buf for copying\n", - dstptr, srcptr); - return; - } - } - if (psm3_oneapi_immed_sync_copy) { - PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl, - dstptr, srcptr, size, NULL, 0, NULL); - } else { - PSMI_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->cl); - PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl, - dstptr, srcptr, size, NULL, 0, NULL); - PSMI_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->cl); - PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->cq, - 1, &ctxt->cl, NULL); - PSMI_ONEAPI_ZE_CALL(zeCommandQueueSynchronize, ctxt->cq, UINT32_MAX); - } -} - -// synchronous GPU memcpy DTOD (xeLink) -void psmi_oneapi_ze_memcpy_DTOD(void *dstptr, const void *srcptr, size_t size) -{ - struct ze_dev_ctxt *ctxt; - - psmi_assert(size > 0); - ctxt = psmi_oneapi_dev_ctxt_get(dstptr); - if (!ctxt) { - _HFI_ERROR("dst %p src %p not GPU buf for copying\n", - dstptr, srcptr); - return; - } - if (size <= psm3_oneapi_parallel_dtod_copy_thresh) { - if (psm3_oneapi_immed_sync_copy) { - PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl, - dstptr, srcptr, size, NULL, 0, NULL); - } else { - PSMI_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->cl); - PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl, - dstptr, srcptr, size, NULL, 0, NULL); - PSMI_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->cl); - PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->cq, - 1, &ctxt->cl, NULL); - PSMI_ONEAPI_ZE_CALL(zeCommandQueueSynchronize, ctxt->cq, UINT32_MAX); - } - } else { - // for large DTOD copies, start 2 parallel commands - // then wait for both - size_t size0 = ROUNDUP64P2(size/2, 64*1024); - size_t size1 = size - size0; - - if (psm3_oneapi_immed_sync_copy) { - PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl0, - dstptr, srcptr, size0, ctxt->copy_status0, 0, NULL); - - PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl1, - (void*)((uintptr_t)dstptr+size0), - (void*)((uintptr_t)srcptr+size0), size1, ctxt->copy_status1, - 0, NULL); - } else { - PSMI_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->async_cl0); - PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl0, - dstptr, srcptr, size0, ctxt->copy_status0, 0, NULL); - PSMI_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->async_cl0); - PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->async_cq0, - 1, &ctxt->async_cl0, NULL); - - PSMI_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->async_cl1); - PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl1, - (void*)((uintptr_t)dstptr+size0), - (void*)((uintptr_t)srcptr+size0), size1, ctxt->copy_status1, - 0, NULL); - PSMI_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->async_cl1); - PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->async_cq1, - 1, &ctxt->async_cl1, NULL); - } - // 2nd copy may be slightly smaller so waity for it first so - // can potentially hide its Reset latency while 1st copy completes - PSMI_ONEAPI_ZE_CALL(zeEventHostSynchronize, ctxt->copy_status1, UINT32_MAX); - PSMI_ONEAPI_ZE_CALL(zeEventHostReset, ctxt->copy_status1); - - PSMI_ONEAPI_ZE_CALL(zeEventHostSynchronize, ctxt->copy_status0, UINT32_MAX); - PSMI_ONEAPI_ZE_CALL(zeEventHostReset, ctxt->copy_status0); - } -} - -// for pipelined async GPU memcpy -// *p_cq is left as NULL when psm3_oneapi_immed_async_copy enabled -void psmi_oneapi_async_cmd_create(struct ze_dev_ctxt *ctxt, - ze_command_queue_handle_t *p_cq, ze_command_list_handle_t *p_cl) -{ - psmi_assert(! *p_cl); - if (psm3_oneapi_immed_async_copy) { - ze_command_queue_desc_t cq_desc = { - .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - .flags = 0, - .mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, - .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL - }; - cq_desc.ordinal = ctxt->ordinal; - cq_desc.index = ctxt->index++; - ctxt->index %= ctxt->num_queues; - PSMI_ONEAPI_ZE_CALL(zeCommandListCreateImmediate, - ze_context, ctxt->dev, &cq_desc, p_cl); - } else { - if (! *p_cq) { - ze_command_queue_desc_t cq_desc = { - .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - .flags = 0, - .mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, - .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL - }; - cq_desc.ordinal = ctxt->ordinal; - cq_desc.index = ctxt->index++; - ctxt->index %= ctxt->num_queues; - PSMI_ONEAPI_ZE_CALL(zeCommandQueueCreate, - ze_context, ctxt->dev, &cq_desc, p_cq); - } - ze_command_list_desc_t cl_desc = { - .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, - .flags = 0 - }; - cl_desc.commandQueueGroupOrdinal = ctxt->ordinal; - PSMI_ONEAPI_ZE_CALL(zeCommandListCreate, - ze_context, ctxt->dev, &cl_desc, p_cl); - } -} - -#ifndef PSM_HAVE_PIDFD -/* - * psmi_ze_init_fds - initialize the file descriptors (ze_dev_fds) - * - * Open the file descriptors for our GPUs (psm3_ze_dev_fds[]) - * - * The file descriptors are used in intra-node communication to pass to peers - * via socket with sendmsg/recvmsg SCM_RIGHTS message type. - * - */ - -psm2_error_t psm3_ze_init_fds(void) -{ - const char *dev_dir = "/dev/dri/by-path/"; - const char *suffix = "-render"; - DIR *dir; - struct dirent *ent = NULL; - char dev_name[NAME_MAX]; - int i = 0, ret; - - if (psm3_num_ze_dev_fds) - return PSM2_OK; - - dir = opendir(dev_dir); - if (dir == NULL) - return PSM2_INTERNAL_ERR; - - while ((ent = readdir(dir)) != NULL) { - if (ent->d_name[0] == '.' || - strstr(ent->d_name, suffix) == NULL) - continue; - - memset(dev_name, 0, sizeof(dev_name)); - ret = snprintf(dev_name, NAME_MAX, "%s%s", dev_dir, ent->d_name); - if (ret < 0 || ret >= NAME_MAX) { - _HFI_INFO("GPU dev name too long: %s%s\n", dev_dir, ent->d_name); - goto err; - } - - psm3_ze_dev_fds[i] = open(dev_name, O_RDWR); - if (psm3_ze_dev_fds[i] == -1) { - _HFI_INFO("Failed to open %s GPU dev FD: %s\n", dev_name, - strerror(errno)); - goto err; - } - _HFI_DBG("Opened %s GPU dev FD: %d\n", dev_name, - psm3_ze_dev_fds[i]); - i++; - psm3_num_ze_dev_fds++; - } - (void) closedir(dir); - _HFI_DBG("Opened %d GPU dev FDs\n", psm3_num_ze_dev_fds); - return PSM2_OK; - -err: - (void) closedir(dir); - return PSM2_INTERNAL_ERR; -} - -/* - * psmi_ze_get_dev_fds - fetch device file descriptors - * - * Returns a pointer to ze_dev_fds while putting the number - * of fds into the in/out nfds parameter - * - */ - -int *psm3_ze_get_dev_fds(int *nfds) -{ - *nfds = psm3_num_ze_dev_fds; - return psm3_ze_dev_fds; -} - -/* - * psmi_sendmsg_fds - send device file descriptors over socket w/ sendmsg - * - * Prepares message of type SCM_RIGHTS, copies file descriptors as payload, - * and sends over socket via sendmsg while creating appropriate fd numbers - * for dest (effectively a dup(2) of our file descriptor) - * - * returns -errno on error or number of bytes sent (>0) on success - */ - -static int psmi_sendmsg_fds(int sock, int *fds, int nfds, psm2_epid_t epid) -{ - struct msghdr msg; - struct cmsghdr *cmsg; - struct iovec iov; - int64_t peer_id = *(int64_t *)&epid; - char *ctrl_buf; - size_t ctrl_size; - int ret; - - ctrl_size = sizeof(*fds) * nfds; - ctrl_buf = (char *)psmi_calloc(NULL, UNDEFINED, 1, CMSG_SPACE(ctrl_size)); - if (!ctrl_buf) - return -ENOMEM; - - iov.iov_base = &peer_id; - iov.iov_len = sizeof(peer_id); - - memset(&msg, 0, sizeof(msg)); - msg.msg_control = ctrl_buf; - msg.msg_controllen = CMSG_SPACE(ctrl_size); - - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - - cmsg = CMSG_FIRSTHDR(&msg); - cmsg->cmsg_level = SOL_SOCKET; - cmsg->cmsg_type = SCM_RIGHTS; - cmsg->cmsg_len = CMSG_LEN(ctrl_size); - memcpy(CMSG_DATA(cmsg), fds, ctrl_size); - - ret = sendmsg(sock, &msg, 0); - if (ret < 0) - ret = -errno; - else if (! ret) - ret = -EAGAIN; - - psmi_free(ctrl_buf); - return ret; -} - -/* - * psmi_recvmsg_fds - receive device file descriptors from socket w/ recvmsg - * - * Prepares message buffer of type SCM_RIGHTS, receives message from socket - * via recvmsg, and copies device file descriptors to in/out parameter. - * The received file descriptors are usable in our process and need to - * be closed when done being used - * - * returns -errno on error or number of bytes received (>0) on success - */ - -static int psmi_recvmsg_fd(int sock, int *fds, int nfds, psm2_epid_t epid) -{ - struct msghdr msg; - struct cmsghdr *cmsg; - struct iovec iov; - int64_t peer_id = *(int64_t *)&epid; - char *ctrl_buf; - size_t ctrl_size; - int ret; - - ctrl_size = sizeof(*fds) * nfds; - ctrl_buf = (char *)psmi_calloc(NULL, UNDEFINED, 1, CMSG_SPACE(ctrl_size)); - if (!ctrl_buf) - return -ENOMEM; - - iov.iov_base = &peer_id; - iov.iov_len = sizeof(peer_id); - - memset(&msg, 0, sizeof(msg)); - msg.msg_control = ctrl_buf; - msg.msg_controllen = CMSG_SPACE(ctrl_size); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - - ret = recvmsg(sock, &msg, 0); - if (ret < 0) { - ret = -errno; - } else if (ret != sizeof(peer_id)) { - _HFI_CONNDBG("recvmsg from: %s returns %d expect %u\n", - psm3_epid_fmt_addr(epid, 0), ret, - (unsigned)sizeof(peer_id) ); - ret = -EAGAIN; - goto out; - } - - psmi_assert(!(msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))); - cmsg = CMSG_FIRSTHDR(&msg); - psmi_assert(cmsg && cmsg->cmsg_len == CMSG_LEN(ctrl_size) && - cmsg->cmsg_level == SOL_SOCKET && - cmsg->cmsg_type == SCM_RIGHTS && CMSG_DATA(cmsg)); - memcpy(fds, CMSG_DATA(cmsg), ctrl_size); -out: - psmi_free(ctrl_buf); - return ret; -} - -/* - * psm3_ze_init_ipc_socket - initialize ipc socket in ep - * - * Set up the AF_UNIX ipc socket in the ep for listen mode. Name it - * using our epid, and bind it. - * - */ - -psm2_error_t psm3_ze_init_ipc_socket(ptl_t *ptl_gen) -{ - struct ptl_am *ptl = (struct ptl_am *)ptl_gen; - psm2_error_t err = PSM2_OK; - int ret; - struct sockaddr_un sockaddr = {0}; - socklen_t len = sizeof(sockaddr); - - if ((ptl->ep->ze_ipc_socket = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { - _HFI_ERROR("error creating GPU dev FDs AF_UNIX sock: %s\n", - strerror(errno)); - err = PSM2_INTERNAL_ERR; - goto fail; - } - - sockaddr.sun_family = AF_UNIX; - snprintf(sockaddr.sun_path, 108, "/dev/shm/psm3_shm.ze_sock2.%ld.%s", - (long int) getuid(), psm3_epid_fmt_internal(ptl->epid, 0)); - ptl->ep->listen_sockname = psmi_strdup(NULL, sockaddr.sun_path); - if (ptl->ep->listen_sockname == NULL) { - err = PSM2_NO_MEMORY; - goto fail; - } - - if ((ret = bind(ptl->ep->ze_ipc_socket, (struct sockaddr *) &sockaddr, len)) < 0) { - _HFI_ERROR("error binding GPU dev FDs AF_UNIX sock to %s: %s\n", - sockaddr.sun_path, strerror(errno)); - err = PSM2_INTERNAL_ERR; - goto fail; - } - - if ((ret = listen(ptl->ep->ze_ipc_socket, 256)) < 0) { - _HFI_ERROR("error listening on GPU dev FDs AF_UNIX sock %s: %s\n", - sockaddr.sun_path, strerror(errno)); - err = PSM2_INTERNAL_ERR; - goto fail; - } - return PSM2_OK; - -fail: - if (ptl->ep->ze_ipc_socket >= 0) - close(ptl->ep->ze_ipc_socket); - ptl->ep->ze_ipc_socket = -1; - if (ptl->ep->listen_sockname) - psmi_free(ptl->ep->listen_sockname); - ptl->ep->listen_sockname = NULL; - return err; -} - -/* - * psm3_receive_ze_dev_fds - receive the dev fds on the listen socket - * - * Set up the listen socket to be polled for POLLIN. When the event is - * received, accept for the new socket and then read the peer epid, - * and locate the epaddr for it. Then receive the dev fds to be stored - * in the am_epaddr. - * - * returns: - * PSM_OK - GPU dev FDs received from a peer - * PSM2_OK_NO_PROGRESS - nothing received - * other - error - */ - -static psm2_error_t psm3_receive_ze_dev_fds(ptl_t *ptl_gen) -{ - struct ptl_am *ptl = (struct ptl_am *)ptl_gen; - psm2_error_t err = PSM2_OK; - struct pollfd fdset; - int newsock = -1; - - fdset.fd = ptl->ep->ze_ipc_socket; - fdset.events = POLLIN; - - if (poll(&fdset, 1, 0) <= 0) - return PSM2_OK_NO_PROGRESS; - - { - struct sockaddr_un sockaddr = {0}; - socklen_t len = sizeof(sockaddr); - int nfds = psm3_num_ze_dev_fds; - int nread; - psm2_epid_t epid; - psm2_epaddr_t epaddr; - am_epaddr_t *am_epaddr; - - newsock = accept(ptl->ep->ze_ipc_socket, (struct sockaddr *)&sockaddr, &len); - if (newsock < 0) { - _HFI_ERROR("GPU dev FDs AF_UNIX accept failed: %s\n", - strerror(errno)); - err = PSM2_INTERNAL_ERR; - goto fail; - } else { - int ret; - // technically we could get less than we asked for and need to - // call recv again in future but our transfers are small enough - // we should get it all - if ((nread = recv(newsock, &epid, sizeof(epid), 0)) < 0) { - _HFI_ERROR("GPU dev FDs AF_UNIX recv failed: %s\n", - strerror(errno)); - err = PSM2_INTERNAL_ERR; - goto fail; - } - if (nread != sizeof(epid)) { - _HFI_ERROR("GPU dev FDs AF_UNIX recv incomplete: %d\n", nread); - err = PSM2_INTERNAL_ERR; - goto fail; - } - // we only poll for recv FDs after processing a am_shm connect - // so the epid should always be known - if ((epaddr = psm3_epid_lookup(ptl->ep, epid)) == NULL) { - _HFI_ERROR("Peer Unknown, unable to receive GPU dev FDs from: %s\n", - psm3_epid_fmt_addr(epid, 0)); - err = PSM2_INTERNAL_ERR; - goto fail; - } - am_epaddr = (am_epaddr_t *)epaddr; - am_epaddr->num_peer_fds = nfds; - ret = psmi_recvmsg_fd(newsock, am_epaddr->peer_fds, nfds, ptl->epid); - if (ret <= 0) { - _HFI_ERROR("Unable to recvmsg %d GPU dev FDs from: %s: %s\n", - nfds, psm3_epid_fmt_addr(epid, 0), - strerror(-ret)); - err = PSM2_INTERNAL_ERR; - goto fail; - } - _HFI_CONNDBG("%d GPU dev FDs Received from: %s\n", - nfds, psm3_epid_fmt_addr(epid, 0)); - } - } - -fail: - if (newsock >= 0) - close(newsock); - return err; -} - -/* - * psm3_send_dev_fds - do next step to send the dev fds to the peer's - * listen socket - * - * Check the connected state and proceed accordingly: - * - ZE_SOCK_NOT_CONNECTED - * We have not done anything yet, so connect and send our epid, - * followed by the dev fds. Set state to ZE_SOCK_DEV_FDS_SENT - * - ZE_SOCK_DEV_FDS_SENT - * The dev fds have been sent. Issue ioctl to see if the output - * queue has been emptied indicating that the peer has read the data. - * If so, set state to ZE_SOCK_DEV_FDS_SENT_AND_RECD. - * - ZE_SOCK_DEV_FDS_SENT_AND_RECD - * We are done, just return. - * - * returns: - * PSM_OK - next step completed - * PSM2_OK_NO_PROGRESS - nothing to do - * other - error - */ - -psm2_error_t psm3_send_dev_fds(ptl_t *ptl_gen, psm2_epaddr_t epaddr) -{ - am_epaddr_t *am_epaddr = (am_epaddr_t *)epaddr; - - switch (am_epaddr->sock_connected_state) { - case ZE_SOCK_DEV_FDS_SENT_AND_RECD: - return PSM2_OK_NO_PROGRESS; - break; - - case ZE_SOCK_DEV_FDS_SENT: - { - int pending; - - psmi_assert(am_epaddr->sock >= 0); - if_pf (ioctl(am_epaddr->sock, SIOCOUTQ, &pending) != 0) { - return psm3_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "error sending dev FDs: %s\n", strerror(errno)); - } - if (pending == 0) { - am_epaddr->sock_connected_state = ZE_SOCK_DEV_FDS_SENT_AND_RECD; - _HFI_CONNDBG("GPU dev FDs Send Completed to: %s\n", - psm3_epid_fmt_addr(epaddr->epid, 0)); - close(am_epaddr->sock); - am_epaddr->sock = -1; - return PSM2_OK; - } - // be paranoid just in case 1st call to send_dev_fds for given - // epaddr gets here - if (! ((struct ptl_am *)ptl_gen)->ep->need_dev_fds_poll) - _HFI_CONNDBG("restart GPU dev FDs poll\n"); - ((struct ptl_am *)ptl_gen)->ep->need_dev_fds_poll = 1; - return PSM2_OK_NO_PROGRESS; - break; - } - - case ZE_SOCK_NOT_CONNECTED: - { - struct ptl_am *ptl = (struct ptl_am *)ptl_gen; - struct sockaddr_un sockaddr = {0}; - socklen_t len = sizeof(sockaddr); - psm2_epid_t peer_epid = epaddr->epid; - int *fds, nfds; - - if (!ptl->ep->need_dev_fds_poll) - _HFI_CONNDBG("restart GPU dev FDs poll\n"); - ptl->ep->need_dev_fds_poll = 1; - - fds = psm3_ze_get_dev_fds(&nfds); - - if ((am_epaddr->sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { - _HFI_ERROR("error creating GPU dev FDs AF_UNIX sock: %s\n", - strerror(errno)); - goto fail; - } - - sockaddr.sun_family = AF_UNIX; - snprintf(sockaddr.sun_path, 108, "/dev/shm/psm3_shm.ze_sock2.%ld.%s", - (long int) getuid(), psm3_epid_fmt_internal(peer_epid, 0)); - - if (connect(am_epaddr->sock, (struct sockaddr *) &sockaddr, len) < 0) { - _HFI_ERROR("GPU dev FDs connect to %s (via %s) failed: %s\n", - psm3_epid_fmt_addr(epaddr->epid, 0), - sockaddr.sun_path, strerror(errno)); - goto fail; - } else { - int ret; - ret = send(am_epaddr->sock, &ptl->epid, sizeof(ptl->epid), 0); - if (ret < 0) { - _HFI_ERROR("GPU dev FDs send to %s (via %s) failed: %s\n", - psm3_epid_fmt_addr(epaddr->epid, 0), - sockaddr.sun_path, strerror(errno)); - goto fail; - } - - ret = psmi_sendmsg_fds(am_epaddr->sock, fds, nfds, peer_epid); - if (ret <= 0) { - /* ret is -errno */ - _HFI_ERROR("GPU dev FDs sendmsg to %s (via %s) failed: %s\n", - psm3_epid_fmt_addr(epaddr->epid, 0), - sockaddr.sun_path, strerror(-ret)); - goto fail; - } - am_epaddr->sock_connected_state = ZE_SOCK_DEV_FDS_SENT; - _HFI_CONNDBG("%d GPU dev FDs Posted Send to: %s (via %s)\n", - nfds, psm3_epid_fmt_addr(epaddr->epid, 0), - sockaddr.sun_path); - return PSM2_OK; - } - /* NOTREACHED */ - break; - } - - default: - return PSM2_INTERNAL_ERR; - break; - } - /* NOTREACHED */ - return PSM2_INTERNAL_ERR; - -fail: - if (am_epaddr->sock >= 0) - close(am_epaddr->sock); - am_epaddr->sock = -1; - return PSM2_INTERNAL_ERR; -} - -// simple test if dev_fds bi-dir exchange completed for given epaddr -// 1 = yes, 0 = no -static -int psm3_dev_fds_exchanged(psm2_epaddr_t epaddr) -{ - am_epaddr_t *am_epaddr = (am_epaddr_t *)epaddr; - return (am_epaddr->sock_connected_state == ZE_SOCK_DEV_FDS_SENT_AND_RECD - && am_epaddr->num_peer_fds) ; -} - -/* - * psm3_check_dev_fds_exchanged - check that dev fds have been bi-dir exchanged - * with given peer. Poll to try and move forward as needed. - * - * connect state ZE_SOCK_DEV_FDS_SENT_AND_RECD indicates peer has received - * our send of dev_fds - * - * num_peer_fds indicates if we received peer's fds. - * - * if both are satisfied, exchange is complete, return PSM2_OK - * - *Returns: - * PSM2_OK - both are done - * PSM2_OK_NO_PROGRESS - more work needed - * other - error - */ -psm2_error_t psm3_check_dev_fds_exchanged(ptl_t *ptl_gen, psm2_epaddr_t epaddr) -{ - psm2_error_t err; - psm2_error_t ret; - am_epaddr_t *am_epaddr = (am_epaddr_t *)epaddr; - - psmi_assert(epaddr); - psmi_assert(! psm3_epid_zero_internal(epaddr->epid)); - - if (psm3_dev_fds_exchanged(epaddr)) - return PSM2_OK; - - if (am_epaddr->cstate_outgoing != AMSH_CSTATE_OUTGOING_ESTABLISHED - && am_epaddr->cstate_incoming != AMSH_CSTATE_INCOMING_ESTABLISHED) - return PSM2_OK_NO_PROGRESS; - - // try to move forward 1 step - err = psm3_send_dev_fds(ptl_gen, epaddr); - if (am_epaddr->sock_connected_state == ZE_SOCK_DEV_FDS_SENT_AND_RECD) - err = PSM2_OK; - else /* err will be NO_PROGRESS or worse */ - err = psm3_error_cmp(err, PSM2_OK_NO_PROGRESS); - - // only poll recv if we need to - ret = PSM2_OK_NO_PROGRESS; // keep KW happy - if (am_epaddr->num_peer_fds == 0) - ret = psm3_receive_ze_dev_fds(ptl_gen); - if (am_epaddr->num_peer_fds) - ret = PSM2_OK; - - /* worst err, NO_PROGRESS is worse than PSM2_OK */ - return psm3_error_cmp(ret, err); -} - -/* - * psm3_poll_dev_fds_exchanged - poll to make forward progress on - * GPU dev FDs exchange - * - * Loop through the epaddrs in am_ep and check_dev_fds_exchanged - * - * Returns: - * PSM2_OK - we found some work to do and made progress - * PSM2_OK_NO_PROGRESS - didn't find anything to do - * other - error - */ - -psm2_error_t psm3_poll_dev_fds_exchange(ptl_t *ptl_gen) -{ - struct ptl_am *ptl = (struct ptl_am *)ptl_gen; - psm2_error_t err = PSM2_OK_NO_PROGRESS; - psm2_error_t ret; - int i; - int num_left = 0; - - err = psm3_receive_ze_dev_fds(ptl_gen); - - for (i = 0; i <= ptl->max_ep_idx; i++) { - am_epaddr_t *am_epaddr = (am_epaddr_t *)ptl->am_ep[i].epaddr; - - if (!am_epaddr || psm3_epid_zero_internal(ptl->am_ep[i].epid)) - continue; - - if (psm3_dev_fds_exchanged(&am_epaddr->epaddr)) - continue; - - num_left++; // causes one extra poll if complete now below, but no harm - - // don't try if uni-dir REQ/REP is incomplete - if (am_epaddr->cstate_outgoing != AMSH_CSTATE_OUTGOING_ESTABLISHED - && am_epaddr->cstate_incoming != AMSH_CSTATE_INCOMING_ESTABLISHED) - continue; - - // try to move forward 1 step - ret = psm3_send_dev_fds(ptl_gen, &am_epaddr->epaddr); - if (ret > PSM2_OK_NO_PROGRESS) - err = psm3_error_cmp(ret, err); - else if (ret == PSM2_OK && err == PSM2_OK_NO_PROGRESS) - err = ret; - } - if (num_left == 0 && ptl->ep->need_dev_fds_poll) - _HFI_CONNDBG("stop GPU dev FDs poll\n"); - ptl->ep->need_dev_fds_poll = (num_left != 0); - - return err; -} - -psm2_error_t psm3_sock_detach(ptl_t *ptl_gen) -{ - struct ptl_am *ptl = (struct ptl_am *)ptl_gen; - - if (ptl->ep->ze_ipc_socket >= 0) - close(ptl->ep->ze_ipc_socket); - ptl->ep->ze_ipc_socket = -1; - if (ptl->ep->listen_sockname) { - unlink(ptl->ep->listen_sockname); - psmi_free(ptl->ep->listen_sockname); - } - ptl->ep->listen_sockname = NULL; - return PSM2_OK; -} -#endif /* not PSM_HAVE_PIDFD */ - -#ifndef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE -static int psm3_ipc_handle_cached(const void *buf, - ze_ipc_mem_handle_t ipc_handle) -{ - static int first = 1; - static int cached = 0; - ze_ipc_mem_handle_t tmp_ipc_handle; - int tmp_fd; - - /* Only detect the first time */ - if (!first) - return cached; - - PSMI_ONEAPI_ZE_CALL(zeMemGetIpcHandle, ze_context, - buf, &tmp_ipc_handle); - tmp_fd = *(uint32_t *)tmp_ipc_handle.data; - if (tmp_fd == *(uint32_t *)ipc_handle.data) - cached = 1; - else - close(tmp_fd); - - first = 0; - _HFI_VDBG("fd %u tmp_fd %d cached %d\n", *(uint32_t *)ipc_handle.data, - tmp_fd, cached); - - return cached; -} -#endif - -#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE -// queue for delayed Put to get better GetIpcHandle performance -// while having an upper bound on number of active Ipc Handles -// sized based on PSM3_ONEAPI_PUTQUEUE_SIZE -struct { - psmi_lock_t lock; - struct oneapi_handle_array { - uint8_t valid; - ze_ipc_mem_handle_t ipc_handle; - } *array; - unsigned index; // where to add next entry and remove oldest - int size; // number of slots in queue, -1 disables put -} psm3_oneapi_putqueue; -#endif /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */ - -psm2_error_t psmi_oneapi_putqueue_alloc(void) -{ -#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE - union psmi_envvar_val env; - psm3_getenv("PSM3_ONEAPI_PUTQUEUE_SIZE", - "How many Ipc Handle Puts to queue for shm send and nic Direct GPU Access [-1 disables Put, 0 disables queue]", - PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)ONEAPI_PUTQUEUE_SIZE, &env); - _HFI_DBG("OneApi PutQueue Size=%d\n", env.e_int); - psm3_oneapi_putqueue.size = env.e_int; - if (env.e_int > 0) { - psm3_oneapi_putqueue.array = (struct oneapi_handle_array *)psmi_calloc( - PSMI_EP_NONE, UNDEFINED, env.e_int, - sizeof(*psm3_oneapi_putqueue.array)); - if (! psm3_oneapi_putqueue.array) - return PSM2_NO_MEMORY; - psm3_oneapi_putqueue.index = 0; - psmi_init_lock(&psm3_oneapi_putqueue.lock); - } -#endif /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */ - return PSM2_OK; -} - -void psm3_put_ipc_handle(const void *buf, ze_ipc_mem_handle_t ipc_handle) -{ -#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE - if (! psm3_oneapi_putqueue.array) { // queue disabled - if (psm3_oneapi_putqueue.size >= 0) // negative size disables Put - PSMI_ONEAPI_ZE_CALL(zeMemPutIpcHandle, ze_context, ipc_handle); - return; - } - PSMI_LOCK(psm3_oneapi_putqueue.lock); - if (psm3_oneapi_putqueue.array[psm3_oneapi_putqueue.index].valid) { - // Put the oldest one to make room for new entry - ze_ipc_mem_handle_t tmp_ipc_handle = - psm3_oneapi_putqueue.array[psm3_oneapi_putqueue.index].ipc_handle; - PSMI_ONEAPI_ZE_CALL(zeMemPutIpcHandle, ze_context, tmp_ipc_handle); - } - // queue the new one - psm3_oneapi_putqueue.array[psm3_oneapi_putqueue.index].valid = 1; - psm3_oneapi_putqueue.array[psm3_oneapi_putqueue.index++].ipc_handle = ipc_handle; - psm3_oneapi_putqueue.index %= psm3_oneapi_putqueue.size; - PSMI_UNLOCK(psm3_oneapi_putqueue.lock); -#else /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */ - // for older Agama with handle "cache" but no reference counting - // no way to put handle without affecting all IOs using that buffer - // on ATS w/o Agama handle cache, no benefit to holding onto fd so close - if (!psm3_ipc_handle_cached(buf, ipc_handle)) - close(*(uint32_t *)ipc_handle.data); -#endif /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */ -} - -void psmi_oneapi_putqueue_free(void) -{ -#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE -#if 0 // we are shutting down, so don't worry about Putting the queued handles - int i; - - // no need for lock, destroying object, no more callers - for (i=0; i < psm3_oneapi_putqueue.size; i++) { - if (psm3_oneapi_putqueue.array[i].valid) { - ze_ipc_mem_handle_t ipc_handle = psm3_oneapi_putqueue.array[i].ipc_handle; - PSMI_ONEAPI_ZE_CALL(zeMemPutIpcHandle, ze_context, ipc_handle); - } - } -#endif /* 0 */ - if (psm3_oneapi_putqueue.array) { - psmi_free(psm3_oneapi_putqueue.array); - psm3_oneapi_putqueue.array = NULL; - psmi_destroy_lock(&psm3_oneapi_putqueue.lock); - } -#endif /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */ -} - -/* - * get OneAPI alloc_id for a GPU address - * - * The address should be part of a buffer allocated from an OneAPI - * library call (zeMemAllocDevice() or zeMemAllocHost()). - * The alloc_id changes on each OneAPI allocation call. PSM3/rv uses the - * alloc_id to determine if a cache hit is a potentially stale entry which - * should be invalidated. - */ -uint64_t psm3_oneapi_ze_get_alloc_id(void *addr, uint8_t *type) -{ - ze_memory_allocation_properties_t mem_props = { - .stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES - }; - ze_device_handle_t device; - - PSMI_ONEAPI_ZE_CALL(zeMemGetAllocProperties, ze_context, - addr, &mem_props, &device); - if (type) - *type = (uint8_t)mem_props.type; - /* - * id is unique across all allocates on all devices within a given - * process - */ - return mem_props.id; -} - -#endif // PSM_ONEAPI diff --git a/prov/psm3/psm3/psm_rndv_mod.c b/prov/psm3/psm3/psm_rndv_mod.c index 1daa81f5c2c..b754320f7d2 100644 --- a/prov/psm3/psm3/psm_rndv_mod.c +++ b/prov/psm3/psm3/psm_rndv_mod.c @@ -102,154 +102,6 @@ struct irdma_mem_reg_req { //#define my_calloc(nmemb, size) (psmi_calloc(PSMI_EP_NONE, NETWORK_BUFFERS, (nmemb), (size))) #define my_free(p) (psmi_free(p)) -#ifdef PSM_CUDA -static int gpu_pin_check; // PSM3_GPU_PIN_CHECK -static uint64_t *gpu_bars; -static int num_gpu_bars = 0; -static uint64_t min_gpu_bar_size; - -// The second BAR address is where the GPU will map GPUDirect memory. -// The beginning of this BAR is reserved for non-GPUDirect uses. -// However, it has been observed that in some multi-process -// pinning failures, HED-2035, the nvidia_p2p_get_pages can foul up -// it's IOMMU after which the next successful pin will incorrectly -// return the 1st physical address of the BAR for the pinned pages. -// In this case it will report this same physical address for other GPU virtual -// addresses and cause RDMA to use the wrong memory. -// As a workaround, we gather the Region 1 BAR address start for each -// GPU and if we see this address returned as the phys_addr of a mmapped -// GPUDirect Copy or the iova of a GPU MR we fail the job before it can -// corrupt any more application data. -static uint64_t get_nvidia_bar_addr(int domain, int bus, int slot) -{ - char sysfs[100]; - int ret; - FILE *f; - unsigned long long start_addr, end_addr, bar_size; - - ret = snprintf(sysfs, sizeof(sysfs), - "/sys/class/pci_bus/%04x:%02x/device/%04x:%02x:%02x.0/resource", - domain, bus, domain, bus, slot); - psmi_assert_always(ret < sizeof(sysfs)); - f = fopen(sysfs, "r"); - if (! f) { - if (gpu_pin_check) { - _HFI_ERROR("Unable to open %s for GPU BAR Address: %s\n", - sysfs, strerror(errno)); - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "Unable to get GPU BAR address\n"); - } - return 0; - } - // for each BAR region, start, end and flags are listed in hex - // nVidia uses the 2nd BAR region (aka Region #1) to map peer to peer - // accesses into it's potentially larger GPU local memory space - ret = fscanf(f, "%*x %*x %*x %llx %llx", &start_addr, &end_addr); - if (ret != 2) { - if (gpu_pin_check) { - _HFI_ERROR("Unable to get GPU BAR Address from %s: %s\n", - sysfs, strerror(errno)); - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "Unable to get GPU BAR address\n"); - } - fclose(f); - return 0; - } - fclose(f); - - bar_size = (end_addr - start_addr) + 1; - _HFI_DBG("GPU BAR Addr from %s is 0x%llx - 0x%llx (size 0x%llx)\n", sysfs, start_addr, end_addr, bar_size); - if (! min_gpu_bar_size || bar_size < min_gpu_bar_size) - min_gpu_bar_size = bar_size; - return start_addr; -} - -void psm2_get_gpu_bars(void) -{ - int num_devices, dev; - union psmi_envvar_val env; - - psm3_getenv("PSM3_GPU_PIN_CHECK", - "Enable sanity check of physical addresses mapped into GPU BAR space (Enabled by default)", - PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)1, &env); - gpu_pin_check = env.e_int; - - PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices); - gpu_bars = psmi_calloc(PSMI_EP_NONE, UNDEFINED, num_devices, sizeof(gpu_bars[0])); - if (! gpu_bars) - return; // psmi_calloc will have exited for Out of Memory - - if (gpu_pin_check) - num_gpu_bars = num_devices; - - for (dev = 0; dev < num_devices; dev++) { - CUdevice device; - int domain, bus, slot; - - PSMI_CUDA_CALL(cuDeviceGet, &device, dev); - PSMI_CUDA_CALL(cuDeviceGetAttribute, - &domain, - CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, - device); - PSMI_CUDA_CALL(cuDeviceGetAttribute, - &bus, - CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, - device); - PSMI_CUDA_CALL(cuDeviceGetAttribute, - &slot, - CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, - device); - gpu_bars[dev] = get_nvidia_bar_addr(domain, bus, slot); - } -} - -static psm2_error_t psm2_check_phys_addr(uint64_t phys_addr) -{ - int i; - for (i=0; i < num_gpu_bars; i++) { - if (phys_addr == gpu_bars[i]) { - _HFI_ERROR("Incorrect Physical Address (0x%"PRIx64") returned by nVidia driver. PSM3 exiting to avoid data corruption. Job may be rerun with PSM3_GPUDIRECT=0 to avoid this issue.\n", - phys_addr); - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "Incorrect Physical Address returned by nVidia driver\n"); - psmi_assert_always(0); - return PSM2_INTERNAL_ERR; - } - } - return PSM2_OK; -} -#endif - -#ifdef PSM_ONEAPI -// PSM3_RV_GPU_IGNORE_ALLOC_ID allows internal testing -// =0 -> default, alloc_id used to identify new buffers which have same -// virt addr as an existing cache entry. In which case a cache miss -// and invalidation of the old cache entry occurs. -// =1 -> an alloc_id of 0 is always used. This has been demonstrated to -// cause false cache hits which can lead to landing data in safe but -// incorrect pages. Useful only for development experiments and tests. -// =2 -> for cache miss performance testing. This will use a different alloc_id -// per IO which will force cache invalidation on every IO. So no -// MR/mmap cache hits will occur, but all the normal MR handling will -// occur just as if there was a miss when running in normal mode -static int ignore_alloc_id; // PSM3_RV_GPU_IGNORE_ALLOC_ID -static uint64_t fake_alloc_id; // for when PSM3_RV_GPU_IGNORE_ALLOC_ID==2 -#endif - -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) -uint64_t psm3_min_gpu_bar_size(void) -{ -#ifdef PSM_ONEAPI - // not yet implemented - // psmi_assert_always(0); - return 0; -#else - return min_gpu_bar_size; -#endif -} -#endif - static int rv_map_event_ring(psm3_rv_t rv, struct rv_event_ring* ring, int entries, int offset) { @@ -309,15 +161,6 @@ psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info) int ret; int save_errno; -#ifdef PSM_ONEAPI - union psmi_envvar_val env; - - psm3_getenv("PSM3_RV_GPU_IGNORE_ALLOC_ID", - "Disable use of alloc_id to identify GPU MRs to invalidate in RV GPU cache. 1=ignore, 2=use fake id to get 100% miss", - PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)0, &env); - ignore_alloc_id = env.e_int; -#endif loc_info->capability = 0; rv = (psm3_rv_t)my_calloc(1, sizeof(struct psm2_rv)); if (! rv) { @@ -340,20 +183,15 @@ psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info) if ((loc_info->rdma_mode & RV_RDMA_MODE_MASK) == RV_RDMA_MODE_USER) qparams.capability |= RV_CAP_USER_MR; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU qparams.gpu_major_rev = RV_GPU_ABI_VER_MAJOR; qparams.gpu_minor_rev = RV_GPU_ABI_VER_MINOR; if ((loc_info->rdma_mode & RV_RDMA_MODE_GPU) || (loc_info->rdma_mode & RV_RDMA_MODE_MASK) == RV_RDMA_MODE_GPU_ONLY) { - qparams.capability |= RV_CAP_GPU_DIRECT | RV_CAP_EVICT; -#ifdef PSM_CUDA - qparams.capability |= RV_CAP_NVIDIA_GPU; -#endif -#ifdef PSM_ONEAPI - qparams.capability |= RV_CAP_INTEL_GPU; -#endif + qparams.capability |= RV_CAP_GPU_DIRECT | RV_CAP_EVICT + | PSM3_GPU_RV_CAPABILITY_EXPECTED; } -#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#endif /* PSM_HAVE_GPU */ if ((ret = ioctl(rv->fd, RV_IOCTL_CAPABILITY, &qparams)) != 0) { int save_cap_errno = errno; @@ -379,74 +217,57 @@ psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info) loc_info->minor_rev = qparams.minor_rev; loc_info->capability = qparams.capability; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU loc_info->gpu_major_rev = qparams.gpu_major_rev; loc_info->gpu_minor_rev = qparams.gpu_minor_rev; rv->ioctl_gpu_pin_mmap = RV_IOCTL_GPU_PIN_MMAP; if ((loc_info->rdma_mode & RV_RDMA_MODE_GPU) || (loc_info->rdma_mode & RV_RDMA_MODE_MASK) == RV_RDMA_MODE_GPU_ONLY) { - if (!(qparams.capability & RV_CAP_GPU_DIRECT)) { - // caller will warn and avoid GPUDirect use -#ifdef PSM_CUDA - _HFI_INFO("WARNING: Mismatch: PSM3(cuda) vs RV(non-GPU).\n"); -#else - _HFI_INFO("WARNING: Mismatch: PSM3(oneapi-ze) vs RV(non-GPU).\n"); -#endif - loc_info->rdma_mode &= ~(RV_RDMA_MODE_GPU|RV_RDMA_MODE_UPSIZE_GPU); - if ((loc_info->rdma_mode & RV_RDMA_MODE_MASK) == RV_RDMA_MODE_GPU_ONLY) - goto fail_sockets; - } -#ifdef PSM_CUDA - if ((qparams.capability & (RV_CAP_INTEL_GPU | RV_CAP_NVIDIA_GPU)) == - RV_CAP_INTEL_GPU) { - // caller will warn and avoid GPUDirect use - _HFI_INFO("WARNING: Mismatch: PSM3(cuda) vs RV(oneapi-ze).\n"); - loc_info->rdma_mode &= ~(RV_RDMA_MODE_GPU|RV_RDMA_MODE_UPSIZE_GPU); - loc_info->capability &= ~RV_CAP_GPU_DIRECT; - if ((loc_info->rdma_mode & RV_RDMA_MODE_MASK) == RV_RDMA_MODE_GPU_ONLY) - goto fail_sockets; - } #ifdef RV_GPU_ABI_VER_MINOR_0 /* not defined if compile against older RV header */ // RV GPU API <= 1.0 is ok, ioctl different but arg subset if (loc_info->gpu_major_rev <= RV_GPU_ABI_VER_MAJOR_1 && loc_info->gpu_minor_rev <= RV_GPU_ABI_VER_MINOR_0) rv->ioctl_gpu_pin_mmap = RV_IOCTL_GPU_PIN_MMAP_R0; #endif -#endif /* CUDA */ -#ifdef PSM_ONEAPI - if ((qparams.capability & (RV_CAP_INTEL_GPU | RV_CAP_NVIDIA_GPU)) == - RV_CAP_NVIDIA_GPU) { + if (!(qparams.capability & RV_CAP_GPU_DIRECT)) { + // caller will warn and avoid GPUDirect use + _HFI_INFO("WARNING: Mismatch: PSM3" PSM3_GPU_TYPES " vs RV non-GPU.\n"); + loc_info->rdma_mode &= ~(RV_RDMA_MODE_GPU|RV_RDMA_MODE_UPSIZE_GPU); + if ((loc_info->rdma_mode & RV_RDMA_MODE_MASK) == RV_RDMA_MODE_GPU_ONLY) + goto fail_sockets; + } + if (!(qparams.capability & PSM3_GPU_RV_CAPABILITY_EXPECTED)) { // caller will warn and avoid GPUDirect use - _HFI_INFO("WARNING: Mismatch: PSM3(oneapi-ze) vs RV(cuda).\n"); + char buf1[100]; + char buf2[100]; + PSM3_GPU_RV_CAP_STRING(buf1, sizeof(buf1), PSM3_GPU_RV_CAPABILITY_EXPECTED); + PSM3_GPU_RV_CAP_STRING(buf2, sizeof(buf2), loc_info->capability); + _HFI_INFO("WARNING: Mismatch: PSM3 %s vs RV %s\n", buf1, buf2); loc_info->rdma_mode &= ~(RV_RDMA_MODE_GPU|RV_RDMA_MODE_UPSIZE_GPU); loc_info->capability &= ~RV_CAP_GPU_DIRECT; if ((loc_info->rdma_mode & RV_RDMA_MODE_MASK) == RV_RDMA_MODE_GPU_ONLY) goto fail_sockets; } -#ifdef RV_GPU_ABI_VER_MINOR_0 - // RV GPU API <= 1.0 does not have track GPU alloc_id - // RV GPU API <= 1.1 requires munmap_unpin - // so if RV GPU API <= 1.1, do not allow GPUDirect - if (loc_info->gpu_major_rev <= RV_GPU_ABI_VER_MAJOR_1 - && loc_info->gpu_minor_rev <= RV_GPU_ABI_VER_MINOR_1) { - _HFI_INFO("WARNING: Mismatch: Unsupported RV(oneapi-ze) revision.\n"); + if ((PSM3_GPU_RV_MAJOR_REV_FAIL && PSM3_GPU_RV_MINOR_REV_FAIL) + && loc_info->gpu_major_rev <= PSM3_GPU_RV_MAJOR_REV_FAIL + && loc_info->gpu_minor_rev <= PSM3_GPU_RV_MINOR_REV_FAIL) { + char buf2[100]; + PSM3_GPU_RV_CAP_STRING(buf2, sizeof(buf2), loc_info->capability); + _HFI_INFO("WARNING: Mismatch: Unsupported RV %s revision (v%u.%u) ne > v%u.%u.\n", + buf2, loc_info->gpu_major_rev, loc_info->gpu_minor_rev, + PSM3_GPU_RV_MAJOR_REV_FAIL, PSM3_GPU_RV_MINOR_REV_FAIL); loc_info->rdma_mode &= ~(RV_RDMA_MODE_GPU|RV_RDMA_MODE_UPSIZE_GPU); loc_info->capability &= ~RV_CAP_GPU_DIRECT; if ((loc_info->rdma_mode & RV_RDMA_MODE_MASK) == RV_RDMA_MODE_GPU_ONLY) goto fail_sockets; } -#else - /* not defined if compile against older RV header */ -#error "Intel GPU Support requires version 1.1 or newer rv_user_ioctls.h header" -#endif -#endif /* PSM_ONEAPI */ if (!(qparams.capability & RV_CAP_EVICT)) { save_errno = ENOTSUP; _HFI_ERROR("Error: rv lacks EVICT ioctl, needed for GPU Support\n"); goto fail; } } -#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#endif /* PSM_HAVE_GPU */ if ((loc_info->rdma_mode & RV_RDMA_MODE_MASK) == RV_RDMA_MODE_USER && !(qparams.capability & RV_CAP_USER_MR)) { save_errno = ENOTSUP; @@ -460,11 +281,17 @@ psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info) && loc_info->minor_rev <= RV_ABI_VER_MINOR_1) rv->ioctl_reg_mem = RV_IOCTL_REG_MEM_R1; #endif +#ifdef RV_ABI_VER_MINOR_4 /* not defined if compile against older RV header */ + // RV API <= 1.4 is ok, ioctl different but arg subset + if (loc_info->major_rev <= RV_ABI_VER_MAJOR_1 && + loc_info->minor_rev <= RV_ABI_VER_MINOR_4) + rv->ioctl_reg_mem = RV_IOCTL_REG_MEM_R4; +#endif memset(&aparams, 0, sizeof(aparams)); snprintf(aparams.in.dev_name, RV_MAX_DEV_NAME_LEN, "%s", devname); aparams.in.mr_cache_size = loc_info->mr_cache_size; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU aparams.in.gpu_cache_size = loc_info->gpu_cache_size; #endif aparams.in.rdma_mode = loc_info->rdma_mode; @@ -500,7 +327,7 @@ psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info) goto fail; } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (loc_info->rdma_mode & RV_RDMA_MODE_GPU) { loc_info->rv_index = aparams.out_gpu.rv_index; loc_info->mr_cache_size = aparams.out_gpu.mr_cache_size; @@ -513,7 +340,7 @@ psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info) loc_info->max_fmr_size = aparams.out_gpu.max_fmr_size; #endif } else { -#endif +#endif /* PSM_HAVE_GPU */ loc_info->rv_index = aparams.out.rv_index; loc_info->mr_cache_size = aparams.out.mr_cache_size; loc_info->q_depth = aparams.out.q_depth; @@ -523,7 +350,7 @@ psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info) loc_info->minor_rev > RV_ABI_VER_MINOR_3) loc_info->max_fmr_size = aparams.out.max_fmr_size; #endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU loc_info->gpu_cache_size = 0; } #endif @@ -538,43 +365,37 @@ psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info) } #ifndef RV_CAP_GPU_DIRECT -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU #error "Inconsistent build. RV_CAP_GPU_DIRECT must be defined for GPU builds. Must use GPU enabled rv headers" #else -// lifted from rv_user_ioctls.h +// lifted from rv_user_ioctls.h so code builds below and can report if runtime +// RV supports an unknown GPU type #define RV_CAP_GPU_DIRECT (1UL << 63) #endif - #endif - if (loc_info->capability & RV_CAP_GPU_DIRECT) -#ifdef PSM_CUDA - psm3_print_identify("%s %s run-time rv interface v%d.%d%s gpu v%d.%d cuda\n", - psm3_get_mylabel(), psm3_ident_tag, - loc_info->major_rev, - loc_info->minor_rev, - (loc_info->capability & RV_CAP_USER_MR)?" user_mr":"", - loc_info->gpu_major_rev, - loc_info->gpu_minor_rev); -#elif defined(PSM_ONEAPI) - psm3_print_identify("%s %s run-time rv interface v%d.%d%s gpu v%d.%d oneapi-ze\n", +#endif /* ! RV_CAP_GPUDIRECT */ + if (loc_info->capability & RV_CAP_GPU_DIRECT) { + // RV has GPU capability +#ifdef PSM_HAVE_GPU + char buf[100]; + PSM3_GPU_RV_CAP_STRING(buf, sizeof(buf), loc_info->capability); + psm3_print_identify("%s %s run-time rv interface v%u.%u%s gpu v%u.%u%s\n", psm3_get_mylabel(), psm3_ident_tag, - loc_info->major_rev, - loc_info->minor_rev, + loc_info->major_rev, loc_info->minor_rev, (loc_info->capability & RV_CAP_USER_MR)?" user_mr":"", - loc_info->gpu_major_rev, - loc_info->gpu_minor_rev); -#else - psm3_print_identify("%s %s run-time rv interface v%d.%d%s cuda\n", + loc_info->gpu_major_rev, loc_info->gpu_minor_rev, buf); +#else /* PSM_HAVE_GPU */ + psm3_print_identify("%s %s run-time rv interface v%u.%u%s gpu unknown\n", psm3_get_mylabel(), psm3_ident_tag, - loc_info->major_rev, - loc_info->minor_rev, + loc_info->major_rev, loc_info->minor_rev, (loc_info->capability & RV_CAP_USER_MR)?" user_mr":""); -#endif /* PSM_CUDA */ - else - psm3_print_identify("%s %s run-time rv interface v%d.%d%s\n", +#endif /* PSM_HAVE_GPU */ + } else { + psm3_print_identify("%s %s run-time rv interface v%u.%u%s\n", psm3_get_mylabel(), psm3_ident_tag, loc_info->major_rev, loc_info->minor_rev, (loc_info->capability & RV_CAP_USER_MR)?" user_mr":""); + } return rv; fail: if (rv) { @@ -583,7 +404,7 @@ psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info) errno = save_errno; return NULL; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU fail_sockets: // unacceptable RV module for sockets use case, just fail open loc_info->rdma_mode = 0; @@ -650,7 +471,7 @@ int psm3_rv_get_cache_stats(psm3_rv_t rv, struct psm3_rv_cache_stats *stats) return -1; } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU int psm3_rv_gpu_get_cache_stats(psm3_rv_t rv, struct psm3_rv_gpu_cache_stats *stats) { struct rv_gpu_cache_stats_params_out sparams; @@ -1049,10 +870,11 @@ void psm3_rv_destroy_conn(psm3_rv_conn_t conn) my_free(conn); } +#ifdef PSM_HAVE_REG_MR psm3_rv_mr_t psm3_rv_reg_mem(psm3_rv_t rv, int cmd_fd_int, struct ibv_pd *pd, void *addr, uint64_t length, int access -#ifdef PSM_ONEAPI - , uint64_t alloc_id +#ifdef PSM_HAVE_GPU + , union psm3_verbs_mr_gpu_specific *gpu_specific #endif ) { @@ -1060,9 +882,8 @@ psm3_rv_mr_t psm3_rv_reg_mem(psm3_rv_t rv, int cmd_fd_int, struct ibv_pd *pd, struct rv_mem_params mparams; struct irdma_mem_reg_req req; int save_errno; -#ifdef PSM_ONEAPI - ze_ipc_mem_handle_t ipc_handle; - uint64_t handle_fd = 0; +#ifdef PSM_HAVE_GPU + union psm3_gpu_rv_reg_mmap_mem_scratchpad gpu_scratchpad = { }; #endif if (!rv || (!pd && !(access & IBV_ACCESS_KERNEL))) { @@ -1070,7 +891,7 @@ psm3_rv_mr_t psm3_rv_reg_mem(psm3_rv_t rv, int cmd_fd_int, struct ibv_pd *pd, goto fail; } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU #ifdef PSM_FI if_pf((access & IBV_ACCESS_IS_GPU_ADDR) && PSM3_FAULTINJ_ENABLED()) { PSM3_FAULTINJ_STATIC_DECL(fi_gpu_reg_mr, "gpu_reg_mr", @@ -1096,27 +917,13 @@ psm3_rv_mr_t psm3_rv_reg_mem(psm3_rv_t rv, int cmd_fd_int, struct ibv_pd *pd, mparams.in.ibv_pd_handle = pd->handle; mparams.in.cmd_fd_int = cmd_fd_int; mparams.in.access = access; -#ifdef PSM_ONEAPI +#ifdef PSM_HAVE_GPU if (access & IBV_ACCESS_IS_GPU_ADDR) { - PSMI_ONEAPI_ZE_CALL(zeMemGetIpcHandle, ze_context, - (const void *)addr, &ipc_handle); -#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE - PSMI_ONEAPI_ZE_CALL(zeMemGetFileDescriptorFromIpcHandleExp, ze_context, ipc_handle, &handle_fd); -#else - handle_fd = *(uint32_t *)ipc_handle.data; -#endif - mparams.in.ipc_handle = (uint32_t)handle_fd; - if (!mparams.in.ipc_handle) { - _HFI_ERROR("zeMemGetIpcHandle for %p returned empty handle: 0x%02x%02x%02x%02x %02x%02x%02x%02x\n", - addr, ipc_handle.data[0], ipc_handle.data[1], - ipc_handle.data[2], ipc_handle.data[3], - ipc_handle.data[4], ipc_handle.data[5], - ipc_handle.data[6], ipc_handle.data[7]); - // tends to mean out of fd's - save_errno = ENOSPC; + if (0 != (save_errno = PSM3_GPU_INIT_RV_REG_MR_PARAMS(addr, + length, access, &mparams, + gpu_specific, &gpu_scratchpad))) { goto fail; } - mparams.in.alloc_id = ignore_alloc_id?(ignore_alloc_id==1?0:fake_alloc_id++):alloc_id; } #endif mparams.in.addr = (uint64_t)addr; @@ -1130,9 +937,9 @@ psm3_rv_mr_t psm3_rv_reg_mem(psm3_rv_t rv, int cmd_fd_int, struct ibv_pd *pd, save_errno = errno; goto fail; } -#ifdef PSM_CUDA +#ifdef PSM_HAVE_GPU if ((access & IBV_ACCESS_IS_GPU_ADDR) - && PSM2_OK != psm2_check_phys_addr(mparams.out.iova)) { + && PSM2_OK != PSM3_GPU_CHECK_PHYS_ADDR(mparams.out.iova)) { (void)psm3_rv_dereg_mem(rv, mr); errno = EFAULT; return NULL; @@ -1157,12 +964,8 @@ psm3_rv_mr_t psm3_rv_reg_mem(psm3_rv_t rv, int cmd_fd_int, struct ibv_pd *pd, } errno = save_errno; exit: -#ifdef PSM_ONEAPI - if (handle_fd) { - save_errno = errno; - psm3_put_ipc_handle((const void *)addr, ipc_handle); - errno = save_errno; - } +#ifdef PSM_HAVE_GPU + PSM3_GPU_RV_REG_MMAP_CLEANUP(addr, length, access, &gpu_scratchpad); #endif return mr; } @@ -1186,18 +989,16 @@ int psm3_rv_dereg_mem(psm3_rv_t rv, psm3_rv_mr_t mr) my_free(mr); return 0; } +#endif /* PSM_HAVE_REG_MR */ -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU void * psm3_rv_pin_and_mmap(psm3_rv_t rv, uintptr_t pageaddr, uint64_t pagelen, int access) { struct rv_gpu_mem_params params; int ret; void *ret_ptr = NULL; -#ifdef PSM_ONEAPI - ze_ipc_mem_handle_t ipc_handle; - uint64_t handle_fd = 0; -#endif + union psm3_gpu_rv_reg_mmap_mem_scratchpad gpu_scratchpad = { }; #ifdef PSM_FI if_pf(PSM3_FAULTINJ_ENABLED()) { @@ -1215,63 +1016,30 @@ void * psm3_rv_pin_and_mmap(psm3_rv_t rv, uintptr_t pageaddr, params.in.gpu_buf_addr = pageaddr; params.in.gpu_buf_size = pagelen; params.in.access = access; -#ifdef PSM_ONEAPI if (access & IBV_ACCESS_IS_GPU_ADDR) { - ze_memory_allocation_properties_t mem_props = { - .stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES - }; - ze_device_handle_t device; - - PSMI_ONEAPI_ZE_CALL(zeMemGetIpcHandle, ze_context, - (const void *)pageaddr, &ipc_handle); -#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE - PSMI_ONEAPI_ZE_CALL(zeMemGetFileDescriptorFromIpcHandleExp, ze_context, ipc_handle, &handle_fd); -#else - handle_fd = *(uint32_t *)ipc_handle.data; -#endif - params.in.ipc_handle = (uint32_t)handle_fd; - if (!params.in.ipc_handle) { - _HFI_ERROR("No ipc_handle: 0x%02x%02x%02x%02x %02x%02x%02x%02x\n", - ipc_handle.data[0], ipc_handle.data[1], - ipc_handle.data[2], ipc_handle.data[3], - ipc_handle.data[4], ipc_handle.data[5], - ipc_handle.data[6], ipc_handle.data[7]); - errno = EFAULT; + if (0 != (errno = PSM3_GPU_INIT_RV_PIN_MMAP_PARAMS( + (void*)pageaddr, pagelen, access, + ¶ms, &gpu_scratchpad))) { goto exit; } - PSMI_ONEAPI_ZE_CALL(zeMemGetAllocProperties, ze_context, - (const void *)pageaddr, &mem_props, &device); - // id is unique across all allocs on all devices in a process - params.in.alloc_id = ignore_alloc_id?(ignore_alloc_id==1?0:fake_alloc_id++):mem_props.id; - _HFI_VDBG("pageaddr 0x%"PRIx64" pagelen %"PRIu64" id %"PRIu64" access 0x%x\n", - (uint64_t)pageaddr, pagelen, mem_props.id, access); } -#endif if ((ret = ioctl(rv->fd, rv->ioctl_gpu_pin_mmap, ¶ms)) != 0) goto exit; -#ifdef PSM_CUDA - if (PSM2_OK != psm2_check_phys_addr(params.out.phys_addr)) { + if (PSM2_OK != PSM3_GPU_CHECK_PHYS_ADDR(params.out.phys_addr)) { (void)psm3_rv_evict_exact(rv, (void*)pageaddr, pagelen, access); errno = EFAULT; goto exit; } -#endif // return mapped host address or NULL with errno set ret_ptr = (void *)(uintptr_t)params.out.host_buf_addr; exit: -#ifdef PSM_ONEAPI - if (handle_fd) { - int save_errno = errno; - psm3_put_ipc_handle((const void *)pageaddr, ipc_handle); - errno = save_errno; - } -#endif + PSM3_GPU_RV_REG_MMAP_CLEANUP((void*)pageaddr, pagelen, access, &gpu_scratchpad); return ret_ptr; } -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ // addr, length, access are what was used in a previous call to // __psm_rv_reg_mem or psm3_rv_pin_and_mmap @@ -1343,7 +1111,7 @@ int64_t psm3_rv_evict_range(psm3_rv_t rv, void *addr, uint64_t length) #endif } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // this will remove from the GPU cache all entries which include // addresses between addr and addr+length-1 inclusive if it's // refcount is 0. In the case of reg_mem, a matching call @@ -1377,7 +1145,7 @@ int64_t psm3_rv_evict_gpu_range(psm3_rv_t rv, uintptr_t addr, uint64_t length) return -1; #endif } -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ // this will remove from the cache up to the amount specified // Only entries with a refcount of 0 are removed. @@ -1413,7 +1181,7 @@ int64_t psm3_rv_evict_amount(psm3_rv_t rv, uint64_t bytes, uint32_t count) #endif } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // this will remove from the GPU cache up to the amount specified // Only entries with a refcount of 0 are removed. // In the case of reg_mem, a matching call @@ -1447,7 +1215,7 @@ int64_t psm3_rv_evict_gpu_amount(psm3_rv_t rv, uint64_t bytes, uint32_t count) return -1; #endif } -#endif // PSM_CUDA || PSM_ONEAPI +#endif /* PSM_HAVE_GPU */ int psm3_rv_post_rdma_write_immed(psm3_rv_t rv, psm3_rv_conn_t conn, void *loc_buf, psm3_rv_mr_t loc_mr, diff --git a/prov/psm3/psm3/psm_rndv_mod.h b/prov/psm3/psm3/psm_rndv_mod.h index d6f0001a37c..59bd49a87ea 100644 --- a/prov/psm3/psm3/psm_rndv_mod.h +++ b/prov/psm3/psm3/psm_rndv_mod.h @@ -63,17 +63,9 @@ #include #include -#if defined(PSM_ONEAPI) -#ifndef RV_IOCTL_CAPABILITY -// TBD we could have configure test this and disable PSM3_HAVE_RNDV_MOD -// or perhaps even disable/fail oneapi in configure -#error "PSM_ONEAPI requires rv_user_ioctls.h 1.3 (w/GPU 1.2) or later" -#endif -#endif - struct local_info { uint32_t mr_cache_size; // in MBs -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint32_t gpu_cache_size; // in MBs #endif uint8_t rdma_mode; // RV_RDMA_MODE_* @@ -101,7 +93,7 @@ struct local_info { // output from RNDV driver uint16_t major_rev; // driver ABI rev uint16_t minor_rev; // driver ABI rev -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint16_t gpu_major_rev; // driver GPU ABI rev uint16_t gpu_minor_rev; // driver GPU ABI rev #endif @@ -118,7 +110,7 @@ struct rv_event_ring { struct psm2_rv { int fd; /* file handle used to issue ioctls to rv driver */ -#if defined(NVIDIA_GPU_DIRECT) || defined(INTEL_GPU_DIRECT) +#ifdef RV_CAP_GPU_DIRECT unsigned int ioctl_gpu_pin_mmap; #endif unsigned int ioctl_reg_mem; @@ -153,7 +145,7 @@ typedef struct psm3_rv_mr *psm3_rv_mr_t; #define psm3_rv_cache_stats rv_cache_stats_params_out -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU #define psm3_rv_gpu_cache_stats rv_gpu_cache_stats_params_out #endif @@ -171,7 +163,7 @@ static inline uint16_t psm3_rv_get_user_minor_bldtime_version(void) return RV_ABI_VER_MINOR; } -#if defined(NVIDIA_GPU_DIRECT) || defined(INTEL_GPU_DIRECT) +#ifdef RV_CAP_GPU_DIRECT static inline uint16_t psm3_rv_get_gpu_user_major_bldtime_version(void) { return RV_GPU_ABI_VER_MAJOR; @@ -181,8 +173,6 @@ static inline uint16_t psm3_rv_get_gpu_user_minor_bldtime_version(void) { return RV_GPU_ABI_VER_MINOR; } - -extern uint64_t psm3_min_gpu_bar_size(void); #endif extern int psm3_rv_available(); @@ -194,7 +184,7 @@ extern int psm3_rv_close(psm3_rv_t rv); extern int psm3_rv_get_cache_stats(psm3_rv_t rv, struct psm3_rv_cache_stats *stats); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU extern int psm3_rv_gpu_get_cache_stats(psm3_rv_t rv, struct psm3_rv_gpu_cache_stats *stats); #endif @@ -221,14 +211,16 @@ extern int psm3_rv_disconnect(psm3_rv_conn_t conn); extern void psm3_rv_destroy_conn(psm3_rv_conn_t conn); +#ifdef PSM_HAVE_REG_MR extern psm3_rv_mr_t psm3_rv_reg_mem(psm3_rv_t rv, int cmd_fd, struct ibv_pd *pd, void *addr, uint64_t length, int access -#ifdef PSM_ONEAPI - , uint64_t alloc_id +#ifdef PSM_HAVE_GPU + , union psm3_verbs_mr_gpu_specific *gpu_specific #endif ); extern int psm3_rv_dereg_mem(psm3_rv_t rv, psm3_rv_mr_t mr); +#endif extern void * psm3_rv_pin_and_mmap(psm3_rv_t rv, uintptr_t pageaddr, uint64_t pagelen, int access); @@ -240,7 +232,7 @@ extern int64_t psm3_rv_evict_range(psm3_rv_t rv, void *addr, uint64_t length); extern int64_t psm3_rv_evict_amount(psm3_rv_t rv, uint64_t bytes, uint32_t count); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU extern int64_t psm3_rv_evict_gpu_range(psm3_rv_t rv, uintptr_t addr, uint64_t length); diff --git a/prov/psm3/psm3/psm_sysbuf.c b/prov/psm3/psm3/psm_sysbuf.c index 698507e8528..cc87e160273 100644 --- a/prov/psm3/psm3/psm_sysbuf.c +++ b/prov/psm3/psm3/psm_sysbuf.c @@ -99,7 +99,7 @@ void psm3_mq_sysbuf_init(psm2_mq_t mq) // eager message size (aka PSM3_MTU). // replenishing_rate is how many we add to pool at a time, there is // no upper bound to the pool. -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint32_t gpu_block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, 65536, 262144, (uint32_t)-1}; uint32_t gpu_replenishing_rate[] = {128, 64, 32, 16, 8, 4, 2, 2, 0}; uint32_t block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, (uint32_t)-1, (uint32_t)-1, (uint32_t)-1}; @@ -111,8 +111,8 @@ void psm3_mq_sysbuf_init(psm2_mq_t mq) if (mq->mem_ctrl_is_init) return; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (PSMI_IS_GPU_ENABLED) { +#ifdef PSM_HAVE_GPU + if (PSM3_GPU_IS_ENABLED) { memcpy(block_sizes, gpu_block_sizes, sizeof(block_sizes)); memcpy(replenishing_rate, gpu_replenishing_rate, sizeof(replenishing_rate)); } @@ -160,36 +160,7 @@ void psm3_mq_sysbuf_fini(psm2_mq_t mq) // free all buffers that is currently no for (i=0; i < MM_NUM_OF_POOLS; i++) { while ((block = mq->handler_index[i].free_list) != NULL) { mq->handler_index[i].free_list = block->next; -#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER) - if (PSMI_IS_GPU_ENABLED && cu_ctxt) { - /* ignore NOT_REGISTERED in case cuda initialized late */ - /* ignore other errors as context could be destroyed before this */ - CUresult cudaerr; - //PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, - // cuMemHostUnregister, block); - psmi_count_cuMemHostUnregister++; - cudaerr = psmi_cuMemHostUnregister(block); - if (cudaerr) { - const char *pStr = NULL; - psmi_count_cuGetErrorString++; - psmi_cuGetErrorString(cudaerr, &pStr); - _HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n", - cudaerr, pStr?pStr:"Unknown"); - } - } -#endif -#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) - if (PSMI_IS_GPU_ENABLED) { - ze_result_t result; - //PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, block); - psmi_count_zexDriverReleaseImportedPointer++; - result = psmi_zexDriverReleaseImportedPointer(ze_driver, - block); - if (result != ZE_RESULT_SUCCESS) { - _HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result)); - } - } -#endif + PSM3_GPU_UNREGISTER_HOSTMEM(block); psmi_free(block); } } @@ -229,20 +200,9 @@ void *psm3_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t alloc_size) new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz); if (new_block) { -#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER) - // for transient buffers, no use Importing, adds cost for - // CPU copy, just pay GPU cost on the copy, we use once & free - //if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt()) - // PSMI_CUDA_CALL(cuMemHostRegister, new_block, newsz, - // CU_MEMHOSTALLOC_PORTABLE); -#endif -#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) // for transient buffers, no use Importing, adds cost for // CPU copy, just pay GPU cost on the copy, we use once & free - //if (PSMI_IS_GPU_ENABLED) - // PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, ze_driver, - // new_block, newsz); -#endif + //PSM3_GPU_REGISTER_HOSTMEM(new_block, newsz); new_block->mem_handler = mm_handler; new_block++; mm_handler->total_alloc++; @@ -257,22 +217,9 @@ void *psm3_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t alloc_size) new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz); if (new_block) { -#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER) - // By registering memory with Cuds, we make - // cuMemcpy* run faster for copies between - // GPU and this sysbuf - if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt()) - PSMI_CUDA_CALL(cuMemHostRegister, new_block, newsz, - CU_MEMHOSTALLOC_PORTABLE); -#endif -#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) - // By registering memory with Level Zero, we make - // zeCommandListAppendMemoryCopy run faster for copies between - // GPU and this sysbuf - if (PSMI_IS_GPU_ENABLED) - PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, ze_driver, - new_block, newsz); -#endif + // By registering memory with GPU, we make GPU memcpy + // run faster for copies between GPU and this sysbuf + PSM3_GPU_REGISTER_HOSTMEM(new_block, newsz); mm_handler->current_available++; mm_handler->total_alloc++; mq->mem_ctrl_total_bytes += newsz; @@ -309,22 +256,9 @@ void psm3_mq_sysbuf_free(psm2_mq_t mq, void * mem_to_free) mm_handler = block_to_free->mem_handler; if (mm_handler->flags & MM_FLAG_TRANSIENT) { -#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER) // for transient buffers, no use Importing, adds cost for // CPU copy, just pay GPU cost on the copy, we use once & free - //if (PSMI_IS_GPU_ENABLED && cu_ctxt) { - // /* ignore NOT_REGISTERED in case cuda initialized late */ - // CUresult cudaerr; - // PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, - // cuMemHostUnregister, block_to_free); - //} -#endif -#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) - // for transient buffers, no use Importing, adds cost for - // CPU copy, just pay GPU cost on the copy, we use once & free - //if (PSMI_IS_GPU_ENABLED) - // PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, block_to_free); -#endif + // PSM3_GPU_UNREGISTER_HOSTMEM(block_to_free); psmi_free(block_to_free); } else { block_to_free->next = mm_handler->free_list; diff --git a/prov/psm3/psm3/psm_sysbuf.h b/prov/psm3/psm3/psm_sysbuf.h index 31ff116d088..5ab4604b014 100644 --- a/prov/psm3/psm3/psm_sysbuf.h +++ b/prov/psm3/psm3/psm_sysbuf.h @@ -58,7 +58,7 @@ #include "psm_user.h" -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU #define MM_NUM_OF_POOLS 9 #else #define MM_NUM_OF_POOLS 7 diff --git a/prov/psm3/psm3/psm_user.h b/prov/psm3/psm3/psm_user.h index 28a6e9de4dd..fa2e739b004 100644 --- a/prov/psm3/psm3/psm_user.h +++ b/prov/psm3/psm3/psm_user.h @@ -60,24 +60,6 @@ extern "C" { #endif -#if defined(PSM_CUDA) -// if defined, do not use cuMemHostRegister for malloced pipeline -// copy bounce buffers -// otherwise, use cuMemHostRegister when malloc buffer -//#define PSM3_NO_CUDA_REGISTER -#endif - -#if defined(PSM_ONEAPI) -// if defined, use malloc for pipeline copy bounce buffers -// otherwise, use zeMemAllocHost -//#define PSM3_USE_ONEAPI_MALLOC - -// if defined, do not use zexDriverImportExternalPointer for malloced pipeline -// copy bounce buffers -// otherwise, use zexDriverImportExternalPointer when malloc buffer -//#define PSM3_NO_ONEAPI_IMPORT -#endif - /* Instead of testing a HAL cap mask bit at runtime (in addition to thresholds), * we only test thresholds, especially in the ips_proto_mq.c datapath. * To allow for slightly more optimized builds, a few build time capability @@ -107,6 +89,10 @@ extern "C" { #ifdef PSM_VERBS #define PSM_HAVE_RDMA #endif + +// psm_config.h will define PSM_HAVE_GPU as needed +#include "psm_config.h" + #ifdef RNDV_MOD /* This is used to guard all RNDV_MOD code in the main parts of PSM * so that RNDV_MOD code is only really enabled when a HAL present is able @@ -117,17 +103,16 @@ extern "C" { * HALs instead of testing specific HAL flags like PSM_VERBS or PSM_SOCKETS. * Thus, when adding a new HAL, the generic code need not be revisited. */ -#if defined(PSM_VERBS) || (defined(PSM_SOCKETS) && (defined(PSM_CUDA) || defined(PSM_ONEAPI))) +#if defined(PSM_VERBS) || (defined(PSM_SOCKETS) && defined(PSM_HAVE_GPU)) #define PSM_HAVE_RNDV_MOD -#endif /* VERBS || (SOCKETS && (CUDA||ONEAPI)) */ +#endif /* VERBS || (SOCKETS && GPU) */ #endif /* RNDV_MOD */ -#if (defined(PSM_CUDA) || defined(PSM_ONEAPI)) && defined(PSM_USE_HWLOC) +#if defined(PSM_HAVE_GPU) && defined(PSM_USE_HWLOC) #define PSM_HAVE_GPU_CENTRIC_AFFINITY #endif -#include "psm_config.h" #include #include @@ -148,25 +133,6 @@ extern "C" { #include "psm_log.h" #include "psm_perf.h" -#ifdef PSM_CUDA -#ifndef PSM_CUDA_MOCK -#include -#include -#include - -#if CUDA_VERSION < 7000 -#error Please update CUDA driver, required minimum version is 7.0 -#endif -#else -// included in stand-alone unit test that does not use real CUDA functions -#include "psmi_cuda_mock.h" -#endif /* PSM_CUDA_MOCK */ -#elif defined(PSM_ONEAPI) -#include -#include -#endif - - #define PSMI_LOCK_NO_OWNER ((pthread_t)(-1)) #define _PSMI_IN_USER_H @@ -182,9 +148,10 @@ typedef void *psmi_hal_hw_context; #include "psm_utils.h" #include "psm_timer.h" #include "psm_mpool.h" +#include "gpu/psm_gpu_hal.h" #ifdef PSM_HAVE_REG_MR #include "psm_verbs_mr.h" -#ifdef RNDV_MOD +#ifdef PSM_HAVE_RNDV_MOD #include "psm_rndv_mod.h" #endif #endif @@ -427,28 +394,11 @@ void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak)); #define PSMI_PROFILE_REBLOCK(noprog) #endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) -extern int is_gdr_copy_enabled; -/* This limit dictates when the sender turns off - * GDR Copy and uses SDMA. The limit needs to be less than equal - * GPU RNDV threshold (psm3_gpu_thresh_rndv) - * set to 0 if GDR Copy disabled - */ -extern uint32_t gdr_copy_limit_send; -/* This limit dictates when the reciever turns off - * GDR Copy. The limit needs to be less than equal - * GPU RNDV threshold (psm3_gpu_thresh_rndv) - * set to 0 if GDR Copy disabled - */ -extern uint32_t gdr_copy_limit_recv; -extern int is_gpudirect_enabled; // only for use during parsing of other params -extern int _device_support_gpudirect; -extern uint32_t gpudirect_rdma_send_limit; -extern uint32_t gpudirect_rdma_recv_limit; -extern uint32_t psm3_gpu_thresh_rndv; - -#define MAX_ZE_DEVICES 8 +#define COMPILE_TIME_ASSERT(NAME,COND) extern char NAME[1/ COND] +#ifdef PSM_HAVE_GPU +// Host bounce buffers. Used during pipelined GPU copies for +// large rendezvous IOs. struct ips_gpu_hostbuf { STAILQ_ENTRY(ips_gpu_hostbuf) req_next; STAILQ_ENTRY(ips_gpu_hostbuf) next; @@ -457,785 +407,14 @@ struct ips_gpu_hostbuf { * pulled from a mpool or dynamically * allocated using calloc. */ uint8_t is_tempbuf; -#ifdef PSM_CUDA - CUevent copy_status; -#elif defined(PSM_ONEAPI) - ze_event_pool_handle_t event_pool; - ze_command_list_handle_t command_lists[MAX_ZE_DEVICES]; - ze_event_handle_t copy_status; - int cur_dev_inx; -#endif + uint8_t pad1; + uint16_t pad2; + // aligned to 64 bit boundary + union gpu_hostbuf_gpu_specific gpu_specific; psm2_mq_req_t req; void* host_buf; void* gpu_buf; }; -#endif - -#ifdef PSM_CUDA - -extern int is_cuda_enabled; -extern int _device_support_unified_addr; -extern int _gpu_p2p_supported; -extern int my_gpu_device; -extern int cuda_lib_version; -extern int cuda_runtime_ver; -extern CUcontext cu_ctxt; -extern void *psmi_cuda_lib; -#endif // PSM_CUDA - -#ifdef PSM_ONEAPI - -int psmi_oneapi_ze_initialize(void); -psm2_error_t psm3_ze_init_fds(void); -int *psm3_ze_get_dev_fds(int *nfds); - -extern int is_oneapi_ze_enabled; -extern int _gpu_p2p_supported; -extern int my_gpu_device; -#ifndef PSM_HAVE_PIDFD -extern int psm3_num_ze_dev_fds; -#endif - -struct ze_dev_ctxt { - ze_device_handle_t dev; - int dev_index; /* Index in ze_devices[] */ - uint32_t ordinal; /* CmdQGrp ordinal for the 1st copy_only engine */ - uint32_t index; /* Cmdqueue index within the CmdQGrp */ - uint32_t num_queues; /* Number of queues in the CmdQGrp */ - // for most sync copies - ze_command_queue_handle_t cq; // NULL if psm3_oneapi_immed_sync_copy - ze_command_list_handle_t cl; - // fields below are only used for large DTOD sync copy so can do 2 - // parallel async copies then wait for both - ze_event_handle_t copy_status0; - ze_event_handle_t copy_status1; - ze_command_list_handle_t async_cl0; - ze_command_list_handle_t async_cl1; - ze_command_queue_handle_t async_cq0;// NULL if psm3_oneapi_immed_sync_copy - ze_command_queue_handle_t async_cq1;// NULL if psm3_oneapi_immed_sync_copy - ze_event_pool_handle_t event_pool; -}; - -extern ze_api_version_t zel_api_version; -extern zel_version_t zel_lib_version; -extern ze_context_handle_t ze_context; -extern ze_driver_handle_t ze_driver; -extern struct ze_dev_ctxt ze_devices[MAX_ZE_DEVICES]; -extern int num_ze_devices; -extern struct ze_dev_ctxt *cur_ze_dev; -extern int psm3_oneapi_immed_sync_copy; -extern int psm3_oneapi_immed_async_copy; -extern unsigned psm3_oneapi_parallel_dtod_copy_thresh; - -const char* psmi_oneapi_ze_result_to_string(const ze_result_t result); -void psmi_oneapi_async_cmd_create(struct ze_dev_ctxt *ctxt, - ze_command_queue_handle_t *p_cq, ze_command_list_handle_t *p_cl); -#ifndef PSM_HAVE_PIDFD -psm2_error_t psm3_sock_detach(ptl_t *ptl_gen); -psm2_error_t psm3_ze_init_ipc_socket(ptl_t *ptl_gen); -psm2_error_t psm3_send_dev_fds(ptl_t *ptl_gen, psm2_epaddr_t epaddr); -psm2_error_t psm3_check_dev_fds_exchanged(ptl_t *ptl_gen, psm2_epaddr_t epaddr); -psm2_error_t psm3_poll_dev_fds_exchange(ptl_t *ptl_gen); -#endif - -#ifdef PSM3_USE_ONEAPI_MALLOC -void *psm3_oneapi_ze_host_alloc_malloc(unsigned size); -void psm3_oneapi_ze_host_free_malloc(void *ptr); -#else -extern void *(*psm3_oneapi_ze_host_alloc)(unsigned size); -extern void (*psm3_oneapi_ze_host_free)(void *ptr); -extern int psm3_oneapi_ze_using_zemem_alloc; -#endif -extern void psm3_oneapi_ze_can_use_zemem(); - -void psmi_oneapi_ze_memcpy(void *dstptr, const void *srcptr, size_t size); -void psmi_oneapi_ze_memcpy_DTOD(void *dstptr, const void *srcptr, size_t size); - -static inline -int device_support_gpudirect() -{ - if (likely(_device_support_gpudirect > -1)) return _device_support_gpudirect; - - /* Is there any device property that can indicate this? */ - _device_support_gpudirect = 1; - return _device_support_gpudirect; -} -#endif // PSM_ONEAPI - -#ifdef PSM_CUDA -extern CUresult (*psmi_cuInit)(unsigned int Flags ); -extern CUresult (*psmi_cuCtxDetach)(CUcontext c); -extern CUresult (*psmi_cuCtxGetCurrent)(CUcontext *c); -extern CUresult (*psmi_cuCtxSetCurrent)(CUcontext c); -extern CUresult (*psmi_cuPointerGetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); -extern CUresult (*psmi_cuPointerSetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); -extern CUresult (*psmi_cuDeviceCanAccessPeer)(int *canAccessPeer, CUdevice dev, CUdevice peerDev); -extern CUresult (*psmi_cuDeviceGet)(CUdevice* device, int ordinal); -extern CUresult (*psmi_cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev); -extern CUresult (*psmi_cuDriverGetVersion)(int* driverVersion); -extern CUresult (*psmi_cuDeviceGetCount)(int* count); -extern CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags); -extern CUresult (*psmi_cuStreamDestroy)(CUstream phStream); -extern CUresult (*psmi_cuStreamSynchronize)(CUstream phStream); -extern CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags); -extern CUresult (*psmi_cuEventDestroy)(CUevent hEvent); -extern CUresult (*psmi_cuEventQuery)(CUevent hEvent); -extern CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream); -extern CUresult (*psmi_cuEventSynchronize)(CUevent hEvent); -extern CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags); -extern CUresult (*psmi_cuMemFreeHost)(void* p); -extern CUresult (*psmi_cuMemHostRegister)(void* p, size_t bytesize, unsigned int Flags); -extern CUresult (*psmi_cuMemHostUnregister)(void* p); -extern CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); -extern CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); -extern CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); -extern CUresult (*psmi_cuMemcpyHtoD)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount); -extern CUresult (*psmi_cuMemcpyDtoHAsync)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); -extern CUresult (*psmi_cuMemcpyHtoDAsync)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream); -extern CUresult (*psmi_cuIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr); -extern CUresult (*psmi_cuIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags); -extern CUresult (*psmi_cuIpcCloseMemHandle)(CUdeviceptr dptr); -extern CUresult (*psmi_cuMemGetAddressRange)(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr); -extern CUresult (*psmi_cuDevicePrimaryCtxGetState)(CUdevice dev, unsigned int* flags, int* active); -extern CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev); -extern CUresult (*psmi_cuCtxGetDevice)(CUdevice* device); -extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device); -extern CUresult (*psmi_cuGetErrorString)(CUresult error, const char **pStr); -extern cudaError_t (*psmi_cudaRuntimeGetVersion)(int* runtimeVersion); -#endif // PSM_CUDA - -#ifdef PSM_ONEAPI -extern ze_result_t (*psmi_zeInit)(ze_init_flags_t flags); -extern ze_result_t (*psmi_zeDriverGet)(uint32_t *pCount, ze_driver_handle_t *phDrivers); -#ifndef PSM3_NO_ONEAPI_IMPORT -extern ze_result_t (*psmi_zexDriverImportExternalPointer)(ze_driver_handle_t hDriver, void *ptr, size_t size); -extern ze_result_t (*psmi_zexDriverReleaseImportedPointer)(ze_driver_handle_t hDriver, void *ptr); -#endif -extern ze_result_t (*psmi_zeDeviceGet)(ze_driver_handle_t hDriver, uint32_t *pCount, ze_device_handle_t *phDevices); -extern ze_result_t (*psmi_zeDevicePciGetPropertiesExt)(ze_device_handle_t hDevice, ze_pci_ext_properties_t *pPciProperties); -#ifndef PSM3_NO_ONEAPI_IMPORT -extern ze_result_t (*psmi_zeDriverGetExtensionFunctionAddress)(ze_driver_handle_t hDriver, const char *name, void **ppFunctionAddress); -#endif -extern ze_result_t (*psmi_zeContextCreate)(ze_driver_handle_t hDriver, const ze_context_desc_t *desc, ze_context_handle_t *phContext); -extern ze_result_t (*psmi_zeContextDestroy)(ze_context_handle_t hContext); -extern ze_result_t (*psmi_zeCommandQueueCreate)(ze_context_handle_t hContext, ze_device_handle_t hDevice,const ze_command_queue_desc_t *desc, ze_command_queue_handle_t *phCommandQueue); -extern ze_result_t (*psmi_zeCommandQueueDestroy)(ze_command_queue_handle_t hCommandQueue); -extern ze_result_t (*psmi_zeCommandQueueExecuteCommandLists)(ze_command_queue_handle_t hCommandQueue, uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, ze_fence_handle_t hFence); -extern ze_result_t (*psmi_zeCommandQueueSynchronize)(ze_command_queue_handle_t hCommandQueue, uint64_t timeout); -extern ze_result_t (*psmi_zeCommandListCreate)(ze_context_handle_t hContext, ze_device_handle_t hDevice, const ze_command_list_desc_t *desc, ze_command_list_handle_t *phCommandList); -extern ze_result_t (*psmi_zeCommandListDestroy)(ze_command_list_handle_t hCommandList); -extern ze_result_t (*psmi_zeCommandListClose)(ze_command_list_handle_t hCommandList); -extern ze_result_t (*psmi_zeCommandListReset)(ze_command_list_handle_t hCommandList); -extern ze_result_t (*psmi_zeCommandListCreateImmediate)(ze_context_handle_t hContext, ze_device_handle_t hDevice, const ze_command_queue_desc_t *desc, ze_command_list_handle_t *phCommandList); -extern ze_result_t (*psmi_zeCommandListAppendMemoryCopy)(ze_command_list_handle_t hCommandList, void *dstptr, const void *srcptr, size_t size, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents); -extern ze_result_t (*psmi_zeCommandListAppendSignalEvent)(ze_command_list_handle_t hCommandList, ze_event_handle_t hEvent); -extern ze_result_t (*psmi_zeDeviceCanAccessPeer)(ze_device_handle_t hDevice, ze_device_handle_t hPeerDevice, ze_bool_t *value); -extern ze_result_t (*psmi_zeDeviceGetCommandQueueGroupProperties)(ze_device_handle_t hDevice, uint32_t *pCount, ze_command_queue_group_properties_t *pCommandQueueGroupProperties); -extern ze_result_t (*psmi_zeMemAllocHost)(ze_context_handle_t hContext, const ze_host_mem_alloc_desc_t *host_desc, size_t size, size_t alignment, void **pptr); -extern ze_result_t (*psmi_zeMemAllocDevice)(ze_context_handle_t hContext, const ze_device_mem_alloc_desc_t *device_desc, size_t size, size_t alignment, ze_device_handle_t hDevice, void **pptr); -extern ze_result_t (*psmi_zeMemFree)(ze_context_handle_t hContext, void *ptr); -extern ze_result_t (*psmi_zeMemGetIpcHandle)(ze_context_handle_t hContext, const void *ptr, ze_ipc_mem_handle_t *pIpcHandle); -#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE -extern ze_result_t (*psmi_zeMemGetIpcHandleFromFileDescriptorExp)(ze_context_handle_t hContext, uint64_t handle, ze_ipc_mem_handle_t *pIpcHandle); -extern ze_result_t (*psmi_zeMemGetFileDescriptorFromIpcHandleExp)(ze_context_handle_t hContext, ze_ipc_mem_handle_t ipcHandle, uint64_t *pHandle); -extern ze_result_t (*psmi_zeMemPutIpcHandle)(ze_context_handle_t hContext, ze_ipc_mem_handle_t handle); -#endif -extern ze_result_t (*psmi_zeMemOpenIpcHandle)(ze_context_handle_t hContext,ze_device_handle_t hDevice, ze_ipc_mem_handle_t handle, ze_ipc_memory_flags_t flags, void **pptr); -extern ze_result_t (*psmi_zeMemCloseIpcHandle)(ze_context_handle_t hContext, const void *ptr); -extern ze_result_t (*psmi_zeMemGetAddressRange)(ze_context_handle_t hContext, const void *ptr, void **pBase, size_t *pSize); -extern ze_result_t (*psmi_zeMemGetAllocProperties)(ze_context_handle_t hContext, const void *ptr, ze_memory_allocation_properties_t *pMemAllocProperties, ze_device_handle_t *phDevice); -extern ze_result_t (*psmi_zeEventPoolCreate)(ze_context_handle_t hContext, const ze_event_pool_desc_t *desc, uint32_t numDevices, ze_device_handle_t *phDevices, ze_event_pool_handle_t *phEventPool); -extern ze_result_t (*psmi_zeEventPoolDestroy)(ze_event_pool_handle_t hEventPool); -extern ze_result_t (*psmi_zeEventCreate)(ze_event_pool_handle_t hEventPool, const ze_event_desc_t *desc, ze_event_handle_t *phEvent); -extern ze_result_t (*psmi_zeEventDestroy)(ze_event_handle_t hEvent); -extern ze_result_t (*psmi_zeEventQueryStatus)(ze_event_handle_t hEvent); -extern ze_result_t (*psmi_zeEventHostSynchronize)(ze_event_handle_t hEvent, uint64_t timeout); -extern ze_result_t (*psmi_zeEventHostReset)(ze_event_handle_t hEvent); -extern ze_result_t (*psmi_zelLoaderGetVersions)(size_t *num_elems, zel_component_version_t *versions); - -#endif // PSM_ONEAPI - -#ifdef PSM_CUDA -extern uint64_t psmi_count_cuInit; -extern uint64_t psmi_count_cuCtxDetach; -extern uint64_t psmi_count_cuCtxGetCurrent; -extern uint64_t psmi_count_cuCtxSetCurrent; -extern uint64_t psmi_count_cuPointerGetAttribute; -extern uint64_t psmi_count_cuPointerSetAttribute; -extern uint64_t psmi_count_cuDeviceCanAccessPeer; -extern uint64_t psmi_count_cuDeviceGet; -extern uint64_t psmi_count_cuDeviceGetAttribute; -extern uint64_t psmi_count_cuDriverGetVersion; -extern uint64_t psmi_count_cuDeviceGetCount; -extern uint64_t psmi_count_cuStreamCreate; -extern uint64_t psmi_count_cuStreamDestroy; -extern uint64_t psmi_count_cuStreamSynchronize; -extern uint64_t psmi_count_cuEventCreate; -extern uint64_t psmi_count_cuEventDestroy; -extern uint64_t psmi_count_cuEventQuery; -extern uint64_t psmi_count_cuEventRecord; -extern uint64_t psmi_count_cuEventSynchronize; -extern uint64_t psmi_count_cuMemHostAlloc; -extern uint64_t psmi_count_cuMemFreeHost; -extern uint64_t psmi_count_cuMemHostRegister; -extern uint64_t psmi_count_cuMemHostUnregister; -extern uint64_t psmi_count_cuMemcpy; -extern uint64_t psmi_count_cuMemcpyDtoD; -extern uint64_t psmi_count_cuMemcpyDtoH; -extern uint64_t psmi_count_cuMemcpyHtoD; -extern uint64_t psmi_count_cuMemcpyDtoHAsync; -extern uint64_t psmi_count_cuMemcpyHtoDAsync; -extern uint64_t psmi_count_cuIpcGetMemHandle; -extern uint64_t psmi_count_cuIpcOpenMemHandle; -extern uint64_t psmi_count_cuIpcCloseMemHandle; -extern uint64_t psmi_count_cuMemGetAddressRange; -extern uint64_t psmi_count_cuDevicePrimaryCtxGetState; -extern uint64_t psmi_count_cuDevicePrimaryCtxRetain; -extern uint64_t psmi_count_cuCtxGetDevice; -extern uint64_t psmi_count_cuDevicePrimaryCtxRelease; -extern uint64_t psmi_count_cuGetErrorString; -extern uint64_t psmi_count_cudaRuntimeGetVersion; -#endif // PSM_CUDA - -#ifdef PSM_ONEAPI -extern uint64_t psmi_count_zeInit; -extern uint64_t psmi_count_zeDriverGet; -#ifndef PSM3_NO_ONEAPI_IMPORT -extern uint64_t psmi_count_zexDriverImportExternalPointer; -extern uint64_t psmi_count_zexDriverReleaseImportedPointer; -#endif -extern uint64_t psmi_count_zeDeviceGet; -extern uint64_t psmi_count_zeDevicePciGetPropertiesExt; -#ifndef PSM3_NO_ONEAPI_IMPORT -extern uint64_t psmi_count_zeDriverGetExtensionFunctionAddress; -#endif -extern uint64_t psmi_count_zeContextCreate; -extern uint64_t psmi_count_zeContextDestroy; -extern uint64_t psmi_count_zeCommandQueueCreate; -extern uint64_t psmi_count_zeCommandQueueDestroy; -extern uint64_t psmi_count_zeCommandQueueExecuteCommandLists; -extern uint64_t psmi_count_zeCommandQueueSynchronize; -extern uint64_t psmi_count_zeCommandListCreate; -extern uint64_t psmi_count_zeCommandListDestroy; -extern uint64_t psmi_count_zeCommandListClose; -extern uint64_t psmi_count_zeCommandListReset; -extern uint64_t psmi_count_zeCommandListCreateImmediate; -extern uint64_t psmi_count_zeCommandListAppendMemoryCopy; -extern uint64_t psmi_count_zeCommandListAppendSignalEvent; -extern uint64_t psmi_count_zeDeviceCanAccessPeer; -extern uint64_t psmi_count_zeDeviceGetCommandQueueGroupProperties; -extern uint64_t psmi_count_zeMemAllocHost; -extern uint64_t psmi_count_zeMemAllocDevice; -extern uint64_t psmi_count_zeMemFree; -extern uint64_t psmi_count_zeMemGetIpcHandle; -#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE -extern uint64_t psmi_count_zeMemGetIpcHandleFromFileDescriptorExp; -extern uint64_t psmi_count_zeMemGetFileDescriptorFromIpcHandleExp; -extern uint64_t psmi_count_zeMemPutIpcHandle; -#endif -extern uint64_t psmi_count_zeMemOpenIpcHandle; -extern uint64_t psmi_count_zeMemCloseIpcHandle; -extern uint64_t psmi_count_zeMemGetAddressRange; -extern uint64_t psmi_count_zeMemGetAllocProperties; -extern uint64_t psmi_count_zeEventPoolCreate; -extern uint64_t psmi_count_zeEventPoolDestroy; -extern uint64_t psmi_count_zeEventCreate; -extern uint64_t psmi_count_zeEventDestroy; -extern uint64_t psmi_count_zeEventQueryStatus; -extern uint64_t psmi_count_zeEventHostSynchronize; -extern uint64_t psmi_count_zeEventHostReset; -extern uint64_t psmi_count_zelLoaderGetVersions; -#endif // PSM_ONEAPI - -#ifdef PSM_CUDA -static int check_set_cuda_ctxt(void) -{ - CUresult err; - CUcontext tmpctxt = {0}; - - if (unlikely(!psmi_cuCtxGetCurrent || !psmi_cuCtxSetCurrent)) - return 0; - - err = psmi_cuCtxGetCurrent(&tmpctxt); - if (likely(!err)) { - if (unlikely(!tmpctxt && cu_ctxt)) { - err = psmi_cuCtxSetCurrent(cu_ctxt); - return !!err; - } else if (unlikely(tmpctxt && !cu_ctxt)) { - cu_ctxt = tmpctxt; - } - } - return 0; -} - -/* Make sure have a real GPU job. Set cu_ctxt if available */ -PSMI_ALWAYS_INLINE( -int check_have_cuda_ctxt(void)) -{ - if (! cu_ctxt) { - if (unlikely(check_set_cuda_ctxt())) { \ - psm3_handle_error(PSMI_EP_NORETURN, \ - PSM2_INTERNAL_ERR, "Failed to set/synchronize" \ - " CUDA context.\n"); \ - } \ - } - return (cu_ctxt != NULL); -} - - -#define PSMI_CUDA_CALL(func, args...) do { \ - CUresult cudaerr; \ - if (unlikely(check_set_cuda_ctxt())) { \ - psm3_handle_error(PSMI_EP_NORETURN, \ - PSM2_INTERNAL_ERR, "Failed to set/synchronize" \ - " CUDA context.\n"); \ - } \ - psmi_count_##func++; \ - cudaerr = (CUresult)psmi_##func(args); \ - if (cudaerr != CUDA_SUCCESS) { \ - const char *pStr = NULL; \ - psmi_count_cuGetErrorString++; \ - psmi_cuGetErrorString(cudaerr, &pStr); \ - _HFI_ERROR( \ - "CUDA failure: %s() (at %s:%d)" \ - " returned %d: %s\n", \ - #func, __FILE__, __LINE__, cudaerr, \ - pStr?pStr:"Unknown"); \ - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ - "Error returned from CUDA function %s.\n", #func);\ - } \ - } while (0) -#endif // PSM_CUDA - -#ifdef PSM_ONEAPI - -#define PSMI_ONEAPI_ZE_CALL(func, args...) do { \ - ze_result_t result; \ - psmi_count_##func++; \ - result = psmi_##func(args); \ - if(result != ZE_RESULT_SUCCESS) { \ - _HFI_ERROR( "OneAPI Level Zero failure: %s() (at %s:%d)" \ - " returned 0x%x: %s\n", \ - #func, __FILE__, __LINE__, result, \ - psmi_oneapi_ze_result_to_string(result)); \ - psm3_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ - "Error returned from OneAPI Level Zero function %s.\n", #func); \ - } \ -} while (0) - -void psmi_oneapi_cmd_create_all(void); -void psmi_oneapi_cmd_destroy_all(void); -uint64_t psm3_oneapi_ze_get_alloc_id(void *addr, uint8_t *type); - -#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE -#define ONEAPI_PUTQUEUE_SIZE -1 -#endif -psm2_error_t psmi_oneapi_putqueue_alloc(void); -void psmi_oneapi_putqueue_free(void); - -/* - * Two usages: - * (1) ctxt == NULL: check if the buffer is allocated from Level-zero. - * In this case, change cur_ze_dev if device has changed. - * (2) ctxt != NULL: try to get the device context. - * In this case, don't change cur_ze_dev. - */ -PSMI_ALWAYS_INLINE( -int -_psmi_is_oneapi_ze_mem(const void *ptr, struct ze_dev_ctxt **ctxt)) -{ - ze_memory_allocation_properties_t mem_props = { - ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES - }; - ze_device_handle_t dev; - ze_result_t result; - int ret = 0; - - psmi_count_zeMemGetAllocProperties++; - result = psmi_zeMemGetAllocProperties(ze_context, ptr, &mem_props, - &dev); - if (result == ZE_RESULT_SUCCESS && - (mem_props.type != ZE_MEMORY_TYPE_UNKNOWN)) { - ret = 1; - _HFI_VDBG("ptr %p type %d dev %p cur_ze_dev %p\n", - ptr, mem_props.type, dev, cur_ze_dev->dev); - /* - * Check if the gpu device has changed. - * If we are trying to get the device context (!ctxt), - * don't change cur_ze_dev. - * If the buffer is allocated through zeMemAllocHost, - * there will be no device associated with it (dev == NULL). - * In this case, use the current device context. - */ - if (!dev) { - if (ctxt) - *ctxt = cur_ze_dev; - return ret; - } - if (ctxt || (!ctxt && dev != cur_ze_dev->dev)) { - int i; - - for (i = 0; i < num_ze_devices; i++) { - if (ze_devices[i].dev == dev) { - if (ctxt) - *ctxt = &ze_devices[i]; - else - cur_ze_dev = &ze_devices[i]; - break; - } - } - _HFI_VDBG("check ze_device[%d-%d] for dev %p: no match\n", 0, num_ze_devices-1, dev); - } - } - - return ret; -} - - -PSMI_ALWAYS_INLINE( -struct ze_dev_ctxt * -psmi_oneapi_dev_ctxt_get(const void *ptr)) -{ - struct ze_dev_ctxt *ctxt = NULL; - - _psmi_is_oneapi_ze_mem(ptr, &ctxt); - - return ctxt; -} - -#define PSMI_IS_ONEAPI_ZE_ENABLED likely(is_oneapi_ze_enabled) -#define PSMI_IS_ONEAPI_ZE_DISABLED unlikely(!is_oneapi_ze_enabled) -#define PSMI_IS_ONEAPI_ZE_MEM(ptr) _psmi_is_oneapi_ze_mem(ptr, NULL) - -#endif // PSM_ONEAPI - -#ifdef PSM_CUDA -PSMI_ALWAYS_INLINE( -void verify_device_support_unified_addr()) -{ - if (likely(_device_support_unified_addr > -1)) return; - - int num_devices, dev; - - /* Check if all devices support Unified Virtual Addressing. */ - PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices); - - _device_support_unified_addr = 1; - - for (dev = 0; dev < num_devices; dev++) { - CUdevice device; - PSMI_CUDA_CALL(cuDeviceGet, &device, dev); - int unifiedAddressing; - PSMI_CUDA_CALL(cuDeviceGetAttribute, - &unifiedAddressing, - CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, - device); - - if (unifiedAddressing !=1) { - psm3_handle_error(PSMI_EP_NORETURN, PSM2_EP_DEVICE_FAILURE, - "CUDA device %d does not support Unified Virtual Addressing.\n", - dev); - } - } - - return; -} - -PSMI_ALWAYS_INLINE( -int device_support_gpudirect()) -{ - if (likely(_device_support_gpudirect > -1)) return _device_support_gpudirect; - - int num_devices, dev; - - /* Check if all devices support GPU Direct RDMA based on version. */ - PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices); - - _device_support_gpudirect = 1; - - for (dev = 0; dev < num_devices; dev++) { - CUdevice device; - PSMI_CUDA_CALL(cuDeviceGet, &device, dev); - - int major; - PSMI_CUDA_CALL(cuDeviceGetAttribute, - &major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device); - if (major < 3) { - _device_support_gpudirect = 0; - _HFI_INFO("CUDA device %d does not support GPUDirect RDMA (Non-fatal error)\n", dev); - } - } - - return _device_support_gpudirect; -} - -PSMI_ALWAYS_INLINE( -int gpu_p2p_supported()) -{ - if (likely(_gpu_p2p_supported > -1)) return _gpu_p2p_supported; - - _gpu_p2p_supported = 0; - - if (unlikely(!is_cuda_enabled)) { - _HFI_DBG("returning 0 (cuda disabled)\n"); - return 0; - } - - /* Check which devices the current device has p2p access to. */ - CUdevice current_device; - CUcontext current_context; - int num_devices, dev_idx; - PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices); - - if (num_devices > 1) { - PSMI_CUDA_CALL(cuCtxGetCurrent, ¤t_context); - if (current_context == NULL) { - _HFI_INFO("Unable to find active CUDA context, assuming P2P not supported\n"); - return 0; - } - PSMI_CUDA_CALL(cuCtxGetDevice, ¤t_device); - } - - for (dev_idx = 0; dev_idx < num_devices; dev_idx++) { - CUdevice device; - PSMI_CUDA_CALL(cuDeviceGet, &device, dev_idx); - - if (num_devices > 1 && device != current_device) { - int canAccessPeer = 0; - PSMI_CUDA_CALL(cuDeviceCanAccessPeer, &canAccessPeer, - current_device, device); - - if (canAccessPeer != 1) - _HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev_idx); - else - _gpu_p2p_supported |= (1 << dev_idx); - } else { - /* Always support p2p on the same GPU */ - my_gpu_device = dev_idx; - _gpu_p2p_supported |= (1 << dev_idx); - } - } - - _HFI_DBG("returning (0x%x), device 0x%x (%d)\n", _gpu_p2p_supported, (1 << my_gpu_device), my_gpu_device); - return _gpu_p2p_supported; -} - -/** - * Similar to PSMI_CUDA_CALL() except does not error out - * if func(args) returns CUDA_SUCCESS or except_err - * - * Invoker must provide 'CUresult cudaerr' in invoked scope - * so invoker can inspect whether cudaerr == CUDA_SUCCESS or - * cudaerr == except_err after expanded code is executed. - * - * As except_err is an allowed value, message is printed at - * DBG level. - */ -#define PSMI_CUDA_CALL_EXCEPT(except_err, func, args...) do { \ - if (unlikely(check_set_cuda_ctxt())) { \ - psm3_handle_error(PSMI_EP_NORETURN, \ - PSM2_INTERNAL_ERR, "Failed to " \ - "set/synchronize CUDA context.\n"); \ - } \ - psmi_count_##func++; \ - cudaerr = psmi_##func(args); \ - if (cudaerr != CUDA_SUCCESS && cudaerr != except_err) { \ - const char *pStr = NULL; \ - psmi_count_cuGetErrorString++; \ - psmi_cuGetErrorString(cudaerr, &pStr); \ - if (cu_ctxt == NULL) \ - _HFI_ERROR( \ - "Check if CUDA is initialized" \ - "before psm3_ep_open call \n"); \ - _HFI_ERROR( \ - "CUDA failure: %s() (at %s:%d)" \ - " returned %d: %s\n", \ - #func, __FILE__, __LINE__, cudaerr, \ - pStr?pStr:"Unknown"); \ - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ - "Error returned from CUDA function %s.\n", #func);\ - } else if (cudaerr == except_err) { \ - const char *pStr = NULL; \ - psmi_count_cuGetErrorString++; \ - psmi_cuGetErrorString(cudaerr, &pStr); \ - _HFI_DBG( \ - "CUDA non-zero return value: %s() (at %s:%d)" \ - " returned %d: %s\n", \ - #func, __FILE__, __LINE__, cudaerr, \ - pStr?pStr:"Unknown"); \ - } \ - } while (0) - -#define PSMI_CUDA_CHECK_EVENT(event, cudaerr) do { \ - psmi_count_cuEventQuery++; \ - cudaerr = psmi_cuEventQuery(event); \ - if ((cudaerr != CUDA_SUCCESS) && \ - (cudaerr != CUDA_ERROR_NOT_READY)) { \ - const char *pStr = NULL; \ - psmi_count_cuGetErrorString++; \ - psmi_cuGetErrorString(cudaerr, &pStr); \ - _HFI_ERROR( \ - "CUDA failure: %s() (at %s:%d) returned %d: %s\n", \ - "cuEventQuery", __FILE__, __LINE__, cudaerr, \ - pStr?pStr:"Unknown"); \ - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ - "Error returned from CUDA function cuEventQuery.\n");\ - } \ - } while (0) - -#define PSMI_CUDA_DLSYM(psmi_cuda_lib,func) do { \ - psmi_##func = dlsym(psmi_cuda_lib, STRINGIFY(func)); \ - if (!psmi_##func) { \ - psm3_handle_error(PSMI_EP_NORETURN, \ - PSM2_INTERNAL_ERR, \ - " Unable to resolve %s symbol" \ - " in CUDA libraries.\n",STRINGIFY(func));\ - } \ -} while (0) -#endif // PSM_CUDA - -#ifdef PSM_ONEAPI - -PSMI_ALWAYS_INLINE( -int gpu_p2p_supported()) -{ - - uint32_t num_devices = 0; - uint32_t dev; - ze_device_handle_t devices[MAX_ZE_DEVICES]; - - if (likely(_gpu_p2p_supported > -1)) return _gpu_p2p_supported; - - if (unlikely(!is_oneapi_ze_enabled)) { - _gpu_p2p_supported=0; - return 0; - } - - _gpu_p2p_supported = 0; - - PSMI_ONEAPI_ZE_CALL(zeDeviceGet, ze_driver, &num_devices, NULL); - if (num_devices > MAX_ZE_DEVICES) - num_devices = MAX_ZE_DEVICES; - PSMI_ONEAPI_ZE_CALL(zeDeviceGet, ze_driver, &num_devices, devices); - - for (dev = 0; dev < num_devices; dev++) { - ze_device_handle_t device; - device = devices[dev]; - - if (num_devices > 1 && device != cur_ze_dev->dev) { - ze_bool_t canAccessPeer = 0; - - PSMI_ONEAPI_ZE_CALL(zeDeviceCanAccessPeer, cur_ze_dev->dev, - device, &canAccessPeer); - if (canAccessPeer != 1) - _HFI_DBG("ONEAPI device %d does not support P2P from current device (Non-fatal error)\n", dev); - else - _gpu_p2p_supported |= (1 << dev); - } else { - /* Always support p2p on the same GPU */ - my_gpu_device = dev; - _gpu_p2p_supported |= (1 << dev); - } - } - - return _gpu_p2p_supported; -} - -#define PSMI_ONEAPI_ZE_DLSYM(lib_ptr, func) do { \ - psmi_##func = dlsym(lib_ptr, STRINGIFY(func)); \ - if (!psmi_##func) { \ - psm3_handle_error(PSMI_EP_NORETURN, \ - PSM2_INTERNAL_ERR, \ - "Unable to resolve %s symbol " \ - "in OneAPI Level Zero library.\n", STRINGIFY(func)); \ - } \ -} while (0) - -static inline -int _psm3_oneapi_ze_memcpy_done(const struct ips_gpu_hostbuf *ghb) -{ - ze_result_t result; - psmi_count_zeEventQueryStatus++; - - result = psmi_zeEventQueryStatus(ghb->copy_status); - if (result == ZE_RESULT_SUCCESS) { - return 1; - } else if (result == ZE_RESULT_NOT_READY) { - return 0; - } else { - _HFI_ERROR("OneAPI Level Zero failure: %s() (at %s:%d) returned 0x%x: %s\n", - "zeEventQueryStatus", __FILE__, __LINE__, result, - psmi_oneapi_ze_result_to_string(result)); - psm3_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "Error returned from OneAPI Level Zero function %s.\n", - "zeEventQueryStatus"); - } - return 0; -} - -#endif // PSM_ONEAPI - -#ifdef PSM_CUDA -PSMI_ALWAYS_INLINE( -int -_psmi_is_cuda_mem(const void *ptr)) -{ - CUresult cres; - CUmemorytype mt; - unsigned uvm = 0; - psmi_count_cuPointerGetAttribute++; - cres = psmi_cuPointerGetAttribute( - &mt, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr) ptr); - if ((cres == CUDA_SUCCESS) && (mt == CU_MEMORYTYPE_DEVICE)) { - psmi_count_cuPointerGetAttribute++; - cres = psmi_cuPointerGetAttribute( - &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr); - if ((cres == CUDA_SUCCESS) && (uvm == 0)) - return 1; - else - return 0; - } else - return 0; -} - -#define PSMI_IS_CUDA_ENABLED likely(is_cuda_enabled) -#define PSMI_IS_CUDA_DISABLED unlikely(!is_cuda_enabled) -#define PSMI_IS_CUDA_MEM(p) _psmi_is_cuda_mem(p) -extern void psm2_get_gpu_bars(void); - -/* - * CUDA documentation dictates the use of SYNC_MEMOPS attribute - * when the buffer pointer received into PSM has been allocated - * by the application. This guarantees that all memory operations - * to this region of memory (used by multiple layers of the stack) - * always synchronize. - */ -static inline -void psmi_cuda_set_attr_sync_memops(const void *ubuf) -{ - int true_flag = 1; - - PSMI_CUDA_CALL(cuPointerSetAttribute, &true_flag, - CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr) ubuf); -} - -static inline -int _psm3_cuda_memcpy_done(const struct ips_gpu_hostbuf *chb) -{ - CUresult status; - PSMI_CUDA_CHECK_EVENT(chb->copy_status, status); - return (status == CUDA_SUCCESS); -} - -#endif /* PSM_CUDA */ - -#define COMPILE_TIME_ASSERT(NAME,COND) extern char NAME[1/ COND] - -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - -extern uint64_t psm3_gpu_cache_evict; enum psm2_chb_match_type { /* Complete data found in a single chb */ @@ -1252,7 +431,7 @@ void psmi_gpu_hostbuf_alloc_func(int is_alloc, void *context, void *obj); #define GPU_HOSTBUFFER_LIMITS { \ .env = "PSM3_GPU_BOUNCEBUFFERS_MAX", \ - .descr = "Max CUDA bounce buffers (in MB)", \ + .descr = "Max GPU Pipeline bounce buffers (in MB)", \ .env_level = PSMI_ENVVAR_LEVEL_HIDDEN, \ .minval = 1, \ .maxval = 1<<30, \ @@ -1265,364 +444,8 @@ struct ips_gpu_hostbuf_mpool_cb_context { unsigned bufsz; }; -PSMI_ALWAYS_INLINE( -int -_psmi_is_gdr_copy_enabled()) -{ - return is_gdr_copy_enabled; -} - -// Only valid if called for a GPU buffer -#define PSMI_USE_GDR_COPY_RECV(len) ((len) >=1 && (len) <= gdr_copy_limit_recv) -#define PSMI_IS_GDR_COPY_ENABLED _psmi_is_gdr_copy_enabled() -#define PSM3_IS_BUFFER_GPU_MEM(buf, len) \ - ((len) && PSMI_IS_GPU_ENABLED && PSMI_IS_GPU_MEM(buf)) #endif -#ifdef PSM_CUDA - -#define PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp) \ - do { \ - protoexp->cudastream_recv = NULL; \ - } while (0) -#define PSM3_GPU_PREPARE_DTOH_MEMCPYS(proto) \ - do { \ - proto->cudastream_send = NULL; \ - } while (0) -#define PSM3_GPU_SHUTDOWN_HTOD_MEMCPYS(protoexp) \ - do { \ - if (protoexp->cudastream_recv != NULL) { \ - PSMI_CUDA_CALL(cuStreamDestroy, \ - protoexp->cudastream_recv); \ - } \ - } while (0) -#define PSM3_GPU_SHUTDOWN_DTOH_MEMCPYS(proto) \ - do { \ - if (proto->cudastream_send) { \ - PSMI_CUDA_CALL(cuStreamDestroy, \ - proto->cudastream_send); \ - } \ - } while (0) -#define PSM3_GPU_MEMCPY_HTOD_START(protoexp, ghb, len) \ - do { \ - if (protoexp->cudastream_recv == NULL) { \ - PSMI_CUDA_CALL(cuStreamCreate, \ - &protoexp->cudastream_recv, \ - CU_STREAM_NON_BLOCKING); \ - } \ - PSMI_CUDA_CALL(cuMemcpyHtoDAsync, \ - (CUdeviceptr)ghb->gpu_buf, ghb->host_buf, \ - len, protoexp->cudastream_recv); \ - if (ghb->copy_status == NULL) { \ - PSMI_CUDA_CALL(cuEventCreate, \ - &ghb->copy_status, CU_EVENT_DEFAULT); \ - } \ - PSMI_CUDA_CALL(cuEventRecord, ghb->copy_status, \ - protoexp->cudastream_recv); \ - } while (0) -#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len) \ - do { \ - if (proto->cudastream_send == NULL) { \ - PSMI_CUDA_CALL(cuStreamCreate, \ - &proto->cudastream_send, \ - CU_STREAM_NON_BLOCKING); \ - } \ - if (ghb->copy_status == NULL) { \ - PSMI_CUDA_CALL(cuEventCreate, \ - &ghb->copy_status, CU_EVENT_DEFAULT); \ - } \ - PSMI_CUDA_CALL(cuMemcpyDtoHAsync, \ - ghb->host_buf, (CUdeviceptr)ghb->gpu_buf, \ - len, proto->cudastream_send); \ - PSMI_CUDA_CALL(cuEventRecord, ghb->copy_status, \ - proto->cudastream_send); \ - } while (0) -#define PSM3_GPU_MEMCPY_DONE(ghb) \ - _psm3_cuda_memcpy_done(ghb) -#define PSM3_GPU_HOSTBUF_LAZY_INIT(ghb) \ - do { \ - ghb->copy_status = NULL; \ - ghb->host_buf = NULL; \ - } while (0) -#define PSM3_GPU_HOSTBUF_RESET(ghb) \ - do { \ - } while (0) -#define PSM3_GPU_HOSTBUF_DESTROY(ghb) \ - do { \ - if (ghb->copy_status != NULL) { \ - PSMI_CUDA_CALL(cuEventDestroy, \ - ghb->copy_status); \ - } \ - if (ghb->host_buf != NULL) { \ - PSMI_CUDA_CALL(cuMemFreeHost, \ - ghb->host_buf); \ - } \ - } while (0) -#define PSM3_GPU_MEMCPY_DTOD(dstptr, srcptr, len) \ - do { PSMI_CUDA_CALL(cuMemcpyDtoD, (CUdeviceptr)(dstptr), (CUdeviceptr)(srcptr), (len)); } while (0) -#define PSM3_GPU_MEMCPY_HTOD(dstptr, srcptr, len) \ - do { PSMI_CUDA_CALL(cuMemcpyHtoD, (CUdeviceptr)(dstptr), (srcptr), (len)); } while (0) -#define PSM3_GPU_SYNCHRONIZE_MEMCPY() \ - do {PSMI_CUDA_CALL(cuStreamSynchronize, 0);} while (0) -#define PSM3_GPU_HOST_ALLOC(ret_ptr, size) \ - do { \ - PSMI_CUDA_CALL(cuMemHostAlloc, (void **)(ret_ptr), \ - (size),CU_MEMHOSTALLOC_PORTABLE); \ - } while (0) -#define PSM3_GPU_HOST_FREE(ptr) \ - do { \ - PSMI_CUDA_CALL(cuMemFreeHost, (void *)ptr); \ - } while (0) -// HOST_ALLOC memory treated as CPU memory for Verbs MRs -#define PSM3_GPU_ADDR_SEND_MR(mqreq) \ - ( (mqreq)->is_buf_gpu_mem && ! (mqreq)->gpu_hostbuf_used ) -#define PSM3_GPU_ADDR_RECV_MR(tidrecvc, mqreq) \ - ( (tidrecvc)->is_ptr_gpu_backed ) -#define PSM3_MARK_BUF_SYNCHRONOUS(buf) do { psmi_cuda_set_attr_sync_memops(buf); } while (0) -#define PSM3_GPU_MEMCPY_DTOH(dstptr, srcptr, len) \ - do { PSMI_CUDA_CALL(cuMemcpyDtoH, dstptr, (CUdeviceptr)(srcptr), len); } while (0) -#define PSM3_GPU_MEMCPY(dstptr, srcptr, len) \ - do { PSMI_CUDA_CALL(cuMemcpy, (CUdeviceptr)(dstptr), (CUdeviceptr)(srcptr), len); } while (0) -#define PSMI_IS_GPU_ENABLED PSMI_IS_CUDA_ENABLED -#define PSMI_IS_GPU_DISABLED PSMI_IS_CUDA_DISABLED -#define PSMI_IS_GPU_MEM(x) PSMI_IS_CUDA_MEM(x) - -#elif defined(PSM_ONEAPI) -#define PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp) \ - do { \ - int i; \ - \ - for (i = 0; i < MAX_ZE_DEVICES; i++) \ - protoexp->cq_recvs[i] = NULL; \ - } while (0) -#define PSM3_GPU_PREPARE_DTOH_MEMCPYS(proto) \ - do { \ - int i; \ - \ - for (i = 0; i < MAX_ZE_DEVICES; i++) \ - proto->cq_sends[i] = NULL; \ - } while (0) -#define PSM3_GPU_SHUTDOWN_HTOD_MEMCPYS(protoexp) \ - do { \ - int i; \ - \ - for (i = 0; i < MAX_ZE_DEVICES; i++) { \ - if (protoexp->cq_recvs[i]) { \ - PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, \ - protoexp->cq_recvs[i]); \ - protoexp->cq_recvs[i] = NULL; \ - } \ - } \ - } while (0) -#define PSM3_GPU_SHUTDOWN_DTOH_MEMCPYS(proto) \ - do { \ - int i; \ - \ - for (i = 0; i < MAX_ZE_DEVICES; i++) { \ - if (proto->cq_sends[i]) { \ - PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, \ - proto->cq_sends[i]); \ - proto->cq_sends[i] = NULL; \ - } \ - } \ - } while (0) - -#define PSM3_GPU_MEMCPY_HTOD_START(protoexp, ghb, len) \ - do { \ - ze_event_pool_desc_t pool_desc = { \ - .stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, \ - .flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE, \ - .count = 1 \ - }; \ - ze_event_desc_t event_desc = { \ - .stype = ZE_STRUCTURE_TYPE_EVENT_DESC, \ - .signal = ZE_EVENT_SCOPE_FLAG_HOST, \ - .wait = ZE_EVENT_SCOPE_FLAG_HOST, \ - .index = 0 \ - }; \ - struct ze_dev_ctxt *ctxt; \ - int inx; \ - \ - ctxt = psmi_oneapi_dev_ctxt_get(ghb->gpu_buf); \ - if (!ctxt) \ - psm3_handle_error(PSMI_EP_NORETURN, \ - PSM2_INTERNAL_ERR, \ - "%s HTOD: unknown GPU device for addr %p\n", \ - __FUNCTION__, ghb->gpu_buf);\ - if (ghb->event_pool == NULL) { \ - PSMI_ONEAPI_ZE_CALL(zeEventPoolCreate, \ - ze_context, &pool_desc, 0, NULL, \ - &ghb->event_pool); \ - } \ - if (ghb->copy_status == NULL) { \ - PSMI_ONEAPI_ZE_CALL(zeEventCreate, \ - ghb->event_pool, &event_desc, \ - &ghb->copy_status); \ - } \ - inx = ctxt->dev_index; \ - if (! ghb->command_lists[inx]) { \ - psmi_oneapi_async_cmd_create(ctxt, \ - &protoexp->cq_recvs[inx], \ - &ghb->command_lists[inx]); \ - } \ - ghb->cur_dev_inx = inx; \ - PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, \ - ghb->command_lists[inx], \ - ghb->gpu_buf, ghb->host_buf, len, \ - ghb->copy_status, 0, NULL); \ - if (! psm3_oneapi_immed_async_copy) { \ - PSMI_ONEAPI_ZE_CALL(zeCommandListClose, \ - ghb->command_lists[inx]); \ - PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists,\ - protoexp->cq_recvs[inx], 1, \ - &ghb->command_lists[inx], NULL); \ - } \ - } while (0) -#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len) \ - do { \ - ze_event_pool_desc_t pool_desc = { \ - .stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, \ - .flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE, \ - .count = 1 \ - }; \ - ze_event_desc_t event_desc = { \ - .stype = ZE_STRUCTURE_TYPE_EVENT_DESC, \ - .signal = ZE_EVENT_SCOPE_FLAG_HOST, \ - .wait = ZE_EVENT_SCOPE_FLAG_HOST, \ - .index = 0 \ - }; \ - struct ze_dev_ctxt *ctxt; \ - int inx; \ - \ - ctxt = psmi_oneapi_dev_ctxt_get(ghb->gpu_buf); \ - if (!ctxt) \ - psm3_handle_error(PSMI_EP_NORETURN, \ - PSM2_INTERNAL_ERR, \ - "%s DTOH: unknown GPU device for addr %p\n", \ - __FUNCTION__, ghb->gpu_buf);\ - if (ghb->event_pool == NULL) { \ - PSMI_ONEAPI_ZE_CALL(zeEventPoolCreate, \ - ze_context, &pool_desc, 0, NULL, \ - &ghb->event_pool); \ - } \ - if (ghb->copy_status == NULL) { \ - PSMI_ONEAPI_ZE_CALL(zeEventCreate, \ - ghb->event_pool, &event_desc, \ - &ghb->copy_status); \ - } \ - inx = ctxt->dev_index; \ - if (! ghb->command_lists[inx]) { \ - psmi_oneapi_async_cmd_create(ctxt, \ - &proto->cq_sends[inx], \ - &ghb->command_lists[inx]); \ - } \ - ghb->cur_dev_inx = inx; \ - PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, \ - ghb->command_lists[inx], \ - ghb->host_buf, ghb->gpu_buf, len, \ - ghb->copy_status, 0, NULL); \ - if (! psm3_oneapi_immed_async_copy) { \ - PSMI_ONEAPI_ZE_CALL(zeCommandListClose, \ - ghb->command_lists[inx]); \ - PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists,\ - proto->cq_sends[inx], 1, \ - &ghb->command_lists[inx], NULL); \ - } \ - } while (0) -#define PSM3_GPU_MEMCPY_DONE(ghb) \ - _psm3_oneapi_ze_memcpy_done(ghb) -#define PSM3_GPU_HOSTBUF_LAZY_INIT(ghb) \ - do { \ - int i; \ - \ - ghb->event_pool = NULL; \ - ghb->copy_status = NULL; \ - for (i = 0; i < MAX_ZE_DEVICES; i++) \ - ghb->command_lists[i] = NULL; \ - ghb->host_buf = NULL; \ - } while (0) -#define PSM3_GPU_HOSTBUF_RESET(ghb) \ - do { \ - if (! psm3_oneapi_immed_async_copy) { \ - PSMI_ONEAPI_ZE_CALL(zeCommandListReset, \ - ghb->command_lists[ghb->cur_dev_inx]);\ - } \ - PSMI_ONEAPI_ZE_CALL(zeEventHostReset, \ - ghb->copy_status); \ - } while (0) -#define PSM3_GPU_HOSTBUF_DESTROY(ghb) \ - do { \ - int i; \ - \ - if (ghb->copy_status != NULL) { \ - PSMI_ONEAPI_ZE_CALL(zeEventDestroy, \ - ghb->copy_status); \ - } \ - if (ghb->host_buf != NULL) { \ - PSM3_ONEAPI_ZE_HOST_FREE(ghb->host_buf); \ - } \ - if (ghb->event_pool != NULL) { \ - PSMI_ONEAPI_ZE_CALL(zeEventPoolDestroy, \ - ghb->event_pool); \ - } \ - for (i = 0; i < MAX_ZE_DEVICES; i++) { \ - if (ghb->command_lists[i]) { \ - PSMI_ONEAPI_ZE_CALL( \ - zeCommandListDestroy, \ - ghb->command_lists[i]); \ - ghb->command_lists[i] = NULL; \ - } \ - } \ - } while (0) -#define PSM3_GPU_MEMCPY_DTOD(dstptr, srcptr, len) \ - do { psmi_oneapi_ze_memcpy_DTOD(dstptr, srcptr, len); } while(0) -#define PSM3_GPU_MEMCPY_HTOD(dstptr, srcptr, len) \ - do { psmi_oneapi_ze_memcpy(dstptr, srcptr, len); } while(0) -#define PSM3_GPU_SYNCHRONIZE_MEMCPY() \ - do { /* not needed for OneAPI ZE */ } while (0) -#ifdef PSM3_USE_ONEAPI_MALLOC -#define PSM3_GPU_HOST_ALLOC(ret_ptr, size) \ - do { \ - *ret_ptr = psm3_oneapi_ze_host_alloc_malloc(size); \ - } while (0) -#define PSM3_ONEAPI_ZE_HOST_FREE(ptr) \ - psm3_oneapi_ze_host_free_malloc(ptr) -// HOST_ALLOC memory treated as CPU memory for Verbs MRs -#define PSM3_GPU_ADDR_SEND_MR(mqreq) \ - ( (mqreq)->is_buf_gpu_mem && ! (mqreq)->gpu_hostbuf_used ) -#define PSM3_GPU_ADDR_RECV_MR(tidrecvc, mqreq) \ - ( (tidrecvc)->is_ptr_gpu_backed ) -#else /* PSM3_USE_ONEAPI_MALLOC */ -#define PSM3_GPU_HOST_ALLOC(ret_ptr, size) \ - do { \ - *ret_ptr = (*psm3_oneapi_ze_host_alloc)(size); \ - } while (0) -#define PSM3_ONEAPI_ZE_HOST_FREE(ptr) \ - (*psm3_oneapi_ze_host_free)(ptr) -// HOST_ALLOC memory treated as GPU memory for Verbs MRs -// Note: gpu_hostbuf_used" only set if is_buf_gpu_mem -#define PSM3_GPU_ADDR_SEND_MR(mqreq) \ - ( (mqreq)->is_buf_gpu_mem && \ - (! (mqreq)->gpu_hostbuf_used || psm3_oneapi_ze_using_zemem_alloc )) -#define PSM3_GPU_ADDR_RECV_MR(tidrecvc, mqreq) \ - ( (tidrecvc)->is_ptr_gpu_backed \ - || ((mqreq)->gpu_hostbuf_used && psm3_oneapi_ze_using_zemem_alloc)) -#endif /* PSM3_USE_ONEAPI_MALLOC */ -#define PSM3_GPU_HOST_FREE(ptr) PSM3_ONEAPI_ZE_HOST_FREE(ptr) -#define PSM3_MARK_BUF_SYNCHRONOUS(buf) do { /* not needed for OneAPI ZE */ } while (0) -#define PSM3_GPU_MEMCPY_DTOH(dstptr, srcptr, len) \ - do { psmi_oneapi_ze_memcpy(dstptr, srcptr, len); } while (0) -#define PSM3_GPU_MEMCPY(dstptr, srcptr, len) \ - do { psmi_oneapi_ze_memcpy(dstptr, srcptr, len); } while (0) -#define PSMI_IS_GPU_ENABLED PSMI_IS_ONEAPI_ZE_ENABLED -#define PSMI_IS_GPU_DISABLED PSMI_IS_ONEAPI_ZE_DISABLED -#define PSMI_IS_GPU_MEM(x) PSMI_IS_ONEAPI_ZE_MEM(x) - -void psm3_put_ipc_handle(const void *buf, ze_ipc_mem_handle_t ipc_handle); - -#endif /* elif PSM_ONEAPI */ - - #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/prov/psm3/psm3/psm_utils.c b/prov/psm3/psm3/psm_utils.c index 0f1a3fe1d5d..d3f272a2041 100644 --- a/prov/psm3/psm3/psm_utils.c +++ b/prov/psm3/psm3/psm_utils.c @@ -2495,7 +2495,7 @@ int psm3_parse_memmode(void) } } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // we need PSM3_GPUDIRECT config early to influence rdmamode defaults, // MR Cache mode and whether we need to open RV. // As such we don't check PSMI_HAL_CAP_GPUDIRECT flag here, but @@ -2552,7 +2552,7 @@ unsigned psmi_parse_gpudirect_rdma_send_limit(int force) psm3_getenv_range("PSM3_GPUDIRECT_RDMA_SEND_LIMIT", "GPUDirect RDMA feature on send side will be switched off for messages larger than limit.", NULL, PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)UINT_MAX, + (union psmi_envvar_val)psm3_gpu_gpudirect_rdma_send_limit_default, (union psmi_envvar_val)0, (union psmi_envvar_val)UINT_MAX, NULL, NULL, &envval); @@ -2584,11 +2584,7 @@ unsigned psmi_parse_gpudirect_rdma_recv_limit(int force) psm3_getenv_range("PSM3_GPUDIRECT_RDMA_RECV_LIMIT", "GPUDirect RDMA feature on receive side will be switched off for messages larger than limit.", NULL, PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, -#ifdef PSM_CUDA - (union psmi_envvar_val)UINT_MAX, -#elif defined(PSM_ONEAPI) - (union psmi_envvar_val)1, -#endif + (union psmi_envvar_val)psm3_gpu_gpudirect_rdma_recv_limit_default, (union psmi_envvar_val)0, (union psmi_envvar_val)UINT_MAX, NULL, NULL, &envval); @@ -2597,9 +2593,9 @@ unsigned psmi_parse_gpudirect_rdma_recv_limit(int force) have_value = 1; return saved; } -#endif // PSM_CUDA || PSM_ONEAPI +#endif // PSM_HAVE_GPU -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* Size of RV GPU Cache - only used for PSM3_GPUDIRECT=1 * otherwise returns 0 */ @@ -2619,7 +2615,7 @@ unsigned psmi_parse_gpudirect_rv_gpu_cache_size(int reload) // min size is (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) * // chunk size (psm3_mq_max_window_rv(mq, 1) after // psmi_mq_initialize_params) - if (PSMI_IS_GPU_ENABLED && psmi_parse_gpudirect() ) { + if (PSM3_GPU_IS_ENABLED && psmi_parse_gpudirect() ) { psm3_getenv("PSM3_RV_GPU_CACHE_SIZE", "kernel space GPU cache size" " (MBs, 0 lets rv module decide) [0]", @@ -2634,7 +2630,7 @@ unsigned psmi_parse_gpudirect_rv_gpu_cache_size(int reload) return saved; } -#endif // PSM_CUDA || PSM_ONEAPI +#endif // PSM_HAVE_GPU #ifdef PSM_HAVE_REG_MR /* Send DMA Enable */ @@ -2817,32 +2813,13 @@ void psm3_print_rank_identify(void) if (identify_shown) return; -#ifdef PSM_CUDA - char cudart_ver[64] = "unknown"; - if (cuda_runtime_ver) - snprintf(cudart_ver, sizeof(cudart_ver), "%d.%d", - cuda_runtime_ver / 1000, (cuda_runtime_ver % 1000) / 10); - snprintf(accel_vers, sizeof(accel_vers), "%s %s CUDA Runtime %s built against interface %d.%d\n", - psm3_get_mylabel(), psm3_ident_tag, - cudart_ver, CUDA_VERSION / 1000, (CUDA_VERSION % 1000) / 10); -#elif defined(PSM_ONEAPI) - char ze_api_ver[64] = "unknown"; - char ze_loader_ver[64] = "unknown"; - if (zel_api_version) - snprintf(ze_api_ver, sizeof(ze_api_ver), "%d.%d", - ZE_MAJOR_VERSION(zel_api_version), ZE_MINOR_VERSION(zel_api_version)); - if (zel_lib_version.major || zel_lib_version.minor || zel_lib_version.patch) - snprintf(ze_loader_ver, sizeof(ze_loader_ver), "v%d.%d.%d", - zel_lib_version.major, zel_lib_version.minor, zel_lib_version.patch); - snprintf(accel_vers, sizeof(accel_vers), "%s %s Level-Zero Runtime %s (%s) built against interface %d.%d\n", - psm3_get_mylabel(), psm3_ident_tag, - ze_api_ver, ze_loader_ver, - ZE_MAJOR_VERSION(ZE_API_VERSION_CURRENT), ZE_MINOR_VERSION(ZE_API_VERSION_CURRENT)); +#ifdef PSM_HAVE_GPU + PSM3_GPU_IDENTIFY(accel_vers, sizeof(accel_vers)); #endif identify_shown = 1; strcat(strcat(ofed_delta," built for IEFS OFA DELTA "),psm3_IEFS_version); - psm3_print_identify("%s %s PSM3 v%d.%d%s%s\n" + psm3_print_identify("%s %s PSM3 v%d.%d"PSM3_GPU_TYPES"%s\n" "%s %s location %s\n" "%s %s build date %s\n" "%s %s src checksum %s\n" @@ -2853,13 +2830,6 @@ void psm3_print_rank_identify(void) "%s %s CPU Core %d NUMA %d PID %d\n", psm3_get_mylabel(), psm3_ident_tag, PSM2_VERNO_MAJOR,PSM2_VERNO_MINOR, -#ifdef PSM_CUDA - "-cuda", -#elif defined(PSM_ONEAPI) - "-oneapi-ze", -#else - "", -#endif (strcmp(psm3_IEFS_version,"") != 0) ? ofed_delta : "", psm3_get_mylabel(), psm3_ident_tag, dladdr(psm3_init, &info_psm) ? @@ -4032,6 +4002,28 @@ psmi_coreopt_ctl(const void *core_obj, int optname, epaddr->usr_ep_ctxt = optval; } break; + case PSM2_CORE_OPT_EP_CUDA_PERMITTED: + { + psm2_ep_t ep_core = (psm2_ep_t)core_obj; + if (!ep_core) + return psm3_handle_error(NULL, PSM2_PARAM_ERR, "Invalid endpoint"); + + if (*optlen < sizeof(bool)) { + err = psm3_handle_error(NULL, PSM2_PARAM_ERR, + "Option len insufficient for bool (%"PRIu64")", *optlen); + *optlen = sizeof(bool); + return err; + } + + PSM_EP_FOR_EACH_MCTXT(ep_core, ep) { + err = get + ? PSM3_GPU_GET_CUDA_PERMITTED(ep, (bool *)optval) + : PSM3_GPU_SET_CUDA_PERMITTED(ep, *(bool *)optval); + if (err) + return err; + } + } + break; default: /* Unknown/unrecognized option */ err = psm3_handle_error(NULL, @@ -5067,12 +5059,51 @@ void psm3_touch_mmap(void *m, size_t bytes) void psm3_memcpy(void *dest, const void *src, uint32_t len) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (len && PSMI_IS_GPU_ENABLED && - (PSMI_IS_GPU_MEM(dest) || PSMI_IS_GPU_MEM((void *)src))) { +#ifdef PSM_HAVE_GPU + if (len && + (PSM3_IS_GPU_MEM(dest) || PSM3_IS_GPU_MEM((void *)src))) { PSM3_GPU_MEMCPY(dest, src, len); return; } #endif memcpy(dest, src, len); } + +void psm3_ep_memcpy(psm2_ep_t ep, void *dest, const void *src, uint32_t len) +{ +#ifdef PSM_HAVE_GPU + // if CUDA is disallowed, attempt gdrcopy instead + if_pf (!len) + return; + + const bool src_is_gpu = PSM3_IS_GPU_MEM(src); + const bool dest_is_gpu = PSM3_IS_GPU_MEM(dest); + + if (src_is_gpu || dest_is_gpu) { + // if the GPU HAL provides memcpy, prefer it + if (PSM3_GPU_IS_MEMCPY_PERMITTED(ep)) { + PSM3_GPU_MEMCPY(dest, src, len); + return; + } + + // otherwise, avoid GPU-driven memcpy paths by mapping the + // device buffer and issuing a CPU driven gdrcopy + if (src_is_gpu) { + src = psmi_hal_gdr_convert_gpu_to_host_addr( + (unsigned long)src, len, 0, ep); + psmi_assert_always(src); + } + if (dest_is_gpu) { + dest = psmi_hal_gdr_convert_gpu_to_host_addr( + (unsigned long)dest, len, 0, ep); + psmi_assert_always(dest); + } + + // buffers cpu-accessible; fall through to host memcpy + } +#else + // no GPU support: fall through to host memcpy +#endif + + memcpy(dest, src, len); +} diff --git a/prov/psm3/psm3/psm_utils.h b/prov/psm3/psm3/psm_utils.h index 57742fc39ea..fc33bb25b5b 100644 --- a/prov/psm3/psm3/psm_utils.h +++ b/prov/psm3/psm3/psm_utils.h @@ -437,7 +437,7 @@ void psm3_print_identify(const char *fmt, ...) \ #ifdef PSM_HAVE_REG_MR unsigned psm3_parse_senddma(void); #endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU unsigned psmi_parse_gpudirect(void); unsigned psmi_parse_gpudirect_rdma_send_limit(int force); unsigned psmi_parse_gpudirect_rdma_recv_limit(int force); @@ -455,36 +455,6 @@ void psm3_syslog(psm2_ep_t ep, int to_console, int level, void *psm3_memcpyo(void *dst, const void *src, size_t n); uint32_t psm3_crc(unsigned char *buf, int len); -/* - * Internal CPUID detection - */ -#define CPUID_FAMILY_MASK 0x00000f00 -#define CPUID_MODEL_MASK 0x000000f0 -#define CPUID_EXMODEL_MASK 0x000f0000 - -/* - * CPUID return values - */ -#define CPUID_FAMILY_XEON 0x00000600 -/* - * cpuid function 0, returns "GeniuneIntel" in EBX,ECX,EDX - * due to Little Endian and Hex it is not so obvious - */ -#define CPUID_GENUINE_INTEL_EBX 0x756e6547 /* "uneG" - Little Endian "Genu" */ -#define CPUID_GENUINE_INTEL_ECX 0x6c65746e /* "Ieni" - Little Endian "ineI" */ -#define CPUID_GENUINE_INTEL_EDX 0x49656e69 /* "letn" - Little Endian "ntel" */ - -/* - * These values are internal only, not real register values - */ -#define CPUID_GENUINE_INTEL 0xf0000000 -#define CPUID_MODEL_UNDEFINED -1 - -/* - * Global model so we can tune defaults better for specific cpu's - */ -extern uint32_t psm3_cpu_model; - /* * Diagnostics, all in psm_diags.c */ diff --git a/prov/psm3/psm3/psm_verbs_mr.c b/prov/psm3/psm3/psm_verbs_mr.c index fa8fdf39499..d17d3fd5177 100644 --- a/prov/psm3/psm3/psm_verbs_mr.c +++ b/prov/psm3/psm3/psm_verbs_mr.c @@ -246,7 +246,7 @@ struct psm2_mr_cache { uint32_t limit_nonpri_inuse; uint64_t limit_nonpri_inuse_bytes; #ifdef PSM_HAVE_RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint64_t limit_nonpri_gpu_inuse_bytes; #endif psm3_rv_t rv; @@ -309,7 +309,7 @@ struct psm2_mr_cache { uint64_t inuse_send_bytes; uint64_t max_inuse_send_bytes; #ifdef PSM_HAVE_RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint64_t gpu_inuse_bytes; uint64_t max_gpu_inuse_bytes; uint64_t gpu_inuse_recv_bytes; @@ -323,7 +323,7 @@ struct psm2_mr_cache { #ifdef PSM_HAVE_RNDV_MOD struct psm3_rv_cache_stats rv_stats; // statistics from rv module // will remain 0 if rv not open -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU struct psm3_rv_gpu_cache_stats rv_gpu_stats; // GPU statistics from rv module // will remain 0 if rv not open #endif @@ -393,13 +393,11 @@ static int mr_cache_key_cmp(const struct psm3_verbs_mr *a, return -1; else if (a->access > b->access) return 1; -#ifdef PSM_ONEAPI - if (a->alloc_id < b->alloc_id) - return -1; - else if (a->alloc_id > b->alloc_id) - return 1; -#endif +#ifdef PSM_HAVE_GPU + return PSM3_GPU_CMP_MR(&a->gpu_specific, &b->gpu_specific); +#else return 0; +#endif } // rbtree.c uses these defines to establish some of it's code and @@ -766,7 +764,7 @@ static void update_stats_inc_inuse(psm2_mr_cache_t cache, uint64_t length, { INC_STAT(cache, inuse, max_inuse); #ifdef PSM_HAVE_RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (access & IBV_ACCESS_IS_GPU_ADDR) ADD_STAT(cache, length, gpu_inuse_bytes, max_gpu_inuse_bytes); else @@ -778,7 +776,7 @@ static void update_stats_inc_inuse(psm2_mr_cache_t cache, uint64_t length, if (access & IBV_ACCESS_REMOTE_WRITE) { INC_STAT(cache, inuse_recv, max_inuse_recv); #ifdef PSM_HAVE_RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (access & IBV_ACCESS_IS_GPU_ADDR) ADD_STAT(cache, length, gpu_inuse_recv_bytes, max_gpu_inuse_recv_bytes); else @@ -788,7 +786,7 @@ static void update_stats_inc_inuse(psm2_mr_cache_t cache, uint64_t length, } else { INC_STAT(cache, inuse_send, max_inuse_send); #ifdef PSM_HAVE_RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (access & IBV_ACCESS_IS_GPU_ADDR) ADD_STAT(cache, length, gpu_inuse_send_bytes, max_gpu_inuse_send_bytes); else @@ -805,7 +803,7 @@ static void update_stats_dec_inuse(psm2_mr_cache_t cache, uint64_t length, { cache->inuse--; #ifdef PSM_HAVE_RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (access & IBV_ACCESS_IS_GPU_ADDR) cache->gpu_inuse_bytes -= length; else @@ -817,7 +815,7 @@ static void update_stats_dec_inuse(psm2_mr_cache_t cache, uint64_t length, if (access & IBV_ACCESS_REMOTE_WRITE) { cache->inuse_recv--; #ifdef PSM_HAVE_RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (access & IBV_ACCESS_IS_GPU_ADDR) cache->gpu_inuse_recv_bytes -= length; else @@ -827,7 +825,7 @@ static void update_stats_dec_inuse(psm2_mr_cache_t cache, uint64_t length, } else { cache->inuse_send--; #ifdef PSM_HAVE_RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (access & IBV_ACCESS_IS_GPU_ADDR) cache->gpu_inuse_send_bytes -= length; else @@ -869,7 +867,7 @@ static void update_stats_inc_full(psm2_mr_cache_t cache, bool priority, psm2_error_t set_cache_limit_nonpri_rv_kern(psm2_mr_cache_t cache, uint32_t limit_entries, uint32_t pri_entries, uint64_t pri_size -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , uint64_t gpu_pri_size #endif ) @@ -885,20 +883,20 @@ psm2_error_t set_cache_limit_nonpri_rv_kern(psm2_mr_cache_t cache, return PSM2_PARAM_ERR; } cache->limit_nonpri_inuse_bytes = (uint64_t)ep->rv_mr_cache_size*MEGABYTE - pri_size; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (PSMI_IS_GPU_ENABLED) { +#ifdef PSM_HAVE_GPU + if (PSM3_GPU_IS_ENABLED) { // For GPU, due to GdrCopy, we can't undersize cache. // Otherwise RDMA MRs could consume all the // cache space and leave a gdrcopy pin/mmap stuck // retrying indefinitely. If we want to allow undersize // GPU cache, we need to have gdrcopy pin/mmap failures // also invoke progress functions to release MRs - if (psm3_min_gpu_bar_size()) { - uint64_t max_recommend = psm3_min_gpu_bar_size() - 32*MEGABYTE; + if (PSM3_GPU_MIN_BAR_SIZE()) { + uint64_t max_recommend = PSM3_GPU_MIN_BAR_SIZE() - 32*MEGABYTE; if ((uint64_t)ep->rv_gpu_cache_size*MEGABYTE >= max_recommend) { _HFI_INFO("Warning: PSM3_RV_GPU_CACHE_SIZE=%u too large for smallest GPU's BAR size of %"PRIu64" (< %"PRIu64" total of endpoint-rail-qp recommended)\n", ep->rv_gpu_cache_size, - (psm3_min_gpu_bar_size() + MEGABYTE-1)/MEGABYTE, + (PSM3_GPU_MIN_BAR_SIZE() + MEGABYTE-1)/MEGABYTE, max_recommend/MEGABYTE); } } @@ -911,7 +909,7 @@ psm2_error_t set_cache_limit_nonpri_rv_kern(psm2_mr_cache_t cache, } _HFI_MMDBG("CPU cache %u GPU cache %u\n", ep->rv_mr_cache_size, ep->rv_gpu_cache_size); -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ return PSM2_OK; } #endif // PSM_HAVE_RNDV_MOD @@ -924,14 +922,14 @@ psm2_error_t set_cache_limit_nonpri_rv_kern(psm2_mr_cache_t cache, psm2_error_t set_cache_limit_nonpri_user(psm2_mr_cache_t cache, uint32_t limit_entries, uint32_t pri_entries, uint64_t pri_size -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , uint64_t gpu_pri_size #endif ) { cache->limit_nonpri_inuse_bytes = cache->limit_bytes - pri_size; #ifdef PSM_HAVE_RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // N/A to GPU cache->limit_nonpri_gpu_inuse_bytes = 0; #endif @@ -944,14 +942,14 @@ psm2_error_t set_cache_limit_nonpri_user(psm2_mr_cache_t cache, psm2_error_t set_cache_limit_nonpri_none(psm2_mr_cache_t cache, uint32_t limit_entries, uint32_t pri_entries, uint64_t pri_size -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , uint64_t gpu_pri_size #endif ) { cache->limit_nonpri_inuse_bytes = UINT64_MAX; #ifdef PSM_HAVE_RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // N/A to GPU cache->limit_nonpri_gpu_inuse_bytes = 0; #endif @@ -964,7 +962,7 @@ psm2_mr_cache_t psm3_verbs_alloc_mr_cache(psm2_ep_t ep, uint32_t limit_entries, uint8_t cache_mode, uint32_t limit_size_mb, uint32_t pri_entries, uint64_t pri_size -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , uint64_t gpu_pri_size #endif ) @@ -1010,7 +1008,7 @@ psm2_mr_cache_t psm3_verbs_alloc_mr_cache(psm2_ep_t ep, cache->reg_mr_fn = psm3_verbs_reg_mr_not_user; cache->release_mr_fn = psm3_verbs_release_mr_not_user; err = set_cache_limit_nonpri_none(cache, limit_entries, pri_entries, pri_size -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , gpu_pri_size #endif ); @@ -1021,7 +1019,7 @@ psm2_mr_cache_t psm3_verbs_alloc_mr_cache(psm2_ep_t ep, cache->reg_mr_fn = psm3_verbs_reg_mr_not_user; cache->release_mr_fn = psm3_verbs_release_mr_not_user; err = set_cache_limit_nonpri_rv_kern(cache, limit_entries, pri_entries, pri_size -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , gpu_pri_size #endif ); @@ -1032,7 +1030,7 @@ psm2_mr_cache_t psm3_verbs_alloc_mr_cache(psm2_ep_t ep, cache->reg_mr_fn = psm3_verbs_reg_mr_user; cache->release_mr_fn = psm3_verbs_release_mr_user; err = set_cache_limit_nonpri_user(cache, limit_entries, pri_entries, pri_size -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , gpu_pri_size #endif ); @@ -1042,7 +1040,7 @@ psm2_mr_cache_t psm3_verbs_alloc_mr_cache(psm2_ep_t ep, cache->reg_mr_fn = psm3_verbs_reg_mr_not_user; cache->release_mr_fn = psm3_verbs_release_mr_user_noinval; err = set_cache_limit_nonpri_user(cache, limit_entries, pri_entries, pri_size -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , gpu_pri_size #endif ); @@ -1060,7 +1058,7 @@ psm2_mr_cache_t psm3_verbs_alloc_mr_cache(psm2_ep_t ep, cache->cmd_fd = ep->cmd_fd; #endif // PSM_HAVE_RNDV_MOD cache->pd = ep->pd; -#if defined(PSM_HAVE_RNDV_MOD) && (defined(PSM_CUDA) || defined(PSM_ONEAPI)) +#if defined(PSM_HAVE_RNDV_MOD) && defined(PSM_HAVE_GPU) _HFI_MMDBG("cache alloc: limit_entries=%u limit_bytes=%"PRIu64" limit_nonpri_inuse=%u limit_nonpri_inuse_bytes=%"PRIu64" limit_nonpri_gpu_inuse_bytes=%"PRIu64", pri_entries=%u pri_size=%"PRIu64" gpu_pri_size=%"PRIu64"\n", cache->limit_entries, cache->limit_bytes, cache->limit_nonpri_inuse, cache->limit_nonpri_inuse_bytes, cache->limit_nonpri_gpu_inuse_bytes, @@ -1128,7 +1126,7 @@ int psm3_verbs_mr_cache_allows_user_mr(psm2_mr_cache_t cache) static inline int have_nonpri_space(psm2_mr_cache_t cache, uint64_t length, int access) { #ifdef PSM_HAVE_RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (access & IBV_ACCESS_IS_GPU_ADDR) return (cache->inuse < cache->limit_nonpri_inuse && cache->gpu_inuse_bytes + length < cache->limit_nonpri_gpu_inuse_bytes); @@ -1356,13 +1354,13 @@ static psm3_verbs_mr_t prep_and_reg_mr(psm2_mr_cache_t cache, // user space QPs for everything mrc->mr.rv_mr = psm3_rv_reg_mem(cache->rv, cache->cmd_fd, cache->pd, key->addr, key->length, key->access -#ifdef PSM_ONEAPI - , key->alloc_id +#ifdef PSM_HAVE_GPU + , &key->gpu_specific #endif ); if (! mrc->mr.rv_mr) { save_errno = errno; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (save_errno == ENOMEM && priority) (void)psm3_gpu_evict_some(cache->ep, key->length, key->access); #endif @@ -1377,13 +1375,13 @@ static psm3_verbs_mr_t prep_and_reg_mr(psm2_mr_cache_t cache, (key->access&IBV_ACCESS_RDMA)?NULL :cache->pd, key->addr, key->length, key->access -#ifdef PSM_ONEAPI - , key->alloc_id +#ifdef PSM_HAVE_GPU + , &key->gpu_specific #endif ); if (! mrc->mr.rv_mr) { save_errno = errno; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (save_errno == ENOMEM && priority) (void)psm3_gpu_evict_some(cache->ep, key->length, key->access); #endif @@ -1411,8 +1409,8 @@ static psm3_verbs_mr_t prep_and_reg_mr(psm2_mr_cache_t cache, mrc->addr = key->addr; mrc->length = key->length; mrc->access = key->access; -#ifdef PSM_ONEAPI - mrc->alloc_id = key->alloc_id; +#ifdef PSM_HAVE_GPU + mrc->gpu_specific = key->gpu_specific; #endif ADD_STAT(cache, mrc->length, registered_bytes, max_registered_bytes); /* Reset the fail counter */ @@ -1636,80 +1634,21 @@ struct psm3_verbs_mr * psm3_verbs_reg_mr(psm2_mr_cache_t cache, bool priority, return NULL; } #else -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - psmi_assert(!!(access & IBV_ACCESS_IS_GPU_ADDR) == (PSMI_IS_GPU_ENABLED && PSMI_IS_GPU_MEM(addr))); -#ifdef PSM_ONEAPI - if (access & IBV_ACCESS_IS_GPU_ADDR) { -#define MAX_USER_MR_SIZE (32 * 1024) - void *base; - size_t len; - PSMI_ONEAPI_ZE_CALL(zeMemGetAddressRange, ze_context, - (const void *)addr, &base, &len); - /* - * Need to register MR with base address and total length. - * However, for Mellanox cards, the max buffer size for a - * user MR registered through the rv module is 32k bytes. - * Otherwise, it will fail with IB_WC_MW_BIND_ERR. For fast - * registration MR through RV (kernel MR and GPU MR), there - * is also a upper limit (max_fast_reg_page_list_len) imposed - * by the underlying RDMA device (eg 256MB for mlx5). - */ - if (strncasecmp(cache->ep->dev_name, "mlx5_0", 3) == 0 && - !(access & IBV_ACCESS_KERNEL)) { - if (len > MAX_USER_MR_SIZE) { - /* - * Change only if the buffer stays in the first - * 32k - */ - if (((char *)addr + length) <= - ((char *)base + MAX_USER_MR_SIZE)) { - addr = base; - length = MAX_USER_MR_SIZE; - } - } else { - addr = base; - length = len; - } - } else { - uint64_t start, end; - uint64_t mr_len; - uint64_t offset; - uint64_t limit = cache->ep->verbs_ep.max_fmr_size; - - /* Buffer end + 1 */ - end = (uint64_t)base + len; - /* Offset of the requested buffer chunk */ - offset = (uint64_t)addr - (uint64_t)base; - /* Start address of next MR */ - start = (uint64_t)base + (offset / limit) * limit; - mr_len = end - start; - if (mr_len > limit) - mr_len = limit; - /* - * Change only if the chunk does not cross the - * (start + mr_len) boundary, Otherwise, - * Just register the requested chunk. - */ - if (((uint64_t)addr + length) <= (start + mr_len)) - { - addr = (void *)start; - length = mr_len; - } - } - } -#endif /* PSM_ONEAPI */ -#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#ifdef PSM_HAVE_GPU + psmi_assert(!!(access & IBV_ACCESS_IS_GPU_ADDR) == (PSM3_IS_GPU_MEM(addr))); + if (access & IBV_ACCESS_IS_GPU_ADDR) + PSM3_GPU_ROUNDUP_RV_REG_MR(cache->ep, &addr, &length, access); +#endif /* PSM_HAVE_GPU */ #endif /* PSM_HAVE_RNDV_MOD */ struct psm3_verbs_mr key = { // our search key .addr = addr, .length = length, .access = access, -#ifdef PSM_ONEAPI - .alloc_id = (access & IBV_ACCESS_IS_GPU_ADDR)? - psm3_oneapi_ze_get_alloc_id(addr, NULL) : 0 -#endif }; +#ifdef PSM_HAVE_GPU + PSM3_GPU_INIT_MR(addr, length, access, &key.gpu_specific); +#endif _HFI_MMDBG("pri %d "MRC_FMT"\n", priority, MR_OUT_MRC(&key)); return (*cache->reg_mr_fn)(cache, priority, &key); @@ -1817,9 +1756,9 @@ void psm3_verbs_release_mr(struct psm3_verbs_mr *mrc) void psm3_verbs_free_mr_cache(psm2_mr_cache_t cache) { // don't pollute stats with our shutdown activity -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU #ifdef PSM_HAVE_RNDV_MOD - if (cache->rv && PSMI_IS_GPU_ENABLED) + if (cache->rv && PSM3_GPU_IS_ENABLED) psm3_stats_deregister_type(PSMI_STATSTYPE_MR_CACHE, &cache->rv_gpu_stats); #endif @@ -2310,11 +2249,11 @@ static uint64_t mr_cache_rv_miss_rate(void *context) return 0; } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU static uint64_t mr_cache_rv_gpu_limit_size(void *context) { psm2_mr_cache_t cache = container_of(context, struct psm2_mr_cache, rv_gpu_stats); - if (cache->rv && PSMI_IS_GPU_ENABLED ) { + if (cache->rv && PSM3_GPU_IS_ENABLED ) { // this is a little sly, we know the stats processing routines will // call the accessors in the order from the entries list // so we use the 1st of the rv statistics accessors to get @@ -2447,7 +2386,7 @@ static uint64_t mr_cache_rv_gpu_miss_rate_mmap(void *context) else return 0; } -#endif // PSM_CUDA || PSM_ONEAPI +#endif /* PSM_HAVE_GPU */ #endif // PSM_HAVE_RNDV_MOD @@ -2478,7 +2417,7 @@ static void register_cache_stats(psm2_mr_cache_t cache) MPSPAWN_STATS_REDUCTION_ALL, NULL, &cache->limit_nonpri_inuse_bytes), #ifdef PSM_HAVE_RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_STATS_DECL("limit_nonpri_gpu_inuse_bytes", "Limit of total registered non-priority GPU MR bytes in cache", MPSPAWN_STATS_REDUCTION_ALL, @@ -2572,7 +2511,7 @@ static void register_cache_stats(psm2_mr_cache_t cache) &cache->umr_cache.stats.max_dereg_queued_cnt), #endif /* UMR_CACHE */ #ifdef PSM_HAVE_RNDV_MOD -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_STATS_DECLU64("gpu_inuse_bytes", "Current registered GPU MR bytes with an active IO", &cache->gpu_inuse_bytes), @@ -2778,7 +2717,7 @@ static void register_cache_stats(psm2_mr_cache_t cache) PSMI_HOWMANY(entries), psm3_epid_fmt_internal(cache->ep->epid, 0), cache, cache->ep->dev_name); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU #ifdef PSM_HAVE_RNDV_MOD struct psmi_stats_entry gpu_entries[] = { PSMI_STATS_DECL_HELP("Kernel RV GPU Cache Configuration:"), @@ -3009,7 +2948,7 @@ static void register_cache_stats(psm2_mr_cache_t cache) "Number of GPU RDMA write bytes successfully posted", (uint64_t*)&cache->rv_gpu_stats.gpu_post_write_bytes), }; - if (cache->rv && PSMI_IS_GPU_ENABLED && cache->ep->rv_gpu_cache_size) { + if (cache->rv && PSM3_GPU_IS_ENABLED && cache->ep->rv_gpu_cache_size) { psm3_stats_register_type("MR_GPU_Cache_Statistics", "Kernel RV GPU MR and mmap cache for an endpoint in the process\n" "When Direct GPU transfers are enabled, an additional " @@ -3031,7 +2970,7 @@ static void register_cache_stats(psm2_mr_cache_t cache) &cache->rv_gpu_stats, cache->ep->dev_name); } #endif /* PSM_HAVE_RNDV_MOD */ -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ } #endif /* PSM_HAVE_REG_MR */ diff --git a/prov/psm3/psm3/psm_verbs_mr.h b/prov/psm3/psm3/psm_verbs_mr.h index 83c34944a84..e04bc759e13 100644 --- a/prov/psm3/psm3/psm_verbs_mr.h +++ b/prov/psm3/psm3/psm_verbs_mr.h @@ -191,8 +191,8 @@ struct psm3_verbs_mr { void *addr; uint64_t length; uint32_t access; -#if defined(PSM_ONEAPI) - uint64_t alloc_id; +#ifdef PSM_HAVE_GPU + union psm3_verbs_mr_gpu_specific gpu_specific; #endif // below is for queue of cache entries available for reuse (refcount==0) // only used when cache_mode==1 @@ -211,11 +211,11 @@ extern unsigned psm3_mr_cache_debug; #define MR_OUT_RANGE(addr, len) (uint64_t)(uintptr_t)(addr), \ (uint64_t)(uintptr_t)(addr)+(uint64_t)(len)-1, \ (uint64_t)(len) -#ifdef PSM_ONEAPI -#define MRC_FMT "0x%"PRIx64":0x%"PRIx64" (len 0x%"PRIx64") id %"PRIu64 \ - " access 0x%x" -#define MR_OUT_MRC(mrc) MR_OUT_RANGE((mrc)->addr, (mrc)->length), \ - (mrc)->alloc_id, (mrc)->access +#ifdef PSM_HAVE_GPU +#define MRC_FMT "0x%"PRIx64":0x%"PRIx64" (len 0x%"PRIx64") access 0x%x" \ + PSM3_GPU_MRC_FMT +#define MR_OUT_MRC(mrc) MR_OUT_RANGE((mrc)->addr, (mrc)->length), (mrc)->access \ + PSM3_GPU_OUT_MRC(&(mrc)->gpu_specific) #else #define MRC_FMT "0x%"PRIx64":0x%"PRIx64" (len 0x%"PRIx64") access 0x%x" #define MR_OUT_MRC(mrc) MR_OUT_RANGE((mrc)->addr, (mrc)->length), (mrc)->access @@ -225,7 +225,7 @@ extern psm2_mr_cache_t psm3_verbs_alloc_mr_cache(psm2_ep_t ep, uint32_t limit_entries, uint8_t cache_mode, uint32_t limit_size_mb, uint32_t limit_pri_entries, uint64_t pri_size -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , uint64_t gpu_pri_size #endif ); diff --git a/prov/psm3/psm3/ptl.h b/prov/psm3/psm3/ptl.h index 44110636411..f6ca3e0d98b 100644 --- a/prov/psm3/psm3/ptl.h +++ b/prov/psm3/psm3/ptl.h @@ -133,7 +133,7 @@ struct ptl_arg { struct ptl_strategy_stats { uint64_t tiny_cpu_isend; uint64_t tiny_cpu_isend_bytes; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint64_t tiny_gdrcopy_isend; uint64_t tiny_gdrcopy_isend_bytes; uint64_t tiny_cuCopy_isend; @@ -141,7 +141,7 @@ struct ptl_strategy_stats { #endif uint64_t tiny_cpu_send; uint64_t tiny_cpu_send_bytes; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint64_t tiny_gdrcopy_send; uint64_t tiny_gdrcopy_send_bytes; uint64_t tiny_cuCopy_send; @@ -152,7 +152,7 @@ struct ptl_strategy_stats { uint64_t tiny_cpu_recv_bytes; uint64_t tiny_sysbuf_recv; /* to unexpected Q sysbuf */ /* incl 0 byte */ uint64_t tiny_sysbuf_recv_bytes; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint64_t tiny_gdrcopy_recv; uint64_t tiny_gdrcopy_recv_bytes; uint64_t tiny_cuCopy_recv; @@ -163,7 +163,7 @@ struct ptl_strategy_stats { uint64_t short_copy_cpu_isend_bytes; uint64_t short_dma_cpu_isend; uint64_t short_dma_cpu_isend_bytes; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint64_t short_gdrcopy_isend; uint64_t short_gdrcopy_isend_bytes; uint64_t short_cuCopy_send; @@ -176,7 +176,7 @@ struct ptl_strategy_stats { uint64_t short_dma_cpu_send; uint64_t short_dma_cpu_send_bytes; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint64_t short_gdrcopy_send; uint64_t short_gdrcopy_send_bytes; uint64_t short_cuCopy_isend; @@ -189,7 +189,7 @@ struct ptl_strategy_stats { uint64_t short_cpu_recv_bytes; uint64_t short_sysbuf_recv; /* to unexpected Q sysbuf */ uint64_t short_sysbuf_recv_bytes; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint64_t short_gdrcopy_recv; uint64_t short_gdrcopy_recv_bytes; uint64_t short_cuCopy_recv; @@ -202,7 +202,7 @@ struct ptl_strategy_stats { uint64_t eager_dma_cpu_isend_bytes; uint64_t eager_sysbuf_recv; /* to unexpected Q sysbuf */ uint64_t eager_sysbuf_recv_bytes; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint64_t eager_cuCopy_isend; uint64_t eager_cuCopy_isend_bytes; uint64_t eager_gdr_isend; @@ -212,7 +212,7 @@ struct ptl_strategy_stats { uint64_t eager_copy_cpu_send_bytes; uint64_t eager_dma_cpu_send; uint64_t eager_dma_cpu_send_bytes; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint64_t eager_cuCopy_send; uint64_t eager_cuCopy_send_bytes; uint64_t eager_gdr_send; @@ -221,7 +221,7 @@ struct ptl_strategy_stats { uint64_t eager_cpu_recv; uint64_t eager_cpu_recv_bytes; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint64_t eager_gdrcopy_recv; uint64_t eager_gdrcopy_recv_bytes; uint64_t eager_cuCopy_recv; @@ -230,13 +230,13 @@ struct ptl_strategy_stats { uint64_t rndv_cpu_isend; uint64_t rndv_cpu_isend_bytes; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint64_t rndv_gpu_isend; uint64_t rndv_gpu_isend_bytes; #endif uint64_t rndv_cpu_send; uint64_t rndv_cpu_send_bytes; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint64_t rndv_gpu_send; uint64_t rndv_gpu_send_bytes; #endif @@ -246,7 +246,7 @@ struct ptl_strategy_stats { uint64_t rndv_rts_cpu_recv_bytes; uint64_t rndv_rts_sysbuf_recv; uint64_t rndv_rts_sysbuf_recv_bytes; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint64_t rndv_rts_cuCopy_recv; uint64_t rndv_rts_cuCopy_recv_bytes; #endif @@ -261,7 +261,7 @@ struct ptl_strategy_stats { uint64_t rndv_long_cpu_recv_bytes; uint64_t rndv_long_gpu_recv; /* per RTS */ uint64_t rndv_long_gpu_recv_bytes; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint64_t rndv_long_cuCopy_recv; uint64_t rndv_long_cuCopy_recv_bytes; uint64_t rndv_long_gdrcopy_recv; @@ -274,7 +274,7 @@ struct ptl_strategy_stats { uint64_t rndv_long_copy_cpu_send_bytes; uint64_t rndv_long_dma_cpu_send; /* per CTS (1 per RTS) */ uint64_t rndv_long_dma_cpu_send_bytes; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint64_t rndv_long_cuCopy_send; /* per CTS (1 per RTS) */ uint64_t rndv_long_cuCopy_send_bytes; uint64_t rndv_long_gdrcopy_send; /* per CTS (1 per RTS) */ @@ -286,7 +286,7 @@ struct ptl_strategy_stats { /* RDMA approach selected by receiver */ uint64_t rndv_rdma_cpu_recv; /* per RTS */ uint64_t rndv_rdma_cpu_recv_bytes; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint64_t rndv_rdma_gdr_recv; /* per RTS */ uint64_t rndv_rdma_gdr_recv_bytes; uint64_t rndv_rdma_hbuf_recv; /* per RTS */ @@ -297,7 +297,7 @@ struct ptl_strategy_stats { /* RDMA may use >= 1 CTS per RTS */ uint64_t rndv_rdma_cpu_send; /* per CTS */ uint64_t rndv_rdma_cpu_send_bytes; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint64_t rndv_rdma_gdr_send; /* per CTS */ uint64_t rndv_rdma_gdr_send_bytes; uint64_t rndv_rdma_hbuf_send; /* per CTS */ diff --git a/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.c b/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.c deleted file mode 100644 index bc4b4798b16..00000000000 --- a/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.c +++ /dev/null @@ -1,515 +0,0 @@ -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2016 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2016 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifdef PSM_CUDA - -#include "psm_user.h" -#include "am_cuda_memhandle_cache.h" - -/* - * rbtree cruft - */ -struct _cl_map_item; - -typedef struct -{ - unsigned long start; /* start virtual address */ - CUipcMemHandle cuda_ipc_handle; /* cuda ipc mem handle */ - CUdeviceptr cuda_ipc_dev_ptr;/* Cuda device pointer */ - psm2_epid_t epid; - struct _cl_map_item* i_prev; /* idle queue previous */ - struct _cl_map_item* i_next; /* idle queue next */ -}__attribute__ ((aligned (128))) rbtree_cuda_memhandle_cache_mapitem_pl_t; - -typedef struct { - uint32_t nelems; /* number of elements in the cache */ -} rbtree_cuda_memhandle_cache_map_pl_t; - -static psm2_error_t am_cuda_memhandle_mpool_alloc( - am_cuda_memhandle_cache_t cache, uint32_t memcache_size); - -/* - * Custom comparator - */ -typedef rbtree_cuda_memhandle_cache_mapitem_pl_t cuda_cache_item; - -static int cuda_cache_key_cmp(const cuda_cache_item *a, const cuda_cache_item *b) -{ - // we use epid as part of cache key so multi-ep and multi-process jobs - // can have a better cache hit rate. In some cases we may end up with - // cache entries for the same buffer with different epid's all within the - // same multi-ep rank, but this does no harm other than to waste some - // cache space. By including epid in key_cmp we have a chance to have - // separate cache entries for the same sbuf address in different - // sender's GPU virtual address space. - switch (psm3_epid_cmp_internal(a->epid, b->epid)) { - case -1: return -1; - case 1: return 1; - default: - break; - } - - // The sender has used cuMemGetAddressRange to normalize the address - // so we can simply compare the start address of the allocation. - // Note cuIpcOpenMemHandle only needs the start address as well, so we - // ignore length - if (a->start < b->start) - return -1; - if (b->start < a->start) - return 1; - - return 0; -} - - -/* - * Necessary rbtree cruft - */ -#define RBTREE_MI_PL rbtree_cuda_memhandle_cache_mapitem_pl_t -#define RBTREE_MAP_PL rbtree_cuda_memhandle_cache_map_pl_t -#define RBTREE_CMP(a,b) cuda_cache_key_cmp((a), (b)) -#define RBTREE_ASSERT psmi_assert -#define RBTREE_MAP_COUNT(PAYLOAD_PTR) ((PAYLOAD_PTR)->nelems) -#define RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR - -#include "psm3_rbtree.h" -#include "psm3_rbtree.c" - -/* - * Convenience rbtree cruft - */ -#define NELEMS(cache) ((cache)->map.payload.nelems) - -#define IHEAD(cache) ((cache)->map.root) -#define LAST(cache) (IHEAD(cache)->payload.i_prev) -#define FIRST(cache) (IHEAD(cache)->payload.i_next) -#define INEXT(x) ((x)->payload.i_next) -#define IPREV(x) ((x)->payload.i_prev) - -/* - * Actual module data - */ -struct am_cuda_memhandle_cache { - cl_qmap_t map; - mpool_t mpool; - uint32_t size; - psm2_mq_stats_t *stats; -}; - -static void print_cuda_memhandle_cache_stats(psm2_mq_stats_t *stats) -{ - _HFI_DBG("limit=%lu,maxelems=%lu,hit=%lu,miss=%lu,evict=%lu,remove=%lu,clear=%lu\n", - stats->gpu_ipc_cache_limit, stats->gpu_ipc_cache_max_nelems, - stats->gpu_ipc_cache_hit, stats->gpu_ipc_cache_miss, - stats->gpu_ipc_cache_evict, stats->gpu_ipc_cache_remove, - stats->gpu_ipc_cache_clear); -} - -/* - * This is the callback function when mempool are resized or destroyed. - * Upon calling cache fini mpool is detroyed which in turn calls this callback - * which helps in closing all memhandles. - */ -static void -psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj) -{ - cl_map_item_t* memcache_item = (cl_map_item_t*)obj; - if (!is_alloc) { - if(memcache_item->payload.start) - PSMI_CUDA_CALL(cuIpcCloseMemHandle, - memcache_item->payload.cuda_ipc_dev_ptr); - } -} - -/* - * Creating mempool for cuda memhandle cache nodes. - */ -static psm2_error_t -am_cuda_memhandle_mpool_alloc(am_cuda_memhandle_cache_t cache, - uint32_t memcache_size) -{ - psm2_error_t err; - if (memcache_size < 1) - return PSM2_PARAM_ERR; - - cache->size = memcache_size; - /* Creating a memory pool of size PSM3_CUDA_MEMCACHE_SIZE - * which includes the Root and NIL items - */ - cache->mpool = psm3_mpool_create_for_gpu(sizeof(cl_map_item_t), - cache->size, - cache->size, 0, - UNDEFINED, NULL, NULL, - psmi_cuda_memhandle_cache_alloc_func, - NULL); - if (cache->mpool == NULL) { - err = psm3_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, - "Couldn't allocate CUDA host receive buffer pool"); - return err; - } - return PSM2_OK; -} - -/* - * allocate and initialize memhandle cache - * including rbtree. - */ -psm2_error_t am_cuda_memhandle_cache_alloc(am_cuda_memhandle_cache_t *cachep, - uint32_t memcache_size, - psm2_mq_stats_t *stats) -{ - cl_map_item_t *root = NULL, *nil_item = NULL; - - *cachep = (am_cuda_memhandle_cache_t)psmi_calloc( - NULL, UNDEFINED, 1, sizeof(**cachep)); - if (! *cachep) - return PSM2_NO_MEMORY; - - psm2_error_t err = am_cuda_memhandle_mpool_alloc(*cachep, memcache_size); - if (err != PSM2_OK) - goto fail; - - root = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t)); - if (root == NULL) { - err = PSM2_NO_MEMORY; - goto fail; - } - nil_item = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t)); - if (nil_item == NULL) { - err = PSM2_NO_MEMORY; - goto fail; - } - - nil_item->payload.start = 0; - nil_item->payload.epid = psm3_epid_zeroed_internal(); - ips_cl_qmap_init(&(*cachep)->map,root,nil_item); - NELEMS(*cachep) = 0; - - (*cachep)->stats = stats; - - stats->gpu_ipc_cache_limit = memcache_size; - stats->gpu_ipc_cache_nelems = 0; - stats->gpu_ipc_cache_max_nelems = 0; - stats->gpu_ipc_cache_hit = 0; - stats->gpu_ipc_cache_miss = 0; - stats->gpu_ipc_cache_evict = 0; - stats->gpu_ipc_cache_remove = 0; - stats->gpu_ipc_cache_clear = 0; - - return PSM2_OK; - -fail: - if (nil_item) - psmi_free(nil_item); - if (root) - psmi_free(root); - if ((*cachep)->mpool) - psm3_mpool_destroy((*cachep)->mpool); - psmi_free(*cachep); - return err; -} - -void am_cuda_memhandle_cache_free(am_cuda_memhandle_cache_t cache) -{ - print_cuda_memhandle_cache_stats(cache->stats); - - if (cache->map.nil_item) - psmi_free(cache->map.nil_item); - if (cache->map.root) - psmi_free(cache->map.root); - if (cache->mpool) - psm3_mpool_destroy(cache->mpool); - psmi_free(cache); -} - -/* - * Insert at the head of Idleq. - */ -static void -am_cuda_idleq_insert(am_cuda_memhandle_cache_t cache, cl_map_item_t* memcache_item) -{ - if (FIRST(cache) == NULL) { - FIRST(cache) = memcache_item; - LAST(cache) = memcache_item; - return; - } - INEXT(FIRST(cache)) = memcache_item; - IPREV(memcache_item) = FIRST(cache); - FIRST(cache) = memcache_item; - INEXT(FIRST(cache)) = NULL; - return; -} - -/* - * Remove least recent used element. - */ -static void -am_cuda_idleq_remove_last(am_cuda_memhandle_cache_t cache, cl_map_item_t* memcache_item) -{ - if (!INEXT(memcache_item)) { - LAST(cache) = NULL; - FIRST(cache) = NULL; - } else { - LAST(cache) = INEXT(memcache_item); - IPREV(LAST(cache)) = NULL; - } - // Null-out now-removed memcache_item's next and prev pointers out of - // an abundance of caution - INEXT(memcache_item) = IPREV(memcache_item) = NULL; -} - -static void -am_cuda_idleq_remove(am_cuda_memhandle_cache_t cache, cl_map_item_t* memcache_item) -{ - if (LAST(cache) == memcache_item) { - am_cuda_idleq_remove_last(cache, memcache_item); - } else if (FIRST(cache) == memcache_item) { - FIRST(cache) = IPREV(memcache_item); - INEXT(FIRST(cache)) = NULL; - } else { - INEXT(IPREV(memcache_item)) = INEXT(memcache_item); - IPREV(INEXT(memcache_item)) = IPREV(memcache_item); - } - // Null-out now-removed memcache_item's next and prev pointers out of - // an abundance of caution - INEXT(memcache_item) = IPREV(memcache_item) = NULL; -} - -static void -am_cuda_idleq_reorder(am_cuda_memhandle_cache_t cache, cl_map_item_t* memcache_item) -{ - if (FIRST(cache) == memcache_item && LAST(cache) == memcache_item ) { - return; - } - am_cuda_idleq_remove(cache, memcache_item); - am_cuda_idleq_insert(cache, memcache_item); - return; -} - -/* - * After a successful cache hit, item is validated by doing a - * memcmp on the handle stored and the handle we receive from the - * sender. If the validation fails the item is removed from the idleq, - * the rbtree, is put back into the mpool and cuIpcCloseMemHandle function - * is called. - * Cuda ipcMemHandles for distinct allocations are unique, even if the - * allocation was at the same address. So this check catches stale cache - * entries. - */ -static psm2_error_t -am_cuda_memhandle_cache_validate(am_cuda_memhandle_cache_t cache, - cl_map_item_t* memcache_item, - uintptr_t sbuf, CUipcMemHandle* handle, - psm2_epid_t epid) -{ - psmi_assert(!psm3_epid_cmp_internal(epid, memcache_item->payload.epid)); - psmi_assert(sbuf == memcache_item->payload.start); - if (0 == memcmp(handle, &memcache_item->payload.cuda_ipc_handle, - sizeof(CUipcMemHandle))) { - return PSM2_OK; - } - _HFI_DBG("cache collision: new entry start=%lu\n", sbuf); - - cache->stats->gpu_ipc_cache_remove++; - ips_cl_qmap_remove_item(&cache->map, memcache_item); - cache->stats->gpu_ipc_cache_nelems--; - PSMI_CUDA_CALL(cuIpcCloseMemHandle, - memcache_item->payload.cuda_ipc_dev_ptr); - am_cuda_idleq_remove(cache, memcache_item); - memset(memcache_item, 0, sizeof(*memcache_item)); - psm3_mpool_put(memcache_item); - return PSM2_OK_NO_PROGRESS; -} - -/* - * Current eviction policy: Least Recently Used. - */ -static void -am_cuda_memhandle_cache_evict(am_cuda_memhandle_cache_t cache) -{ - cache->stats->gpu_ipc_cache_evict++; - cl_map_item_t *p_item = LAST(cache); - _HFI_VDBG("Removing (epid=%s,start=%lu,dev_ptr=0x%llX,it=%p) from cuda_memhandle_cachemap.\n", - psm3_epid_fmt_internal(p_item->payload.epid, 0), p_item->payload.start, - p_item->payload.cuda_ipc_dev_ptr, p_item); - ips_cl_qmap_remove_item(&cache->map, p_item); - cache->stats->gpu_ipc_cache_nelems--; - PSMI_CUDA_CALL(cuIpcCloseMemHandle, p_item->payload.cuda_ipc_dev_ptr); - am_cuda_idleq_remove_last(cache, p_item); - memset(p_item, 0, sizeof(*p_item)); - psm3_mpool_put(p_item); -} - -static psm2_error_t -am_cuda_memhandle_cache_register(am_cuda_memhandle_cache_t cache, - uintptr_t sbuf, CUipcMemHandle* handle, - psm2_epid_t epid, - CUdeviceptr cuda_ipc_dev_ptr) -{ - if (NELEMS(cache) == cache->size) - am_cuda_memhandle_cache_evict(cache); - - cl_map_item_t* memcache_item = psm3_mpool_get(cache->mpool); - /* memcache_item cannot be NULL as we evict - * before the call to mpool_get. Check has - * been fixed to help with klockwork analysis. - */ - if (memcache_item == NULL) - return PSM2_NO_MEMORY; - memcache_item->payload.start = sbuf; - memcache_item->payload.cuda_ipc_handle = *handle; - memcache_item->payload.cuda_ipc_dev_ptr = cuda_ipc_dev_ptr; - memcache_item->payload.epid = epid; - ips_cl_qmap_insert_item(&cache->map, memcache_item); - cache->stats->gpu_ipc_cache_nelems++; - if (cache->stats->gpu_ipc_cache_nelems > cache->stats->gpu_ipc_cache_max_nelems) - cache->stats->gpu_ipc_cache_max_nelems = cache->stats->gpu_ipc_cache_nelems; - am_cuda_idleq_insert(cache, memcache_item); - return PSM2_OK; -} - -static void am_cuda_memhandle_cache_clear(am_cuda_memhandle_cache_t cache) -{ - _HFI_DBG("Closing all handles, clearing cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS(cache)); - while (NELEMS(cache)) { - am_cuda_memhandle_cache_evict(cache); - } - cache->stats->gpu_ipc_cache_clear++; - _HFI_DBG("Closed all handles, cleared cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS(cache)); -} - -/* - * The key used to search the cache is the senders buf address pointer and - * epid. The sender will have used cuMemGetAddressRange - * to find the start of the memory containing the buffer (supplied as sbuf). - * Upon match, we must validate the entry we find and may need to replace it. - */ -CUdeviceptr -am_cuda_memhandle_acquire(am_cuda_memhandle_cache_t cache, - uintptr_t sbuf, CUipcMemHandle* handle, - psm2_epid_t epid) -{ - _HFI_VDBG("sbuf=%lu,handle=%p,epid=%s\n", - sbuf, handle, psm3_epid_fmt_internal(epid, 0)); - - CUdeviceptr cuda_ipc_dev_ptr; - if(! cache) { - PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, - *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); - return cuda_ipc_dev_ptr; - } - - cuda_cache_item key = { - .start = (unsigned long) sbuf, - .epid = epid - }; - - /* - * preconditions: - * 1) buffer [start,epid) may or may not be in cachemap already - * 2) there are no duplicate entries in cachemap - * postconditions: - * 1) buffer is in cachemap with same handle, epid - * 2) there are no duplicate entries in cachemap - * - * The key used to search the cache is the senders buf address pointer - * and epid. - * Upon a succesful hit in the cache, additional validation is required - * as the handle could be stale. - */ - cl_map_item_t *p_item = ips_cl_qmap_searchv(&cache->map, &key); - if (p_item->payload.start) { - // confirm the entry for sbuf matches the handle and is not stale - if (am_cuda_memhandle_cache_validate(cache, p_item, sbuf, handle, epid) == PSM2_OK) { - cache->stats->gpu_ipc_cache_hit++; - am_cuda_idleq_reorder(cache, p_item); - return p_item->payload.cuda_ipc_dev_ptr; - } - - // buffer found was stale am_cuda_memhandle_cache_validate() - // closed and removed existing entry. - // Should find no more duplicates -#ifdef PSM_DEBUG - p_item = ips_cl_qmap_searchv(&cache->map, &key); - psmi_assert(! p_item->payload.start); -#endif - } - cache->stats->gpu_ipc_cache_miss++; - - CUresult cudaerr; - PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_ALREADY_MAPPED, cuIpcOpenMemHandle, - &cuda_ipc_dev_ptr, *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); - - if (cudaerr == CUDA_ERROR_ALREADY_MAPPED) { - // remote memory already mapped. Close all handles, clear cache, - // and try again - am_cuda_memhandle_cache_clear(cache); - PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, *handle, - CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); - } - - am_cuda_memhandle_cache_register(cache, sbuf, handle, - epid, cuda_ipc_dev_ptr); - return cuda_ipc_dev_ptr; -} - -void -am_cuda_memhandle_release(am_cuda_memhandle_cache_t cache, - CUdeviceptr cuda_ipc_dev_ptr) -{ - if(! cache) - PSMI_CUDA_CALL(cuIpcCloseMemHandle, cuda_ipc_dev_ptr); - return; -} - -#endif diff --git a/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.h b/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.h deleted file mode 100644 index 4b1cf744545..00000000000 --- a/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.h +++ /dev/null @@ -1,91 +0,0 @@ -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2016 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2016 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifdef PSM_CUDA - -#ifndef _AM_CUDA_MEMHANDLE_CACHE_H -#define _AM_CUDA_MEMHANDLE_CACHE_H - -#include "psm_user.h" -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define CUDA_MEMHANDLE_CACHE_SIZE 64 - -struct am_cuda_memhandle_cache; // opaque since contains rbtree fields -typedef struct am_cuda_memhandle_cache *am_cuda_memhandle_cache_t; - -psm2_error_t am_cuda_memhandle_cache_alloc(am_cuda_memhandle_cache_t *cachep, - uint32_t memcache_size, - psm2_mq_stats_t *stats); - -CUdeviceptr -am_cuda_memhandle_acquire(am_cuda_memhandle_cache_t cache, - uintptr_t sbuf, CUipcMemHandle* handle, - psm2_epid_t epid); -void -am_cuda_memhandle_release(am_cuda_memhandle_cache_t cache, - CUdeviceptr cuda_ipc_dev_ptr); - -void am_cuda_memhandle_cache_free(am_cuda_memhandle_cache_t cache); - -#ifdef __cplusplus -} /* extern "C" */ -#endif - -#endif /* _AM_CUDA_MEMHANDLE_CACHE_H */ - -#endif /* PSM_CUDA */ diff --git a/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c b/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c deleted file mode 100644 index ac561c6d32f..00000000000 --- a/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c +++ /dev/null @@ -1,696 +0,0 @@ -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2022 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2022 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifdef PSM_ONEAPI - -#include "psm_user.h" -#include "psm_am_internal.h" -#include "am_oneapi_memhandle_cache.h" -#include -#include -#ifdef HAVE_DRM -#include -#include -#endif -#ifdef HAVE_LIBDRM -#include -#include -#endif -#ifdef PSM_HAVE_PIDFD -#include -#endif - -#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) -/* - * rbtree cruft - */ -struct _cl_map_item; - -typedef struct -{ - unsigned long start; /* start(base) virtual address - in peer process */ - uint32_t ze_handle; /* Sender's GEM handle or fd */ - uint64_t alloc_id; /* ze alloc_id */ - void *buf_ptr; /* buffer pointer in this - process */ - psm2_epid_t epid; - struct _cl_map_item* i_prev; /* idle queue previous */ - struct _cl_map_item* i_next; /* idle queue next */ - am_ze_memhandle_cache_t cache; /* only for gem_handle close */ -}__attribute__ ((aligned (128))) rbtree_ze_memhandle_cache_mapitem_pl_t; - -typedef struct { - uint32_t nelems; /* number of elements in the cache */ -} rbtree_ze_memhandle_cache_map_pl_t; - -static psm2_error_t am_ze_memhandle_mpool_alloc( - am_ze_memhandle_cache_t cache, uint32_t memcache_size); -static void am_ze_memhandle_delete(void *buf_ptr); - -/* - * Custom comparator - */ -typedef rbtree_ze_memhandle_cache_mapitem_pl_t ze_cache_item; - -static int ze_cache_key_cmp(const ze_cache_item *a, const ze_cache_item *b) -{ - // we use epid as part of cache key so multi-ep and multi-process jobs - // can have a better cache hit rate. In some cases we may end up with - // cache entries for the same buffer with different epid's all within the - // same multi-ep rank, but this does no harm other than to waste some - // cache space. By including epid in key_cmp we have a chance to have - // separate cache entries for the same sbuf address in different - // sender's GPU virtual address space. - switch (psm3_epid_cmp_internal(a->epid, b->epid)) { - case -1: return -1; - case 1: return 1; - default: - break; - } - - // The sender has used zeMemGetAddressRange to normalize the address - // so we can simply compare the start address of the allocation. - // Note zeMemOpenIpcHandle only needs the start address as well, so we - // ignore length - if (a->start < b->start) - return -1; - if (b->start < a->start) - return 1; - - return 0; -} - - -/* - * Necessary rbtree cruft - */ -#define RBTREE_MI_PL rbtree_ze_memhandle_cache_mapitem_pl_t -#define RBTREE_MAP_PL rbtree_ze_memhandle_cache_map_pl_t -#define RBTREE_CMP(a,b) ze_cache_key_cmp((a), (b)) -#define RBTREE_ASSERT psmi_assert -#define RBTREE_MAP_COUNT(PAYLOAD_PTR) ((PAYLOAD_PTR)->nelems) -#define RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR - -#include "psm3_rbtree.h" -#include "psm3_rbtree.c" - -/* - * Convenience rbtree cruft - */ -#define NELEMS(cache) ((cache)->map.payload.nelems) - -#define IHEAD(cache) ((cache)->map.root) -#define LAST(cache) (IHEAD(cache)->payload.i_prev) -#define FIRST(cache) (IHEAD(cache)->payload.i_next) -#define INEXT(x) ((x)->payload.i_next) -#define IPREV(x) ((x)->payload.i_prev) - -/* - * Actual module data - */ -struct am_ze_memhandle_cache { - cl_qmap_t map; - mpool_t mpool; - uint32_t size; - psm2_mq_stats_t *stats; -}; - -static void print_ze_memhandle_cache_stats(psm2_mq_stats_t *stats) -{ - _HFI_DBG("limit=%lu,maxelems=%lu,hit=%lu,miss=%lu,evict=%lu,remove=%lu,clear=%lu\n", - stats->gpu_ipc_cache_limit, stats->gpu_ipc_cache_max_nelems, - stats->gpu_ipc_cache_hit, stats->gpu_ipc_cache_miss, - stats->gpu_ipc_cache_evict, stats->gpu_ipc_cache_remove, - stats->gpu_ipc_cache_clear); -} - -/* - * This is the callback function when mempool are resized or destroyed. - * Upon calling cache free mpool is destroyed which in turn calls this callback - * which helps in closing all memhandles. - * TBD - only called for !is_alloc when destroying so could avoid keeping - * cache pointer in memcache_item. But when GEM_CLOSE is not needed - * memhandle_delete won't need destroyng flag and can remove cache pointer then - */ -static void -psmi_ze_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj) -{ - cl_map_item_t* memcache_item = (cl_map_item_t*)obj; - if (!is_alloc) { - if(memcache_item->payload.start) - am_ze_memhandle_delete(memcache_item->payload.buf_ptr); - } -} - -/* - * Creating mempool for ze memhandle cache nodes. - */ -static psm2_error_t -am_ze_memhandle_mpool_alloc(am_ze_memhandle_cache_t cache, - uint32_t memcache_size) -{ - psm2_error_t err; - if (memcache_size < 1) - return PSM2_PARAM_ERR; - - cache->size = memcache_size; - /* Creating a memory pool of size PSM3_ONEAPI_MEMCACHE_SIZE - * which includes the Root and NIL items - */ - cache->mpool = psm3_mpool_create_for_gpu(sizeof(cl_map_item_t), - cache->size, - cache->size, 0, - UNDEFINED, NULL, NULL, - psmi_ze_memhandle_cache_alloc_func, - NULL); - if (cache->mpool == NULL) { - err = psm3_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, - "Couldn't allocate ONEAPI host receive buffer pool"); - return err; - } - return PSM2_OK; -} - -/* - * allocate and initialize memhandle cache - * including rbtree. - */ -psm2_error_t am_ze_memhandle_cache_alloc(am_ze_memhandle_cache_t *cachep, - uint32_t memcache_size, - psm2_mq_stats_t *stats) -{ - cl_map_item_t *root = NULL, *nil_item = NULL; - - *cachep = (am_ze_memhandle_cache_t)psmi_calloc( - NULL, UNDEFINED, 1, sizeof(**cachep)); - if (! *cachep) - return PSM2_NO_MEMORY; - - psm2_error_t err = am_ze_memhandle_mpool_alloc(*cachep, memcache_size); - if (err != PSM2_OK) - return err; - - root = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t)); - if (root == NULL) { - err = PSM2_NO_MEMORY; - goto fail; - } - nil_item = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t)); - if (nil_item == NULL) { - err = PSM2_NO_MEMORY; - goto fail; - } - - nil_item->payload.start = 0; - nil_item->payload.epid = psm3_epid_zeroed_internal(); - ips_cl_qmap_init(&(*cachep)->map,root,nil_item); - NELEMS(*cachep) = 0; - - (*cachep)->stats = stats; - - stats->gpu_ipc_cache_limit = memcache_size; - stats->gpu_ipc_cache_nelems = 0; - stats->gpu_ipc_cache_max_nelems = 0; - stats->gpu_ipc_cache_hit = 0; - stats->gpu_ipc_cache_miss = 0; - stats->gpu_ipc_cache_evict = 0; - stats->gpu_ipc_cache_remove = 0; - stats->gpu_ipc_cache_clear = 0; - - return PSM2_OK; - -fail: - if (nil_item) - psmi_free(nil_item); - if (root) - psmi_free(root); - if ((*cachep)->mpool) - psm3_mpool_destroy((*cachep)->mpool); - psmi_free(*cachep); - return err; -} - -void am_ze_memhandle_cache_free(am_ze_memhandle_cache_t cache) -{ - print_ze_memhandle_cache_stats(cache->stats); - - if (cache->map.nil_item) - psmi_free(cache->map.nil_item); - if (cache->map.root) - psmi_free(cache->map.root); - if (cache->mpool) - psm3_mpool_destroy(cache->mpool); - psmi_free(cache); -} - -/* - * Insert at the head of Idleq. - */ -static void -am_ze_idleq_insert(am_ze_memhandle_cache_t cache, cl_map_item_t* memcache_item) -{ - if (FIRST(cache) == NULL) { - FIRST(cache) = memcache_item; - LAST(cache) = memcache_item; - return; - } - INEXT(FIRST(cache)) = memcache_item; - IPREV(memcache_item) = FIRST(cache); - FIRST(cache) = memcache_item; - INEXT(FIRST(cache)) = NULL; - return; -} - -/* - * Remove least recent used element. - */ -static void -am_ze_idleq_remove_last(am_ze_memhandle_cache_t cache, cl_map_item_t* memcache_item) -{ - if (!INEXT(memcache_item)) { - LAST(cache) = NULL; - FIRST(cache) = NULL; - } else { - LAST(cache) = INEXT(memcache_item); - IPREV(LAST(cache)) = NULL; - } - // Null-out now-removed memcache_item's next and prev pointers out of - // an abundance of caution - INEXT(memcache_item) = IPREV(memcache_item) = NULL; -} - -static void -am_ze_idleq_remove(am_ze_memhandle_cache_t cache, cl_map_item_t* memcache_item) -{ - if (LAST(cache) == memcache_item) { - am_ze_idleq_remove_last(cache, memcache_item); - } else if (FIRST(cache) == memcache_item) { - FIRST(cache) = IPREV(memcache_item); - INEXT(FIRST(cache)) = NULL; - } else { - INEXT(IPREV(memcache_item)) = INEXT(memcache_item); - IPREV(INEXT(memcache_item)) = IPREV(memcache_item); - } - // Null-out now-removed memcache_item's next and prev pointers out of - // an abundance of caution - INEXT(memcache_item) = IPREV(memcache_item) = NULL; -} - -static void -am_ze_idleq_reorder(am_ze_memhandle_cache_t cache, cl_map_item_t* memcache_item) -{ - if (FIRST(cache) == memcache_item && LAST(cache) == memcache_item ) { - return; - } - am_ze_idleq_remove(cache, memcache_item); - am_ze_idleq_insert(cache, memcache_item); - return; -} - -/* - * After a successful cache hit, item is validated by doing a - * memcmp on the handle stored and the handle we receive from the - * sender. If the validation fails the item is removed from the idleq, - * the rbtree, is put back into the mpool and ZeMemCloseIpcHandle function - * is called. - * Level Zero's alloc_id will be unique per allocation, even if the allocation - * was at the same address. In some cases, but not always, the ipc_handle - * will also be different. So we validate both, although just checking alloc_id - * would be sufficient. - */ - -static psm2_error_t -am_ze_memhandle_cache_validate(am_ze_memhandle_cache_t cache, - cl_map_item_t* memcache_item, - uintptr_t sbuf, uint32_t handle, - psm2_epid_t epid, uint64_t alloc_id) -{ - psmi_assert(!psm3_epid_cmp_internal(epid, memcache_item->payload.epid)); - psmi_assert(sbuf == memcache_item->payload.start); - if (handle == memcache_item->payload.ze_handle && - alloc_id == memcache_item->payload.alloc_id) { - return PSM2_OK; - } - _HFI_DBG("cache remove stale entry: new start=%lu,handle=%u,alloc_id=%lu\n", - sbuf, handle, alloc_id); - - cache->stats->gpu_ipc_cache_remove++; - ips_cl_qmap_remove_item(&cache->map, memcache_item); - cache->stats->gpu_ipc_cache_nelems--; - am_ze_memhandle_delete(memcache_item->payload.buf_ptr); - am_ze_idleq_remove(cache, memcache_item); - memset(memcache_item, 0, sizeof(*memcache_item)); - psm3_mpool_put(memcache_item); - return PSM2_OK_NO_PROGRESS; -} - -/* - * Current eviction policy: Least Recently Used. - */ -static void -am_ze_memhandle_cache_evict(am_ze_memhandle_cache_t cache) -{ - cache->stats->gpu_ipc_cache_evict++; - cl_map_item_t *p_item = LAST(cache); - _HFI_VDBG("Removing (epid=%s,start=%lu,dev_ptr=%p,it=%p) from ze_memhandle_cachemap.\n", - psm3_epid_fmt_internal(p_item->payload.epid, 0), p_item->payload.start, - p_item->payload.buf_ptr, p_item); - ips_cl_qmap_remove_item(&cache->map, p_item); - cache->stats->gpu_ipc_cache_nelems--; - am_ze_memhandle_delete(p_item->payload.buf_ptr); - am_ze_idleq_remove_last(cache, p_item); - memset(p_item, 0, sizeof(*p_item)); - psm3_mpool_put(p_item); -} - -static psm2_error_t -am_ze_memhandle_cache_register(am_ze_memhandle_cache_t cache, - uintptr_t sbuf, uint32_t handle, - psm2_epid_t epid, - void *buf_ptr, uint64_t alloc_id) -{ - if (NELEMS(cache) == cache->size) - am_ze_memhandle_cache_evict(cache); - - cl_map_item_t* memcache_item = psm3_mpool_get(cache->mpool); - /* memcache_item cannot be NULL as we evict - * before the call to mpool_get. Check has - * been fixed to help with klockwork analysis. - */ - if (memcache_item == NULL) - return PSM2_NO_MEMORY; - memcache_item->payload.start = sbuf; - memcache_item->payload.ze_handle = handle; - memcache_item->payload.buf_ptr = buf_ptr; - memcache_item->payload.alloc_id = alloc_id; - memcache_item->payload.epid = epid; - memcache_item->payload.cache = cache; - ips_cl_qmap_insert_item(&cache->map, memcache_item); - cache->stats->gpu_ipc_cache_nelems++; - if (cache->stats->gpu_ipc_cache_nelems > cache->stats->gpu_ipc_cache_max_nelems) - cache->stats->gpu_ipc_cache_max_nelems = cache->stats->gpu_ipc_cache_nelems; - am_ze_idleq_insert(cache, memcache_item); - _HFI_VDBG("registered: handle %u sbuf 0x%lx ptr %p alloc_id %lu\n", - handle, sbuf, buf_ptr, alloc_id); - return PSM2_OK; -} - -#ifndef PSM_HAVE_PIDFD -static inline psm2_error_t am_ze_prepare_fds_for_ipc_import( - uint32_t gem_handle, int device_index, int *ipc_fd, - psm2_epaddr_t epaddr) -{ - am_epaddr_t *am_epaddr = (am_epaddr_t*)epaddr; - int fd; - struct drm_prime_handle open_fd = {0, 0, -1}; - - if (device_index >= num_ze_devices) { - _HFI_ERROR("am_ze_memhandle_acquire received invalid device_index from peer: %d\n", - device_index); - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "device_index " - "invalid - received from peer: %d", - device_index); - return PSM2_INTERNAL_ERR; - } - fd = am_epaddr->peer_fds[device_index]; - cur_ze_dev = &ze_devices[device_index]; - open_fd.flags = DRM_CLOEXEC | DRM_RDWR; - open_fd.handle = gem_handle; - if (ioctl(fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &open_fd) < 0) { - _HFI_ERROR("ioctl failed for DRM_IOCTL_PRIME_HANDLE_TO_FD: %s\n", strerror(errno)); - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "ioctl " - "failed for DRM_IOCTL_PRIME_HANDLE_TO_FD errno=%d", - errno); - return PSM2_INTERNAL_ERR; - } - *ipc_fd = open_fd.fd; - - return PSM2_OK; -} -#else -static inline psm2_error_t am_ze_prepare_fds_for_ipc_import( - uint32_t handle, int device_index, int *ipc_fd, - psm2_epaddr_t epaddr) -{ - int fd; - am_epaddr_t *am_epaddr = (am_epaddr_t *)epaddr; - - fd = syscall(__NR_pidfd_getfd, am_epaddr->pidfd, handle, 0); - if (fd < 0) { - _HFI_ERROR("pidfd_getfd failed %d: %s\n", fd, strerror(errno)); - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "pidfd_getfd failed errno=%d (%s)", - errno, strerror(errno)); - return PSM2_INTERNAL_ERR; - } - *ipc_fd = fd; - - return PSM2_OK; -} -#endif /* PSM_HAVE_PIDFD */ -#endif /* defined(HAVE_DRM) || defined(HAVE_LIBDRM) */ - -#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) -static void *am_ze_import_ipc_buf(uint32_t fd, uint8_t alloc_type) -{ - ze_external_memory_import_fd_t import_desc = {}; - void *ze_ipc_buf = NULL; - - import_desc.stype = ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_FD; - import_desc.flags = ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF; - import_desc.fd = fd; - - switch(alloc_type) { - case ZE_MEMORY_TYPE_HOST: - { - ze_host_mem_alloc_desc_t host_desc = {}; - - host_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC; - host_desc.pNext = &import_desc; - /* size & alignment are not used since this is an import.*/ - PSMI_ONEAPI_ZE_CALL(zeMemAllocHost, ze_context, &host_desc, - 0, 0, &ze_ipc_buf); - } - break; - case ZE_MEMORY_TYPE_DEVICE: - { - ze_device_mem_alloc_desc_t dev_desc = {}; - - dev_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; - dev_desc.pNext = &import_desc; - /* size & alignment are not used since this is an import. */ - PSMI_ONEAPI_ZE_CALL(zeMemAllocDevice, ze_context, &dev_desc, - 0, 0, cur_ze_dev->dev, &ze_ipc_buf); - } - break; - default: - _HFI_ERROR("Invalid alloc_type %u for fd %u\n", - alloc_type, fd); - return NULL; - } - - return ze_ipc_buf; -} -#endif /* defined(HAVE_DRM) || defined(HAVE_LIBDRM) */ - -/* - * The key used to search the cache is the senders buf address pointer and - * epid. The sender will have used zeMemGetAddressRange - * to find the start of the memory containing the buffer (supplied as sbuf) - * Upon match, we must validate the entry we find and may need to replace it. - */ -void * -am_ze_memhandle_acquire(am_ze_memhandle_cache_t cache, - uintptr_t sbuf, uint32_t handle, - psm2_epaddr_t epaddr, int device_index, - uint64_t alloc_id, uint8_t alloc_type) -{ - void *buf_ptr = NULL; - psm2_epid_t epid = epaddr->epid; -#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) - int ipc_fd = -1; -#endif - _HFI_VDBG("sbuf=%lu,handle=%u,epid=%s\n", - sbuf, handle, psm3_epid_fmt_internal(epid, 0)); -#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) - - if (!cache) { - if (am_ze_prepare_fds_for_ipc_import(handle, device_index, &ipc_fd, - epaddr) == PSM2_OK) { - buf_ptr = am_ze_import_ipc_buf(ipc_fd, alloc_type); - if (ipc_fd >= 0) { - if (close(ipc_fd) < 0) { - _HFI_ERROR("close failed for ipc_fd: %s\n", strerror(errno)); - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "close " - "failed for ipc_fd %d errno=%d", - ipc_fd, errno); - return NULL; - } - } - } - return buf_ptr; - } - - ze_cache_item key = { - .start = (unsigned long) sbuf, - .epid = epid - }; - - /* - * preconditions: - * 1) buffer [start,epid) may or may not be in cache->map already - * 2) there are no duplicate entries in cache->map - * postconditions: - * 1) buffer is in cache->map with same handle, epid, alloc_id - * 2) there are no duplicate entries in cache->map - * - * The key used to search the cache is the senders buf address pointer - * and epid. - * Upon a succesful hit in the cache, additional validation is required - * as the handle or alloc_id could be stale. - */ - cl_map_item_t *p_item = ips_cl_qmap_searchv(&cache->map, &key); - if (p_item->payload.start) { - // confirm the entry for sbuf matches the handle and is not stale - if (am_ze_memhandle_cache_validate(cache, p_item, sbuf, handle, - epid, alloc_id) == - PSM2_OK) { - cache->stats->gpu_ipc_cache_hit++; - am_ze_idleq_reorder(cache, p_item); - return p_item->payload.buf_ptr; - } - - // buffer found was stale am_oneapi_memhandle_cache_validate() - // closed and removed existing entry. - // Should find no more duplicates -#ifdef PSM_DEBUG - p_item = ips_cl_qmap_searchv(&cache->map, &key); - psmi_assert(! p_item->payload.start); -#endif - } - cache->stats->gpu_ipc_cache_miss++; - - if (am_ze_prepare_fds_for_ipc_import(handle, device_index, &ipc_fd, - epaddr) == PSM2_OK) { - buf_ptr = am_ze_import_ipc_buf(ipc_fd, alloc_type); - if (ipc_fd >= 0) { - if (close(ipc_fd) < 0) { - _HFI_ERROR("close failed for ipc_fd: %s\n", strerror(errno)); - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "close " - "failed for ipc_fd %d errno=%d", - ipc_fd, errno); - return NULL; - } - } - if (!buf_ptr) - return NULL; - } else { - return NULL; - } - - am_ze_memhandle_cache_register(cache, sbuf, handle, epid, buf_ptr, - alloc_id); - return buf_ptr; -#else // if no drm, set up to return NULL as oneapi ipc handles don't work without drm - buf_ptr = NULL; - return buf_ptr; -#endif // defined(HAVE_DRM) || defined(HAVE_LIBDRM) - -} - -#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) -void am_ze_memhandle_delete(void *buf_ptr) -{ - /* Release the reference to the buffer */ - PSMI_ONEAPI_ZE_CALL(zeMemFree, ze_context, buf_ptr); - -#ifndef PSM_HAVE_PIDFD - /* - * If pidfd is not used, we need to call GEM_CLOSE ioctl to remove the - * GEM handle from the handle cache of the peer device file's - * private file data in the kernel to avoid handle leak. However, we - * will have a potential risk condition that will fail a later request: - * (1) 3 requests with buf1, buf2, and buf1 are sent from sender side. - * Requests 1 and 3 uses the same buffer and therefore have the - * same gem_handle1. - * (2) buf1 is received and put into cache; - * (3) buf2 is received and buf1 is evicted from cache due to some - * condition (small cache size). As a result, gem_handle1 is closed - * through GEM_CLOSE ioctl. buf2 is put into cache. - * (4) Request 3 (with buf1) is received and HANDLE_TO_FD ioctl will - * fail because the gem_handle has been removed from peer device - * file's handle cache. - * For this reason, we prefer to leak the GEM handle over calling - * GEM_CLOSE. - */ -#endif -} -#endif /* HAVE_DRM or HAVE_LIBDRM */ - -void -am_ze_memhandle_release(am_ze_memhandle_cache_t cache, - void *buf_ptr) -{ -#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) - if (!cache) - am_ze_memhandle_delete(buf_ptr); -#endif // defined(HAVE_DRM) || defined(HAVE_LIBDRM) - return; -} - -#endif /* PSM_ONEAPI */ diff --git a/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.h b/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.h deleted file mode 100644 index 12539540507..00000000000 --- a/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2022 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2022 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifdef PSM_ONEAPI - -#ifndef _AM_ONEAPI_MEMHANDLE_H -#define _AM_ONEAPI_MEMHANDLE_H - -#include "psm_user.h" -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define ONEAPI_MEMHANDLE_CACHE_SIZE 64 - -struct am_ze_memhandle_cache; // opaque since contains rbtree fields -typedef struct am_ze_memhandle_cache *am_ze_memhandle_cache_t; - -struct am_oneapi_ze_ipc_info { - uint32_t handle; /* GEM handle or file descriptor */ - uint8_t alloc_type; /* allocation type */ -}; -typedef struct am_oneapi_ze_ipc_info *am_oneapi_ze_ipc_info_t; - -psm2_error_t am_ze_memhandle_cache_alloc(am_ze_memhandle_cache_t *cachep, - uint32_t memcache_size, - psm2_mq_stats_t *stats); - -void * -am_ze_memhandle_acquire(am_ze_memhandle_cache_t cache, - uintptr_t sbuf, uint32_t handle, - psm2_epaddr_t epaddr, int device_index, - uint64_t alloc_id, uint8_t alloc_type); -void -am_ze_memhandle_release(am_ze_memhandle_cache_t cache, void *buf_ptr); - -void am_ze_memhandle_cache_free(am_ze_memhandle_cache_t cache); - -#ifdef __cplusplus -} /* extern "C" */ -#endif - -#endif /* _AM_ONEAPI_MEMHANDLE_H */ - -#endif /* PSM_ONEAPI */ diff --git a/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c b/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c index 89dbdd6cd87..722f9fdbb1b 100644 --- a/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c +++ b/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c @@ -59,6 +59,7 @@ #include #include #include +#include #include "psm_user.h" #include "psm_mq_internal.h" @@ -66,27 +67,6 @@ #include "cmarw.h" #include "psmi_wrappers.h" -#ifdef PSM_CUDA -#include "am_cuda_memhandle_cache.h" -#endif - -#ifdef PSM_ONEAPI -#include "am_oneapi_memhandle_cache.h" -#ifdef HAVE_DRM -#include -#include -#include -#endif -#ifdef HAVE_LIBDRM -#include -#include -#include -#endif -#ifdef PSM_HAVE_PIDFD -#include -#endif -#endif - /* AMLONG_PAYLOAD is number of bytes available in a bulk packet for payload. */ #define AMLONG_PAYLOAD(FifoLong) ((FifoLong) - sizeof(am_pkt_bulk_t)) @@ -169,9 +149,9 @@ static uint32_t create_extra_ep_data() { uint32_t ret = getpid(); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* PID is at maximum 22 bits */ - ret |= my_gpu_device << 22; + ret |= psm3_my_gpu_device << 22; #endif return ret; @@ -190,12 +170,14 @@ static void am_update_directory(struct am_ctl_nodeinfo *, size_t segsz); static void amsh_atexit() { - static ips_atomic_t atexit_once = { 0 }; + static atomic_int atexit_once = 0; + int expected = 0; + psm2_ep_t ep; struct ptl_am *ptl; /* bail out if previous value is non-zero */ - if (ips_atomic_cmpxchg(&atexit_once, 0, 1) != 0) + if (!atomic_compare_exchange_strong(&atexit_once, &expected, 1)) return; ep = psm3_opened_endpoint; @@ -363,16 +345,7 @@ psm2_error_t psm3_shm_create(ptl_t *ptl_gen) } memset((void *) mapptr, 0, segsz); /* touch all of my pages */ -#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) - if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt()) - PSMI_CUDA_CALL(cuMemHostRegister, mapptr, segsz, - CU_MEMHOSTALLOC_PORTABLE); -#endif -#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) - if (PSMI_IS_GPU_ENABLED) - PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, ze_driver, - mapptr, segsz); -#endif + PSM3_GPU_REGISTER_HOSTMEM(mapptr, segsz); /* Our own ep's info for ptl_am resides at the start of the shm object. Other processes need some of this info to @@ -421,36 +394,8 @@ psm2_error_t psm3_do_unmap(struct am_ctl_nodeinfo *nodeinfo) { psm2_error_t err = PSM2_OK; -#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) - if (PSMI_IS_GPU_ENABLED && cu_ctxt) { - /* ignore NOT_REGISTERED in case cuda initialized late */ - /* ignore other errors as context could be destroyed before this */ - CUresult cudaerr; - //PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, - // cuMemHostUnregister, (void*)nodeinfo->amsh_shmbase); - psmi_count_cuMemHostUnregister++; - cudaerr = psmi_cuMemHostUnregister((void*)nodeinfo->amsh_shmbase); - if (cudaerr) { - const char *pStr = NULL; - psmi_count_cuGetErrorString++; - psmi_cuGetErrorString(cudaerr, &pStr); - _HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n", - cudaerr, pStr?pStr:"Unknown"); - } - } -#endif -#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) - if (PSMI_IS_GPU_ENABLED) { - ze_result_t result; - //PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, - // (void *)nodeinfo->amsh_shmbase); - psmi_count_zexDriverReleaseImportedPointer++; - result = psmi_zexDriverReleaseImportedPointer(ze_driver, - (void *)nodeinfo->amsh_shmbase); - if (result != ZE_RESULT_SUCCESS) { - _HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result)); - } - } +#ifdef PSM_HAVE_GPU + PSM3_GPU_UNREGISTER_HOSTMEM((void*)nodeinfo->amsh_shmbase); #endif if (munmap((void *)nodeinfo->amsh_shmbase, am_ctl_sizeof_seg(nodeinfo))) { err = @@ -583,15 +528,8 @@ psm2_error_t psm3_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm // read every page in segment so faulted into our address space psm3_touch_mmap(dest_mapptr, segsz); -#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) - if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt()) - PSMI_CUDA_CALL(cuMemHostRegister, dest_mapptr, segsz, - CU_MEMHOSTALLOC_PORTABLE); -#endif -#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) - if (PSMI_IS_GPU_ENABLED) - PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, ze_driver, - dest_mapptr, segsz); +#ifdef PSM_HAVE_GPU + PSM3_GPU_REGISTER_HOSTMEM(dest_mapptr, segsz); #endif shmidx = -1; @@ -732,36 +670,8 @@ psm2_error_t psm3_shm_detach(ptl_t *ptl_gen) shm_unlink(ptl->amsh_keyname); psmi_free(ptl->amsh_keyname); -#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) - if (PSMI_IS_GPU_ENABLED && cu_ctxt) { - /* ignore NOT_REGISTERED in case cuda initialized late */ - /* ignore other errors as context could be destroyed before this */ - CUresult cudaerr; - //PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, - // cuMemHostUnregister, (void*)shmbase); - psmi_count_cuMemHostUnregister++; - cudaerr = psmi_cuMemHostUnregister((void*)shmbase); - if (cudaerr) { - const char *pStr = NULL; - psmi_count_cuGetErrorString++; - psmi_cuGetErrorString(cudaerr, &pStr); - _HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n", - cudaerr, pStr?pStr:"Unknown"); - } - } -#endif -#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) - if (PSMI_IS_GPU_ENABLED) { - ze_result_t result; - //PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, - // (void *)shmbase); - psmi_count_zexDriverReleaseImportedPointer++; - result = psmi_zexDriverReleaseImportedPointer(ze_driver, - (void *)shmbase); - if (result != ZE_RESULT_SUCCESS) { - _HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result)); - } - } +#ifdef PSM_HAVE_GPU + PSM3_GPU_UNREGISTER_HOSTMEM((void*)shmbase); #endif if (munmap((void *)shmbase, am_ctl_sizeof_block(ptl))) { err = @@ -882,26 +792,11 @@ amsh_epaddr_add(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t shmidx, psm2_epaddr_t amaddr->return_shmidx = -1; amaddr->cstate_outgoing = AMSH_CSTATE_OUTGOING_NONE; amaddr->cstate_incoming = AMSH_CSTATE_INCOMING_NONE; -#ifdef PSM_ONEAPI -#ifdef PSM_HAVE_PIDFD - amaddr->pidfd = syscall(SYS_pidfd_open, ptl->am_ep[shmidx].pid, 0); - if (amaddr->pidfd < 0) { - _HFI_ERROR("pidfd_open failed: pid %u, ret %d (%s)\n", - ptl->am_ep[shmidx].pid, amaddr->pidfd, - strerror(errno)); +#ifdef PSM_HAVE_GPU + err = PSM3_GPU_SHM_EPADDR_ADD(ptl, amaddr); + if (err) goto fail; - } -#else - amaddr->num_peer_fds = 0; - { - int i; - for (i=0; i < MAX_ZE_DEVICES; i++) - amaddr->peer_fds[i] = -1; - } - amaddr->sock_connected_state = ZE_SOCK_NOT_CONNECTED; - amaddr->sock = -1; #endif -#endif /* PSM_ONEAPI */ /* other setup */ ptl->am_ep[shmidx].epaddr = epaddr; @@ -952,23 +847,8 @@ amsh_epaddr_update(ptl_t *ptl_gen, psm2_epaddr_t epaddr) return; } -struct ptl_connection_req { - int isdone; - int op; /* connect or disconnect */ - int numep; - int numep_left; - int phase; - - int *epid_mask; - const psm2_epid_t *epids; /* input epid list */ - psm2_epaddr_t *epaddr; - psm2_error_t *errors; /* inout errors */ - - /* Used for connect/disconnect */ - psm2_amarg_t args[6]; -}; - static + void amsh_free_epaddr(ptl_t *ptl_gen, psm2_epaddr_t epaddr) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; @@ -979,29 +859,11 @@ void amsh_free_epaddr(ptl_t *ptl_gen, psm2_epaddr_t epaddr) psmi_assert(ptl->am_ep[amaddr->shmidx].epaddr == epaddr); if (ptl->am_ep[amaddr->shmidx].epaddr == epaddr) ptl->am_ep[amaddr->shmidx].epaddr = NULL; -#ifdef PSM_ONEAPI -#ifdef PSM_HAVE_PIDFD - if (amaddr->pidfd >= 0) - close(amaddr->pidfd); -#else - { - int i; - for (i=0; i < MAX_ZE_DEVICES; i++) - if (amaddr->peer_fds[i] >= 0) - close(amaddr->peer_fds[i]); - } - if (amaddr->sock >= 0) - close(amaddr->sock); -#endif -#endif /* PSM_ONEAPI */ + PSM3_GPU_SHM_EPADDR_FREE(amaddr); psmi_free(epaddr); return; } -#define PTL_OP_CONNECT 0 -#define PTL_OP_DISCONNECT 1 -#define PTL_OP_ABORT 2 - static psm2_error_t amsh_ep_connreq_init(ptl_t *ptl_gen, int op, /* connect, disconnect or abort */ @@ -1009,17 +871,17 @@ amsh_ep_connreq_init(ptl_t *ptl_gen, int op, /* connect, disconnect or abort */ const int array_of_epid_mask[], psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr, - struct ptl_connection_req **req_o) + struct am_ptl_connection_req **req_o) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; int i, cstate; psm2_epaddr_t epaddr; psm2_epid_t epid; - struct ptl_connection_req *req = NULL; + struct am_ptl_connection_req *req = NULL; - req = (struct ptl_connection_req *) + req = (struct am_ptl_connection_req *) psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, 1, - sizeof(struct ptl_connection_req)); + sizeof(struct am_ptl_connection_req)); if (req == NULL) return PSM2_NO_MEMORY; req->isdone = 0; @@ -1043,7 +905,7 @@ amsh_ep_connreq_init(ptl_t *ptl_gen, int op, /* connect, disconnect or abort */ req->epid_mask[i] = AMSH_CMASK_NONE; /* no connect by default */ if (!array_of_epid_mask[i]) continue; - if (op == PTL_OP_CONNECT) { + if (op == AM_PTL_OP_CONNECT) { epid = array_of_epid[i]; /* Connect only to other processes reachable by shared memory. The self PTL handles loopback communication, so explicitly @@ -1068,12 +930,11 @@ amsh_ep_connreq_init(ptl_t *ptl_gen, int op, /* connect, disconnect or abort */ if (cstate == AMSH_CSTATE_OUTGOING_ESTABLISHED) { array_of_epaddr[i] = epaddr; array_of_errors[i] = PSM2_OK; -#ifdef PSM_ONEAPI -#ifndef PSM_HAVE_PIDFD - // set done so know to check in amsh_ep_connreq_poll_dev_fds - req->epid_mask[i] = AMSH_CMASK_DONE; -#endif -#endif + if (PSM3_GPU_SHM_DEV_FDS_NEEDED()) { + // set done so know to check in + // PSM3_GPU_SHM_DEV_FDS_CONNEQ_POLL + req->epid_mask[i] = AMSH_CMASK_DONE; + } } else { psmi_assert(cstate == AMSH_CSTATE_OUTGOING_NONE); @@ -1092,7 +953,7 @@ amsh_ep_connreq_init(ptl_t *ptl_gen, int op, /* connect, disconnect or abort */ psmi_assert(epaddr != NULL); _HFI_CONNDBG("Disconnect force=%d epid %s\n", - (op == PTL_OP_ABORT), psm3_epid_fmt_internal(epaddr->epid, 0)); + (op == AM_PTL_OP_ABORT), psm3_epid_fmt_internal(epaddr->epid, 0)); cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing; if (cstate == AMSH_CSTATE_OUTGOING_ESTABLISHED) { req->epid_mask[i] = AMSH_CMASK_PREREQ; @@ -1109,7 +970,7 @@ amsh_ep_connreq_init(ptl_t *ptl_gen, int op, /* connect, disconnect or abort */ if (req->numep_left == 0) { /* nothing to do */ psmi_free(req->epid_mask); psmi_free(req); - if (op != PTL_OP_ABORT) { + if (op != AM_PTL_OP_ABORT) { _HFI_CONNDBG("Nothing to connect, bump up phase\n"); ptl->connect_phase++; } @@ -1123,7 +984,7 @@ amsh_ep_connreq_init(ptl_t *ptl_gen, int op, /* connect, disconnect or abort */ static psm2_error_t -amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req) +amsh_ep_connreq_poll(ptl_t *ptl_gen, struct am_ptl_connection_req *req) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; int i, j, cstate; @@ -1137,7 +998,7 @@ amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req) psmi_assert_always(ptl->connect_phase == req->phase); - if (req->op == PTL_OP_DISCONNECT || req->op == PTL_OP_ABORT) { + if (req->op == AM_PTL_OP_DISCONNECT || req->op == AM_PTL_OP_ABORT) { for (i = 0; i < req->numep; i++) { if (req->epid_mask[i] == AMSH_CMASK_NONE || req->epid_mask[i] == AMSH_CMASK_DONE) @@ -1164,7 +1025,7 @@ amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req) psmi_assert(shmidx != (uint16_t)-1); req->args[2].u32w0 = create_extra_ep_data(); req->args[2].u32w1 = PSM2_OK; - if (req->op != PTL_OP_ABORT) + if (req->op != AM_PTL_OP_ABORT) req->args[3].u64w0 = (uint64_t) (uintptr_t) &req->errors[i]; else @@ -1229,12 +1090,7 @@ amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req) ((am_epaddr_t *) epaddr)->cstate_outgoing = AMSH_CSTATE_OUTGOING_ESTABLISHED; req->epid_mask[i] = AMSH_CMASK_DONE; -#ifdef PSM_ONEAPI -#ifndef PSM_HAVE_PIDFD - if (PSMI_IS_GPU_ENABLED) - psm3_send_dev_fds(ptl_gen, epaddr); -#endif -#endif + PSM3_GPU_SHM_DEV_FDS_SEND(ptl, (struct am_epaddr *)epaddr); continue; } } @@ -1341,7 +1197,7 @@ amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req) static psm2_error_t -amsh_ep_connreq_fini(ptl_t *ptl_gen, struct ptl_connection_req *req) +amsh_ep_connreq_fini(ptl_t *ptl_gen, struct am_ptl_connection_req *req) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; psm2_error_t err = PSM2_OK; @@ -1355,13 +1211,13 @@ amsh_ep_connreq_fini(ptl_t *ptl_gen, struct ptl_connection_req *req) /* This prevents future connect replies from referencing data structures * that disappeared. For abort we aren't waiting for DISC_REP so * we want to keep same phase so we accept them after this function */ - if (req->op != PTL_OP_ABORT) + if (req->op != AM_PTL_OP_ABORT) ptl->connect_phase++; /* First process any leftovers in postreq or prereq */ for (i = 0; i < req->numep; i++) { if (req->epid_mask[i] == AMSH_CMASK_NONE - || req->op == PTL_OP_ABORT) + || req->op == AM_PTL_OP_ABORT) continue; else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) { int cstate; @@ -1370,20 +1226,12 @@ amsh_ep_connreq_fini(ptl_t *ptl_gen, struct ptl_connection_req *req) if (cstate == AMSH_CSTATE_OUTGOING_REPLIED) { ((am_epaddr_t *) req->epaddr[i])->cstate_outgoing = AMSH_CSTATE_OUTGOING_ESTABLISHED; -#ifdef PSM_ONEAPI -#ifndef PSM_HAVE_PIDFD // late connect establish, check once to // see if have GPU dev fds, if not, this one // missed the timelimit and timesout - if (PSMI_IS_GPU_ENABLED && req->op == PTL_OP_CONNECT) - _HFI_CONNDBG("late established, special GPU dev FDs poll\n"); - if (PSMI_IS_GPU_ENABLED && req->op == PTL_OP_CONNECT && - PSM2_OK != psm3_check_dev_fds_exchanged(ptl_gen, - req->epaddr[i])) + if (PSM3_GPU_SHM_DEV_FDS_CHECK_EXCHANGED(ptl, req, i) != PSM2_OK) req->errors[i] = PSM2_TIMEOUT; else -#endif -#endif req->numep_left--; } else { /* never actually got reply */ req->errors[i] = PSM2_TIMEOUT; @@ -1403,7 +1251,7 @@ amsh_ep_connreq_fini(ptl_t *ptl_gen, struct ptl_connection_req *req) for (i = 0; i < req->numep; i++) { if (req->epid_mask[i] == AMSH_CMASK_NONE) continue; - if (req->op == PTL_OP_ABORT + if (req->op == AM_PTL_OP_ABORT && req->epid_mask[i] != AMSH_CMASK_DONE) { req->epid_mask[i] = AMSH_CMASK_DONE; continue; @@ -1415,7 +1263,7 @@ amsh_ep_connreq_fini(ptl_t *ptl_gen, struct ptl_connection_req *req) /* Only free epaddr if they have disconnected from us */ int cstate = ((am_epaddr_t *) req->epaddr[i])->cstate_incoming; if (cstate == AMSH_CSTATE_INCOMING_DISC_REQUESTED) { - if (req->op == PTL_OP_DISCONNECT || req->op == PTL_OP_ABORT) { + if (req->op == AM_PTL_OP_DISCONNECT || req->op == AM_PTL_OP_ABORT) { psmi_assert(req->epaddr[i] != NULL); amsh_free_epaddr(ptl_gen, req->epaddr[i]); req->epaddr[i] = NULL; @@ -1429,39 +1277,6 @@ amsh_ep_connreq_fini(ptl_t *ptl_gen, struct ptl_connection_req *req) return err; } -#ifdef PSM_ONEAPI -#ifndef PSM_HAVE_PIDFD -// check if all successful epid/epaddr in req have exchanged GPU dev FDs -// when called it assumes all the good epid have completed so it does not -// check failed epid and just treats them as done for this phase -// return: -// PSM2_OK - all that can be done are done -// PSM2_OK_NO_PROGRESS - more to be done -static -psm2_error_t -amsh_ep_connreq_poll_dev_fds(ptl_t *ptl_gen, struct ptl_connection_req *req) -{ - int num_left = 0; - int i; - - for (i = 0; i < req->numep; i++) { - if (req->epid_mask[i] == AMSH_CMASK_NONE) - continue; - if (req->epid_mask[i] != AMSH_CMASK_DONE || req->errors[i]) - continue; - psmi_assert(req->epaddr[i]); - psmi_assert(! psm3_epid_zero_internal(req->epaddr[i]->epid)); - if (PSM2_OK != psm3_check_dev_fds_exchanged(ptl_gen, req->epaddr[i])) - num_left++; - } - if (num_left == 0) - return PSM2_OK; - else - return PSM2_OK_NO_PROGRESS; // not done everyone yet -} -#endif -#endif /* PSM_ONEAPI */ - /* Wrapper for 2.0's use of connect/disconnect. The plan is to move the * init/poll/fini interface up to the PTL level for 2.2 */ #define CONNREQ_ZERO_POLLS_BEFORE_YIELD 20 @@ -1477,7 +1292,7 @@ amsh_ep_connreq_wrap(ptl_t *ptl_gen, int op, struct ptl_am *ptl = (struct ptl_am *)ptl_gen; psm2_error_t err; uint64_t t_start; - struct ptl_connection_req *req; + struct am_ptl_connection_req *req; int num_polls_noprogress = 0; static int shm_polite_attach = -1; @@ -1503,7 +1318,7 @@ amsh_ep_connreq_wrap(ptl_t *ptl_gen, int op, * there was an error */ return err; - if (op == PTL_OP_ABORT) { + if (op == AM_PTL_OP_ABORT) { int i; /* loop a couple times only, ignore timeout */ /* this will move from PREREQ to POSTREQ and check once @@ -1529,23 +1344,15 @@ amsh_ep_connreq_wrap(ptl_t *ptl_gen, int op, do { psm3_poll_internal(ptl->ep, 1, 0); err = amsh_ep_connreq_poll(ptl_gen, req); - if (err == PSM2_OK) -#ifndef PSM_ONEAPI - break; /* Finished before timeout */ -#elif !defined(PSM_HAVE_PIDFD) - { - if (PSMI_IS_GPU_ENABLED && req->op == PTL_OP_CONNECT) { - if (amsh_ep_connreq_poll_dev_fds(ptl_gen, req) == PSM2_OK) { + if (err == PSM2_OK) { + if (PSM3_GPU_IS_ENABLED && req->op == AM_PTL_OP_CONNECT) { + if (PSM3_GPU_SHM_DEV_FDS_CONNREQ_POLL(ptl, req) == PSM2_OK) break; /* Finished before timeout */ - } else { + else PSMI_YIELD(ptl->ep->mq->progress_lock); - } } else break; } -#else - break; -#endif else if (err != PSM2_OK_NO_PROGRESS) { psmi_free(req->epid_mask); psmi_free(req); @@ -1578,7 +1385,7 @@ amsh_ep_connect(ptl_t *ptl, psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr, uint64_t timeout_ns) { - return amsh_ep_connreq_wrap(ptl, PTL_OP_CONNECT, numep, array_of_epid, + return amsh_ep_connreq_wrap(ptl, AM_PTL_OP_CONNECT, numep, array_of_epid, array_of_epid_mask, array_of_errors, array_of_epaddr, timeout_ns); } @@ -1591,7 +1398,7 @@ amsh_ep_disconnect(ptl_t *ptl, int force, int numep, psm2_error_t array_of_errors[], uint64_t timeout_ns) { return amsh_ep_connreq_wrap(ptl, - force ? PTL_OP_ABORT : PTL_OP_DISCONNECT, + force ? AM_PTL_OP_ABORT : AM_PTL_OP_DISCONNECT, numep, NULL, array_of_epaddr_mask, array_of_errors, array_of_epaddr, @@ -1746,14 +1553,8 @@ amsh_poll_internal_inner(ptl_t *ptl_gen, int replyonly, } while (!QISEMPTY(ptl->reqH.head->flag)); } } -#ifdef PSM_ONEAPI -#ifndef PSM_HAVE_PIDFD - // play err safe, callers ignore errors or expect just OK or NO_PROGRESS - if (((struct ptl_am *)ptl_gen)->ep->need_dev_fds_poll - && psm3_poll_dev_fds_exchange(ptl_gen) != PSM2_OK_NO_PROGRESS) - err = PSM2_OK; -#endif -#endif + + err = PSM3_GPU_SHM_DEV_FDS_POLL((struct ptl_am *)ptl_gen, err); if (is_internal) { if (err == PSM2_OK) /* some progress, no yields */ @@ -2236,25 +2037,8 @@ amsh_mq_rndv(ptl_t *ptl, psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, psm2_mq_tag_t *tag, const void *buf, uint32_t len) { -#ifdef PSM_ONEAPI psm2_amarg_t args[6]; -#else - psm2_amarg_t args[5]; -#endif psm2_error_t err = PSM2_OK; -#ifdef PSM_ONEAPI -#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) -#ifndef PSM_HAVE_PIDFD - int fd; - int *devfds; - int numfds; - int device_index = 0; -#endif - uint64_t handle_fd = 0; - size_t total; -#endif -#endif - args[0].u32w0 = MQ_MSG_LONGRTS; args[0].u32w1 = len; @@ -2271,124 +2055,27 @@ amsh_mq_rndv(ptl_t *ptl, psm2_mq_t mq, psm2_mq_req_t req, req->req_data.send_msglen = len; req->send_msgoff = 0; -#ifdef PSM_CUDA - /* If the send buffer is on gpu, we create a cuda IPC +#ifdef PSM_HAVE_GPU + /* If the send buffer is on gpu, we create a GPU IPC * handle and send it as payload in the RTS */ if (req->is_buf_gpu_mem) { - CUdeviceptr buf_base_ptr; - PSMI_CUDA_CALL(cuMemGetAddressRange, &buf_base_ptr, NULL, (CUdeviceptr)buf); - - /* Offset in GPU buffer from which we copy data, we have to - * send it separetly because this offset is lost - * when cuIpcGetMemHandle is called */ - req->cuda_ipc_offset = (uint32_t)((uintptr_t)buf - (uintptr_t)buf_base_ptr); - args[2].u32w0 = (uint32_t)req->cuda_ipc_offset; - - PSMI_CUDA_CALL(cuIpcGetMemHandle, - &req->cuda_ipc_handle, - (CUdeviceptr) buf); - if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) { - psm3_am_reqq_add(AMREQUEST_SHORT, ptl, - epaddr, mq_handler_hidx, - args, 5, (void*)&req->cuda_ipc_handle, - sizeof(CUipcMemHandle), NULL, 0); - } else { - psm3_amsh_short_request(ptl, epaddr, mq_handler_hidx, - args, 5, (void*)&req->cuda_ipc_handle, - sizeof(CUipcMemHandle), 0); - } - req->cuda_ipc_handle_attached = 1; - } else -#elif defined(PSM_ONEAPI) - /* If the send buffer is on gpu, we create a oneapi IPC - * handle and send it as payload in the RTS */ - if (req->is_buf_gpu_mem) { -#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) - void *buf_base_ptr; -#ifndef PSM_HAVE_PIDFD - struct drm_prime_handle open_fd = {0, 0, 0}; -#endif - uint64_t alloc_id; - struct am_oneapi_ze_ipc_info info; - -#ifndef PSM_HAVE_PIDFD - devfds = psm3_ze_get_dev_fds(&numfds); - device_index = cur_ze_dev - ze_devices; /* index (offset) in table */ - args[5].u32w0 = device_index; - fd = devfds[device_index]; -#endif - PSMI_ONEAPI_ZE_CALL(zeMemGetAddressRange, ze_context, buf, &buf_base_ptr, &total); - - /* Offset in GPU buffer from which we copy data, we have to - * send it separetly because this offset is lost - * when zeMemGetIpcHandle is called */ - req->ze_ipc_offset = (uint32_t)((uintptr_t)buf - (uintptr_t)buf_base_ptr); - args[2].u32w0 = (uint32_t)req->ze_ipc_offset; - alloc_id = psm3_oneapi_ze_get_alloc_id(buf_base_ptr, &info.alloc_type); -#ifndef PSM_HAVE_PIDFD - args[5].u32w1 = (uint32_t)alloc_id; /* 32-bit for now */ -#else - args[5].u64w0 = alloc_id; -#endif - - PSMI_ONEAPI_ZE_CALL(zeMemGetIpcHandle, - ze_context, - (const void *)buf_base_ptr, - &req->ipc_handle); -#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE - PSMI_ONEAPI_ZE_CALL(zeMemGetFileDescriptorFromIpcHandleExp, ze_context, req->ipc_handle, &handle_fd); -#else - memcpy(&handle_fd, &req->ipc_handle, sizeof(uint32_t)); -#endif - req->ze_handle_attached = 1; -#ifndef PSM_HAVE_PIDFD - open_fd.fd = (uint32_t)handle_fd; - if (ioctl(fd, DRM_IOCTL_PRIME_FD_TO_HANDLE, &open_fd) < 0) { - struct ptl_am *ptl_am = (struct ptl_am *)ptl; - _HFI_ERROR("ioctl failed for DRM_IOCTL_PRIME_FD_TO_HANDLE: for fd %d: %s", open_fd.fd, strerror(errno)); - psm3_handle_error(ptl_am->ep, PSM2_INTERNAL_ERR, - "ioctl " - "failed for DRM_IOCTL_PRIME_FD_TO_HANDLE for fd %d: errno=%d", - open_fd.fd, errno); - err = PSM2_INTERNAL_ERR; + void *payload; + size_t payload_size; + union am_gpu_rts_payload info; + int narg; + err = PSM3_GPU_SHM_BUILD_RTS((struct ptl_am *)ptl, req, &narg, args, &payload, &payload_size, &info); + if (err) goto fail; - } - _HFI_VDBG("FD_TO_HANDLE: buf %p total 0x%lx base %p alloc_id %lu gem_handle %u\n", - buf, total, buf_base_ptr, alloc_id, open_fd.handle); - info.handle = open_fd.handle; if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) { psm3_am_reqq_add(AMREQUEST_SHORT, ptl, epaddr, mq_handler_hidx, - args, 6, (void *)&info, - sizeof(info), NULL, 0); + args, narg, payload, payload_size, NULL, 0); } else { psm3_amsh_short_request(ptl, epaddr, mq_handler_hidx, - args, 6, (void *)&info, - sizeof(info), 0); + args, narg, payload, payload_size, 0); } - // for DRM approach once we have the open_fd we could - // PutIpcHandle(ipc_handle) since open_fd has a reference - // however since that is a legacy mode, we focus on the - // prefered mode and have both delay the Put until CTS received -#else - info.handle = (uint32_t)handle_fd; - if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) { - psm3_am_reqq_add(AMREQUEST_SHORT, ptl, - epaddr, mq_handler_hidx, - args, 6, (void *)&info, - sizeof(info), NULL, 0); - } else { - psm3_amsh_short_request(ptl, epaddr, mq_handler_hidx, - args, 6, (void *)&info, - sizeof(info), 0); - } -#endif /* PSM_HAVE_PIDFD */ -#else // if no drm, error out as oneapi ipc handles don't work without drm - err = PSM2_INTERNAL_ERR; - goto fail; -#endif // defined(HAVE_DRM) || defined(HAVE_LIBDRM) } else -#endif // defined(PSM_ONEAPI) +#endif /* PSM_HAVE_GPU */ if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) { psm3_am_reqq_add(AMREQUEST_SHORT, ptl, epaddr, mq_handler_hidx, args, 5, NULL, 0, NULL, 0); @@ -2402,10 +2089,8 @@ amsh_mq_rndv(ptl_t *ptl, psm2_mq_t mq, psm2_mq_req_t req, mq->stats.tx_rndv_num++; // tx_rndv_bytes tabulated when get CTS -#ifdef PSM_ONEAPI -#if !defined(PSM_HAVE_PIDFD) || !(defined(HAVE_DRM) || defined(HAVE_LIBDRM)) +#ifdef PSM_HAVE_GPU fail: -#endif #endif return err; } @@ -2485,9 +2170,9 @@ amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, psm2_amarg_t args[3]; psm2_error_t err = PSM2_OK; int is_blocking = (req == NULL); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU int gpu_mem = 0; - int ep_supports_p2p = (1 << ((am_epaddr_t *) epaddr)->gpuid) & gpu_p2p_supported(); + int ep_supports_p2p = (1 << ((am_epaddr_t *) epaddr)->gpuid) & PSM3_GPU_P2P_SUPPORTED(); if (PSM3_IS_BUFFER_GPU_MEM(ubuf, len)) { gpu_mem = 1; @@ -2503,7 +2188,7 @@ amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, goto do_rendezvous; } } else -#endif +#endif /* PSM_HAVE_GPU */ /* SENDSYNC gets priority, assume not used for MPI_isend w/INJECT */ /* otherwise use eager for INJECT as caller is waiting */ if ((flags_user & (PSM2_MQ_FLAG_SENDSYNC|PSM2_MQ_FLAG_INJECT)) @@ -2531,14 +2216,12 @@ amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, * mq->completed_q */ req->flags_internal |= (flags_internal | PSMI_REQ_FLAG_IS_INTERNAL); } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU void *host_buf = NULL; req->is_buf_gpu_mem = gpu_mem; if (req->is_buf_gpu_mem) { -#ifdef PSM_CUDA - psmi_cuda_set_attr_sync_memops(ubuf); -#endif + PSM3_GPU_MARK_BUF_SYNCHRONOUS(ubuf); /* Use host buffer for blocking requests if GPU P2P is * unsupported between endpoints. @@ -2553,7 +2236,7 @@ amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, req->is_buf_gpu_mem = 0; } } -#endif +#endif /* PSM_HAVE_GPU */ err = amsh_mq_rndv(epaddr->ptlctl->ptl, mq, req, epaddr, tag, ubuf, len); @@ -2561,7 +2244,7 @@ amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, err = psm3_mq_wait_internal(&req); } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (err == PSM2_OK && host_buf) psmi_free(host_buf); #endif @@ -2664,15 +2347,15 @@ int psm3_get_kassist_mode(int first_ep) union psmi_envvar_val env_kassist; const char *PSM3_KASSIST_MODE_HELP = "PSM Shared memory kernel assist mode " "(cma-put, cma-get, none)"; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // GPU limits KASSIST choices to cma-get or none const char *PSM3_KASSIST_MODE_GPU_HELP = "PSM Shared memory kernel assist mode " "(cma-get, none)"; #endif if (!psm3_getenv("PSM3_KASSIST_MODE", -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - PSMI_IS_GPU_ENABLED? +#ifdef PSM_HAVE_GPU + PSM3_GPU_IS_ENABLED? PSM3_KASSIST_MODE_GPU_HELP:PSM3_KASSIST_MODE_HELP, #else PSM3_KASSIST_MODE_HELP, @@ -2683,8 +2366,8 @@ int psm3_get_kassist_mode(int first_ep) &env_kassist)) { char *s = env_kassist.e_str; if ( -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - ! PSMI_IS_GPU_ENABLED && +#ifdef PSM_HAVE_GPU + ! PSM3_GPU_IS_ENABLED && #endif strcasecmp(s, "cma-put") == 0) mode = PSM3_KASSIST_CMA_PUT; @@ -2783,12 +2466,7 @@ amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, ((am_epaddr_t *) epaddr)->pid = pid; ((am_epaddr_t *) epaddr)->gpuid = gpuid; } -#ifdef PSM_ONEAPI -#ifndef PSM_HAVE_PIDFD - if (PSMI_IS_GPU_ENABLED) - psm3_send_dev_fds(ptl_gen, epaddr); -#endif -#endif + PSM3_GPU_SHM_DEV_FDS_SEND(ptl, (struct am_epaddr *)epaddr); /* Rewrite args */ ptl->connect_incoming++; @@ -3030,18 +2708,6 @@ amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl) amsh_fifo_getconfig(ptl); -#ifdef PSM_ONEAPI -#ifndef PSM_HAVE_PIDFD - ptl->ep->ze_ipc_socket = -1; - if (PSMI_IS_GPU_ENABLED) { - if ((err = psm3_ze_init_ipc_socket(ptl_gen)) != PSM2_OK) - goto fail; - if ((err = psm3_ze_init_fds()) != PSM2_OK) - goto fail; - } -#endif -#endif - memset(&ptl->amsh_empty_shortpkt, 0, sizeof(ptl->amsh_empty_shortpkt)); memset(&ptl->psmi_am_reqq_fifo, 0, sizeof(ptl->psmi_am_reqq_fifo)); @@ -3095,49 +2761,9 @@ amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl) ctl->epaddr_stats_init = NULL; ctl->epaddr_stats_get = NULL; #endif -#ifdef PSM_CUDA - if (PSMI_IS_GPU_ENABLED) { - union psmi_envvar_val env_memcache_enabled; - psm3_getenv("PSM3_CUDA_MEMCACHE_ENABLED", - "PSM cuda ipc memhandle cache enabled (default is enabled)", - PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val) - 1, &env_memcache_enabled); - if (env_memcache_enabled.e_uint) { - union psmi_envvar_val env_memcache_size; - psm3_getenv("PSM3_CUDA_MEMCACHE_SIZE", - "Size of the cuda ipc memhandle cache ", - PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val) - CUDA_MEMHANDLE_CACHE_SIZE, &env_memcache_size); - if ((err = am_cuda_memhandle_cache_alloc(&ptl->memhandle_cache, - env_memcache_size.e_uint, &ep->mq->stats) != PSM2_OK)) - goto fail; - } - } -#endif -#ifdef PSM_ONEAPI - if (PSMI_IS_GPU_ENABLED) { - union psmi_envvar_val env_memcache_enabled; - psm3_getenv("PSM3_ONEAPI_MEMCACHE_ENABLED", - "PSM oneapi ipc memhandle cache enabled (default is enabled)", - PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val) - 1, &env_memcache_enabled); - if (env_memcache_enabled.e_uint) { - union psmi_envvar_val env_memcache_size; - psm3_getenv("PSM3_ONEAPI_MEMCACHE_SIZE", - "Size of the oneapi ipc memhandle cache ", - PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val) - ONEAPI_MEMHANDLE_CACHE_SIZE, &env_memcache_size); -#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) - if ((err = am_ze_memhandle_cache_alloc(&ptl->memhandle_cache, - env_memcache_size.e_uint, &ep->mq->stats) != PSM2_OK)) - goto fail; -#endif - } - } +#ifdef PSM_HAVE_GPU + if ((err = PSM3_GPU_SHM_INIT(ptl, &ep->mq->stats)) != PSM2_OK) + goto fail; #endif fail: return err; @@ -3235,15 +2861,6 @@ static psm2_error_t amsh_fini(ptl_t *ptl_gen, int force, uint64_t timeout_ns) goto fail; } -#ifdef PSM_ONEAPI -#ifndef PSM_HAVE_PIDFD - if (PSMI_IS_GPU_ENABLED && (err_seg = psm3_sock_detach(ptl_gen))) { - err = err_seg; - goto fail; - } -#endif -#endif - /* This prevents poll calls between now and the point where the endpoint is * deallocated to reference memory that disappeared */ ptl->repH.head = &ptl->amsh_empty_shortpkt; @@ -3252,20 +2869,9 @@ static psm2_error_t amsh_fini(ptl_t *ptl_gen, int force, uint64_t timeout_ns) if (ptl->am_ep) psmi_free(ptl->am_ep); -#ifdef PSM_CUDA - if (ptl->memhandle_cache) - am_cuda_memhandle_cache_free(ptl->memhandle_cache); - ptl->memhandle_cache = NULL; -#endif -#ifdef PSM_ONEAPI -#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) - if (ptl->memhandle_cache) - am_ze_memhandle_cache_free(ptl->memhandle_cache); -#endif - ptl->memhandle_cache = NULL; -#endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (PSMI_IS_GPU_ENABLED && ptl->gpu_bounce_buf) +#ifdef PSM_HAVE_GPU + PSM3_GPU_SHM_FINALIZE(ptl); + if (PSM3_GPU_IS_ENABLED && ptl->gpu_bounce_buf) PSM3_GPU_HOST_FREE(ptl->gpu_bounce_buf); #endif return PSM2_OK; diff --git a/prov/psm3/psm3/ptl_am/psm_am_internal.h b/prov/psm3/psm3/ptl_am/psm_am_internal.h index 0796dbee9e9..091ae3e3edb 100644 --- a/prov/psm3/psm3/ptl_am/psm_am_internal.h +++ b/prov/psm3/psm3/ptl_am/psm_am_internal.h @@ -58,22 +58,9 @@ #include "am_config.h" #include "../psm_am_internal.h" -#ifdef PSM_CUDA -#include "am_cuda_memhandle_cache.h" -#endif -#ifdef PSM_ONEAPI -#include "am_oneapi_memhandle_cache.h" -#endif #define AMSH_DIRBLOCK_SIZE 128 -#ifdef PSM_ONEAPI -/* sock_connected_state state definitions */ -#define ZE_SOCK_NOT_CONNECTED 0 -#define ZE_SOCK_DEV_FDS_SENT 1 -#define ZE_SOCK_DEV_FDS_SENT_AND_RECD 2 -#endif - typedef struct am_epaddr { /* @@ -84,15 +71,8 @@ struct am_epaddr { uint16_t shmidx; uint16_t return_shmidx; -#ifdef PSM_ONEAPI -#ifdef PSM_HAVE_PIDFD - int pidfd; -#else - int num_peer_fds; - int peer_fds[MAX_ZE_DEVICES]; - int sock_connected_state; - int sock; -#endif +#ifdef PSM_HAVE_GPU + union am_epaddr_gpu_specific gpu_specific; #endif uint32_t cstate_outgoing:3; uint32_t cstate_incoming:3; @@ -105,6 +85,26 @@ struct am_epaddr { uint32_t gpuid:4; } am_epaddr_t; +struct am_ptl_connection_req { + int isdone; + int op; /* connect or disconnect */ + int numep; + int numep_left; + int phase; + + int *epid_mask; + const psm2_epid_t *epids; /* input epid list */ + psm2_epaddr_t *epaddr; + psm2_error_t *errors; /* inout errors */ + + /* Used for connect/disconnect */ + psm2_amarg_t args[6]; +}; + +#define AM_PTL_OP_CONNECT 0 +#define AM_PTL_OP_DISCONNECT 1 +#define AM_PTL_OP_ABORT 2 + /* Up to NSHORT_ARGS are supported via am_pkt_short_t; the remaining arguments are passed using space in am_pkt_bulk_t. One additional argument is added for passing the internal ptl_am handler index. */ @@ -466,13 +466,9 @@ struct ptl_am { struct am_ctl_nodeinfo *self_nodeinfo; /* our local advertized shm */ struct am_ctl_nodeinfo *am_ep; /* local array w/copy of each peer's info */ -#ifdef PSM_CUDA - am_cuda_memhandle_cache_t memhandle_cache; -#endif -#ifdef PSM_ONEAPI - am_ze_memhandle_cache_t memhandle_cache; -#endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU + union ptl_am_gpu_specific gpu_specific; + void *memhandle_cache; #define AMSH_GPU_BOUNCE_BUF_SZ (256*1024) void *gpu_bounce_buf; // for H to D #endif diff --git a/prov/psm3/psm3/ptl_am/ptl.c b/prov/psm3/psm3/ptl_am/ptl.c index a6af3c356ac..b37ac175357 100644 --- a/prov/psm3/psm3/ptl_am/ptl.c +++ b/prov/psm3/psm3/ptl_am/ptl.c @@ -59,13 +59,6 @@ #include "psm_am_internal.h" #include "cmarw.h" -#ifdef PSM_CUDA -#include "am_cuda_memhandle_cache.h" -#endif -#ifdef PSM_ONEAPI -#include "am_oneapi_memhandle_cache.h" -#endif - #ifdef PSM_FI /* * fault injection for psm3_cma_get() and psm3_cma_put(). @@ -110,63 +103,9 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted, _HFI_VDBG("[shm][rndv][recv] req=%p dest=%p len=%d tok=%p\n", req, req->req_data.buf, req->req_data.recv_msglen, tok); -#ifdef PSM_CUDA - if (req->cuda_ipc_handle_attached) { - - CUdeviceptr cuda_ipc_dev_ptr = am_cuda_memhandle_acquire( - ptl->memhandle_cache, - req->rts_sbuf - req->cuda_ipc_offset, - (CUipcMemHandle*)&req->cuda_ipc_handle, - req->rts_peer->epid); - cuda_ipc_dev_ptr = cuda_ipc_dev_ptr + req->cuda_ipc_offset; - /* cuMemcpy into the receive side buffer - * based on its location */ - if (req->is_buf_gpu_mem) { - PSM3_GPU_MEMCPY_DTOD(req->req_data.buf, cuda_ipc_dev_ptr, - req->req_data.recv_msglen); - PSM3_GPU_SYNCHRONIZE_MEMCPY(); - } else { - PSM3_GPU_MEMCPY_DTOH(req->req_data.buf, cuda_ipc_dev_ptr, - req->req_data.recv_msglen); - } +#ifdef PSM_HAVE_GPU + if (PSM3_GPU_SHM_RTSMATCH(ptl, req)) { gpu_ipc_send_completion = 1; - am_cuda_memhandle_release(ptl->memhandle_cache, - cuda_ipc_dev_ptr - req->cuda_ipc_offset); - req->cuda_ipc_handle_attached = 0; - goto send_cts; - } -#endif -#ifdef PSM_ONEAPI - if (req->ze_handle_attached) { - void *buf_ptr = am_ze_memhandle_acquire( - ptl->memhandle_cache, - req->rts_sbuf - req->ze_ipc_offset, req->ze_handle, - req->rts_peer, -#ifndef PSM_HAVE_PIDFD - req->ze_device_index, req->ze_alloc_id, -#else - 0, req->ze_alloc_id, -#endif - req->ze_alloc_type); - psmi_assert_always(buf_ptr != NULL); - buf_ptr = (uint8_t *)buf_ptr + req->ze_ipc_offset; - /* zeMemcpy into the receive side buffer - * based on its location */ - _HFI_VDBG("Copying src %p (offset 0x%x) dst %p msg_len %u\n", - buf_ptr, req->ze_ipc_offset, - req->req_data.buf, req->req_data.recv_msglen); - if (req->is_buf_gpu_mem) { - PSM3_GPU_MEMCPY_DTOD(req->req_data.buf, buf_ptr, - req->req_data.recv_msglen); - PSM3_GPU_SYNCHRONIZE_MEMCPY(); - } else { - PSM3_GPU_MEMCPY_DTOH(req->req_data.buf, buf_ptr, - req->req_data.recv_msglen); - } - gpu_ipc_send_completion = 1; - am_ze_memhandle_release(ptl->memhandle_cache, - (uint8_t *)buf_ptr - req->ze_ipc_offset); - req->ze_handle_attached = 0; goto send_cts; } #endif @@ -175,7 +114,7 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted, if ((ptl->kassist_mode & PSM3_KASSIST_GET) && req->req_data.recv_msglen > 0 && (pid = psm3_epaddr_pid(epaddr))) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* If the buffer on the send side is on the host, * we alloc a bounce buffer, use kassist and then * do a cuMemcpy if the buffer on the recv side @@ -213,7 +152,7 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted, /* Cuda library has recent optimizations where they do * not guarantee synchronus nature for Host to Device * copies for msg sizes less than 64k. The event record - * and synchronize calls are to guarentee completion. + * and synchronize calls are to guarantee completion. */ PSM3_GPU_SYNCHRONIZE_MEMCPY(); } else { @@ -230,7 +169,7 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted, goto fail_cma; psmi_assert_always(nbytes == req->req_data.recv_msglen); } -#else +#else /* PSM_HAVE_GPU */ /* cma can be done in handler context or not. */ size_t nbytes; #ifdef PSM_FI @@ -243,7 +182,7 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted, if (nbytes == -1) goto fail_cma; psmi_assert_always(nbytes == req->req_data.recv_msglen); -#endif +#endif /* PSM_HAVE_GPU */ cma_succeed = 1; } @@ -330,7 +269,7 @@ psm3_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, default:{ void *sreq = (void *)(uintptr_t) args[3].u64w0; uintptr_t sbuf = (uintptr_t) args[4].u64w0; -#ifdef PSM_ONEAPI +#ifdef PSM_HAVE_GPU psmi_assert(narg == 5 || narg == 6); #else psmi_assert(narg == 5); @@ -343,38 +282,13 @@ psm3_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, req->rts_peer = tok->tok.epaddr_incoming; req->ptl_req_ptr = sreq; req->rts_sbuf = sbuf; -#ifdef PSM_CUDA - /* Payload in RTS would mean an IPC handle has been +#ifdef PSM_HAVE_GPU + /* Payload in RTS would mean a GPU IPC handle has been * sent. This would also mean the sender has to * send from a GPU buffer */ - if (buf && len > 0) { - req->cuda_ipc_handle = *((CUipcMemHandle*)buf); - req->cuda_ipc_handle_attached = 1; - req->cuda_ipc_offset = args[2].u32w0; - } -#endif -#ifdef PSM_ONEAPI - /* Payload in RTS would mean an IPC handle has been - * sent. This would also mean the sender has to - * send from a GPU buffer - */ - if (buf && len > 0) { - am_oneapi_ze_ipc_info_t info; - - psmi_assert(narg == 6); - info = (am_oneapi_ze_ipc_info_t)buf; - req->ze_handle = info->handle; - req->ze_alloc_type = info->alloc_type; - req->ze_handle_attached = 1; - req->ze_ipc_offset = args[2].u32w0; -#ifndef PSM_HAVE_PIDFD - req->ze_device_index = args[5].u32w0; - req->ze_alloc_id = args[5].u32w1; -#else - req->ze_alloc_id = args[5].u64w0; -#endif - } + if (buf && len > 0) + PSM3_GPU_SHM_PROCESS_RTS(req, buf, len, narg, args); #endif if (rc == MQ_RET_MATCH_OK) /* we are in handler context, issue a reply */ @@ -397,7 +311,7 @@ psm3_am_mq_handler_data(void *toki, psm2_amarg_t *args, int narg, void *buf, psm2_epaddr_t epaddr = (psm2_epaddr_t) tok->tok.epaddr_incoming; psm2_mq_req_t req = mq_eager_match(tok->mq, epaddr, 0); /* using seqnum 0 */ psmi_assert_always(req != NULL); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU psm3_mq_handle_data(tok->mq, req, args[2].u32w0, buf, len, 0, NULL); #else psm3_mq_handle_data(tok->mq, req, args[2].u32w0, buf, len); @@ -419,34 +333,19 @@ psm3_am_mq_handler_rtsmatch(void *toki, psm2_amarg_t *args, int narg, void *buf, ptl_t *ptl = tok->ptl; psm2_mq_req_t sreq = (psm2_mq_req_t) (uintptr_t) args[0].u64w0; -#ifdef PSM_CUDA - /* If send side req has a cuda ipc handle attached then as soon as we - * get a CTS, we can assume the data has been copied and receiver now - * has a reference for the ipc handle for any receiver handle caching - */ - if (sreq->cuda_ipc_handle_attached) { - sreq->cuda_ipc_handle_attached = 0; - sreq->mq->stats.tx_shm_bytes += sreq->req_data.send_msglen; - sreq->mq->stats.tx_rndv_bytes += sreq->req_data.send_msglen; - psm3_mq_handle_rts_complete(sreq); - return; - } -#endif -#ifdef PSM_ONEAPI - /* If send side req has an ipc handle attached then as soon as we +#ifdef PSM_HAVE_GPU + /* If send side req has a GPU IPC handle attached then as soon as we * get a CTS, we can assume the data has been copied and receiver now * has a reference for the ipc handle for any receiver handle caching */ - if (sreq->ze_handle_attached) { - psm3_put_ipc_handle(sreq->req_data.buf - sreq->ze_ipc_offset, - sreq->ipc_handle); - sreq->ze_handle_attached = 0; + if (PSM3_GPU_SHM_PROCESS_CTS(sreq)) { sreq->mq->stats.tx_shm_bytes += sreq->req_data.send_msglen; sreq->mq->stats.tx_rndv_bytes += sreq->req_data.send_msglen; psm3_mq_handle_rts_complete(sreq); return; } #endif + void *dest = (void *)(uintptr_t) args[2].u64w0; uint32_t msglen = args[3].u32w0; psm2_amarg_t rarg[1]; diff --git a/prov/psm3/psm3/ptl_ips/ips_config.h b/prov/psm3/psm3/ptl_ips/ips_config.h index 1a253aa4a23..e66a651ed6c 100644 --- a/prov/psm3/psm3/ptl_ips/ips_config.h +++ b/prov/psm3/psm3/ptl_ips/ips_config.h @@ -69,6 +69,10 @@ #define IPS_PROTO_FLOW_CREDITS_MAX_DEFAULT 128 #define IPS_PROTO_FLOW_CREDITS_STEP_DEFAULT 16 +#define IPS_PROTO_FLOW_CREDITS_RC_MIN_DEFAULT 768 +#define IPS_PROTO_FLOW_CREDITS_RC_MAX_DEFAULT 960 +#define IPS_PROTO_FLOW_CREDITS_RC_MAX 2048 + /* Send retransmission */ #define IPS_PROTO_SPIO_RETRY_US_DEFAULT 2 /* in uS */ @@ -116,7 +120,7 @@ #define IPS_FAULTINJ_UFFD_REGISTER 1000 /* 1 every X uffd REGISTER ENOMEM */ #endif #endif /* PSM_HAVE_REG_MR */ -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU #define IPS_FAULTINJ_GDRMMAP 100 /* 1 every X GPU pin and mmap ENOMEM */ #define IPS_FAULTINJ_GPU_REG_MR 100 /* 1 every X GPU reg_mr */ #endif diff --git a/prov/psm3/psm3/ptl_ips/ips_expected_proto.h b/prov/psm3/psm3/ptl_ips/ips_expected_proto.h index 221706ade25..3e65d85d864 100644 --- a/prov/psm3/psm3/ptl_ips/ips_expected_proto.h +++ b/prov/psm3/psm3/ptl_ips/ips_expected_proto.h @@ -124,19 +124,14 @@ struct ips_protoexp { /* services pend_getreqsq and pend_err_chk_rdma_resp */ struct psmi_timer timer_getreqs; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU STAILQ_HEAD(ips_tid_get_gpupend, /* pending GPU transfers */ ips_tid_get_request) gpupend_getreqsq; struct ips_gpu_hostbuf_mpool_cb_context gpu_hostbuf_recv_cfg; struct ips_gpu_hostbuf_mpool_cb_context gpu_hostbuf_small_recv_cfg; mpool_t gpu_hostbuf_pool_recv; mpool_t gpu_hostbuf_pool_small_recv; -#endif -#ifdef PSM_CUDA - CUstream cudastream_recv; -#elif defined(PSM_ONEAPI) - /* Will not be usd if psm3_oneapi_immed_async_copy */ - ze_command_queue_handle_t cq_recvs[MAX_ZE_DEVICES]; + union ips_protoexp_gpu_specific gpu_specific; #endif }; @@ -194,7 +189,7 @@ struct ips_tid_send_desc { uint8_t reserved:7; #endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* As size of gpu_hostbuf is less than equal to window size, * there is a guarantee that the maximum number of host bufs we * would need to attach to a tidsendc would be 2 @@ -239,7 +234,7 @@ struct ips_tid_recv_desc { uint32_t tidflow_nswap_gen; psmi_seqnum_t tidflow_genseq; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU struct ips_gpu_hostbuf *gpu_hostbuf; uint8_t is_ptr_gpu_backed; #endif @@ -282,7 +277,7 @@ struct ips_tid_get_request { uint32_t tidgr_bytesdone; uint32_t tidgr_flags; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU int gpu_hostbuf_used; uint32_t tidgr_gpu_bytesdone; STAILQ_HEAD(ips_tid_getreq_gpu_hostbuf_pend, /* pending exp. sends */ @@ -363,7 +358,7 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, ips_tid_session_list *tid_list, uint32_t tid_list_size); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // buffers for GPU send copy pipeline struct ips_gpu_hostbuf* psm3_ips_allocate_send_chb(struct ips_proto *proto, uint32_t nbytes, int allow_temp); diff --git a/prov/psm3/psm3/ptl_ips/ips_proto.c b/prov/psm3/psm3/ptl_ips/ips_proto.c index 372dd75ea56..3705b052672 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto.c +++ b/prov/psm3/psm3/ptl_ips/ips_proto.c @@ -86,11 +86,6 @@ // to play safe we set max credit to 16384 #define IPS_MAX_CREDIT 16384 -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) -uint32_t gpudirect_rdma_send_limit; -uint32_t gpudirect_rdma_recv_limit; -#endif - static void ctrlq_init(struct ips_ctrlq *ctrlq, struct ips_proto *proto); #ifdef PSM_HAVE_REG_MR @@ -98,18 +93,19 @@ static psm2_error_t proto_sdma_init(struct ips_proto *proto); #endif static psm2_error_t ips_proto_register_stats(struct ips_proto *proto); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU void psmi_gpu_hostbuf_alloc_func(int is_alloc, void *context, void *obj) { struct ips_gpu_hostbuf *icb = (struct ips_gpu_hostbuf *)obj; if (is_alloc) { PSM3_GPU_HOSTBUF_LAZY_INIT(icb); + icb->host_buf = NULL; } else { PSM3_GPU_HOSTBUF_DESTROY(icb); } return; } -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ static int parse_flow_credits(const char *str, size_t errstr_size, char errstr[], @@ -191,18 +187,32 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, { /* Number of credits per flow */ union psmi_envvar_val env_flow_credits; +#ifdef PSM_VERBS + int min_credits = IPS_PROTOEXP_FLAG_RDMA_QP(proto->ep->rdmamode) == IPS_PROTOEXP_FLAG_RDMA_USER_RC ? + IPS_PROTO_FLOW_CREDITS_RC_MIN_DEFAULT : IPS_PROTO_FLOW_CREDITS_MIN_DEFAULT; + int max_credits = IPS_PROTOEXP_FLAG_RDMA_QP(proto->ep->rdmamode) == IPS_PROTOEXP_FLAG_RDMA_USER_RC ? + IPS_PROTO_FLOW_CREDITS_RC_MAX_DEFAULT : IPS_PROTO_FLOW_CREDITS_MAX_DEFAULT; int tvals[3] = { - min(IPS_PROTO_FLOW_CREDITS_MIN_DEFAULT, num_of_send_desc), - min(IPS_PROTO_FLOW_CREDITS_MAX_DEFAULT, num_of_send_desc), - IPS_PROTO_FLOW_CREDITS_STEP_DEFAULT - }; + min(min_credits, num_of_send_desc), + min(max_credits, num_of_send_desc), + IPS_PROTO_FLOW_CREDITS_STEP_DEFAULT + }; +#else + int tvals[3] = { + min(IPS_PROTO_FLOW_CREDITS_MIN_DEFAULT, num_of_send_desc), + min(IPS_PROTO_FLOW_CREDITS_MAX_DEFAULT, num_of_send_desc), + IPS_PROTO_FLOW_CREDITS_STEP_DEFAULT + }; +#endif char fcredits_def[32]; snprintf(fcredits_def, sizeof(fcredits_def), "%d:%d:%d", tvals[0], tvals[1], tvals[2]); (void)psm3_getenv_range("PSM3_FLOW_CREDITS", "Number of unacked packets (credits) per flow in ", "Specified as min:max:adjust where min and max is the range of credits,\n" - "and adjust is the adjustment amount for adjusting credits", + "and adjust is the adjustment amount for adjusting credits. For PSM3_RDMA=3,\n" + "adjust is ignored. Data send pauses when number of unacked packets is beyond\n" + "max credits, and send resumes when the number is below min credits", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_TUPLES, (union psmi_envvar_val)fcredits_def, (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, @@ -446,7 +456,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, goto fail; } if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSM3_GPU_PREPARE_DTOH_MEMCPYS(proto); #endif if ((err = psm3_ips_protoexp_init(proto, protoexp_flags, @@ -474,59 +484,56 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, &proto->proto_am))) goto fail; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - is_gpudirect_enabled = psmi_parse_gpudirect(); - gpudirect_rdma_send_limit = psmi_parse_gpudirect_rdma_send_limit(0); - gpudirect_rdma_recv_limit = psmi_parse_gpudirect_rdma_recv_limit(0); +#ifdef PSM_HAVE_GPU + psm3_gpu_is_gpudirect_enabled = psmi_parse_gpudirect(); + psm3_gpu_gpudirect_rdma_send_limit = psmi_parse_gpudirect_rdma_send_limit(0); + psm3_gpu_gpudirect_rdma_recv_limit = psmi_parse_gpudirect_rdma_recv_limit(0); +#ifdef PSM_HAVE_RNDV_MOD if (psmi_hal_has_cap(PSM_HAL_CAP_GPUDIRECT)) - is_driver_gpudirect_enabled = 1; + psm3_gpu_is_driver_gpudirect_enabled = 1; /* Check for mismatch between PSM3 and RV module */ -#ifdef PSM_CUDA - if (psmi_hal_has_cap(PSM_HAL_CAP_INTEL_GPU) && - !psmi_hal_has_cap(PSM_HAL_CAP_NVIDIA_GPU)) - is_driver_gpudirect_enabled = 0; + if (! psmi_hal_has_cap(PSM3_GPU_HAL_CAP_EXPECTED)) + psm3_gpu_is_driver_gpudirect_enabled = 0; #else - if (psmi_hal_has_cap(PSM_HAL_CAP_NVIDIA_GPU) && - !psmi_hal_has_cap(PSM_HAL_CAP_INTEL_GPU)) - is_driver_gpudirect_enabled = 0; + psm3_gpu_is_driver_gpudirect_enabled = 0; #endif - if (! is_gpudirect_enabled) { - gpudirect_rdma_send_limit = gpudirect_rdma_recv_limit = 0; - } else if (PSMI_IS_GPU_DISABLED) { -#ifdef PSM_CUDA - // should not happen since we don't dynamically disable CUDA - _HFI_INFO("WARNING: Non-CUDA application, PSM3_GPUDIRECT option ignored\n"); -#else - // should not happen since we don't dynamically disable ONEAPI_ZE - _HFI_INFO("WARNING: Non-ONEAPI_ZE application, PSM3_GPUDIRECT option ignored\n"); -#endif - is_gpudirect_enabled = 0; - gpudirect_rdma_send_limit = gpudirect_rdma_recv_limit = 0; - } else if (!device_support_gpudirect()) { + if (! psm3_gpu_is_gpudirect_enabled) { + psm3_gpu_gpudirect_rdma_send_limit = psm3_gpu_gpudirect_rdma_recv_limit = 0; + } else if (! PSM3_GPU_IS_ENABLED) { + // should not happen since we test psmi_parse_gpudirect earlier + // and it will trigger initialization of the proper GPU. Then + // we provide no disabling of the GPU per EP. + _HFI_INFO("WARNING: Non-GPU application, PSM3_GPUDIRECT option ignored\n"); + psm3_gpu_is_gpudirect_enabled = 0; + psm3_gpu_gpudirect_rdma_send_limit = psm3_gpu_gpudirect_rdma_recv_limit = 0; + } else if (!PSM3_GPU_GPUDIRECT_SUPPORTED()) { _HFI_INFO("WARNING: GPU device does not support GPU Direct, PSM3_GPUDIRECT option ignored\n"); - is_gpudirect_enabled = 0; - gpudirect_rdma_send_limit = gpudirect_rdma_recv_limit = 0; - } else if ( - PSMI_IS_DRIVER_GPUDIRECT_DISABLED) { + psm3_gpu_is_gpudirect_enabled = 0; + psm3_gpu_gpudirect_rdma_send_limit = psm3_gpu_gpudirect_rdma_recv_limit = 0; + } else if (! PSM3_GPU_IS_DRIVER_GPUDIRECT_ENABLED) { +#ifdef PSM_HAVE_RNDV_MOD + char buf[100]; + PSM3_GPU_RV_CAP_STRING(buf, sizeof(buf), PSM3_GPU_RV_CAPABILITY_EXPECTED); err = psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, -#ifdef PSM_CUDA - "Unable to start run, PSM3_GPUDIRECT requires rv module with CUDA support.\n"); + "Unable to start run, PSM3_GPUDIRECT requires rv module with %s support.\n", buf); #else - "Unable to start run, PSM3_GPUDIRECT requires rv module with ONEAPI_ZE support.\n"); + err = psm3_handle_error(PSMI_EP_NORETURN, + PSM2_INTERNAL_ERR, + "Unable to start run, PSM3_GPUDIRECT requires rv module with GPU support.\n"); #endif } else if (!(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) { // only GDR Copy and GPU Send DMA allowed - gpudirect_rdma_send_limit = gpudirect_rdma_recv_limit = 0; + psm3_gpu_gpudirect_rdma_send_limit = psm3_gpu_gpudirect_rdma_recv_limit = 0; } else { - if (gpudirect_rdma_send_limit) + if (psm3_gpu_gpudirect_rdma_send_limit) proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND; - if (gpudirect_rdma_recv_limit) + if (psm3_gpu_gpudirect_rdma_recv_limit) proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV; } // from here forward can't use psmi_parse_gpudirect, - // must use is_gpudirect_enabled + // must use psm3_gpu_is_gpudirect_enabled /* The following cases need to be handled: * 1) GPU DIRECT is turned off but GDR COPY is turned on by the user or @@ -536,15 +543,15 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, * 2) GPU DIRECT is on but GDR COPY is turned off by the user - Leave *. this config as it is. */ - if (!is_gpudirect_enabled) - is_gdr_copy_enabled = gdr_copy_limit_send = - gdr_copy_limit_recv = 0; + if (!psm3_gpu_is_gpudirect_enabled) + psm3_gpu_is_gdr_copy_enabled = psm3_gpu_gdr_copy_limit_send = + psm3_gpu_gdr_copy_limit_recv = 0; /* technically this is not needed since we only consider GDRCopy Send * for TINY, SHORT, and single MTU RTS payload. But does no harm. */ - gdr_copy_limit_send = min(gdr_copy_limit_send, proto->ep->mtu); + psm3_gpu_gdr_copy_limit_send = min(psm3_gpu_gdr_copy_limit_send, proto->ep->mtu); - if (PSMI_IS_GPU_ENABLED && + if (PSM3_GPU_IS_ENABLED && (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) { struct psmi_rlimit_mpool rlim = GPU_HOSTBUFFER_LIMITS; uint32_t maxsz, chunksz, max_elements; @@ -613,7 +620,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, proto->gpu_hostbuf_send_cfg.bufsz, proto->gpu_prefetch_limit); } -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ #ifdef PSM_HAVE_REG_MR // we allocate MR cache here (as opposed to in protoexp) because @@ -629,7 +636,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, uint32_t default_cache_size_mb; // in megabytes uint32_t cache_pri_entries; uint64_t cache_pri_size; // in bytes -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint64_t cache_gpu_pri_size; // in bytes union psmi_envvar_val env_mr_cache_gpu_evict; #endif @@ -707,7 +714,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)default_cache_entries, &env_mr_cache_entries); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // cache_gpu_pri_size only used to confirm RV GPU cache size // Without GPU Direct we will not register any GPU MRs // if we have GPU Direct w/o RDMA, no priority pin/MRs except @@ -716,17 +723,17 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, // grow pri_entries to account for it // Note cache_pri_size == 0 if rdmamode not enabled cache_gpu_pri_size = 0; - if (PSMI_IS_GPU_ENABLED && is_gpudirect_enabled) { - if (gpudirect_rdma_send_limit || gpudirect_rdma_recv_limit) + if (PSM3_GPU_IS_ENABLED && psm3_gpu_is_gpudirect_enabled) { + if (psm3_gpu_gpudirect_rdma_send_limit || psm3_gpu_gpudirect_rdma_recv_limit) cache_gpu_pri_size = cache_pri_size; - if (gdr_copy_limit_send || gdr_copy_limit_recv) { + if (psm3_gpu_gdr_copy_limit_send || psm3_gpu_gdr_copy_limit_recv) { // min of one extra for GDRCopy - // largest recv with GDR copy is gdr_copy_limit_recv - // largest send with GDR copy is gdr_copy_limit_send + // largest recv with GDR copy is psm3_gpu_gdr_copy_limit_recv + // largest send with GDR copy is psm3_gpu_gdr_copy_limit_send cache_gpu_pri_size += ROUNDUP64P2(max(proto->epinfo.ep_mtu, - max(gdr_copy_limit_recv, - gdr_copy_limit_send)), + max(psm3_gpu_gdr_copy_limit_recv, + psm3_gpu_gdr_copy_limit_send)), PSMI_GPU_PAGESIZE); } psm3_getenv("PSM3_RV_GPU_CACHE_EVICT", @@ -737,13 +744,13 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, psm3_gpu_cache_evict = (uint64_t)env_mr_cache_gpu_evict.e_uint * 1024; } -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ proto->ep->mr_cache = proto->mr_cache = psm3_verbs_alloc_mr_cache(proto->ep, env_mr_cache_entries.e_uint, proto->ep->mr_cache_mode, env_mr_cache_size_mb.e_uint, cache_pri_entries, cache_pri_size -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , cache_gpu_pri_size #endif ); @@ -763,7 +770,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, _HFI_INFO("WARNING: Send DMA requires an MR Cache, disabling PSM3_SDMA\n"); proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = ~0U; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU proto->iovec_gpu_thresh_eager = proto->iovec_gpu_thresh_eager_blocking = ~0U; #endif @@ -771,39 +778,34 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, // without a real cache, Send DMA makes no sense psmi_assert(proto->ep->mr_cache_mode || proto->iovec_thresh_eager == ~0); psmi_assert(proto->ep->mr_cache_mode || proto->iovec_thresh_eager_blocking == ~0U); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // without a real cache, GPU Direct Send DMA makes no sense psmi_assert(proto->ep->mr_cache_mode || proto->iovec_gpu_thresh_eager == ~0); psmi_assert(proto->ep->mr_cache_mode || proto->iovec_gpu_thresh_eager_blocking == ~0U); #endif #endif /* PSM_HAVE_REG_MR */ -#ifdef PSM_CUDA - _HFI_DBG("Cuda %d GPU Direct support: driver %d GPU device %d\n", - is_cuda_enabled, is_driver_gpudirect_enabled, _device_support_gpudirect); -#elif defined(PSM_ONEAPI) - _HFI_DBG("OneAPI ZE %d GPU Direct support: driver %d GPU device %d\n", - is_oneapi_ze_enabled, is_driver_gpudirect_enabled, _device_support_gpudirect); -#endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU + _HFI_DBG("GPU ("PSM3_GPU_TYPES") Enabled %d (%s) GPU Direct support: driver %d GPU device %d\n", + PSM3_GPU_IS_ENABLED, PSM3_GPU_TYPE, psm3_gpu_is_driver_gpudirect_enabled, PSM3_GPU_GPUDIRECT_SUPPORTED()); _HFI_DBG("GDR Copy: %d limit send=%u recv=%u gpu_rndv=%u GPU RDMA flags=0x%x limit send=%u recv=%u\n", - is_gdr_copy_enabled, gdr_copy_limit_send, gdr_copy_limit_recv, + psm3_gpu_is_gdr_copy_enabled, psm3_gpu_gdr_copy_limit_send, psm3_gpu_gdr_copy_limit_recv, psm3_gpu_thresh_rndv, proto->flags & (IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV |IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND), - gpudirect_rdma_send_limit, gpudirect_rdma_recv_limit); + psm3_gpu_gpudirect_rdma_send_limit, psm3_gpu_gpudirect_rdma_recv_limit); #ifdef PSM_HAVE_REG_MR _HFI_DBG("send dma thresh: %u %u GPU send DMA thresh %u %u\n", proto->iovec_thresh_eager, proto->iovec_thresh_eager_blocking, proto->iovec_gpu_thresh_eager, proto->iovec_gpu_thresh_eager_blocking); #endif -#else /* PSM_CUDA || PSM_ONEAPI */ +#else /* PSM_HAVE_GPU */ #ifdef PSM_HAVE_REG_MR _HFI_DBG("send dma thresh: %u %u\n", proto->iovec_thresh_eager, proto->iovec_thresh_eager_blocking); #endif -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ #ifdef PSM_HAVE_REG_MR _HFI_DBG("rdma: %u MR cache %u\n", proto->ep->rdmamode, proto->ep->mr_cache_mode); @@ -971,9 +973,7 @@ psm3_ips_proto_fini(struct ips_proto *proto) { psm2_error_t err = PSM2_OK; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) PSM3_GPU_SHUTDOWN_DTOH_MEMCPYS(proto); -#endif if ((err = psm3_ips_ibta_fini(proto))) goto fail; @@ -1038,8 +1038,8 @@ proto_sdma_init(struct ips_proto *proto) } } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (! is_gpudirect_enabled +#ifdef PSM_HAVE_GPU + if (! psm3_gpu_is_gpudirect_enabled || !psmi_hal_has_cap(PSM_HAL_CAP_GPUDIRECT_SDMA)) env_sdma.e_uint = 0; else @@ -1064,7 +1064,7 @@ proto_sdma_init(struct ips_proto *proto) env_hfiegr.e_uint; } } -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ return err; } @@ -1200,7 +1200,7 @@ psm3_ips_proto_timer_ctrlq_callback(struct psmi_timer *timer, uint64_t t_cyc_exp cqe->msg_scb.flow, &cqe->msg_scb, cqe->msg_scb.cksum, 0, PSMI_TRUE, have_cksum, cqe->msg_scb.cksum[0] -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , 0 #endif ); @@ -1309,7 +1309,7 @@ psm3_ips_proto_send_ctrl_message(struct ips_flow *flow, uint8_t message_type, err = psmi_hal_transfer_frame(proto, flow, ctrlscb, payload, paylen, PSMI_TRUE, have_cksum, ctrlscb->cksum[0] -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , 0 #endif ); @@ -1492,7 +1492,7 @@ psm3_ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed) scb->ips_lrh.flags & IPS_SEND_FLAG_PKTCKSUM, scb->cksum[0] -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , IS_TRANSFER_BUF_GPU_MEM(scb) #endif )) @@ -1529,9 +1529,9 @@ psm3_ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed) #endif PSM2_LOG_PKT_STRM(PSM2_LOG_TX,&scb->ips_lrh,"PKT_STRM: payload_size=%d err: %d", scb->payload_size, err); - } else if (err == PSM2_TCP_DATA_SENT) { + } else if (err == PSM2_RELIABLE_DATA_SENT) { // no credits and timers - // TDB - implement credits for TCP + // TDB - implement credits for reliable send GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); /* perf stats */ scb->scb_flags &= ~IPS_SEND_FLAG_PENDING; num_sent++; @@ -2036,7 +2036,7 @@ ips_proto_register_stats(struct ips_proto *proto) "also carry all or a portion of the message payload.\n" "Large Rendezvous messages may be broken into multiple " "window size chunks each with a separate CTS.\n" -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU "When sending from a GPU application buffer the " "mechanisms include:\n" " - gdrcopy - Direct GPU copy via mmaping GPU memory\n" @@ -2051,7 +2051,7 @@ ips_proto_register_stats(struct ips_proto *proto) "application buffer when it posts the receive. " "With the exception of RDMA, all receive mechanisms " "involve some form of copy.\n" -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU "When receiving into a GPU application buffer the " "mechanisms include:\n" " - gdrcopy - Direct GPU copy via mmaping GPU memory\n" @@ -2068,7 +2068,7 @@ ips_proto_register_stats(struct ips_proto *proto) PSMI_STATS_DECLU64("tiny_cpu_isend_bytes", "Tiny message bytes sent async from a CPU buffer", &proto->strat_stats.tiny_cpu_isend_bytes), -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_STATS_DECLU64("tiny_gdrcopy_isend", "Tiny messages sent async from a GPU buffer via GDR copy", &proto->strat_stats.tiny_gdrcopy_isend), @@ -2088,7 +2088,7 @@ ips_proto_register_stats(struct ips_proto *proto) PSMI_STATS_DECLU64("tiny_cpu_send_bytes", "Tiny message bytes sent sync from a CPU buffer", &proto->strat_stats.tiny_cpu_send_bytes), -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_STATS_DECLU64("tiny_gdrcopy_send", "Tiny messages sent sync from a GPU buffer via GDR copy", &proto->strat_stats.tiny_gdrcopy_send), @@ -2114,7 +2114,7 @@ ips_proto_register_stats(struct ips_proto *proto) PSMI_STATS_DECLU64("tiny_sysbuf_recv_bytes", "Tiny message bytes received into a bounce buffer", &proto->strat_stats.tiny_sysbuf_recv_bytes), -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_STATS_DECLU64("tiny_gdrcopy_recv", "Tiny messages received into an application GPU buffer via GDR copy", &proto->strat_stats.tiny_gdrcopy_recv), @@ -2141,7 +2141,7 @@ ips_proto_register_stats(struct ips_proto *proto) PSMI_STATS_DECLU64("short_dma_cpu_isend_bytes", "Short message bytes sent async from a CPU buffer via send DMA", &proto->strat_stats.short_dma_cpu_isend_bytes), -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_STATS_DECLU64("short_gdrcopy_isend", "Short messages sent async from a GPU buffer via GDR copy", &proto->strat_stats.short_gdrcopy_isend), @@ -2173,7 +2173,7 @@ ips_proto_register_stats(struct ips_proto *proto) PSMI_STATS_DECLU64("short_dma_cpu_send_bytes", "Short message bytes sent sync from a CPU buffer via send DMA", &proto->strat_stats.short_dma_cpu_send_bytes), -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_STATS_DECLU64("short_gdrcopy_send", "Short messages sent sync from a GPU buffer via GDR copy", &proto->strat_stats.short_gdrcopy_send), @@ -2206,7 +2206,7 @@ ips_proto_register_stats(struct ips_proto *proto) PSMI_STATS_DECLU64("short_sysbuf_recv_bytes", "Short message bytes received into a bounce buffer", &proto->strat_stats.short_sysbuf_recv_bytes), -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_STATS_DECLU64("short_gdrcopy_recv", "Short messages received into an application GPU buffer via GDR copy", &proto->strat_stats.short_gdrcopy_recv), @@ -2233,7 +2233,7 @@ ips_proto_register_stats(struct ips_proto *proto) PSMI_STATS_DECLU64("eager_dma_cpu_isend_bytes", "Eager message bytes sent async from a CPU buffer via send DMA", &proto->strat_stats.eager_dma_cpu_isend_bytes), -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_STATS_DECLU64("eager_cuCopy_isend", "Eager messages sent async from a GPU buffer via GPU copy", &proto->strat_stats.eager_cuCopy_isend), @@ -2259,7 +2259,7 @@ ips_proto_register_stats(struct ips_proto *proto) PSMI_STATS_DECLU64("eager_dma_cpu_send_bytes", "Eager message bytes sent sync from a CPU buffer via send DMA", &proto->strat_stats.eager_dma_cpu_send_bytes), -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_STATS_DECLU64("eager_cuCopy_send", "Eager messages sent sync from a GPU buffer via GPU copy", &proto->strat_stats.eager_cuCopy_send), @@ -2286,7 +2286,7 @@ ips_proto_register_stats(struct ips_proto *proto) PSMI_STATS_DECLU64("eager_sysbuf_recv_bytes", "Eager message bytes received into a bounce buffer", &proto->strat_stats.eager_sysbuf_recv_bytes), -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_STATS_DECLU64("eager_gdrcopy_recv", "Eager messages received into an application GPU buffer via GDR copy", &proto->strat_stats.eager_gdrcopy_recv), @@ -2307,7 +2307,7 @@ ips_proto_register_stats(struct ips_proto *proto) PSMI_STATS_DECLU64("rndv_cpu_isend_bytes", "Rendezvous message bytes sent async from a CPU buffer", &proto->strat_stats.rndv_cpu_isend_bytes), -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_STATS_DECLU64("rndv_gpu_isend", "Rendezvous messages sent async from a GPU buffer", &proto->strat_stats.rndv_gpu_isend), @@ -2321,7 +2321,7 @@ ips_proto_register_stats(struct ips_proto *proto) PSMI_STATS_DECLU64("rndv_cpu_send_bytes", "Rendezvous message bytes sent sync from a CPU buffer", &proto->strat_stats.rndv_cpu_send_bytes), -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_STATS_DECLU64("rndv_gpu_send", "Rendezvous messages sent sync from a GPU buffer", &proto->strat_stats.rndv_gpu_send), @@ -2342,7 +2342,7 @@ ips_proto_register_stats(struct ips_proto *proto) PSMI_STATS_DECLU64("rndv_rts_sysbuf_recv_bytes", "RTS packet message bytes received into an bounce buffer", &proto->strat_stats.rndv_rts_sysbuf_recv_bytes), -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_STATS_DECLU64("rndv_rts_cuCopy_recv", "RTS packet messages received into an application GPU buffer via GPU copy", &proto->strat_stats.rndv_rts_cuCopy_recv), @@ -2363,7 +2363,7 @@ ips_proto_register_stats(struct ips_proto *proto) PSMI_STATS_DECLU64("rndv_long_cpu_recv_bytes", "Long Data rendezvous message bytes received into an application CPU buffer", &proto->strat_stats.rndv_long_cpu_recv_bytes), -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_STATS_DECLU64("rndv_long_cuCopy_recv", "Long Data rendezvous messages received into an application GPU buffer via GPU copy", &proto->strat_stats.rndv_long_cuCopy_recv), @@ -2390,7 +2390,7 @@ ips_proto_register_stats(struct ips_proto *proto) PSMI_STATS_DECLU64("rndv_long_dma_cpu_send_bytes", "Long Data rendezvous message bytes sent from a CPU buffer via send DMA", &proto->strat_stats.rndv_long_dma_cpu_send_bytes), -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_STATS_DECLU64("rndv_long_cuCopy_send", "Long Data rendezvous messages sent from a GPU buffer via GPU copy", &proto->strat_stats.rndv_long_cuCopy_send), @@ -2417,7 +2417,7 @@ ips_proto_register_stats(struct ips_proto *proto) PSMI_STATS_DECLU64("rndv_rdma_cpu_recv_bytes", "RDMA rendezvous message bytes received direct into a CPU buffer", &proto->strat_stats.rndv_rdma_cpu_recv_bytes), -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_STATS_DECLU64("rndv_rdma_gdr_recv", "RDMA rendezvous messages received direct into a GPU buffer", &proto->strat_stats.rndv_rdma_gdr_recv), @@ -2437,7 +2437,7 @@ ips_proto_register_stats(struct ips_proto *proto) PSMI_STATS_DECLU64("rndv_rdma_cpu_send_bytes", "RDMA rendezvous message bytes sent from a CPU buffer via send RDMA", &proto->strat_stats.rndv_rdma_cpu_send_bytes), -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_STATS_DECLU64("rndv_rdma_gdr_send", "RDMA rendezvous messages sent from a GPU buffer via send RDMA", &proto->strat_stats.rndv_rdma_gdr_send), diff --git a/prov/psm3/psm3/ptl_ips/ips_proto.h b/prov/psm3/psm3/ptl_ips/ips_proto.h index 47bf7a50c1d..8c81c4dcb49 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto.h +++ b/prov/psm3/psm3/ptl_ips/ips_proto.h @@ -94,8 +94,8 @@ struct ips_epinfo { uint8_t ep_lmc; enum psm3_ibv_rate ep_link_rate; uint16_t ep_sl; /* PSM3_NIC_SL only when path record not used */ - uint32_t ep_mtu; // PSM payload after potential hdr & PSM3_MTU decrease - // or TCP increase beyond wire size + uint32_t ep_mtu; // PSM payload after potential hdr & PSM3_MTU adjustment + // for TCP and RC it can be beyond wire size uint16_t ep_pkey; /* PSM3_PKEY only when path record not used */ uint64_t ep_timeout_ack; /* PSM3_ERRCHK_TIMEOUT if no path record */ uint64_t ep_timeout_ack_max; @@ -356,7 +356,7 @@ struct ips_proto { uint32_t iovec_thresh_eager_blocking; #endif #ifdef PSM_HAVE_REG_MR -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU uint32_t iovec_gpu_thresh_eager; uint32_t iovec_gpu_thresh_eager_blocking; #endif @@ -431,21 +431,12 @@ struct ips_proto { void *opp_ctxt; struct opp_api opp_fn; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU struct ips_gpu_hostbuf_mpool_cb_context gpu_hostbuf_send_cfg; struct ips_gpu_hostbuf_mpool_cb_context gpu_hostbuf_small_send_cfg; mpool_t gpu_hostbuf_pool_send; mpool_t gpu_hostbuf_pool_small_send; -#endif - -#ifdef PSM_CUDA - CUstream cudastream_send; -#elif defined(PSM_ONEAPI) - /* Will not be used if psm3_oneapi_immed_async_copy */ - ze_command_queue_handle_t cq_sends[MAX_ZE_DEVICES]; -#endif - -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + union ips_proto_gpu_specific gpu_specific; unsigned gpu_prefetch_limit; #endif /* @@ -656,6 +647,21 @@ struct ips_epaddr { uint32_t use_max_inline_data; uint8_t rc_connected; + + // MR for flow->recv_seq_num + struct ibv_mr *recv_seq_mr; + // remote flow->recv_seq_num addr and rkey + uint64_t remote_recv_seq_addr; + uint32_t remote_recv_seq_rkey; + // psn num of remote flow->recv_seq_num + uint32_t remote_recv_psn; + // MR for remote flow->recv_seq_num storage + struct ibv_mr *remote_recv_psn_mr; + // indicare whether we have outstanding RDMA Read for + // remote flow->recv_seq_num + uint8_t remote_seq_outstanding; + // congestion control count + uint16_t cc_count; #endif /* USE_RC */ } verbs; #endif /* PSM_VERBS */ @@ -838,7 +844,7 @@ MOCK_DCL_EPILOGUE(psm3_ips_ibta_init); psm2_error_t psm3_ips_ibta_fini(struct ips_proto *proto); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_ALWAYS_INLINE( uint32_t ips_gpu_next_window(uint32_t max_window, uint32_t offset, uint32_t len)) diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_connect.h b/prov/psm3/psm3/ptl_ips/ips_proto_connect.h index 51f1f9affcb..a586f9dff1a 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto_connect.h +++ b/prov/psm3/psm3/ptl_ips/ips_proto_connect.h @@ -123,15 +123,21 @@ struct ips_connect_reqrep { uint8_t reserved[16]; // 64b aligned // fields below can be zero depending on rdmamode - // TBD - we could combine the RDMA=1 and RDMA=2,3 - // sets of fields below into a union and save space - // or make room for more reserved space - - // For rndv module connection establishment, PSM3_RDMA=1 - // zero if no rndv mod RDMA - union ibv_gid gid; // sender's gid - uint32_t rv_index; // senders process index - uint32_t resv; // alignment + union { + struct { + // For rndv module connection establishment, PSM3_RDMA=1 + // zero if no rndv mod RDMA + union ibv_gid gid; // sender's gid + uint32_t rv_index; // senders process index + uint32_t resv; // alignment + } rv; + struct { + // For PSM3_RDMA=3 only + uint64_t recv_addr; + uint32_t recv_rkey; + uint8_t resv[12]; + } urc; // user space RC + }; // For user space RC QP connection establishment // only set for USE_RC with PSM3_RDMA=2 or 3 diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_expected.c b/prov/psm3/psm3/ptl_ips/ips_proto_expected.c index 4cc1ebc701b..8b69b9d34d8 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto_expected.c +++ b/prov/psm3/psm3/ptl_ips/ips_proto_expected.c @@ -102,7 +102,7 @@ static psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc); #endif // PSM_HAVE_RDMA static psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU static void psmi_gpu_run_prefetcher(struct ips_protoexp *protoexp, struct ips_tid_send_desc *tidsendc); @@ -252,8 +252,8 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto, #endif #endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (PSMI_IS_GPU_ENABLED) { +#ifdef PSM_HAVE_GPU + if (PSM3_GPU_IS_ENABLED) { struct psmi_rlimit_mpool rlim = GPU_HOSTBUFFER_LIMITS; uint32_t maxsz, chunksz, max_elements; uint32_t pool_num_obj_max_total; @@ -323,7 +323,7 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto, return err; fail: -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (protoexp != NULL && protoexp->gpu_hostbuf_pool_recv != NULL) psm3_mpool_destroy(protoexp->gpu_hostbuf_pool_recv); if (protoexp != NULL && protoexp->gpu_hostbuf_pool_small_recv != NULL) @@ -346,9 +346,8 @@ psm2_error_t psm3_ips_protoexp_fini(struct ips_protoexp *protoexp) { psm2_error_t err = PSM2_OK; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if(PSMI_IS_GPU_ENABLED && - !(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) { +#ifdef PSM_HAVE_GPU + if (PSM3_GPU_IS_ENABLED) { psm3_mpool_destroy(protoexp->gpu_hostbuf_pool_small_recv); psm3_mpool_destroy(protoexp->gpu_hostbuf_pool_recv); PSM3_GPU_SHUTDOWN_HTOD_MEMCPYS(protoexp); @@ -483,12 +482,12 @@ psm3_ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp, getreq->tidgr_bytesdone = 0; getreq->tidgr_flags = flags; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if ((req->is_buf_gpu_mem && !(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) || ((req->is_buf_gpu_mem && (protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) && - (length > gpudirect_rdma_recv_limit + (length > psm3_gpu_gpudirect_rdma_recv_limit || length & 0x03 || (uintptr_t)buf & 0x03 )))) { getreq->gpu_hostbuf_used = 1; @@ -505,14 +504,14 @@ psm3_ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp, #endif protoexp->proto->strat_stats.rndv_rdma_cpu_recv++; protoexp->proto->strat_stats.rndv_rdma_cpu_recv_bytes += length; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU } } #endif /* nbytes is the bytes each channel should transfer. */ count = ((ips_epaddr_t *) epaddr)->msgctl->ipsaddr_count; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (req->is_buf_gpu_mem) nbytes = PSMI_ALIGNUP((length + count - 1) / count, PSMI_GPU_PAGESIZE); else @@ -632,7 +631,7 @@ psm3_ips_protoexp_send_tid_grant(struct ips_tid_recv_desc *tidrecvc) } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU void psm3_ips_deallocate_send_chb(struct ips_gpu_hostbuf* chb, int reset) { if (chb->is_tempbuf) { @@ -672,7 +671,7 @@ ips_protoexp_tidsendc_complete(struct ips_tid_send_desc *tidsendc) } #endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (req->gpu_hostbuf_used) { if (tidsendc->gpu_num_buf == 1) { tidsendc->gpu_hostbuf[0]->bytes_read += @@ -1237,7 +1236,7 @@ int ips_protoexp_handle_immed_data(struct ips_proto *proto, uint64_t conn_ref, } #endif if (_HFI_PDBG_ON) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (tidrecvc->is_ptr_gpu_backed) _HFI_PDBG_DUMP_GPU_ALWAYS(tidrecvc->buffer, len); else @@ -1269,7 +1268,7 @@ int ips_protoexp_handle_immed_data(struct ips_proto *proto, uint64_t conn_ref, -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU static psm2_error_t psmi_gpu_reclaim_hostbufs(struct ips_tid_get_request *getreq) @@ -1565,7 +1564,7 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, _HFI_MMDBG("tidsendc created userbuf %p buffer %p length %u\n", tidsendc->userbuf, tidsendc->buffer, tidsendc->length); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* Matching on previous prefetches and initiating next prefetch */ struct ips_gpu_hostbuf *chb = NULL, *chb_next = NULL; psm2_chb_match_type_t rc = PSMI_GPU_CONTINUE; @@ -1638,7 +1637,7 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, protoexp->proto->strat_stats.rndv_rdma_gdr_send++; protoexp->proto->strat_stats.rndv_rdma_gdr_send_bytes += tid_list->tsess_length; } else -#endif // PSM_CUDA || PSM_ONEAPI +#endif /* PSM_HAVE_GPU */ { protoexp->proto->strat_stats.rndv_rdma_cpu_send++; protoexp->proto->strat_stats.rndv_rdma_cpu_send_bytes += tid_list->tsess_length; @@ -1716,7 +1715,7 @@ psm2_error_t ips_tid_issue_rdma_write(struct ips_tid_send_desc *tidsendc) // no need to register again err = PSM2_OK; } else if ( -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU ! tidsendc->mqreq->gpu_hostbuf_used && #endif // separate MR cache's per EP, so this confirms we have the same EP @@ -1730,7 +1729,7 @@ psm2_error_t ips_tid_issue_rdma_write(struct ips_tid_send_desc *tidsendc) _HFI_MMDBG("CTS send chunk register send: %p %u bytes\n", tidsendc->buffer , tidsendc->length); tidsendc->mr = psm3_verbs_reg_mr(proto->mr_cache, 1, tidsendc->buffer, tidsendc->length, IBV_ACCESS_RDMA -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU | (PSM3_GPU_ADDR_SEND_MR(tidsendc->mqreq) ?IBV_ACCESS_IS_GPU_ADDR:0) #endif @@ -1775,7 +1774,7 @@ psm2_error_t ips_tid_issue_rdma_write(struct ips_tid_send_desc *tidsendc) } if (err == PSM2_OK) { if (_HFI_PDBG_ON) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (tidsendc->mqreq->is_buf_gpu_mem && !tidsendc->mqreq->gpu_hostbuf_used) _HFI_PDBG_DUMP_GPU_ALWAYS(tidsendc->buffer, tidsendc->tid_list.tsess_length); else @@ -1803,7 +1802,7 @@ psm2_error_t ips_tid_issue_rdma_write(struct ips_tid_send_desc *tidsendc) } if (err == PSM2_OK) { if (_HFI_PDBG_ON) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (tidsendc->mqreq->is_buf_gpu_mem && !tidsendc->mqreq->gpu_hostbuf_used) _HFI_PDBG_DUMP_GPU_ALWAYS(tidsendc->buffer, tidsendc->tid_list.tsess_length); else @@ -1840,12 +1839,12 @@ static psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc) { psm2_error_t err = PSM2_OK; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU struct ips_protoexp *protoexp = tidsendc->protoexp; #endif _HFI_MMDBG("ips_tid_send_exp\n"); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU struct ips_gpu_hostbuf *chb, *chb_next; uint32_t offset_in_chb, i; // wait for async copies into needed prefetcher chb's to finish @@ -2005,7 +2004,7 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp, ips_scb_t *grantscb; #ifdef PSM_VERBS psm2_mq_req_t req = getreq->tidgr_req; -#elif defined(PSM_CUDA) || defined(PSM_ONEAPI) +#elif defined(PSM_HAVE_GPU) psm2_mq_req_t req = getreq->tidgr_req; #endif #if defined(PSM_VERBS) @@ -2046,7 +2045,7 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp, tidrecvc->mr = NULL; // be safe,but should be NULL since clear on release #endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (req->is_buf_gpu_mem) tidrecvc->is_ptr_gpu_backed = !getreq->gpu_hostbuf_used; else @@ -2095,17 +2094,17 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp, getreq->tidgr_offset); tidrecvc->gpu_hostbuf = NULL; } -#else // PSM_CUDA || PSM_ONEAPI +#else /* PSM_HAVE_GPU */ tidrecvc->buffer = (void *)((uintptr_t) getreq->tidgr_lbuf + getreq->tidgr_offset); -#endif // PSM_CUDA || PSM_ONEAPI +#endif /* PSM_HAVE_GPU */ #if defined(PSM_SOCKETS) && PSMI_HAL_INST_CNT == 1 psmi_assert_always(0); // should not get here #elif defined(PSM_VERBS) // separate MR cache's per EP, so this confirms we have the same EP if ( -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU ! getreq->gpu_hostbuf_used && #endif req->mr && req->mr->cache == proto->mr_cache) { @@ -2115,12 +2114,12 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp, _HFI_MMDBG("CTS chunk register recv: %p %u bytes\n", tidrecvc->buffer, nbytes_this); tidrecvc->mr = psm3_verbs_reg_mr(proto->mr_cache, 1, tidrecvc->buffer, nbytes_this, IBV_ACCESS_RDMA|IBV_ACCESS_REMOTE_WRITE -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - | (PSM3_GPU_ADDR_RECV_MR(tidrecvc, getreq)?IBV_ACCESS_IS_GPU_ADDR:0) +#ifdef PSM_HAVE_GPU + | (PSM3_GPU_ADDR_RECV_MR(tidrecvc, getreq->gpu_hostbuf_used)?IBV_ACCESS_IS_GPU_ADDR:0) #endif ); if (! tidrecvc->mr) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (chb) psm3_mpool_put(chb); #endif @@ -2220,7 +2219,7 @@ ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current) #endif #endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* due to unaligned recv using hostbuf, must always do this */ { /* Before processing pending TID requests, first try to free up @@ -2289,7 +2288,7 @@ ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current) } } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (getreq->gpu_hostbuf_used) { /* If this is a large transfer, we may be able to * start reclaiming before all of the data is sent. */ @@ -2322,7 +2321,7 @@ ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current) * async cuda copies to fill it, so the extra CTS is minimal * impact to the sender. */ -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU psm2_mq_req_t req = getreq->tidgr_req; if (req->is_buf_gpu_mem){ if (((getreq->tidgr_offset + nbytes_this) < @@ -2392,7 +2391,7 @@ ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current) getreq->tidgr_length); if (getreq->tidgr_offset == getreq->tidgr_length) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (getreq->gpu_hostbuf_used) { /* this completes the tid xfer setup. move to the pending cuda ops queue, @@ -2446,7 +2445,7 @@ ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current) } #ifdef PSM_HAVE_RDMA -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU static void psmi_cudamemcpy_tid_to_device(struct ips_tid_recv_desc *tidrecvc) { @@ -2463,7 +2462,7 @@ void psmi_cudamemcpy_tid_to_device(struct ips_tid_recv_desc *tidrecvc) tidrecvc->gpu_hostbuf = NULL; ips_tid_pendtids_timer_callback(&tidrecvc->getreq->tidgr_protoexp->timer_getreqs,0); } -#endif // PSM_CUDA || PSM_ONEAPI +#endif /* PSM_HAVE_GPU */ #endif // PSM_HAVE_RDMA #ifdef PSM_HAVE_RDMA @@ -2479,7 +2478,7 @@ psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc) psmi_assert(getreq != NULL); psmi_assert(tidrecvc->state == TIDRECVC_STATE_BUSY); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (tidrecvc->gpu_hostbuf) psmi_cudamemcpy_tid_to_device(tidrecvc); #endif @@ -2502,7 +2501,7 @@ psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc) psm3_ips_tf_deallocate(&protoexp->tfc, tidrecvc->rdescid._desc_idx, 1); if (getreq->tidgr_bytesdone == getreq->tidgr_length) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* if cuda, we handle callbacks when the cuda xfer is done */ if (!getreq->gpu_hostbuf_used) { if (getreq->tidgr_callback) diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_mq.c b/prov/psm3/psm3/ptl_ips/ips_proto_mq.c index a4f71ab8e5e..286b59507ca 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto_mq.c +++ b/prov/psm3/psm3/ptl_ips/ips_proto_mq.c @@ -149,7 +149,7 @@ int ips_proto_mq_eager_complete(void *reqp, uint32_t nbytes) * completion notification sent to the sender, this is the only place * where send side chb's can be freed and put back into the mpool. */ -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU struct ips_gpu_hostbuf *chb; if (req->gpu_hostbuf_used) { while (!STAILQ_EMPTY(&req->sendreq_prefetch)) { @@ -202,8 +202,8 @@ ips_shortcpy(void *vdest, const void *vsrc, uint32_t nchars)) unsigned char *dest = vdest; const unsigned char *src = vsrc; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (PSMI_IS_GPU_ENABLED && (PSMI_IS_GPU_MEM(vdest) || PSMI_IS_GPU_MEM(vsrc))) { +#ifdef PSM_HAVE_GPU + if ((PSM3_IS_GPU_MEM(vdest) || PSM3_IS_GPU_MEM(vsrc))) { PSM3_GPU_MEMCPY(vdest, vsrc, nchars); return; } @@ -223,7 +223,7 @@ ips_shortcpy(void *vdest, const void *vsrc, uint32_t nchars)) return; } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU PSMI_ALWAYS_INLINE( void ips_shortcpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars)) @@ -356,7 +356,7 @@ ips_ptl_mq_eager(struct ips_proto *proto, psm2_mq_req_t req, } #endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (req->is_buf_gpu_mem) { // flags will get handled in pio transfer_frame // but use cuMemcpy instead of GDRCopy @@ -369,7 +369,7 @@ ips_ptl_mq_eager(struct ips_proto *proto, psm2_mq_req_t req, // TBD USER_BUF_GPU only useful for RTS ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; } -#endif // PSM_CUDA || PSM_ONEAPI +#endif /* PSM_HAVE_GPU */ buf += pktlen; offset += pktlen; @@ -463,7 +463,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, // small synchronous payload is sent in RTS itself // CTS becomes the synchronous ACK if (len <= flow->frag_size && -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU !req->is_buf_gpu_mem && #endif (psmi_hal_has_cap(PSM_HAL_CAP_NON_DW_PKT_SIZE) || !(len & 0x3))) { @@ -476,7 +476,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, req->send_msgoff = 0; } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* Used to indicate to the receiver that the send * is issued on a device buffer. This helps the * receiver select TID instead of using eager buffers. @@ -492,7 +492,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, (len > GPUDIRECT_THRESH_RV)) || ((proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND) && req->is_buf_gpu_mem && - (len > gpudirect_rdma_send_limit))) { + (len > psm3_gpu_gpudirect_rdma_send_limit))) { /* send from intermediate host buffer */ _HFI_VDBG("send from intermediate host buffer\n"); struct ips_gpu_hostbuf *chb; @@ -566,7 +566,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, // length, etc below) // // register buffer we will use as source for RDMA Write - // for PSM_CUDA/PSM_ONEAPI, a group of host bounce buffers may be used above + // for GPU, a group of host bounce buffers may be used above // ips_scb_buffer catches when RTS contains the data, in which case no // need for memory registration. While unlkely we also skip // registration for zero length sync messages @@ -576,14 +576,14 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, && proto->protoexp /* expected tid recieve enabled */ && ips_epaddr_rdma_connected(ipsaddr) && !req->mr -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - && (!PSMI_IS_GPU_ENABLED || len > GPUDIRECT_THRESH_RV) +#ifdef PSM_HAVE_GPU + && (!PSM3_GPU_IS_ENABLED || len > GPUDIRECT_THRESH_RV) && ! req->gpu_hostbuf_used #endif ) { req->mr = psm3_verbs_reg_mr(proto->mr_cache, 0, req->req_data.buf, req->req_data.send_msglen, IBV_ACCESS_RDMA -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU | (req->is_buf_gpu_mem?IBV_ACCESS_IS_GPU_ADDR:0) #endif ); @@ -610,7 +610,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, return err; } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU static inline int psm3_is_needed_rendezvous(struct ips_proto *proto, uint32_t len, uint32_t flags_user) @@ -623,7 +623,7 @@ int psm3_is_needed_rendezvous(struct ips_proto *proto, uint32_t len, return 0; } -#endif //PSM_CUDA || PSM_ONEAPI +#endif /* PSM_HAVE_GPU */ psm2_error_t @@ -637,9 +637,9 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user ips_epaddr_t *ipsaddr; ips_scb_t *scb; psm2_mq_req_t req; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU int gpu_mem = 0; -#endif // PSM_CUDA || PSM_ONEAPI +#endif /* PSM_HAVE_GPU */ req = psm3_mq_req_alloc(mq, MQE_TYPE_SEND); if_pf(req == NULL) @@ -666,12 +666,12 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user req->req_data.tag = *tag; req->req_data.context = context; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU req->is_buf_gpu_mem = PSM3_IS_BUFFER_GPU_MEM(ubuf, len); req->gpu_hostbuf_used = 0; if (req->is_buf_gpu_mem) { gpu_mem = 1; - PSM3_MARK_BUF_SYNCHRONOUS(ubuf); + PSM3_GPU_MARK_BUF_SYNCHRONOUS(ubuf); if (psm3_is_needed_rendezvous(proto, len, 0)) goto do_rendezvous; } @@ -692,7 +692,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag); const void *user_buffer = ubuf; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (!req->is_buf_gpu_mem) { mq_copy_tiny_host_mem((uint32_t *) &scb->ips_lrh.hdr_data, (uint32_t *) user_buffer, len); @@ -707,7 +707,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user * memcpy to move data between HFI resources * and the GPU */ - if (len <= gdr_copy_limit_send && + if (len <= psm3_gpu_gdr_copy_limit_send && NULL != (user_buffer = psmi_hal_gdr_convert_gpu_to_host_addr( (unsigned long)ubuf, len, 0, proto->ep))) { mq_copy_tiny_host_mem((uint32_t *) &scb->ips_lrh.hdr_data, @@ -763,11 +763,11 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user #ifdef PSM_HAVE_REG_MR int used_send_dma = 0; #endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (req->is_buf_gpu_mem) { // TBD USER_BUF_GPU only useful for RTS ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; - if (len <= gdr_copy_limit_send && + if (len <= psm3_gpu_gdr_copy_limit_send && NULL != (user_buffer = psmi_hal_gdr_convert_gpu_to_host_addr( (unsigned long)ubuf, len , 0, proto->ep))) { /* init req so ips_proto_mq_eager_complete can unmap */ @@ -802,7 +802,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user } } } else -#endif // PSM_CUDA || PSM_ONEAPI +#endif /* PSM_HAVE_GPU */ { #ifdef PSM_HAVE_REG_MR if (len > proto->iovec_thresh_eager) { @@ -894,7 +894,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user } else if (len <= mq->rndv_nic_thresh) { req->send_msgoff = 0; req->rts_peer = (psm2_epaddr_t) ipsaddr; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (req->is_buf_gpu_mem) { #ifdef PSM_HAVE_REG_MR // TBD - no upper bound for send DMA here @@ -913,7 +913,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user proto->strat_stats.eager_cuCopy_isend_bytes += len; } } else -#endif // PSM_CUDA || PSM_ONEAPI +#endif /* PSM_HAVE_GPU */ { #ifdef PSM_HAVE_REG_MR // TBD - no upper bound for send DMA here @@ -943,7 +943,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user len, tag->tag[0], tag->tag[1], tag->tag[2], req); } else { /* skip eager accounting below */ do_rendezvous: -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (gpu_mem) { proto->strat_stats.rndv_gpu_isend++; proto->strat_stats.rndv_gpu_isend_bytes += len; @@ -969,7 +969,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user mq->stats.tx_num++; mq->stats.tx_eager_num++; mq->stats.tx_eager_bytes += len; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (gpu_mem) { mq->stats.tx_eager_gpu_num++; mq->stats.tx_eager_gpu_bytes += len; @@ -992,7 +992,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, ips_epaddr_t *ipsaddr; ips_scb_t *scb; -#if defined(PSM_CUDA) || defined (PSM_ONEAPI) +#ifdef PSM_HAVE_GPU int gpu_mem = 0; #endif @@ -1010,10 +1010,10 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, psmi_assert(proto->msgflowid < EP_FLOW_LAST); -#if defined(PSM_CUDA) || defined (PSM_ONEAPI) +#ifdef PSM_HAVE_GPU gpu_mem = PSM3_IS_BUFFER_GPU_MEM(ubuf, len); if (gpu_mem) { - PSM3_MARK_BUF_SYNCHRONOUS(ubuf); + PSM3_GPU_MARK_BUF_SYNCHRONOUS(ubuf); if (psm3_is_needed_rendezvous(proto, len, flags)) goto do_rendezvous; } @@ -1033,7 +1033,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, ipsaddr->msgctl->mq_send_seqnum); ipsaddr->msgctl->mq_send_seqnum++; ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU const void *user_buffer = ubuf; if (!gpu_mem) { mq_copy_tiny_host_mem((uint32_t *) &scb->ips_lrh.hdr_data, @@ -1049,7 +1049,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, * memcpy to move data between HFI resources * and the GPU */ - if (len <= gdr_copy_limit_send && + if (len <= psm3_gpu_gdr_copy_limit_send && NULL != (user_buffer = psmi_hal_gdr_convert_gpu_to_host_addr( (unsigned long)ubuf, len, 0, proto->ep))) { mq_copy_tiny_host_mem((uint32_t *) &scb->ips_lrh.hdr_data, @@ -1091,13 +1091,13 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag); const void * user_buffer = ubuf; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU int converted = 0; if (gpu_mem) { // TBD USER_BUF_GPU only useful for RTS ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; /* will use PIO */ - if (len <= gdr_copy_limit_send && + if (len <= psm3_gpu_gdr_copy_limit_send && NULL != (user_buffer = psmi_hal_gdr_convert_gpu_to_host_addr( (unsigned long)ubuf, len, 0, proto->ep))) { converted = 1; @@ -1131,7 +1131,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, } } } else -#endif // PSM_CUDA || PSM_ONEAPI +#endif /* PSM_HAVE_GPU */ { #ifdef PSM_HAVE_REG_MR if (len > proto->iovec_thresh_eager_blocking @@ -1209,7 +1209,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, psmi_assert(flow->transfer == PSM_TRANSFER_PIO); /* PIO and now have a bounce buffer */ /* copy to bounce buffer */ -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (!gpu_mem || converted) { // host address ips_shortcpy_host_mem @@ -1245,7 +1245,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, if (err > PSM2_OK_NO_PROGRESS) return err; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU req->gpu_hostbuf_used = 0; if (gpu_mem) { req->is_buf_gpu_mem = 1; @@ -1269,7 +1269,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, req->is_buf_gpu_mem = 0; #else { -#endif // PSM_CUDA || PSM_ONEAPI +#endif /* PSM_HAVE_GPU */ #ifdef PSM_HAVE_REG_MR // TBD - no upper bound for send DMA here // non-priority MR and will fallback if can't register @@ -1319,7 +1319,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, req->flags_user = flags; req->flags_internal |= PSMI_REQ_FLAG_IS_INTERNAL; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (gpu_mem) { req->is_buf_gpu_mem = 1; proto->strat_stats.rndv_gpu_send++; @@ -1348,7 +1348,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, mq->stats.tx_num++; mq->stats.tx_eager_num++; mq->stats.tx_eager_bytes += len; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (gpu_mem) { mq->stats.tx_eager_gpu_num++; mq->stats.tx_eager_gpu_bytes += len; @@ -1379,7 +1379,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted) psmi_assert(req->req_data.recv_msglen == req->req_data.send_msglen); req->mq->stats.rx_user_num++; req->mq->stats.rx_user_bytes += req->req_data.recv_msglen; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* Cases where we do not use TIDs: * 0) Received full message as payload to RTS, CTS is just an ack * 1) Recv on a host buffer, Send on a gpu buffer and len is <= 3 bytes @@ -1398,7 +1398,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted) || ! ips_epaddr_rdma_connected((ips_epaddr_t *) epaddr) #endif ) { -#else // PSM_CUDA || PSM_ONEAPI +#else /* PSM_HAVE_GPU */ if (req->recv_msgoff >= req->req_data.recv_msglen || proto->protoexp == NULL /* no expected tid recieve */ #ifdef PSM_HAVE_REG_MR @@ -1406,7 +1406,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted) #endif || req->req_data.recv_msglen <= proto->mq->rndv_nic_thresh /* less rv theshold */ ) { /* no expected tid recieve */ -#endif // PSM_CUDA || PSM_ONEAPI +#endif /* PSM_HAVE_GPU */ #ifdef PSM_HAVE_REG_MR //do_long_data: #endif @@ -1415,7 +1415,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted) /* there is no order requirement, try to push CTS request * directly, if fails, then queue it for later try. */ _HFI_VDBG("pushing CTS recv off %u len %u" -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU " rGPU %u sGPU %u" #endif " rv thresh %u" @@ -1424,7 +1424,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted) #endif " epaddr %p RDMA %u\n", req->recv_msgoff, req->req_data.recv_msglen, -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU req->is_buf_gpu_mem, req->is_sendbuf_gpu_mem, #endif proto->mq->rndv_nic_thresh, @@ -1435,7 +1435,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted) if (req->recv_msgoff < req->req_data.recv_msglen) { // RTS did not have the message as payload -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (req->is_buf_gpu_mem) { proto->strat_stats.rndv_long_gpu_recv++; proto->strat_stats.rndv_long_gpu_recv_bytes += req->req_data.recv_msglen; @@ -1443,7 +1443,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted) #endif proto->strat_stats.rndv_long_cpu_recv++; proto->strat_stats.rndv_long_cpu_recv_bytes += req->req_data.recv_msglen; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU } #endif } @@ -1483,7 +1483,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted) // various sized messages which may arrive in the buffer #ifdef PSM_HAVE_REG_MR psmi_assert(req->req_data.send_msglen); // 0 len uses LONG_DATA above -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // for GPU receive buffer we need to sort things out at a lower level // since may use a host bounce buffer for RDMA and need to register it if (! req->is_buf_gpu_mem) { @@ -1571,7 +1571,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) uint32_t nbytes_left = req->req_data.send_msglen - req->recv_msgoff; uint32_t nbytes_this, chunk_size; uint32_t frag_size, unaligned_bytes; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU int converted = 0; #endif struct ips_flow *flow; @@ -1585,7 +1585,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) frag_size = flow->frag_size; chunk_size = min(proto->ep->chunk_max_segs*frag_size, proto->ep->chunk_max_size); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (req->is_buf_gpu_mem) { #ifdef PSM_HAVE_REG_MR // rare, but when RV connection not available, we @@ -1607,7 +1607,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) // for GPU send buffer <= 3, receiver can select // LONG DATA and we can use GDRCopy // must repin per attempt - if (req->req_data.send_msglen <= gdr_copy_limit_send && + if (req->req_data.send_msglen <= psm3_gpu_gdr_copy_limit_send && 0 != (buf = (uintptr_t)psmi_hal_gdr_convert_gpu_to_host_addr( (unsigned long)req->req_data.buf, req->req_data.send_msglen, 0, proto->ep))) { @@ -1620,7 +1620,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) proto->strat_stats.rndv_long_cuCopy_send_bytes += dostats*req->req_data.send_msglen; } } else { -#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#endif /* PSM_HAVE_GPU */ #ifdef PSM_HAVE_REG_MR // TBD - no upper bound for send DMA here // non-priority MR and will fallback if can't register @@ -1636,9 +1636,9 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) #endif /* PSM_HAVE_REG_MR */ { proto->strat_stats.rndv_long_copy_cpu_send += dostats; - proto->strat_stats.rndv_long_copy_cpu_send_bytes += (uint64_t)dostats*req->req_data.send_msglen; + proto->strat_stats.rndv_long_copy_cpu_send_bytes += dostats*(uint64_t)req->req_data.send_msglen; } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU } #endif @@ -1673,7 +1673,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) /* attached unaligned bytes into packet header */ unaligned_bytes = nbytes_left & 0x3; if (unaligned_bytes) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (!req->is_buf_gpu_mem || converted) mq_copy_tiny_host_mem((uint32_t *)&scb->ips_lrh.mdata, (uint32_t *)buf, unaligned_bytes); @@ -1700,7 +1700,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) ips_scb_flags(scb) |= IPS_SEND_FLAG_SEND_MR; } #endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // SDMA identifies GPU buffers itself. But PIO path needs flags if (req->is_buf_gpu_mem) { #ifdef PSM_HAVE_REG_MR @@ -1712,7 +1712,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) // TBD USER_BUF_GPU only useful for RTS ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; } -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ scb->frag_size = frag_size; nbytes_this = min(chunk_size, nbytes_left); @@ -1799,7 +1799,7 @@ psm3_ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev) p_hdr->data[1].u32w0); proto->epaddr_stats.cts_rdma_recv++; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU psmi_assert(p_hdr->data[1].u32w1 > min(psm3_gpu_thresh_rndv, mq->rndv_nic_thresh)); // msglen #else psmi_assert(p_hdr->data[1].u32w1 > mq->rndv_nic_thresh); // msglen @@ -1815,7 +1815,7 @@ psm3_ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev) #ifdef PSM_HAVE_REG_MR if (! req->mr -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU && ! req->gpu_hostbuf_used #endif ) { @@ -1823,7 +1823,7 @@ psm3_ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev) // or we failed to register memory previously. req->mr = psm3_verbs_reg_mr(proto->mr_cache, 0, req->req_data.buf, req->req_data.send_msglen, IBV_ACCESS_RDMA -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU | (req->is_buf_gpu_mem?IBV_ACCESS_IS_GPU_ADDR:0) #endif ); @@ -1870,7 +1870,7 @@ psm3_ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev) // for send DMA if req->mr != NULL. if (req->mr && (!psm3_verbs_user_space_mr(req->mr) -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU || (req->is_buf_gpu_mem && req->req_data.send_msglen <= proto->iovec_gpu_thresh_eager) || (!req->is_buf_gpu_mem && req->req_data.send_msglen <= proto->iovec_thresh_eager) #else @@ -2000,7 +2000,7 @@ psm3_ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev) if (p_hdr->flags & IPS_SEND_FLAG_BLOCKING) req->type |= MQE_TYPE_WAITING_PEER; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (p_hdr->flags & IPS_SEND_FLAG_USER_BUF_GPU) req->is_sendbuf_gpu_mem = 1; else @@ -2256,7 +2256,7 @@ psm3_ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev) */ if (req) { //u32w0 is offset - only cnt recv msgs on 1st pkt in msg -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU int use_gdrcopy = 0; if (!req->is_buf_gpu_mem) { if (req->state == MQ_STATE_UNEXP) { @@ -2287,7 +2287,7 @@ psm3_ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev) } psm3_mq_handle_data(mq, req, p_hdr->data[1].u32w0, payload, paylen); -#endif // PSM_CUDA || PSM_ONEAPI +#endif /* PSM_HAVE_GPU */ if (msgorder == IPS_MSG_ORDER_FUTURE_RECV) ret = IPS_RECVHDRQ_BREAK; @@ -2403,10 +2403,10 @@ psm3_ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev) psm2_mq_req_t req; struct ips_flow *flow; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU int use_gdrcopy = 0; struct ips_proto *proto = rcv_ev->proto; -#endif // PSM_CUDA || PSM_ONEAPI +#endif /* PSM_HAVE_GPU */ psmi_copy_tiny_fn_t psmi_copy_tiny_fn = mq_copy_tiny; @@ -2426,7 +2426,7 @@ psm3_ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev) paylen = ips_recvhdrq_event_paylen(rcv_ev); psmi_assert(paylen == 0 || payload); -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU // cpu stats already tracked when sent CTS if (req->is_buf_gpu_mem) { req->req_data.buf = req->user_gpu_buffer; @@ -2470,7 +2470,7 @@ psm3_ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev) } psm3_mq_handle_data(mq, req, p_hdr->data[1].u32w0, payload, paylen -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU , use_gdrcopy, rcv_ev->proto->ep); #else ); diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_params.h b/prov/psm3/psm3/ptl_ips/ips_proto_params.h index f288d6c54a1..fce2435f259 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto_params.h +++ b/prov/psm3/psm3/ptl_ips/ips_proto_params.h @@ -141,7 +141,7 @@ #define IPS_SEND_FLAG_PKTCKSUM 0x02 /* Has packet checksum */ #define IPS_SEND_FLAG_AMISTINY 0x04 /* AM is tiny, exclusive */ -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* This flag is used to indicate to the reciever when * the send is issued on a device buffer. This helps in * selecting TID path on the recieve side regardless of @@ -159,7 +159,7 @@ #define IPS_SEND_FLAG_PERSISTENT 0x0200 #define IPS_SEND_FLAG_NO_LMC 0x0400 -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* This flag is used to indicate if the send is on * a GPU buffer. This helps PIO/SDMA paths to detect * if payload is GPU buffer without having to call @@ -219,7 +219,7 @@ #define IPS_PROTO_FLAG_PPOLICY_STATIC 0x1c00 -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* Use RNDV (TID) for all message sizes */ //#define IPS_PROTO_FLAG_ALWAYS_RNDV 0x10000 // unused /* Use GPUDirect RDMA for SDMA */ @@ -246,6 +246,7 @@ #define IPS_PROTOEXP_FLAG_RDMA_KERNEL 0x01 /* kernel RV module RDMA */ #define IPS_PROTOEXP_FLAG_RDMA_USER 0x02 /* user RC QP for RDMA only */ #define IPS_PROTOEXP_FLAG_RDMA_USER_RC 0x03 /* user RC QP eager & RDMA */ +#define IPS_PROTOEXP_FLAG_RDMA_QP(flag) ((flag)&IPS_PROTOEXP_FLAG_RDMA_MASK) /* QP RDMA mode */ #define IPS_PROTOEXP_FLAG_USER_RC_QP(flag) ((flag)&0x02) /* either RC QP mode */ #define IPS_PROTOEXP_FLAG_KERNEL_QP(flag) \ (((flag)&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_KERNEL) diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_recv.c b/prov/psm3/psm3/ptl_ips/ips_proto_recv.c index 2fbc0a0773b..4aa0fc476fa 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto_recv.c +++ b/prov/psm3/psm3/ptl_ips/ips_proto_recv.c @@ -315,6 +315,9 @@ psm3_ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev) ips_scb_t *scb; ack_seq_num.psn_num = p_hdr->ack_seq_num; +#ifdef USE_RC + ipsaddr->verbs.remote_recv_psn = ack_seq_num.psn_num; +#endif // check actual psn acked (ack_seq_num-1), we only want to process acks // for packets we never got an ack for if ((flowid = ips_proto_flowid(p_hdr)) < EP_NUM_FLOW_ENTRIES) { diff --git a/prov/psm3/psm3/ptl_ips/ips_scb.c b/prov/psm3/psm3/ptl_ips/ips_scb.c index 05aead8cc33..a3149c2455a 100644 --- a/prov/psm3/psm3/ptl_ips/ips_scb.c +++ b/prov/psm3/psm3/ptl_ips/ips_scb.c @@ -276,7 +276,7 @@ ips_scb_t *MOCKABLE(psm3_ips_scbctrl_alloc)(struct ips_scbctrl *scbc, int scbnum scb->nfrag = 1; scb->frag_size = 0; scb->chunk_size = 0; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU scb->mq_req = NULL; #endif #ifdef PSM_HAVE_REG_MR @@ -346,7 +346,7 @@ ips_scb_t *MOCKABLE(psm3_ips_scbctrl_alloc_tiny)(struct ips_scbctrl *scbc) scb->nfrag = 1; scb->frag_size = 0; scb->chunk_size = 0; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU scb->mq_req = NULL; #endif #ifdef PSM_HAVE_REG_MR diff --git a/prov/psm3/psm3/ptl_ips/ips_scb.h b/prov/psm3/psm3/ptl_ips/ips_scb.h index 97670116fdf..6345830b632 100644 --- a/prov/psm3/psm3/ptl_ips/ips_scb.h +++ b/prov/psm3/psm3/ptl_ips/ips_scb.h @@ -185,16 +185,16 @@ struct ips_scb { psm2_am_completion_fn_t completion_am; }; void *cb_param; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU psm2_mq_req_t mq_req; /* back pointer to original request */ -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ struct { struct ips_message_header ips_lrh; } PSMI_CACHEALIGN; }; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU #define IS_TRANSFER_BUF_GPU_MEM(scb) (ips_scb_flags(scb) & IPS_SEND_FLAG_PAYLOAD_BUF_GPU) #endif diff --git a/prov/psm3/psm3/ptl_ips/ptl.c b/prov/psm3/psm3/ptl_ips/ptl.c index 9878713a37c..3f416231783 100644 --- a/prov/psm3/psm3/ptl_ips/ptl.c +++ b/prov/psm3/psm3/ptl_ips/ptl.c @@ -560,14 +560,15 @@ psm3_ips_ptl_disconnect(ptl_t *ptl_gen, int force, int numep, } /* Only symbol we expose out of here */ -struct ptl_ctl_init -psm3_ptl_ips = { - ips_ptl_sizeof, ips_ptl_init, ips_ptl_fini, ips_ptl_setopt, - ips_ptl_getopt +struct ptl_ctl_init psm3_ptl_ips = { + .sizeof_ptl = ips_ptl_sizeof, + .init = ips_ptl_init, + .fini = ips_ptl_fini, + .setopt = ips_ptl_setopt, + .getopt = ips_ptl_getopt, }; -struct ptl_ctl_rcvthread -psm3_ptl_ips_rcvthread = { - ips_ptl_rcvthread_is_enabled, - psm3_ips_ptl_rcvthread_transfer_ownership, +struct ptl_ctl_rcvthread psm3_ptl_ips_rcvthread = { + .is_enabled = ips_ptl_rcvthread_is_enabled, + .transfer_ownership = psm3_ips_ptl_rcvthread_transfer_ownership, }; diff --git a/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c b/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c index 562721a0b37..cac70401242 100644 --- a/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c +++ b/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c @@ -97,14 +97,6 @@ struct ptl_rcvthread { pthread_t psm3_rcv_threadid; #endif -#ifdef PSM_CUDA -/* This is a global cuda context (extern declaration in psm_user.h) - * stored to provide hints during a cuda failure - * due to a null cuda context. - */ -CUcontext cu_ctxt; -#endif - // for psm3_wait and psm3_wake static pthread_mutex_t wait_mutex; static pthread_cond_t wait_condvar; @@ -144,15 +136,16 @@ psm2_error_t psm3_ips_ptl_rcvthread_init(ptl_t *ptl_gen, struct ips_recvhdrq *re rcvc->ptl = ptl_gen; rcvc->t_start_cyc = get_cycles(); -#ifdef PSM_CUDA - if (PSMI_IS_GPU_ENABLED) - PSMI_CUDA_CALL(cuCtxGetCurrent, &cu_ctxt); -#endif + PSM3_GPU_FETCH_CTXT(); if (psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD) && (!psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED))){ - pthread_cond_init(&wait_condvar, NULL); + pthread_condattr_t attr; + pthread_condattr_init(&attr); + pthread_condattr_setclock(&attr, CLOCK_MONOTONIC); + pthread_cond_init(&wait_condvar, &attr); + pthread_mutex_init(&wait_mutex, NULL); wait_signalled = 0; @@ -375,12 +368,12 @@ psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc) // means migrating to a singleton model and properly fixing/removing the // transfer_ownership of rcvc when EPs are destroyed so can ensure // poll_type properly maintained on all affected EPs. -psm2_error_t psm3_wait(int timeout) +psm2_error_t psm3_wait(int timeout_ms) { psm2_ep_t ep; psm2_error_t ret = PSM2_OK; - _HFI_VDBG("Wait for event. timeout=%d\n", timeout); + _HFI_VDBG("Wait for event. timeout=%d ms\n", timeout_ms); // TBD - while psm3_wait is active, we would like a quick poll() timeout // because it is our only checking for PSM protocol timeouts. However // poll() has probably already started, so too late to change it now. @@ -460,20 +453,20 @@ psm2_error_t psm3_wait(int timeout) wait_signalled = 0; wait_nosleep_signalled_count++; _HFI_VDBG("found already signaled, no sleep\n"); - } else if (timeout < 0) { // infinite timeout + } else if (timeout_ms < 0) { // infinite timeout // Wait for condition variable to be signaled or broadcast. pthread_cond_wait(&wait_condvar, &wait_mutex); wait_signalled = 0; wait_sleep_til_signal_count++; _HFI_VDBG("slept, infinite timeout\n"); } else { - struct timespec wait_time; + struct timespec wait_time; // absolute timestamp clock_gettime(CLOCK_MONOTONIC, &wait_time); // current time - wait_time.tv_sec += timeout / 1000; - wait_time.tv_nsec += (timeout % 1000) * 1000; - if (wait_time.tv_nsec > 1000000000) { // handle carry from nsec to sec - wait_time.tv_sec++; - wait_time.tv_nsec -= 1000000000; + wait_time.tv_sec += timeout_ms / MSEC_PER_SEC; + wait_time.tv_nsec += (timeout_ms % MSEC_PER_SEC) * NSEC_PER_MSEC; + if (wait_time.tv_nsec >= NSEC_PER_SEC) { // handle carry from nsec to sec + wait_time.tv_sec++; + wait_time.tv_nsec -= NSEC_PER_SEC; } if (0 > pthread_cond_timedwait(&wait_condvar, &wait_mutex, &wait_time)) { _HFI_VDBG("slept, timeout\n"); @@ -486,7 +479,7 @@ psm2_error_t psm3_wait(int timeout) wait_sleep_signal_count++; } } - pthread_mutex_unlock( &wait_mutex ); + pthread_mutex_unlock(&wait_mutex); // TBD if ret == PSM2_OK we could use ipeek to see if any real progress // was made and loop back to start to wait again if not. For now we // leave that to our caller @@ -564,10 +557,7 @@ void *ips_ptl_pollintr(void *rcvthreadc) int next_timeout = rcvc->last_timeout; psm2_error_t err; -#ifdef PSM_CUDA - if (PSMI_IS_GPU_ENABLED && cu_ctxt != NULL) - PSMI_CUDA_CALL(cuCtxSetCurrent, cu_ctxt); -#endif + PSM3_GPU_REFRESH_CTXT(); PSM2_LOG_MSG("entering"); /* No reason to have many of these, keep this as a backup in case the diff --git a/prov/psm3/psm3/ptl_self/ptl.c b/prov/psm3/psm3/ptl_self/ptl.c index 19231015d9b..31eeb1a85ae 100644 --- a/prov/psm3/psm3/ptl_self/ptl.c +++ b/prov/psm3/psm3/ptl_self/ptl.c @@ -158,25 +158,16 @@ self_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags_user, if_pf(send_req == NULL) return PSM2_NO_MEMORY; -#ifdef PSM_CUDA +#ifdef PSM_HAVE_GPU // we technically don't need to set is_buf_gpu_mem because psm3_mq_mtucpy // will be used to copy the data to the destination or a sysbuf and it will // check if the buffer is GPU memory. But we do need the sync_memops() - if (len && PSMI_IS_GPU_ENABLED && PSMI_IS_GPU_MEM(ubuf)) { - psmi_cuda_set_attr_sync_memops(ubuf); + if (len && PSM3_IS_GPU_MEM(ubuf)) { + PSM3_GPU_MARK_BUF_SYNCHRONOUS(ubuf); send_req->is_buf_gpu_mem = 1; } else send_req->is_buf_gpu_mem = 0; #endif -#ifdef PSM_ONEAPI - // we don't need to set is_buf_gpu_mem because psm3_mq_mtucpy will be - // used to copy the data to the destination or a sysbuf and it will - // check if the buffer is a GPU memory - //if (len && PSMI_IS_GPU_ENABLED && PSMI_IS_GPU_MEM(ubuf)) { - // send_req->is_buf_gpu_mem = 1; - //} else - // send_req->is_buf_gpu_mem = 0; -#endif mq->stats.tx_num++; mq->stats.tx_rndv_num++; @@ -441,8 +432,10 @@ self_ptl_getopt(const void *component_obj, int optname, } /* Only symbol we expose out of here */ -struct ptl_ctl_init -psm3_ptl_self = { - self_ptl_sizeof, self_ptl_init, self_ptl_fini, self_ptl_setopt, - self_ptl_getopt +struct ptl_ctl_init psm3_ptl_self = { + .sizeof_ptl = self_ptl_sizeof, + .init = self_ptl_init, + .fini = self_ptl_fini, + .setopt = self_ptl_setopt, + .getopt = self_ptl_getopt, }; diff --git a/prov/psm3/psm3/utils/utils_debug.c b/prov/psm3/psm3/utils/utils_debug.c index e218f3bd12f..97fbd9de585 100644 --- a/prov/psm3/psm3/utils/utils_debug.c +++ b/prov/psm3/psm3/utils/utils_debug.c @@ -514,7 +514,7 @@ void psm3_dump_buf(uint8_t *buf, uint32_t len) } } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len) { int i, j, print_len; diff --git a/prov/psm3/psm3/utils/utils_dsa.c b/prov/psm3/psm3/utils/utils_dsa.c index a990babb208..9dfe3368200 100644 --- a/prov/psm3/psm3/utils/utils_dsa.c +++ b/prov/psm3/psm3/utils/utils_dsa.c @@ -293,8 +293,8 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx, int copied_chunks = 0; uint32_t dsa_cp_len; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (n && PSMI_IS_GPU_ENABLED && (PSMI_IS_GPU_MEM(dest) || PSMI_IS_GPU_MEM((void *) src))) { +#ifdef PSM_HAVE_GPU + if (n && (PSM3_IS_GPU_MEM(dest) || PSM3_IS_GPU_MEM((void *) src))) { _HFI_VDBG("GPU copy from %p to %p for %u\n", src, dest, n); PSM3_GPU_MEMCPY(dest, src, n); return; diff --git a/prov/psm3/psm3/utils/utils_dwordcpy-x86_64.c b/prov/psm3/psm3/utils/utils_dwordcpy-x86_64.c index 6929bc200a6..818e6d9ea56 100644 --- a/prov/psm3/psm3/utils/utils_dwordcpy-x86_64.c +++ b/prov/psm3/psm3/utils/utils_dwordcpy-x86_64.c @@ -169,8 +169,8 @@ void psm3_qwordcpy(volatile uint64_t *dest, const uint64_t *src, uint32_t nqword void MOCKABLE(psm3_mq_mtucpy)(void *vdest, const void *vsrc, uint32_t nchars) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (nchars && PSMI_IS_GPU_ENABLED && (PSMI_IS_GPU_MEM(vdest) || PSMI_IS_GPU_MEM((void *) vsrc))) { +#ifdef PSM_HAVE_GPU + if (nchars && (PSM3_IS_GPU_MEM(vdest) || PSM3_IS_GPU_MEM((void *) vsrc))) { PSM3_GPU_MEMCPY(vdest, vsrc, nchars); return; } diff --git a/prov/psm3/src/psm3_revision.c.in b/prov/psm3/src/psm3_revision.c.in index 6c02d64ec9e..936e7474a3d 100644 --- a/prov/psm3/src/psm3_revision.c.in +++ b/prov/psm3/src/psm3_revision.c.in @@ -17,10 +17,6 @@ #define PSMX3_GIT_CHECKSUM "@PSM3_GIT_HASH@" #endif -#ifndef PSM3_MARCH -#define PSM3_MARCH "@PSM3_MARCH@" -#endif - char psm3_IEFS_version[] = PSMX3_IEFS_VERSION; char psm3_build_timestamp[] = PSMX3_BUILD_TIMESTAMP; char psm3_sources_checksum[] = PSMX3_SRC_CHECKSUM; diff --git a/prov/psm3/src/psmx3.h b/prov/psm3/src/psmx3.h index 6edfa308338..7a126099b71 100644 --- a/prov/psm3/src/psmx3.h +++ b/prov/psm3/src/psmx3.h @@ -853,6 +853,7 @@ struct psmx3_env { char *tag_layout; #endif int yield_mode; + int wait_enable; }; #define PSMX3_MAX_UNITS PSMI_MAX_RAILS /* from psm_config.h */ diff --git a/prov/psm3/src/psmx3_atomic.c b/prov/psm3/src/psmx3_atomic.c index b05416c5e07..b518f2a81f5 100644 --- a/prov/psm3/src/psmx3_atomic.c +++ b/prov/psm3/src/psmx3_atomic.c @@ -182,7 +182,7 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count, #define PSMX3_BXOR(dst,src) (dst) ^= (src) #define PSMX3_COPY(dst,src) (dst) = (src) -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* res is always CPU address, dst could be GPU address */ #define PSMX3_ATOMIC_READ(dst,res,cnt,TYPE) \ do { \ @@ -195,7 +195,7 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count, psm3_memcpy(r, d, sizeof(TYPE)*cnt); \ psmx3_unlock(&psmx3_atomic_lock, 1); \ } while (0) -#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#else /* PSM_HAVE_GPU */ #define PSMX3_ATOMIC_READ(dst,res,cnt,TYPE) \ do { \ int i; \ @@ -206,9 +206,9 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count, r[i] = d[i]; \ psmx3_unlock(&psmx3_atomic_lock, 1); \ } while (0) -#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#endif /* PSM_HAVE_GPU */ -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* src is always CPU address, dst could be GPU address */ #define PSMX3_ATOMIC_WRITE(dst,src,cnt,OP,TYPE) \ do { \ @@ -228,7 +228,7 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count, } \ psmx3_unlock(&psmx3_atomic_lock, 1); \ } while (0) -#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#else /* PSM_HAVE_GPU */ #define PSMX3_ATOMIC_WRITE(dst,src,cnt,OP,TYPE) \ do { \ int i; \ @@ -239,9 +239,9 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count, OP(d[i],s[i]); \ psmx3_unlock(&psmx3_atomic_lock, 1); \ } while (0) -#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#endif /* PSM_HAVE_GPU */ -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* src is always CPU address, dst could be GPU address */ // optimized to avoid unnecessary read and compare, OP==PSMX3_COPY and not used #define PSMX3_ATOMIC_WRITE_COPY(dst,src,cnt,OP,TYPE) \ @@ -255,12 +255,12 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count, psm3_memcpy(d, s, sizeof(TYPE)*cnt); \ psmx3_unlock(&psmx3_atomic_lock, 1); \ } while (0) -#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#else /* PSM_HAVE_GPU */ #define PSMX3_ATOMIC_WRITE_COPY(dst,src,cnt,OP,TYPE) \ PSMX3_ATOMIC_WRITE(dst,src,cnt,OP,TYPE) -#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#endif /* PSM_HAVE_GPU */ -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* src, res are always CPU address, dst could be GPU address */ #define PSMX3_ATOMIC_READWRITE(dst,src,res,cnt,OP,TYPE) \ do { \ @@ -281,7 +281,7 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count, } \ psmx3_unlock(&psmx3_atomic_lock, 1); \ } while (0) -#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#else /* PSM_HAVE_GPU */ #define PSMX3_ATOMIC_READWRITE(dst,src,res,cnt,OP,TYPE) \ do { \ int i; \ @@ -295,9 +295,9 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count, } \ psmx3_unlock(&psmx3_atomic_lock, 1); \ } while (0) -#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#endif /* PSM_HAVE_GPU */ -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* src, cmp, res are always CPU address, dst could be GPU address */ #define PSMX3_ATOMIC_CSWAP(dst,src,cmp,res,cnt,CMP_OP,TYPE) \ do { \ @@ -320,7 +320,7 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count, } \ psmx3_unlock(&psmx3_atomic_lock, 1); \ } while (0) -#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#else /* PSM_HAVE_GPU */ #define PSMX3_ATOMIC_CSWAP(dst,src,cmp,res,cnt,CMP_OP,TYPE) \ do { \ int i; \ @@ -336,9 +336,9 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count, } \ psmx3_unlock(&psmx3_atomic_lock, 1); \ } while (0) -#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#endif /* PSM_HAVE_GPU */ -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* src, cmp, res are always CPU address, dst could be GPU address */ #define PSMX3_ATOMIC_MSWAP(dst,src,cmp,res,cnt,TYPE) \ do { \ @@ -359,7 +359,7 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count, } \ psmx3_unlock(&psmx3_atomic_lock, 1); \ } while (0) -#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#else /* PSM_HAVE_GPU */ #define PSMX3_ATOMIC_MSWAP(dst,src,cmp,res,cnt,TYPE) \ do { \ int i; \ @@ -374,7 +374,7 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count, } \ psmx3_unlock(&psmx3_atomic_lock, 1); \ } while (0) -#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#endif /* PSM_HAVE_GPU */ static int psmx3_atomic_do_write(void *dest, void *src, int datatype, int op, int count) diff --git a/prov/psm3/src/psmx3_attr.c b/prov/psm3/src/psmx3_attr.c index 402253fba59..34f9b671071 100644 --- a/prov/psm3/src/psmx3_attr.c +++ b/prov/psm3/src/psmx3_attr.c @@ -263,7 +263,7 @@ static struct fi_info *psmx3_dupinfo(const struct fi_info *info) #endif /* HAVE_PSM3_DL */ static uint64_t psmx3_check_fi_hmem_cap(void) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU /* if parses as empty or invalid use default of 0 */ /* psm3 below us will provide warning as needed when it parses it */ int gpu = 0; @@ -278,7 +278,7 @@ static uint64_t psmx3_check_fi_hmem_cap(void) { 0, UINT_MAX); if ((gpu || gpudirect) && !ofi_hmem_p2p_disabled()) return FI_HMEM; -#endif /* PSM_CUDA || PSM_ONEAPI */ +#endif /* PSM_HAVE_GPU */ return 0; } @@ -319,28 +319,30 @@ static uint64_t get_max_inject_size(void) { thresh_rv = temp; } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#ifdef PSM_HAVE_GPU if (psmx3_prov_info.caps & FI_HMEM) { if (have_nic) { // GPU ips rendezvous threshold - // sockets HAL avoids rendezvous, so this may be overly restrictive - temp = PSM3_GPU_THRESH_RNDV; - // PSM3_CUDA_THRESH_RNDV depricated, use PSM3_GPU_THRESH_RNDV if set - psm3_parse_str_uint(psm3_env_get("PSM3_CUDA_THRESH_RNDV"), &temp, - 0, UINT_MAX); - psm3_parse_str_uint(psm3_env_get("PSM3_GPU_THRESH_RNDV"), &temp, - 0, UINT_MAX); - if (thresh_rv > temp) - thresh_rv = temp; + uint32_t out; + if (psm3_info_query(PSM2_INFO_QUERY_GPU_THRESH_RNDV, &out, 0, NULL)) { + PSMX3_WARN(&psmx3_prov, FI_LOG_CORE, + "Unable to get PSM3_GPU_THRESH_RNDV.\n"); + } else if (thresh_rv > out) { + thresh_rv = out; + } } if (have_shm) { // GPU shm rendezvous threshold - temp = PSM3_MQ_RNDV_SHM_GPU_THRESH; - psm3_parse_str_uint(psm3_env_get("PSM3_MQ_RNDV_SHM_GPU_THRESH"), &temp, - 0, UINT_MAX); - if (thresh_rv > temp) - thresh_rv = temp; + // we only have default, real value may be overriden at MQ init + // when open PSM3 endpoint + uint32_t out; + if (psm3_info_query(PSM2_INFO_QUERY_MQ_RNDV_SHM_GPU_THRESH_DEFAULT, &out, 0, NULL)) { + PSMX3_WARN(&psmx3_prov, FI_LOG_CORE, + "Unable to get PSM3_MQ_RNDV_SHM_GPU_THRESH default.\n"); + } else if (thresh_rv > out) { + thresh_rv = out; + } } } #endif diff --git a/prov/psm3/src/psmx3_ep.c b/prov/psm3/src/psmx3_ep.c index 62042023c65..47400d93298 100644 --- a/prov/psm3/src/psmx3_ep.c +++ b/prov/psm3/src/psmx3_ep.c @@ -147,6 +147,46 @@ STATIC ssize_t psmx3_ep_cancel(fid_t fid, void *context) return psmx3_errno(err); } +STATIC int psmx3_ep_getopt_cuda_api_permitted( + struct psmx3_fid_ep *ep, bool *value) +{ + // invariant: if both rx and tx are set, then they are expected to be + // the same internal PSM endpoint + assert(!ep->tx || !ep->rx || ep->tx->psm2_ep == ep->rx->psm2_ep); + + uint64_t size = (uint64_t)sizeof(*value); + + psm2_error_t err = psm3_getopt( + PSM2_COMPONENT_CORE, + ep->tx ? ep->tx->psm2_ep : ep->rx->psm2_ep, + PSM2_CORE_OPT_EP_CUDA_PERMITTED, + value, + &size); + if (err) + return -FI_EINVAL; + + return 0; +} + +STATIC int psmx3_ep_setopt_cuda_api_permitted( + struct psmx3_fid_ep *ep, const bool *value) +{ + // invariant: if both rx and tx are set, then they are expected to be + // the same internal PSM endpoint + assert(!ep->tx || !ep->rx || ep->tx->psm2_ep == ep->rx->psm2_ep); + + psm2_error_t err = psm3_setopt( + PSM2_COMPONENT_CORE, + ep->tx ? ep->tx->psm2_ep : ep->rx->psm2_ep, + PSM2_CORE_OPT_EP_CUDA_PERMITTED, + value, + sizeof(*value)); + if (err) + return -FI_EINVAL; + + return 0; +} + DIRECT_FN STATIC int psmx3_ep_getopt(fid_t fid, int level, int optname, void *optval, size_t *optlen) @@ -164,6 +204,11 @@ STATIC int psmx3_ep_getopt(fid_t fid, int level, int optname, *optlen = sizeof(size_t); break; + case FI_OPT_CUDA_API_PERMITTED: + if (!optlen || *optlen != sizeof(bool)) + return -FI_EINVAL; + return psmx3_ep_getopt_cuda_api_permitted(ep, (bool *)optval); + default: return -FI_ENOPROTOOPT; } @@ -187,6 +232,11 @@ STATIC int psmx3_ep_setopt(fid_t fid, int level, int optname, ep->min_multi_recv = *(size_t *)optval; break; + case FI_OPT_CUDA_API_PERMITTED: + if (optlen != sizeof(bool)) + return -FI_EINVAL; + return psmx3_ep_setopt_cuda_api_permitted(ep, (const bool *)optval); + default: return -FI_ENOPROTOOPT; } diff --git a/prov/psm3/src/psmx3_init.c b/prov/psm3/src/psmx3_init.c index 29359d3ea34..3408cf2ec41 100644 --- a/prov/psm3/src/psmx3_init.c +++ b/prov/psm3/src/psmx3_init.c @@ -45,7 +45,7 @@ static const char* FI_PSM3_NAME_SERVER_HELP = "Whether to turn on the name server or not (default: yes)"; static const char* FI_PSM3_TAGGED_RMA_HELP = "Whether to use tagged messages for large size RMA or not " \ - "(default: yes)"; + "(default: no)"; static const char* FI_PSM3_UUID_HELP = "Unique Job ID required by the fabric"; static const char* FI_PSM3_DELAY_HELP = @@ -81,6 +81,8 @@ static const char* FI_PSM3_TAG_LAYOUT_HELP = #endif static const char* FI_PSM3_YIELD_MODE_HELP = "Enabled interrupt driven operation with fi_wait. (default: no)."; +static const char* FI_PSM3_WAIT_ENABLE_HELP = + "Enabled use of wait semantics outside of yield mode. (default: no)."; #define FI_PSM3_PREFIX "FI_PSM3_" #define FI_PSM3_PREFIX_LEN strlen(FI_PSM3_PREFIX) @@ -132,7 +134,7 @@ int psmx3_param_get_str(struct fi_provider *provider, const char *env_var_name, struct psmx3_env psmx3_env = { .name_server = 1, - .tagged_rma = 1, + .tagged_rma = 0, .uuid = PSMX3_DEFAULT_UUID, .uuid_override = 0, .delay = 0, @@ -149,6 +151,7 @@ struct psmx3_env psmx3_env = { .tag_layout = "auto", #endif .yield_mode = 0, + .wait_enable = 0, }; #if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_RUNTIME) @@ -253,6 +256,8 @@ static void psmx3_init_env(void) //fi_param_get_bool(&psmx3_prov, "yield_mode", &psmx3_env.yield_mode); psmx3_param_get_bool(&psmx3_prov, "FI_PSM3_YIELD_MODE", FI_PSM3_YIELD_MODE_HELP, 0, &psmx3_env.yield_mode); + psmx3_param_get_bool(&psmx3_prov, "FI_PSM3_WAIT_ENABLE", + FI_PSM3_WAIT_ENABLE_HELP, 0, &psmx3_env.wait_enable); } void psmx3_init_tag_layout(struct fi_info *info) @@ -680,18 +685,6 @@ static int psmx3_getinfo(uint32_t api_version, const char *node, PSMX3_INFO(&psmx3_prov, FI_LOG_CORE,"\n"); - __builtin_cpu_init(); - if (!__builtin_cpu_supports(PSM3_MARCH)) { - PSMX3_INFO(&psmx3_prov, FI_LOG_CORE, - "CPU does not support '%s'.\n", PSM3_MARCH); - OFI_INFO_STR(&psmx3_prov, - (__builtin_cpu_supports("avx2") ? "AVX2" : - (__builtin_cpu_supports("avx") ? "AVX" : - (__builtin_cpu_supports("sse4.2") ? "SSE4.2" : "unknown"))), - PSM3_MARCH, "CPU Supports", "PSM3 Built With"); - goto err_out; - } - if (psmx3_init_prov_info(hints, &prov_info)) goto err_out; @@ -946,6 +939,8 @@ PROVIDER_INI #endif fi_param_define(&psmx3_prov, "yield_mode", FI_PARAM_BOOL, FI_PSM3_YIELD_MODE_HELP); + fi_param_define(&psmx3_prov, "wait_enable", FI_PARAM_BOOL, + FI_PSM3_WAIT_ENABLE_HELP); psmx3_init_env(); diff --git a/prov/psm3/src/psmx3_rma.c b/prov/psm3/src/psmx3_rma.c index e76491c9878..f8c88f628f2 100644 --- a/prov/psm3/src/psmx3_rma.c +++ b/prov/psm3/src/psmx3_rma.c @@ -64,31 +64,33 @@ static inline void psmx3_iov_copy(struct iovec *iov, size_t count, /* RMA protocol: * * Write REQ: - * args[0].u32w0 cmd, flag - * args[0].u32w1 len - * args[1].u64 req - * args[2].u64 addr - * args[3].u64 key - * args[4].u64 data (optional) + * args[0].u32w0 : cmd, flag + * args[0].u32w1 : req len + * args[1].u64 : req + * args[2].u64 : target base address + * args[3].u64 : target mr key + * args[4].u64 : cq data (optional) + * args[5].u32w0 : target base offset (optional; unused for long protocol) + * args[5].u32w1 : reserved * * Write REP: - * args[0].u32w0 cmd, flag - * args[0].u32w1 error - * args[1].u64 req + * args[0].u32w0 : cmd, flag + * args[0].u32w1 : error + * args[1].u64 : req * * Read REQ: - * args[0].u32w0 cmd, flag - * args[0].u32w1 len - * args[1].u64 req - * args[2].u64 addr - * args[3].u64 key - * args[4].u64 offset / unused for long protocol + * args[0].u32w0 : cmd, flag + * args[0].u32w1 : len + * args[1].u64 : req + * args[2].u64 : addr + * args[3].u64 : key + * args[4].u64 : offset / unused for long protocol * * Read REP: - * args[0].u32w0 cmd, flag - * args[0].u32w1 error - * args[1].u64 req - * args[2].u64 offset + * args[0].u32w0 : cmd, flag + * args[0].u32w1 : error + * args[1].u64 : req + * args[2].u64 : offset */ int psmx3_am_rma_handler(psm2_am_token_t token, psm2_amarg_t *args, @@ -98,6 +100,8 @@ int psmx3_am_rma_handler(psm2_am_token_t token, psm2_amarg_t *args, psm2_amarg_t rep_args[8]; uint8_t *rma_addr; ssize_t rma_len; + size_t rma_offset; + uint32_t cq_data; uint64_t key; int err = 0; int op_error = 0; @@ -123,23 +127,23 @@ int psmx3_am_rma_handler(psm2_am_token_t token, psm2_amarg_t *args, rma_len = args[0].u32w1; rma_addr = (uint8_t *)(uintptr_t)args[2].u64; key = args[3].u64; + cq_data = args[4].u64; + rma_offset = args[5].u32w0; mr = psmx3_mr_get(rx->domain, key); op_error = mr ? - psmx3_mr_validate(mr, (uint64_t)rma_addr, len, FI_REMOTE_WRITE) : + psmx3_mr_validate(mr, (uint64_t)rma_addr + rma_offset, len, FI_REMOTE_WRITE) : -FI_EINVAL; if (!op_error) { - rma_addr += mr->offset; - psm3_memcpy(rma_addr, src, len); + psm3_memcpy(rma_addr + mr->offset + rma_offset, src, len); if (eom) { if (rx->ep->recv_cq && has_data) { - /* TODO: report the addr/len of the whole write */ event = psmx3_cq_create_event( rx->ep->recv_cq, 0, /* context */ rma_addr, FI_REMOTE_WRITE | FI_RMA | FI_REMOTE_CQ_DATA, - rma_len, - args[4].u64, + rma_offset + rma_len, + cq_data, 0, /* tag */ 0, /* olen */ 0); @@ -409,6 +413,7 @@ static ssize_t psmx3_rma_self(int am_cmd, struct psmx3_fid_cntr *cntr = NULL; struct psmx3_fid_cntr *mr_cntr = NULL; struct psmx3_fid_cq *cq = NULL; + psm2_ep_t psm_ep = ep->tx ? ep->tx->psm2_ep : ep->rx->psm2_ep; int no_event; int err = 0; int op_error = 0; @@ -458,7 +463,7 @@ static ssize_t psmx3_rma_self(int am_cmd, cq = ep->recv_cq; if (mr->cntr != cntr) mr_cntr = mr->cntr; - psm3_memcpy((void *)addr, buf, len); + psm3_ep_memcpy(psm_ep, (void *)addr, buf, len); break; case PSMX3_AM_REQ_WRITEV: @@ -470,14 +475,14 @@ static ssize_t psmx3_rma_self(int am_cmd, dst = (void *)addr; for (i=0; iremote_read_cntr; - psm3_memcpy(buf, (void *)addr, len); + psm3_ep_memcpy(psm_ep, buf, (void *)addr, len); break; case PSMX3_AM_REQ_READV: @@ -485,7 +490,7 @@ static ssize_t psmx3_rma_self(int am_cmd, src = (void *)addr; for (i=0; i chunk_size) { args[0].u32w1 = chunk_size; args[1].u64 = (uint64_t)(uintptr_t)req; args[2].u64 = addr; args[3].u64 = key; + args[5].u32w0 = offset; err = psm3_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER, args, nargs, (void *)buf, chunk_size, am_flags, NULL, NULL); @@ -1068,8 +1076,8 @@ ssize_t psmx3_write_generic(struct fid_ep *ep, const void *buf, size_t len, } psmx3_am_poll(ep_priv->tx); buf = (const uint8_t *)buf + chunk_size; - addr += chunk_size; len -= chunk_size; + offset += chunk_size; req_refcnt++; } @@ -1077,10 +1085,10 @@ ssize_t psmx3_write_generic(struct fid_ep *ep, const void *buf, size_t len, args[1].u64 = (uint64_t)(uintptr_t)req; args[2].u64 = addr; args[3].u64 = key; + args[5].u32w0 = offset; if (flags & FI_REMOTE_CQ_DATA) { PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_DATA | PSMX3_AM_EOM); args[4].u64 = data; - nargs++; } else { PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_EOM); } @@ -1184,19 +1192,20 @@ ssize_t psmx3_writev_generic(struct fid_ep *ep, const struct iovec *iov, PSMX3_CTXT_USER(&req->fi_context) = context; PSMX3_CTXT_EP(&req->fi_context) = ep_priv; + nargs = 6; args[0].u32w0 = 0; PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_WRITE); args[0].u32w1 = len; args[1].u64 = (uint64_t)(uintptr_t)req; args[2].u64 = addr; args[3].u64 = key; - nargs = 4; + args[5].u32w0 = 0; if (flags & FI_REMOTE_CQ_DATA) { PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_DATA | PSMX3_AM_EOM); args[4].u64 = data; - nargs++; } else { PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_EOM); + args[4].u64 = 0; } err = psm3_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER, args, nargs, (void *)buf, len, @@ -1285,7 +1294,8 @@ ssize_t psmx3_writev_generic(struct fid_ep *ep, const struct iovec *iov, /* Case 2.2: use short protocol all other segments */ PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_WRITE); - nargs = 4; + nargs = 6; + args[4].u64 = 0; /* cq_data always zero when !EOM */ buf = iov[i].iov_base; len = iov[i].iov_len; while (len > chunk_size) { @@ -1293,6 +1303,7 @@ ssize_t psmx3_writev_generic(struct fid_ep *ep, const struct iovec *iov, args[1].u64 = (uint64_t)(uintptr_t)req; args[2].u64 = addr; args[3].u64 = key; + args[5].u32w0 = len_sent; err = psm3_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER, args, nargs, (void *)buf, chunk_size, am_flags, @@ -1304,7 +1315,6 @@ ssize_t psmx3_writev_generic(struct fid_ep *ep, const struct iovec *iov, } psmx3_am_poll(ep_priv->tx); buf += chunk_size; - addr += chunk_size; len -= chunk_size; len_sent += chunk_size; req_refcnt++; @@ -1314,11 +1324,11 @@ ssize_t psmx3_writev_generic(struct fid_ep *ep, const struct iovec *iov, args[1].u64 = (uint64_t)(uintptr_t)req; args[2].u64 = addr; args[3].u64 = key; + args[5].u32w0 = len_sent; if (len_sent + len == total_len) { if (flags & FI_REMOTE_CQ_DATA) { PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_DATA | PSMX3_AM_EOM); args[4].u64 = data; - nargs++; } else { PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_EOM); } diff --git a/prov/psm3/src/psmx3_wait.c b/prov/psm3/src/psmx3_wait.c index 7f798a60c66..3ffc118e663 100644 --- a/prov/psm3/src/psmx3_wait.c +++ b/prov/psm3/src/psmx3_wait.c @@ -195,7 +195,7 @@ STATIC int psmx3_wait_wait(struct fid_wait *wait, int timeout) struct util_wait *wait_priv; struct psmx3_fid_fabric *fabric; int err; - + wait_priv = container_of(wait, struct util_wait, wait_fid); fabric = container_of(wait_priv->fabric, struct psmx3_fid_fabric, util_fabric); @@ -213,8 +213,6 @@ STATIC int psmx3_wait_wait(struct fid_wait *wait, int timeout) // don't seem extensible and they have no way to determine if they are empty // so we depend on FI_PSM3_YIELD_MODE=1 to disable normal waitset handling // and allow this simplified use to meet Intel MPI needs. - //if (wait_priv->pollset is empty && wait_priv->wait_obj == FI_WAIT_YIELD) - // psm3_wait(); if (psmx3_env.yield_mode) { switch (psm3_wait(timeout)) { case PSM2_OK: @@ -226,6 +224,16 @@ STATIC int psmx3_wait_wait(struct fid_wait *wait, int timeout) } } + /* outside of YIELD_MODE, one must explicitly enable support for + * fi_wait(). user beware... this is not supported by PSM3 proper, + * but instead only plumbed within the PSMX3 provider shim. + */ + if (!psmx3_env.wait_enable) { + PSMX3_WARN(fabric->util_fabric.prov, FI_LOG_FABRIC, + "fi_wait() not enabled (see FI_PSM3_WAIT_ENABLE)\n"); + return -FI_ENOSYS; + } + psmx3_wait_start_progress(fabric); err = psmx3_wait_wait_wait(wait, timeout); @@ -239,20 +247,61 @@ DIRECT_FN int psmx3_wait_open(struct fid_fabric *fabric, struct fi_wait_attr *attr, struct fid_wait **waitset) { + struct util_fabric *util_fabric = container_of(fabric, struct util_fabric, fabric_fid); struct fid_wait *wait; int err; - if (psmx3_env.yield_mode && attr->wait_obj == FI_WAIT_YIELD) { - // CQ and CNTR won't be allowed to be added to waitset, so - // we simply create an UNSPEC fd waitset for simplicity here - // It should not actually be used - struct fi_wait_attr tmp = *attr; - tmp.wait_obj = FI_WAIT_UNSPEC; - err = ofi_wait_fd_open(fabric, &tmp, &wait); - } else { + + switch (attr->wait_obj) { + case FI_WAIT_UNSPEC: + case FI_WAIT_FD: + case FI_WAIT_POLLFD: err = ofi_wait_fd_open(fabric, attr, &wait); + if (err) + return err; + + break; + + case FI_WAIT_YIELD: + // NOTE: we use the YIELD type only as a special indicator for + // the Intel MPI yield mode global wait set. it is otherwise + // unsupported + if (!psmx3_env.yield_mode) { + PSMX3_WARN(util_fabric->prov, FI_LOG_FABRIC, + "wait object %u not supported outside of yield mode\n", + attr->wait_obj); + return -FI_ENOSYS; + } + + // if not in YIELD_MODE, create a yield wait object. + // + // if in YIELD_MODE, we want callers to only ever wait by + // invoking a top-level fi_wait(), since YIELD_MODE will turn + // all waits into a global wait. can also use a yield wait + // object for this, justified by: + // + // - it is not valid to call fi_control(...GETWAIT...) on a + // YIELD object, which is what we want to force callers to + // wait via fi_wait(). + // - CQ and CNTR won't be allowed to be added to waitset by + // an explicit yield mode check in their open() calls, so the + // object type shouldn't matter. + // - in yield mode, fi_wait() will never interact with the wait + // set directly, but instead just delegate to psm3_wait(), so + // the underlying wait set type doesn't matter. + // + err = ofi_wait_yield_open(fabric, attr, &wait); + if (err) + return err; + + break; + + case FI_WAIT_MUTEX_COND: + default: + PSMX3_WARN(util_fabric->prov, FI_LOG_FABRIC, + "wait object %u not supported\n", + attr->wait_obj); + return -FI_ENOSYS; } - if (err) - return err; psmx3_wait_ops_save = wait->ops; psmx3_wait_ops = *psmx3_wait_ops_save;