diff --git a/prov/psm3/Makefile.am b/prov/psm3/Makefile.am index a6d3fbc68ed..cec9bddede3 100644 --- a/prov/psm3/Makefile.am +++ b/prov/psm3/Makefile.am @@ -30,9 +30,9 @@ ACLOCAL_AMFLAGS = -I config AM_CFLAGS = -Wall if HAVE_LD_VERSION_SCRIPT - libpsm3_fi_version_script = -Wl,--version-script=$(builddir)/libpsm3-fi.map + libpsm3_fi_version_script = -Wl,--version-script=$(builddir)/libpsm3-fi.map else !HAVE_LD_VERSION_SCRIPT - libpsm3_fi_version_script = + libpsm3_fi_version_script = endif !HAVE_LD_VERSION_SCRIPT # rdmaincludedir = $(includedir)/rdma @@ -51,6 +51,8 @@ common_srcs = \ shared/hmem_neuron.c \ shared/hmem_synapseai.c \ shared/hmem_ipc_cache.c \ + shared/xpmem.c \ + shared/xpmem_cache.c \ shared/common.c \ shared/enosys.c \ shared/rbtree.c \ @@ -78,13 +80,22 @@ common_srcs = \ util/src/util_ns.c \ util/src/util_pep.c \ util/src/util_poll.c \ + util/src/util_profile.c \ + util/src/util_srx.c \ util/src/util_wait.c \ util/src/rxm_av.c \ util/src/cuda_mem_monitor.c \ util/src/cuda_ipc_monitor.c \ util/src/rocr_mem_monitor.c \ util/src/rocr_ipc_monitor.c \ - util/src/ze_mem_monitor.c + util/src/ze_mem_monitor.c \ + util/src/xpmem_monitor.c \ + shared/fabric.c \ + shared/fi_tostr.c \ + shared/perf.c \ + shared/log.c \ + shared/var.c \ + shared/abi_1_0.c if MACOS common_srcs += shared/osx/osd.c @@ -103,9 +114,7 @@ if LINUX common_srcs += shared/unix/osd.c common_srcs += shared/linux/osd.c if HAVE_LINUX_PERF_RDPMC -if !HAVE_PSM3_SRC -common_srcs += shared/linux/rdpmc.c #seems to be a copy of psm3/psm_perf.c -endif +common_srcs += shared/linux/rdpmc.c endif common_srcs += inc/linux/rdpmc.h common_srcs += inc/linux/osd.h @@ -120,6 +129,8 @@ bin_SCRIPTS = nodist_src_libpsm3_fi_la_SOURCES = src_libpsm3_fi_la_SOURCES = \ inc/ofi_hmem.h \ + inc/ofi_cma.h \ + inc/ofi_xpmem.h \ inc/ofi.h \ inc/ofi_abi.h \ inc/ofi_atom.h \ @@ -137,7 +148,7 @@ src_libpsm3_fi_la_SOURCES = \ inc/ofi_proto.h \ inc/ofi_recvwin.h \ inc/ofi_rbuf.h \ - inc/ofi_shm.h \ + inc/ofi_shm_p2p.h \ inc/ofi_signal.h \ inc/ofi_epoll.h \ inc/ofi_tree.h \ @@ -148,10 +159,12 @@ src_libpsm3_fi_la_SOURCES = \ inc/ofi_net.h \ inc/ofi_perf.h \ inc/ofi_coll.h \ + inc/ofi_mb.h \ inc/fasthash.h \ inc/rbtree.h \ inc/uthash.h \ inc/ofi_prov.h \ + inc/ofi_profile.h \ inc/rdma/providers/fi_log.h \ inc/rdma/providers/fi_prov.h \ inc/rdma/providers/fi_peer.h \ @@ -167,6 +180,7 @@ src_libpsm3_fi_la_SOURCES = \ inc/rdma/fi_errno.h \ inc/rdma/fi_tagged.h \ inc/rdma/fi_trigger.h \ + inc/rdma/fi_profile.h \ src/psmx3.h \ src/psmx3_am.c \ src/psmx3_atomic.c \ @@ -216,7 +230,7 @@ src_libpsm3_fi_la_LDFLAGS += -lpsm2 endif !HAVE_PSM3_SRC if !EMBEDDED -src_libpsm3_fi_la_LDFLAGS += -version-info 22:0:21 +src_libpsm3_fi_la_LDFLAGS += -version-info 24:0:23 endif prov_install_man_pages = man/man7/fi_psm3.7 @@ -249,8 +263,8 @@ src/psm3_src_chksum.h: Makefile $(chksum_srcs) nroff: @for file in $(prov_install_man_pages); do \ - source=`echo $$file | sed -e 's@/man[0-9]@@'`; \ - perl $(top_srcdir)/config/md2nroff.pl --source=$(top_srcdir)/$$source.md; \ + source=`echo $$file | sed -e 's@/man[0-9]@@'`; \ + perl $(top_srcdir)/config/md2nroff.pl --source=$(top_srcdir)/$$source.md; \ done dist-hook: libpsm3-fi.spec diff --git a/prov/psm3/Makefile.include b/prov/psm3/Makefile.include index 4716706b0e0..47424fc2caf 100644 --- a/prov/psm3/Makefile.include +++ b/prov/psm3/Makefile.include @@ -220,6 +220,8 @@ prov_psm3_psm3_libpsm3i_la_SOURCES = \ prov/psm3/psm3/psm_mq_recv.c \ prov/psm3/psm3/psm_mq_utils.c \ prov/psm3/psm3/psm_netutils.h \ + prov/psm3/psm3/psm_nic_select.c \ + prov/psm3/psm3/psm_nic_select.h \ prov/psm3/psm3/psm_oneapi_ze.c \ prov/psm3/psm3/psm_perf.c \ prov/psm3/psm3/psm_perf.h \ diff --git a/prov/psm3/VERSION b/prov/psm3/VERSION index ef63cfba3ce..144229f3d51 100644 --- a/prov/psm3/VERSION +++ b/prov/psm3/VERSION @@ -1 +1 @@ -3_5_1_1 +3_6_0_1 diff --git a/prov/psm3/configure.ac b/prov/psm3/configure.ac index b680fddcdb9..a985fc05b85 100644 --- a/prov/psm3/configure.ac +++ b/prov/psm3/configure.ac @@ -58,7 +58,7 @@ AC_DEFINE_UNQUOTED([BUILD_ID],["$with_build_id"], # Override autoconf default CFLAG settings (e.g. "-g -O2") while still # allowing the user to explicitly set CFLAGS="" -: ${CFLAGS="-fvisibility=hidden ${base_c_warn_flags}"} +: ${CFLAGS="${base_c_warn_flags}"} # AM_PROG_AS would set CFLAGS="-g -O2" by default if not set already so it # should not be called earlier @@ -242,6 +242,35 @@ AS_IF([test x"$enable_debug" != x"no"], AC_DEFINE_UNQUOTED([ENABLE_DEBUG],[$dbg], [defined to 1 if libfabric was configured with --enable-debug, 0 otherwise]) +AC_ARG_ENABLE([profile], + [AS_HELP_STRING([--enable-profile], + [Enable profiling @<:@default=no@:>@])], + [], + [enable_profile=no]) + +AS_IF([test x"$enable_profile" != x"no"], + [AC_DEFINE([HAVE_FABRIC_PROFILE], [1], + [defined to 1 if libfabric was configured with --enable-profile, 0 otherwise]) +]) + +AC_DEFUN([FI_ARG_ENABLE_SANITIZER],[ + AC_ARG_ENABLE([$1], + [AS_HELP_STRING([--enable-$1], + [Enable $3Sanitizer @<:@default=no@:>@]) + ], + [], + [enable_$1=no]) + AS_IF([test x"$enable_$1" != x"no"], + [CFLAGS="-fsanitize=$2 $CFLAGS"]) +]) + +m4_map([FI_ARG_ENABLE_SANITIZER],[ + [asan, address, Address], + [lsan, leak, Leak], + [tsan, thread, Thread], + [ubsan, undefined, UndefinedBehavior] +]) + dnl Checks for header files. AC_HEADER_STDC @@ -463,7 +492,9 @@ AC_LINK_IFELSE([AC_LANG_PROGRAM([[__asm__(".symver main_, main@ABIVER_1.0");]], ]) dnl AS_IF icc_symver_hack -AC_DEFINE_UNQUOTED([HAVE_SYMVER_SUPPORT], [$ac_asm_symver_support], +dnl Disable in psm3 to include all symbols without symver +dnl AC_DEFINE_UNQUOTED([HAVE_SYMVER_SUPPORT], [$ac_asm_symver_support], +AC_DEFINE_UNQUOTED([HAVE_SYMVER_SUPPORT], [0], [Define to 1 if compiler/linker support symbol versioning.]) AC_MSG_CHECKING(for __alias__ attribute support) @@ -478,8 +509,9 @@ AC_LINK_IFELSE([AC_LANG_PROGRAM([[ AC_MSG_RESULT(no) ac_prog_cc_alias_symbols=0 ]) - -AC_DEFINE_UNQUOTED([HAVE_ALIAS_ATTRIBUTE], [$ac_prog_cc_alias_symbols], +dnl Disable in psm3 to include all symbols without symver +dnl AC_DEFINE_UNQUOTED([HAVE_ALIAS_ATTRIBUTE], [$ac_prog_cc_alias_symbols], +AC_DEFINE_UNQUOTED([HAVE_ALIAS_ATTRIBUTE], [0], [Define to 1 if the linker supports alias attribute.]) AC_CHECK_FUNCS([getifaddrs]) @@ -772,6 +804,37 @@ AS_IF([test "x$enable_psm3_umr_cache" != "xno"], ]) ]) +dnl ------------- hwloc +AC_ARG_ENABLE([psm3-hwloc], + [AS_HELP_STRING([--enable-psm3-hwloc], + [Enable PSM3 use of hwloc for NIC affinity selections @<:@default=check@:>@])], + [], [enable_psm3_hwloc=check]) +psm3_hwloc_happy=0 +AS_IF([test "x$enable_psm3_hwloc" != "xno"], + [ + FI_CHECK_PACKAGE([psm3_hwloc], + [hwloc.h], + [hwloc], + [hwloc_topology_init], + [], + [$psm3_PREFIX], + [$psm3_LIBDIR], + [psm3_hwloc_found=1], + [psm3_hwloc_found=0]) + AS_IF([test $psm3_hwloc_found -ne 1 && test "x$enable_psm3_hwloc" == "xyes"], + [ + psm3_happy=0 + AC_MSG_ERROR([hwloc Support requested but hwloc headers and/or library not found.]) + ]) + AS_IF([test "$psm3_hwloc_found" -eq 1], + [ + psm3_hwloc_happy=1 + CPPFLAGS="$CPPFLAGS $psm3_hwloc_CPPFLAGS -DPSM_USE_HWLOC" + LDFLAGS="$LDFLAGS $psm3_hwloc_LDFLAGS" + LIBS="$LIBS $psm3_hwloc_LIBS" + ]) + ]) + dnl ------------- Driver Modules psm3_rv_happy=0 AC_ARG_WITH([psm3-rv], @@ -852,6 +915,9 @@ AC_DEFINE_UNQUOTED([PSM3_MARCH], ["$PSM3_MARCH"], [PSM3 built with instruction s AS_IF([test ! -z "$PSM_CPPFLAGS"], [CPPFLAGS="$CPPFLAGS $PSM_CPPFLAGS"], []) AS_IF([test ! -z "$PSM_CFLAGS"], [CFLAGS="$CFLAGS $PSM_CFLAGS"], []) +dnl Workaround for including fabric.c +AC_DEFINE([HOOK_NOOP_INIT], NULL, [Ignore HOOK_NOOP_INIT]) +AC_DEFINE([COLL_INIT], NULL, [Ignore COLL_INIT]) dnl Defines not used in PSM3 provider AC_DEFINE([HAVE_DMABUF_PEER_MEM], 0, [Ignore HAVE_DMABUF_PEER_MEM]) AC_DEFINE([HAVE_GDRCOPY], 0, [Ignore HAVE_GDRCOPY]) @@ -862,10 +928,16 @@ AC_DEFINE([HAVE_NEURON], 0, [Ignore HAVE_NEURON]) AC_DEFINE([HAVE_ROCR], 0, [Ignore HAVE_ROCR]) AC_DEFINE([HAVE_SYNAPSEAI], 0, [Ignore HAVE_SYNAPSEAI]) AC_DEFINE([HAVE_UFFD_MONITOR], 0, [Ignore HAVE_UFFD_MONITOR]) +AC_DEFINE([HAVE_XPMEM], 0, [Ignore HAVE_XPMEM]) + dnl Provider-specific checks dnl FI_PROVIDER_INIT +AC_DEFINE([HAVE_BGQ], 0, [Ignore HAVE_BGQ]) +AC_DEFINE([HAVE_BGQ_DL], 0, [Ignore HAVE_BGQ_DL]) AC_DEFINE([HAVE_EFA], 0, [Ignore HAVE_EFA]) AC_DEFINE([HAVE_EFA_DL], 0, [Ignore HAVE_EFA_DL]) +AC_DEFINE([HAVE_GNI], 0, [Ignore HAVE_GNI]) +AC_DEFINE([HAVE_GNI_DL], 0, [Ignore HAVE_GNI_DL]) AC_DEFINE([HAVE_MRAIL], 0, [Ignore HAVE_MRAIL]) AC_DEFINE([HAVE_MRAIL_DL], 0, [Ignore HAVE_MRAIL_DL]) AC_DEFINE([HAVE_NET], 0, [Ignore HAVE_NET]) @@ -878,6 +950,8 @@ AC_DEFINE([HAVE_PSM2_DL], 0, [Ignore HAVE_PSM2_DL]) dnl FI_PROVIDER_SETUP([psm3]) AC_DEFINE([HAVE_OPX], 0, [Ignore HAVE_OPX]) AC_DEFINE([HAVE_OPX_DL], 0, [Ignore HAVE_OPX_DL]) +AC_DEFINE([HAVE_RSTREAM], 0, [Ignore HAVE_RSTREAM]) +AC_DEFINE([HAVE_RSTREAM_DL], 0, [Ignore HAVE_RSTREAM_DL]) AC_DEFINE([HAVE_RXD], 0, [Ignore HAVE_RXD]) AC_DEFINE([HAVE_RXD_DL], 0, [Ignore HAVE_RXD_DL]) AC_DEFINE([HAVE_RXM], 0, [Ignore HAVE_RXM]) @@ -980,6 +1054,9 @@ fi if test $psm3_dsa_happy -eq 1; then afeatures="$afeatures, Intel DSA" fi +if test $psm3_hwloc_happy -eq 1; then + afeatures="$afeatures, hwloc" +fi if test "x$enable_psm3_udp" = "xyes"; then afeatures="$afeatures, UDP" fi diff --git a/prov/psm3/configure.m4 b/prov/psm3/configure.m4 index 6ae917558e8..25aea136db6 100644 --- a/prov/psm3/configure.m4 +++ b/prov/psm3/configure.m4 @@ -371,6 +371,28 @@ AC_DEFUN([FI_PSM3_CONFIGURE],[ ]) ]) + AS_IF([test "x$enable_psm3_hwloc" != "xno"], + [ + FI_CHECK_PACKAGE([psm3_hwloc], + [hwloc.h], + [hwloc], + [hwloc_topology_init], + [], + [$psm3_PREFIX], + [$psm3_LIBDIR], + [psm3_hwloc_found=1], + [psm3_hwloc_found=0]) + AS_IF([test $psm3_hwloc_found -ne 1 && test "x$enable_psm3_hwloc" == "xyes"], + [ + psm3_happy=0 + AC_MSG_ERROR([hwloc Support requested but hwloc headers and/or library not found.]) + ]) + AS_IF([test "$psm3_hwloc_found" -eq 1], + [ + psm3_CPPFLAGS="$psm3_CPPFLAGS -DPSM_USE_HWLOC" + ]) + ]) + AS_IF([test $psm3_happy -eq 1], [ AC_CONFIG_FILES([prov/psm3/psm3/psm2_hal_inlines_i.h \ prov/psm3/psm3/psm2_hal_inlines_d.h \ @@ -381,9 +403,9 @@ AC_DEFUN([FI_PSM3_CONFIGURE],[ AS_IF([test $psm3_happy -eq 1], [$1], [$2]) psm3_ARCH_CFLAGS="$PSM3_ARCH_CFLAGS" - psm3_CPPFLAGS="$psm3_CPPFLAGS $psm3_rt_CPPFLAGS $psm3_dl_CPPFLAGS $psm3_numa_CPPFLAGS $psm3_ibv_CPPFLAGS $psm3_uuid_CPPFLAGS" - psm3_LDFLAGS="$psm3_LDFLAGS $psm3_rt_LDFLAGS $psm3_dl_LDFLAGS $psm3_numa_LDFLAGS $psm3_ibv_LDFLAGS $psm3_uuid_LDFLAGS" - psm3_LIBS="$psm3_LIBS $psm3_rt_LIBS $psm3_dl_LIBS $psm3_numa_LIBS $psm3_ibv_LIBS $psm3_uuid_LIBS" + psm3_CPPFLAGS="$psm3_CPPFLAGS $psm3_rt_CPPFLAGS $psm3_dl_CPPFLAGS $psm3_numa_CPPFLAGS $psm3_ibv_CPPFLAGS $psm3_uuid_CPPFLAGS $psm3_hwloc_CPPFLAGS" + psm3_LDFLAGS="$psm3_LDFLAGS $psm3_rt_LDFLAGS $psm3_dl_LDFLAGS $psm3_numa_LDFLAGS $psm3_ibv_LDFLAGS $psm3_uuid_LDFLAGS $psm3_hwloc_LDFLAGS" + psm3_LIBS="$psm3_LIBS $psm3_rt_LIBS $psm3_dl_LIBS $psm3_numa_LIBS $psm3_ibv_LIBS $psm3_uuid_LIBS $psm3_hwloc_LIBS" AC_SUBST(psm3_CFLAGS) AC_SUBST(psm3_ARCH_CFLAGS) AC_SUBST(psm3_CPPFLAGS) @@ -448,4 +470,9 @@ AC_ARG_ENABLE([psm3-umr-cache], [Enable support for Userspace Memory Region (UMR) Caching @<:@default=check@:>@])], [], [enable_psm3_umr_cache=check]) +AC_ARG_ENABLE([psm3-hwloc], + [AS_HELP_STRING([--enable-psm3-hwloc], + [Enable PSM3 use of hwloc for NIC affinity selections @<:@default=check@:>@])], + [], + [enable_psm3_hwloc=check]) dnl vim: set ts=4 sw=4 tw=0 et : diff --git a/prov/psm3/debian/changelog b/prov/psm3/debian/changelog index 7eaab218a3a..0b1b356686f 100644 --- a/prov/psm3/debian/changelog +++ b/prov/psm3/debian/changelog @@ -1,4 +1,4 @@ -libpsm3-fi (11.5.1.1-1) unstable; urgency=medium +libpsm3-fi (11.6.0.0-231) unstable; urgency=medium * Initial release diff --git a/prov/psm3/debian/control b/prov/psm3/debian/control index 40dd0224032..43e38c07d02 100644 --- a/prov/psm3/debian/control +++ b/prov/psm3/debian/control @@ -2,7 +2,7 @@ Source: libpsm3-fi Section: libs Priority: optional Maintainer: https://www.intel.com/content/www/us/en/support.html -Build-Depends: debhelper (>= 12~), uuid-dev, libnuma-dev, libibverbs-dev, librdmacm-dev +Build-Depends: debhelper (>= 12~), uuid-dev, libnuma-dev, libibverbs-dev, librdmacm-dev, libhwloc-dev Standards-Version: 4.5.1 Rules-Requires-Root: no diff --git a/prov/psm3/libpsm3-fi.spec.in b/prov/psm3/libpsm3-fi.spec.in index a5cbce1be15..b24d4c13a63 100644 --- a/prov/psm3/libpsm3-fi.spec.in +++ b/prov/psm3/libpsm3-fi.spec.in @@ -1,6 +1,8 @@ %{!?configopts: %global configopts LDFLAGS=-Wl,--build-id} %{!?provider: %define provider psm3} %{!?provider_formal: %define provider_formal PSM3} +# Disable setting SOURCE_DATE_EPOCH from changelog +%define source_date_epoch_from_changelog 0 Name: lib%{provider}-fi Version: @VERSION@ @@ -18,6 +20,7 @@ Provides: lib${provider}-fi1 = %{version}-%{release} BuildRequires: libuuid-devel BuildRequires: rdma-core-devel +BuildRequires: hwloc-devel %if 0%{?suse_version} >= 1 BuildRequires: glibc-devel BuildRequires: libnuma-devel diff --git a/prov/psm3/psm3/Makefile.include b/prov/psm3/psm3/Makefile.include index 3cd1eff52ff..cc52b8f1868 100644 --- a/prov/psm3/psm3/Makefile.include +++ b/prov/psm3/psm3/Makefile.include @@ -185,6 +185,8 @@ psm3_libpsm3i_la_SOURCES = \ psm3/psm_mq_recv.c \ psm3/psm_mq_utils.c \ psm3/psm_netutils.h \ + psm3/psm_nic_select.c \ + psm3/psm_nic_select.h \ psm3/psm_oneapi_ze.c \ psm3/psm_perf.c \ psm3/psm_perf.h \ @@ -196,13 +198,13 @@ psm3_libpsm3i_la_SOURCES = \ psm3/psm_sysbuf.h \ psm3/psm_timer.c \ psm3/psm_timer.h \ + psm3/psm_uffd.c \ + psm3/psm_uffd.h \ psm3/psm_user.h \ psm3/psm_utils.c \ psm3/psm_utils.h \ psm3/psm_verbs_mr.c \ psm3/psm_verbs_mr.h \ - psm3/psm_verbs_umrc.c \ - psm3/psm_verbs_umrc.h \ psm3/psmi_wrappers.c \ psm3/psmi_wrappers.h \ psm3/psm2.h \ diff --git a/prov/psm3/psm3/hal_sockets/sockets_ep.c b/prov/psm3/psm3/hal_sockets/sockets_ep.c index 8e095b71315..27b98631508 100755 --- a/prov/psm3/psm3/hal_sockets/sockets_ep.c +++ b/prov/psm3/psm3/hal_sockets/sockets_ep.c @@ -159,11 +159,16 @@ psm3_ep_open_udp_internal(psm2_ep_t ep, int unit, int port, } if (!is_aux) { - psm3_getenv("PSM3_UDP_GSO", - "Enable UDP GSO Segmentation Offload (0 disables GSO)", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)1, &env_gso); - ep->sockets_ep.udp_gso = env_gso.e_int; + psm3_getenv_range("PSM3_UDP_GSO", + "Enable UDP GSO Segmentation Offload", + "(0 disables GSO, 1 sets max chunk to 65536, >1 specifies max chunk)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)UINT16_MAX, + (union psmi_envvar_val)0, (union psmi_envvar_val)UINT16_MAX, + NULL, NULL, &env_gso); + ep->sockets_ep.udp_gso = env_gso.e_uint; + if (ep->sockets_ep.udp_gso == 1) + ep->sockets_ep.udp_gso = UINT16_MAX; if (ep->sockets_ep.udp_gso) { int gso; socklen_t optlen = sizeof(gso); @@ -553,6 +558,57 @@ psm2_error_t psm3_tune_tcp_socket(const char *sck_name, psm2_ep_t ep, int fd) return PSM2_INTERNAL_ERR; } +/* parse TCP port range for PSM3_TCP_PORT_RANGE + * format is low:high + * low must be <= high and each must be < UINT16_MAX. + * Either field can be omitted in which case default (input tvals) is used + * for given field. + * 0 - successfully parsed, tvals updated + * -1 - str empty, tvals unchanged + * -2 - syntax error, tvals may have been changed + */ +static int parse_tcp_port_range(const char *str, + size_t errstr_size, char errstr[], + int tvals[2]) +{ + psmi_assert(tvals); + int ret = psm3_parse_str_tuples(str, 2, tvals); + if (ret < 0) + return ret; + if (tvals[0] > UINT16_MAX || tvals[1] > UINT16_MAX) { + if (errstr_size) + snprintf(errstr, errstr_size, " Max allowed is %u", UINT16_MAX); + return -2; + } + if (tvals[0] < 0 || tvals[1] < 0) { + if (errstr_size) + snprintf(errstr, errstr_size, " Negative values not allowed"); + return -2; + } + if ((tvals[0] == TCP_PORT_AUTODETECT && tvals[1] != TCP_PORT_AUTODETECT) + || (tvals[0] != TCP_PORT_AUTODETECT && tvals[1] == TCP_PORT_AUTODETECT)) { + if (errstr_size) + snprintf(errstr, errstr_size, " low of %d only allowed with high of %d", TCP_PORT_AUTODETECT, TCP_PORT_AUTODETECT); + return -2; + } + if (tvals[0] > tvals[1]) { + if (errstr_size) + snprintf(errstr, errstr_size, " low (%d) > high (%d)", tvals[0], tvals[1]); + return -2; + } + return 0; +} + +static int parse_check_tcp_port_range(int type, + const union psmi_envvar_val val, void *ptr, + size_t errstr_size, char errstr[]) +{ + // parser will set tvals to result, use a copy to protect input of defaults + int tvals[2] = { ((int*)ptr)[0], ((int*)ptr)[1] }; + psmi_assert(type == PSMI_ENVVAR_TYPE_STR_TUPLES); + return parse_tcp_port_range(val.e_str, errstr_size, errstr, tvals); +} + static __inline__ psm2_error_t listen_to_port(psm2_ep_t ep, int sockfd, psm3_sockaddr_in_t *addr, @@ -567,12 +623,16 @@ psm2_error_t listen_to_port(psm2_ep_t ep, int sockfd, char range_def[32]; snprintf(range_def, sizeof(range_def), "%d:%d", tvals[0], tvals[1]); - if (!psm3_getenv("PSM3_TCP_PORT_RANGE", - "Set the TCP listener port range . The listener will bind to a random port in the range. '0:0'=let OS pick.", + (void)psm3_getenv_range("PSM3_TCP_PORT_RANGE", + "Set the TCP listener port range .", + "The listener will bind to a random port in the range. '0:0'=let OS pick.", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_TUPLES, - (union psmi_envvar_val) range_def, &env_val)) { - /* not using default values */ - (void)psm3_parse_str_tuples(env_val.e_str, 2, tvals); + (union psmi_envvar_val) range_def, + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + parse_check_tcp_port_range, tvals, &env_val); + if (parse_tcp_port_range(env_val.e_str, 0, NULL, tvals) < 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); } _HFI_DBG("PSM3_TCP_PORT_RANGE = %d:%d\n", tvals[0], tvals[1]); @@ -583,17 +643,14 @@ psm2_error_t listen_to_port(psm2_ep_t ep, int sockfd, start = 0; end = 0; _HFI_DBG("Binding to OS provided port\n"); - } else if (tvals[0] > 0 && tvals[0] <= tvals[1] && tvals[1] <= UINT16_MAX) { + } else { + psmi_assert(tvals[0] > 0); // start with a random port, find the first available one. port = psm3_rand((long int) getpid()); port = port % (tvals[1] + 1 - tvals[0]) + tvals[0]; start = (uint16_t)tvals[0]; end = (uint16_t)tvals[1]; _HFI_DBG("Binding to port in range [%" PRIu16 ":%" PRIu16 "], starting from %ld\n", start, end, port); - } else { - // high < low or only set one - _HFI_ERROR("Invalid TCP port range [%d:%d]\n", tvals[0], tvals[1]); - return PSM2_INTERNAL_ERR; } psm3_getenv("PSM3_TCP_BACKLOG", @@ -637,6 +694,46 @@ psm2_error_t listen_to_port(psm2_ep_t ep, int sockfd, return PSM2_INTERNAL_ERR; } +/* parse TCP skip poll counts for PSM3_TCP_SKIPPOLL_COUNT + * format is inactive_polls:active_polls + * inactive_polls must be >= active_polls + * Either field can be omitted in which case default (input tvals) is used + * for given field. + * 0 - successfully parsed, tvals updated + * -1 - str empty, tvals unchanged + * -2 - syntax error, tvals may have been changed + */ +static int parse_tcp_skippoll_count(const char *str, + size_t errstr_size, char errstr[], + int tvals[2]) +{ + psmi_assert(tvals); + int ret = psm3_parse_str_tuples(str, 2, tvals); + if (ret < 0) + return ret; + if (tvals[0] < 0 || tvals[1] < 0) { + if (errstr_size) + snprintf(errstr, errstr_size, " Negative values not allowed"); + return -2; + } + if (tvals[0] < tvals[1]) { + if (errstr_size) + snprintf(errstr, errstr_size, " inactive_polls (%d) must be >= active_polls (%d)", tvals[0], tvals[1]); + return -2; + } + return 0; +} + +static int parse_check_tcp_skippoll_count(int type, + const union psmi_envvar_val val, void *ptr, + size_t errstr_size, char errstr[]) +{ + // parser will set tvals to result, use a copy to protect input of defaults + int tvals[2] = { ((int*)ptr)[0], ((int*)ptr)[1] }; + psmi_assert(type == PSMI_ENVVAR_TYPE_STR_TUPLES); + return parse_tcp_skippoll_count(val.e_str, errstr_size, errstr, tvals); +} + psm2_error_t psm3_ep_open_tcp_internal(psm2_ep_t ep, int unit, int port, psm2_uuid_t const job_key) @@ -772,21 +869,16 @@ psm3_ep_open_tcp_internal(psm2_ep_t ep, int unit, int port, char buf[32]; snprintf(buf, sizeof(buf), "%d:%d", TCP_INACT_SKIP_POLLS, TCP_ACT_SKIP_POLLS); int tvals[2] = {TCP_INACT_SKIP_POLLS, TCP_ACT_SKIP_POLLS}; - if (!psm3_getenv("PSM3_TCP_SKIPPOLL_COUNT", - "Polls to skip under inactive and active connections " + (void)psm3_getenv_range("PSM3_TCP_SKIPPOLL_COUNT", + "Polls to skip under inactive and active connections ", "where inactive_polls >= active_polls.", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_TUPLES, - (union psmi_envvar_val) buf, &env_val)) { - (void)psm3_parse_str_tuples(env_val.e_str, 2, tvals); - if (tvals[0] < 0) { - tvals[0] = TCP_INACT_SKIP_POLLS; - } - if (tvals[1] < 0) { - tvals[1] = TCP_ACT_SKIP_POLLS; - } - if (tvals[1] > tvals[0]) { - tvals[1] = tvals[0]; - } + (union psmi_envvar_val) buf, + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + parse_check_tcp_skippoll_count, tvals, &env_val); + if (parse_tcp_skippoll_count(env_val.e_str, 0, NULL, tvals) < 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); } ep->sockets_ep.inactive_skip_polls = tvals[0]; ep->sockets_ep.active_skip_polls_offset = tvals[0] - tvals[1]; @@ -1084,10 +1176,11 @@ psm3_sockets_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz) if (ep->sockets_ep.udp_gso) { // set upper bounds for GSO segmentation - // OS limitation of 64K (UINT16_MAX) + // OS limitation of 64K (UINT16_MAX) and UDP_MAX_SEGMENTS (64) ep->chunk_max_segs = min(UINT16_MAX / (ep->mtu + sizeof(struct ips_message_header)), UDP_MAX_SEGMENTS); - ep->chunk_max_size = ep->mq->hfi_base_window_rv; - // for acks to pipeline well need to limit max_nsegs to + ep->chunk_max_size = ep->sockets_ep.udp_gso; + + // for acks to pipeline we'll need to limit max_nsegs to // < flow_credits/2 and max_size to < flow_credit_bytes/2 // (ideally 1/4, but that makes GSO too small and is worse) ep->chunk_max_segs = min(ep->chunk_max_segs, proto->flow_credits/2); diff --git a/prov/psm3/psm3/hal_sockets/sockets_ep.h b/prov/psm3/psm3/hal_sockets/sockets_ep.h index 5bfc3ffdb82..51fcd06f792 100644 --- a/prov/psm3/psm3/hal_sockets/sockets_ep.h +++ b/prov/psm3/psm3/hal_sockets/sockets_ep.h @@ -185,7 +185,7 @@ struct psm3_sockets_ep { int active_skip_polls_offset; // tailored for internal use. it's inactive_skip_polls - active_skip_polls struct msghdr snd_msg; // struct used for sendmsg /* fields specific to UDP */ - int udp_gso; // is GSO enabled for UDP + unsigned udp_gso; // is GSO enabled for UDP, max chunk_size uint8_t *sbuf_udp_gso; // buffer to compose UDP GSO packet sequence int udp_gso_zerocopy; // is UDP GSO Zero copy option enabled int udp_gro; // will be used later diff --git a/prov/psm3/psm3/hal_sockets/sockets_hal.c b/prov/psm3/psm3/hal_sockets/sockets_hal.c index 0c8087450b3..8d4527bdd64 100644 --- a/prov/psm3/psm3/hal_sockets/sockets_hal.c +++ b/prov/psm3/psm3/hal_sockets/sockets_hal.c @@ -175,15 +175,15 @@ static void psm3_hfp_sockets_mq_init_defaults(struct psm2_mq *mq) * corresponding PSM3_* env variables. * Otherwise these defaults are used. */ - mq->hfi_thresh_rv = 64000; - mq->hfi_base_window_rv = 131072; + mq->hfi_thresh_rv = PSM_MQ_NIC_RNDV_THRESH; + mq->ips_cpu_window_rv_str = PSM_CPU_NIC_RNDV_WINDOW_STR; // Even without RDMA do we want to disable rendezvous? // even without RDMA, the receiver controlled pacing helps scalability mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY; #if defined(PSM_CUDA) || defined(PSM_ONEAPI) if (PSMI_IS_GPU_ENABLED) - mq->hfi_base_window_rv = 2097152; + mq->ips_gpu_window_rv_str = PSM_GPU_NIC_RNDV_WINDOW_STR; #endif // we parse inet and rv_gpu_cache_size here so we can cache it // once per EP open, even if multi-rail or multi-QP diff --git a/prov/psm3/psm3/hal_verbs/verbs_ep.c b/prov/psm3/psm3/hal_verbs/verbs_ep.c index 979787b7af6..10a4e845e4b 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_ep.c +++ b/prov/psm3/psm3/hal_verbs/verbs_ep.c @@ -113,7 +113,7 @@ psm3_ep_open_verbs(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid_t // make sure all fields are empty. memset(&ep->verbs_ep,0,sizeof(ep->verbs_ep)); - ep->verbs_ep.qkey = *(uint32_t*)job_key; // use 1st 32 bits of job_key + ep->verbs_ep.qkey = (*(uint32_t*)job_key) & 0x7FFFFFFF; // use 1st 31 bits of job_key (MSB is reserved) if (_HFI_PRDBG_ON) { char uuid_str[64]; @@ -180,12 +180,48 @@ psm3_ep_open_verbs(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid_t ep->dev_name, strerror(errno)); goto fail; } - // this gets done by psm3_verbs_poll_type - //if (ibv_req_notify_cq(ep->verbs_ep.recv_cq, 0)) { - // _HFI_ERROR("Can't request RQ events from %s: %s\n", - // ep->dev_name, strerror(errno)); - // goto fail; - //} + +#ifdef USE_RC + if (IPS_PROTOEXP_FLAG_USER_RC_QP(ep->rdmamode)) { + // SRQ improves scalability + struct ibv_device_attr dev_attr; + union psmi_envvar_val envvar_val; + + // get RDMA capabilities of device + if (ibv_query_device(ep->verbs_ep.context, &dev_attr)) { + _HFI_ERROR("Unable to query device %s: %s\n", ep->dev_name, + strerror(errno)); + goto fail; + } + _HFI_DBG("max_srq=%d\n", dev_attr.max_srq); + if (dev_attr.max_srq) { + psm3_getenv("PSM3_USE_SRQ", + "If device supports SRQ, use it [1=yes, 0=no) [1]", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)1, &envvar_val); + if (envvar_val.e_uint) { + struct ibv_srq_init_attr attr = { 0 }; + attr.srq_context = ep; // our own pointer + attr.attr.max_wr = ep->verbs_ep.hfi_num_recv_wqes; + attr.attr.max_sge = 1; + + ep->verbs_ep.srq = ibv_create_srq(ep->verbs_ep.pd, &attr); + if (ep->verbs_ep.srq == NULL) { + _HFI_ERROR( "Unable to create SRQ on %s: %s\n", + ep->dev_name, strerror(errno)); + if (errno == ENOMEM) { + _HFI_ERROR( "Requested SRQ size might be too big. Try reducing TX depth and/or inline size.\n"); + _HFI_ERROR( "Requested RX depth was %u .\n", + ep->verbs_ep.hfi_num_recv_wqes); + } + goto fail; + } + _HFI_DBG("created SRQ\n"); + ep->addl_nic_info = " SRQ"; + } + } + } +#endif /* USE_RC */ // TBD - should we pick an EQ number // we use ep as the cq_context (would be in callbacks if any) @@ -194,13 +230,20 @@ psm3_ep_open_verbs(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid_t // so CQ only needs a little headroom to be safe (1000) // HFI_TF_NFLOWS (32) limits receiver side concurrent tidflows (aka inbound // RDMA w/immed). - // For USER RC Eager we can have num_recv_wqes/FRACTION per QP - // in which case theoretical need could be huge. We add 4000 as a + // For USER RC Eager without SRQ we can have num_recv_wqes/FRACTION per + // QP in which case theoretical need could be huge. We add 4000 as a // swag to cover most cases and user can always tune higher as needed + // For USER RC Eager with SRQ worse case is num_recv_wqes so we + // add that to allow up to num_recv_wqes on UD QP and SRQ each and keep + // the HFI_TF_NFLOWS+1000 as headroom. if (! ep->verbs_ep.hfi_num_recv_cqes) { ep->verbs_ep.hfi_num_recv_cqes = ep->verbs_ep.hfi_num_recv_wqes+HFI_TF_NFLOWS+1000; - if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) - ep->verbs_ep.hfi_num_recv_cqes += 4000; + if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) { + if (ep->verbs_ep.srq) + ep->verbs_ep.hfi_num_recv_cqes += ep->verbs_ep.hfi_num_recv_wqes; + else + ep->verbs_ep.hfi_num_recv_cqes += 4000; + } } ep->verbs_ep.recv_cq = ibv_create_cq(ep->verbs_ep.context, ep->verbs_ep.hfi_num_recv_cqes, @@ -211,12 +254,16 @@ psm3_ep_open_verbs(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid_t strerror(errno)); goto fail; } + // this gets done by psm3_verbs_poll_type + //if (ibv_req_notify_cq(ep->verbs_ep.recv_cq, 0)) { + // _HFI_ERROR("Can't request RQ events from %s: %s\n", + // ep->dev_name, strerror(errno)); + // goto fail; + //} ep->verbs_ep.qp = ud_qp_create(ep); - if (! ep->verbs_ep.qp) { - _HFI_ERROR( "Unable to create UD QP on %s\n", ep->dev_name); + if (! ep->verbs_ep.qp) goto fail; - } psmi_assert_always (ep->verbs_ep.context); @@ -306,7 +353,8 @@ psm3_verbs_parse_params(psm2_ep_t ep) psm3_getenv("PSM3_NUM_RECV_CQES", "Number of recv CQEs to allocate\n" "(0 will calculate as PSM3_NUM_RECV_WQES+1032 for PSM3_RDMA=0-2\n" - "and 4000 more than that for PSM3_RDMA=3]) [0]", + "for PSM3_RDMA=3 with SRQ, allow an additional PSM3_NUM_RECV_WQES\n" + "for PSM3_RDMA=3 without SRQ, allow an additional 4000) [0]", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)0, &envvar_val); @@ -343,11 +391,12 @@ psm3_verbs_parse_params(psm2_ep_t ep) * otherwise ignored */ // RV defaults are sufficient for default PSM parameters - // but if user adjusts ep->hfi_num_send_rdma or mq->hfi_base_window_rv + // but if user adjusts ep->hfi_num_send_rdma or mq->ips_cpu_window_rv // they also need to increase the cache size. psm3_verbs_alloc_mr_cache // will verify cache size is sufficient. // min size is (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) * - // chunk size (mq->hfi_base_window_rv after psm3_mq_initialize_params) + // chunk size (psm3_mq_max_window_rv(mq, 0) after + // psm3_mq_initialize_params) // for OPA native, actual window_rv may be smaller, but for UD it // is not reduced psm3_getenv("PSM3_RV_MR_CACHE_SIZE", @@ -358,12 +407,14 @@ psm3_verbs_parse_params(psm2_ep_t ep) (union psmi_envvar_val)0, &envvar_val); ep->rv_mr_cache_size = envvar_val.e_uint; // TBD - we could check cache_size >= minimum based on: - // (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) * mq->hfi_base_window_rv + // (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) + // * psm3_mq_max_window_rv(mq, 0) // and automatically increase with warning if not? #if defined(PSM_CUDA) || defined(PSM_ONEAPI) ep->rv_gpu_cache_size = psmi_parse_gpudirect_rv_gpu_cache_size(0); // TBD - we could check gpu_cache_size >= minimum based on: - // (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) * mq->hfi_base_window_rv + // (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) + // * psm3_mq_max_window_rv(mq, 1) // and automatically increase with warning if not? #endif @@ -464,7 +515,7 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz) ep->verbs_ep.send_reap_thresh = min(ep->verbs_ep.hfi_send_reap_thresh, ep->verbs_ep.send_pool.send_total/2); _HFI_PRDBG("reaping when %u posted.\n", ep->verbs_ep.send_reap_thresh); - if (PSM2_OK != psm_verbs_alloc_recv_pool(ep, ep->verbs_ep.qp, &ep->verbs_ep.recv_pool, + if (PSM2_OK != psm_verbs_alloc_recv_pool(ep, 0, ep->verbs_ep.qp, &ep->verbs_ep.recv_pool, min(ep->verbs_ep.hfi_num_recv_wqes, ep->verbs_ep.qp_cap.max_recv_wr), // want to end up with multiple of cache line (64) // ep->mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU @@ -474,6 +525,25 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz) _HFI_ERROR( "Unable to allocate UD recv buffer pool\n"); goto fail; } +#ifdef USE_RC + if (ep->verbs_ep.srq) { + if (PSM2_OK != psm_verbs_alloc_recv_pool(ep, 1, ep->verbs_ep.srq, &ep->verbs_ep.srq_recv_pool, + ep->verbs_ep.hfi_num_recv_wqes, + (proto->ep->rdmamode == IPS_PROTOEXP_FLAG_RDMA_USER)? 0 + // want to end up with multiple of cache line (64) + // ep->mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU + // be conservative (+BUFFER_HEADROOM) + : (ep->mtu + MAX_PSM_HEADER + BUFFER_HEADROOM) + )) { + _HFI_ERROR( "Unable to allocate SRQ recv buffer pool\n"); + goto fail; + } + if (PSM2_OK != psm3_ep_verbs_prepost_recv(&ep->verbs_ep.srq_recv_pool)) { + _HFI_ERROR( "Unable to prepost recv buffers on SRQ for %s port %u\n", ep->dev_name, ep->portnum); + goto fail; + } + } +#endif /* USE_RC */ // no send segmentation, max_segs will constrain ep->chunk_max_segs = 1; @@ -515,6 +585,9 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz) return PSM2_OK; fail: +#ifdef USE_RC + psm_verbs_free_recv_pool(&ep->verbs_ep.srq_recv_pool); +#endif psm_verbs_free_send_pool(&ep->verbs_ep.send_pool); psm_verbs_free_recv_pool(&ep->verbs_ep.recv_pool); return PSM2_INTERNAL_ERR; @@ -756,6 +829,13 @@ void psm3_ep_free_verbs(psm2_ep_t ep) psm3_rv_close(ep->rv); ep->rv = NULL; } +#endif +#ifdef USE_RC + if (ep->verbs_ep.srq) { + ibv_destroy_srq(ep->verbs_ep.srq); + ep->verbs_ep.srq = NULL; + } + psm_verbs_free_recv_pool(&ep->verbs_ep.srq_recv_pool); #endif if (ep->verbs_ep.pd) { ibv_dealloc_pd(ep->verbs_ep.pd); @@ -796,6 +876,16 @@ psm2_error_t psm_verbs_alloc_send_pool(psm2_ep_t ep, struct ibv_pd *pd, _HFI_ERROR( "can't alloc send buffers"); goto fail; } +#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) + // By registering memory with Cuda, we make + // cuMemcpy run faster for copies from + // GPU to the send buffer. + if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt()) + PSMI_CUDA_CALL(cuMemHostRegister, + pool->send_buffers, + pool->send_total*pool->send_buffer_size, + CU_MEMHOSTALLOC_PORTABLE); +#endif #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) // By registering memory with Level Zero, we make // zeCommandListAppendMemoryCopy run faster for copies from @@ -860,13 +950,22 @@ extern psm2_error_t psm_verbs_init_send_allocator( // which are tracked in other structures but still part of the ep's memory stats // For RC QPs receiving only RDMA Write with immediate, no buffer space is // needed. Caller will specify recv_buffer_size==0 with a recv_total. -psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp, - psm3_verbs_recv_pool_t pool, +psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, uint32_t for_srq, + void *qp_srq, psm3_verbs_recv_pool_t pool, uint32_t recv_total, uint32_t recv_buffer_size) { memset(pool,0,sizeof(*pool)); - pool->qp = qp; // save a reference +#ifdef USE_RC + pool->for_srq = for_srq; + if (for_srq) + pool->srq = (struct ibv_srq *)qp_srq; // save a reference + else +#endif + pool->qp = (struct ibv_qp *)qp_srq; // save a reference +#ifndef USE_RC + psmi_assert(! for_srq); +#endif pool->ep = ep; pool->recv_total = recv_total; @@ -878,7 +977,11 @@ psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp, // allocate recv buffers pool->recv_buffer_size = recv_buffer_size; // beginning of UD QP Recv Buf always consumed with space for IB GRH - if (qp->qp_type == IBV_QPT_UD) { + if ( +#ifdef USE_RC + ! pool->for_srq && +#endif + pool->qp->qp_type == IBV_QPT_UD) { // round up UD_ADDITION (40) to multiple of 64 for better // cache alignment of buffers pool->recv_buffer_size += ROUNDUP(UD_ADDITION, 64); @@ -892,6 +995,16 @@ psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp, _HFI_ERROR( "can't alloc recv buffers"); goto fail; } +#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) + // By registering memory with Cuda, we make + // cuMemcpy run faster for copies from + // recv buffer to GPU + if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt()) + PSMI_CUDA_CALL(cuMemHostRegister, + pool->recv_buffers, + pool->recv_total*pool->recv_buffer_size, + CU_MEMHOSTALLOC_PORTABLE); +#endif #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) // By registering memory with Level Zero, we make // zeCommandListAppendMemoryCopy run faster for copies from @@ -921,7 +1034,11 @@ psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp, // UD doesn't support RDMA, so we just need local NIC to be able to // access our buffers with kernel bypass (IBV_ACCESS_LOCAL_WRITE) pool->recv_buffer_mr = ibv_reg_mr( - qp->pd, pool->recv_buffers, +#ifdef USE_RC + for_srq?pool->srq->pd: +#endif + pool->qp->pd, + pool->recv_buffers, pool->recv_total*pool->recv_buffer_size, IBV_ACCESS_LOCAL_WRITE); if (! pool->recv_buffer_mr) { @@ -932,7 +1049,7 @@ psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp, } else { #ifdef USE_RC // we want a pool for RDMA Write w/immediate recv. No buffers - psmi_assert(qp->qp_type != IBV_QPT_UD); + psmi_assert(for_srq || pool->qp->qp_type != IBV_QPT_UD); // we use exactly 1 rbuf so wr_id can lead us to pool and qp pool->recv_bufs = (struct verbs_rbuf *)psmi_calloc(ep, NETWORK_BUFFERS, sizeof(struct verbs_rbuf), 1); @@ -989,10 +1106,37 @@ void psm_verbs_free_send_pool(psm3_verbs_send_pool_t pool) pool->send_bufs = NULL; } if (pool->send_buffers) { +#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) + if (PSMI_IS_GPU_ENABLED && cu_ctxt) { + /* ignore NOT_REGISTERED in case cuda initialized late */ + /* ignore other errors as context could be destroyed before this */ + CUresult cudaerr; + //PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, + // cuMemHostUnregister, pool->send_buffers); + psmi_count_cuMemHostUnregister++; + cudaerr = psmi_cuMemHostUnregister(pool->send_buffers); + if (cudaerr) { + const char *pStr = NULL; + psmi_count_cuGetErrorString++; + psmi_cuGetErrorString(cudaerr, &pStr); + _HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n", + cudaerr, pStr?pStr:"Unknown"); + } + + } +#endif #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) - if (PSMI_IS_GPU_ENABLED) - PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, - ze_driver, pool->send_buffers); + if (PSMI_IS_GPU_ENABLED) { + ze_result_t result; + //PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, + // ze_driver, pool->send_buffers); + psmi_count_zexDriverReleaseImportedPointer++; + result = psmi_zexDriverReleaseImportedPointer(ze_driver, + pool->send_buffers); + if (result != ZE_RESULT_SUCCESS) { + _HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result)); + } + } #endif psmi_free(pool->send_buffers); pool->send_buffers = NULL; @@ -1014,10 +1158,36 @@ void psm_verbs_free_recv_pool(psm3_verbs_recv_pool_t pool) } #endif if (pool->recv_buffers) { +#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) + if (PSMI_IS_GPU_ENABLED && cu_ctxt) { + /* ignore NOT_REGISTERED in case cuda initialized late */ + /* ignore other errors as context could be destroyed before this */ + CUresult cudaerr; + //PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, + // cuMemHostUnregister, pool->recv_buffers); + psmi_count_cuMemHostUnregister++; + cudaerr = psmi_cuMemHostUnregister(pool->recv_buffers); + if (cudaerr) { + const char *pStr = NULL; + psmi_count_cuGetErrorString++; + psmi_cuGetErrorString(cudaerr, &pStr); + _HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n", + cudaerr, pStr?pStr:"Unknown"); + } + } +#endif #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) - if (PSMI_IS_GPU_ENABLED) - PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, - ze_driver, pool->recv_buffers); + if (PSMI_IS_GPU_ENABLED) { + ze_result_t result; + //PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, + // ze_driver, pool->recv_buffers); + psmi_count_zexDriverReleaseImportedPointer++; + result = psmi_zexDriverReleaseImportedPointer(ze_driver, + pool->recv_buffers); + if (result != ZE_RESULT_SUCCESS) { + _HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result)); + } + } #endif psmi_free(pool->recv_buffers); pool->recv_buffers = NULL; @@ -1181,27 +1351,44 @@ psm2_error_t psm3_ep_verbs_post_recv( PSM3_FAULTINJ_STATIC_DECL(fi_rq_lkey, "rq_lkey", "post UD " #ifdef USE_RC - "or RC " + "or RC or SRQ " #endif "RQ WQE with bad lkey", 0, IPS_FAULTINJ_RQ_LKEY); - if_pf(PSM3_FAULTINJ_IS_FAULT(fi_rq_lkey, pool->ep, " QP %u", pool->qp->qp_num)) + // SRQ has no number but need consistency in fmt and number of args + if_pf(PSM3_FAULTINJ_IS_FAULT(fi_rq_lkey, pool->ep, +#ifdef USE_RC + "%s %u", pool->for_srq?"SRQ":"QP", pool->for_srq?0:pool->qp->qp_num)) +#else + " QP %u", pool->qp->qp_num)) +#endif wr->sg_list->lkey = 55; } else wr->sg_list->lkey = pool->recv_buffer_mr->lkey; #endif // PSM_FI if_pf (++pool->next_recv_wqe >= VERBS_RECV_QP_COALLESCE) { // we have a batch ready to post - if_pf (ibv_post_recv(pool->qp, pool->recv_wr_list, &bad_wr)) { - _HFI_ERROR("failed to post RQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); - return PSM2_INTERNAL_ERR; +#ifdef USE_RC + if (pool->for_srq) { + if_pf (ibv_post_srq_recv(pool->srq, pool->recv_wr_list, &bad_wr)) { + _HFI_ERROR("failed to post SRQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted SRQ, including buffer %u\n", index); + } else +#endif + { + if_pf (ibv_post_recv(pool->qp, pool->recv_wr_list, &bad_wr)) { + _HFI_ERROR("failed to post RQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted RQ, including buffer %u\n", index); } - //_HFI_VDBG("posted RQ, including buffer %u\n", index); pool->next_recv_wqe = 0; } else { //_HFI_VDBG("preped RQE, buffer %u\n", index); } -#else +#else /* VERBS_RECV_QP_COALLESCE > 1 */ list.addr = (uintptr_t)rbuf_to_buffer(buf); list.length = pool->recv_buffer_size; list.lkey = pool->recv_buffer_mr->lkey; @@ -1210,11 +1397,17 @@ psm2_error_t psm3_ep_verbs_post_recv( PSM3_FAULTINJ_STATIC_DECL(fi_rq_lkey, "rq_lkey", "post UD " #ifdef USE_RC - "or RC " + "or RC or SRQ" #endif "RQ WQE with bad lkey", 0, IPS_FAULTINJ_RQ_LKEY); - if_pf(PSM3_FAULTINJ_IS_FAULT(fi_rq_lkey, pool->ep, " QP %u", pool->qp->qp_num)) + // SRQ has no number but need consistency in fmt and number of args + if_pf(PSM3_FAULTINJ_IS_FAULT(fi_rq_lkey, pool->ep, +#ifdef USE_RC + "%s %u", pool->for_srq?"SRQ":"QP", pool->for_srq?0:pool->qp->qp_num)) +#else + " QP %u", pool->qp->qp_num)) +#endif list.lkey = 55; } #endif // PSM_FI @@ -1223,12 +1416,23 @@ psm2_error_t psm3_ep_verbs_post_recv( wr.sg_list = &list; wr.num_sge = 1; // size of sg_list - if_pf (ibv_post_recv(pool->qp, &wr, &bad_wr)) { - _HFI_ERROR("failed to post RQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); - return PSM2_INTERNAL_ERR; - } - //_HFI_VDBG("posted RQ, buffer %u\n", index); +#ifdef USE_RC + if (pool->for_srq) { + if_pf (ibv_post_srq_recv(pool->srq, &wr, &bad_wr)) { + _HFI_ERROR("failed to post SRQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted SRQ, buffer %u\n", index); + } else #endif + { + if_pf (ibv_post_recv(pool->qp, &wr, &bad_wr)) { + _HFI_ERROR("failed to post RQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted RQ, buffer %u\n", index); + } +#endif /* VERBS_RECV_QP_COALLESCE > 1 */ #ifdef USE_RC } else { #if VERBS_RECV_QP_COALLESCE > 1 @@ -1238,27 +1442,43 @@ psm2_error_t psm3_ep_verbs_post_recv( wr->wr_id = (uintptr_t)buf; // we'll get this back in completion if_pf (++pool->next_recv_wqe >= VERBS_RECV_QP_COALLESCE) { // we have a batch ready to post - if_pf (ibv_post_recv(pool->qp, pool->recv_wr_list, &bad_wr)) { - _HFI_ERROR("failed to post RQ on %s on port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); - return PSM2_INTERNAL_ERR; + if (pool->for_srq) { + if_pf (ibv_post_srq_recv(pool->srq, pool->recv_wr_list, &bad_wr)) { + _HFI_ERROR("failed to post SRQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted SRQ\n"); + } else { + if_pf (ibv_post_recv(pool->qp, pool->recv_wr_list, &bad_wr)) { + _HFI_ERROR("failed to post RQ on %s on port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted RQ\n"); } - //_HFI_VDBG("posted RQ\n"); pool->next_recv_wqe = 0; } else { //_HFI_VDBG("preped RQE\n"); } -#else +#else /* VERBS_RECV_QP_COALLESCE > 1 */ wr.next = NULL; // just post 1 wr.wr_id = (uintptr_t)buf; // we'll get this back in completion wr.sg_list = NULL; wr.num_sge = 0; // size of sg_list - if_pf (ibv_post_recv(pool->qp, &wr, &bad_wr)) { - _HFI_ERROR("failed to post RQ on %s on port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); - return PSM2_INTERNAL_ERR; + if (pool->for_srq) { + if_pf (ibv_post_srq_recv(pool->srq, &wr, &bad_wr)) { + _HFI_ERROR("failed to post SRQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted SRQ\n"); + } else { + if_pf (ibv_post_recv(pool->qp, &wr, &bad_wr)) { + _HFI_ERROR("failed to post RQ on %s on port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted RQ\n"); } - //_HFI_VDBG("posted RQ\n"); -#endif +#endif /* VERBS_RECV_QP_COALLESCE > 1 */ #endif // USE_RC } return PSM2_OK; @@ -2333,12 +2553,15 @@ static struct ibv_qp* ud_qp_create(psm2_ep_t ep) attr.qp_type = IBV_QPT_UD; qp = ibv_create_qp(ep->verbs_ep.pd, &attr); - if (qp == NULL && errno == ENOMEM) { + if (qp == NULL) { _HFI_ERROR( "Unable to create UD QP on %s: %s\n", ep->dev_name, strerror(errno)); - _HFI_ERROR( "Requested QP size might be too big. Try reducing TX depth and/or inline size.\n"); - _HFI_ERROR( "Requested TX depth was %u and RX depth was %u .\n", + if (errno == ENOMEM) { + _HFI_ERROR( "Requested QP size might be too big. Try reducing TX depth and/or inline size.\n"); + _HFI_ERROR( "Requested TX depth was %u and RX depth was %u .\n", ep->verbs_ep.hfi_num_send_wqes+1, ep->verbs_ep.hfi_num_recv_wqes); + } + return NULL; } // attr reports what we got, double check and react in case @@ -2437,7 +2660,7 @@ struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap) attr.qp_context = context; attr.send_cq = ep->verbs_ep.send_cq; attr.recv_cq = ep->verbs_ep.recv_cq; - attr.srq = NULL; + attr.srq = ep->verbs_ep.srq; // one extra WQE to be safe in case verbs needs a spare WQE if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) { // need to be prepared in case all sends posted to same RC QP, so @@ -2445,10 +2668,9 @@ struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap) attr.cap.max_send_wr = ep->verbs_ep.hfi_num_send_wqes+ep->hfi_num_send_rdma+1; attr.cap.max_send_sge = 2; // inline data helps latency and message rate for small sends - // Later we may explore use of - // send SGEs pointing to application buffers, somewhat like WFR send DMA attr.cap.max_inline_data = ep->hfi_imm_size; - attr.cap.max_recv_wr = ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION;// TBD + attr.cap.max_recv_wr = ep->verbs_ep.srq?0 + :(ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION);// TBD attr.cap.max_recv_sge = 1; } else { // only RDMA Write w/immediate @@ -2456,7 +2678,7 @@ struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap) attr.cap.max_send_sge = 1; attr.cap.max_inline_data = 0; // incoming Write w/immediate consumes a RQ WQE but no buffer needed - attr.cap.max_recv_wr = HFI_TF_NFLOWS+1; + attr.cap.max_recv_wr = ep->verbs_ep.srq?0:(HFI_TF_NFLOWS+1); attr.cap.max_recv_sge = 0; } @@ -2467,9 +2689,16 @@ struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap) _HFI_ERROR( "Unable to create RC QP on %s: %s\n", ep->dev_name, strerror(errno)); _HFI_ERROR( "Requested QP size might be too big. Try reducing TX depth and/or inline size.\n"); - _HFI_ERROR( "Requested TX depth was %u and RX depth was %u .\n", - ep->verbs_ep.hfi_num_send_wqes+1, - ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION); + if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) { + _HFI_ERROR( "Requested TX depth was %u and RX depth was %u .\n", + ep->verbs_ep.hfi_num_send_wqes+ep->hfi_num_send_rdma+1, + ep->verbs_ep.srq?0 + :(ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION)); + } else { + _HFI_ERROR( "Requested TX depth was %u and RX depth was %u .\n", + ep->hfi_num_send_rdma+1, + ep->verbs_ep.srq?0:(HFI_TF_NFLOWS+1)); + } return NULL; } @@ -2492,7 +2721,8 @@ struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap) _HFI_PRDBG( "Limited to %d SQ SGEs\n", attr.cap.max_send_sge); } - if (ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION > attr.cap.max_recv_wr) { + if (! ep->verbs_ep.srq + && ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION > attr.cap.max_recv_wr) { _HFI_PRDBG( "Limited to %d RQ WQEs, requested %u\n", attr.cap.max_recv_wr, ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION); } else { @@ -2514,7 +2744,8 @@ struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap) _HFI_PRDBG( "Limited to %d SQ SGEs\n", attr.cap.max_send_sge); } - if (HFI_TF_NFLOWS+1 > attr.cap.max_recv_wr) { + if (! ep->verbs_ep.srq + && HFI_TF_NFLOWS+1 > attr.cap.max_recv_wr) { _HFI_PRDBG( "Limited to %d RQ WQEs, requested %u\n", attr.cap.max_recv_wr, HFI_TF_NFLOWS+1); } else { @@ -2848,7 +3079,7 @@ psm3_dump_verbs_qp(struct ibv_qp *qp) printf("QP %p (%u), type %u state %u PkeyIndx %u Port %u draining %u\n", qp, qp->qp_num, qp->qp_type, attr.qp_state, attr.pkey_index, attr.port_num, attr.sq_draining); - printf(" send: wr %u sge %u inline %u recv: wr %u sqe %u\n", + printf(" send: wr %u sge %u inline %u recv: wr %u sge %u\n", attr.cap.max_send_wr, attr.cap.max_send_sge, attr.cap.max_inline_data, attr.cap.max_recv_wr, attr.cap.max_recv_sge); printf(" context %p send_cq %p recv_cq %p srq %p sg_sig_all %u\n", @@ -2906,6 +3137,7 @@ static enum psm3_ibv_rate verbs_get_rate(uint8_t width, uint8_t speed) case 16: return PSM3_IBV_RATE_14_GBPS; case 32: return PSM3_IBV_RATE_25_GBPS; case 64: return PSM3_IBV_RATE_50_GBPS; + case 128: return PSM3_IBV_RATE_100_GBPS; default: _HFI_ERROR( "unknown link speed 0x%x\n", speed); return PSM3_IBV_RATE_100_GBPS; @@ -2919,6 +3151,7 @@ static enum psm3_ibv_rate verbs_get_rate(uint8_t width, uint8_t speed) case 16: return PSM3_IBV_RATE_56_GBPS; case 32: return PSM3_IBV_RATE_100_GBPS; case 64: return PSM3_IBV_RATE_200_GBPS; + case 128: return PSM3_IBV_RATE_400_GBPS; default: _HFI_ERROR( "unknown link speed 0x%x\n", speed); return PSM3_IBV_RATE_100_GBPS; @@ -2932,6 +3165,7 @@ static enum psm3_ibv_rate verbs_get_rate(uint8_t width, uint8_t speed) case 16: return PSM3_IBV_RATE_112_GBPS; case 32: return PSM3_IBV_RATE_200_GBPS; case 64: return PSM3_IBV_RATE_400_GBPS; + case 128: return PSM3_IBV_RATE_800_GBPS; default: _HFI_ERROR( "unknown link speed 0x%x\n", speed); return PSM3_IBV_RATE_100_GBPS; @@ -2945,6 +3179,7 @@ static enum psm3_ibv_rate verbs_get_rate(uint8_t width, uint8_t speed) case 16: return PSM3_IBV_RATE_168_GBPS; case 32: return PSM3_IBV_RATE_300_GBPS; case 64: return PSM3_IBV_RATE_600_GBPS; + case 128: return PSM3_IBV_RATE_1200_GBPS; default: _HFI_ERROR( "unknown link speed 0x%x\n", speed); return PSM3_IBV_RATE_100_GBPS; @@ -2958,6 +3193,7 @@ static enum psm3_ibv_rate verbs_get_rate(uint8_t width, uint8_t speed) case 16: return PSM3_IBV_RATE_28_GBPS; case 32: return PSM3_IBV_RATE_50_GBPS; case 64: return PSM3_IBV_RATE_100_GBPS; + case 128: return PSM3_IBV_RATE_200_GBPS; default: _HFI_ERROR( "unknown link speed 0x%x\n", speed); return PSM3_IBV_RATE_100_GBPS; diff --git a/prov/psm3/psm3/hal_verbs/verbs_ep.h b/prov/psm3/psm3/hal_verbs/verbs_ep.h index 8874831f3b5..c1da6b73e53 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_ep.h +++ b/prov/psm3/psm3/hal_verbs/verbs_ep.h @@ -161,12 +161,14 @@ struct verbs_rbuf { typedef struct verbs_rbuf *rbuf_t; #define rbuf_to_buffer(buf) ((buf)->buffer) #define rbuf_addition(buf) ((buf)->pool->addition) -#define rbuf_qp(ep, buf) ((buf)->pool->qp) +#define rbuf_qp_context(ep, buf) ((buf)->pool->for_srq?NULL:(buf)->pool->qp->qp_context) +#define rbuf_qp_type_str(ep, buf) ((buf)->pool->for_srq?"SRQ":qp_type_str((buf)->pool->qp)) #else typedef uint8_t *rbuf_t; #define rbuf_to_buffer(buf) (buf) #define rbuf_addition(buf) (UD_ADDITION) -#define rbuf_qp(ep, buf) ((ep)->verbs_ep.recv_pool.qp) +#define rbuf_qp_context(ep, buf) ((ep)->verbs_ep.recv_pool.qp->qp_context) +#define rbuf_qp_type_str(ep, buf) (qp_type_str((ep)->verbs_ep.recv_pool.qp)) #endif static inline const char*qp_type_str(struct ibv_qp *qp) { @@ -255,7 +257,12 @@ typedef struct psm3_verbs_send_allocator *psm3_verbs_send_allocator_t; // but sizes may differ // when USE_RC, we need a separate recv pool per QP so we can prepost bufs. struct psm3_verbs_recv_pool { - struct ibv_qp *qp; // secondary reference to QP these buffers are for + union { // secondary reference to QP or SRQ these buffers are for + struct ibv_qp *qp; // when ! for_srq +#ifdef USE_RC + struct ibv_srq *srq; // when for_srq +#endif + }; psm2_ep_t ep; // our preregistered recv buffers uint32_t recv_buffer_size; @@ -264,6 +271,7 @@ struct psm3_verbs_recv_pool { struct ibv_mr *recv_buffer_mr; #ifdef USE_RC uint32_t addition; // UD_ADDITION for UD QP, 0 for RC QP + uint32_t for_srq; // if this for an SRQ or QP? #endif #if VERBS_RECV_QP_COALLESCE > 1 // list of ready to post WQEs and SGEs @@ -296,6 +304,9 @@ struct psm3_verbs_ep { struct ibv_cq *recv_cq; struct ibv_qp *qp; struct ibv_qp_cap qp_cap; // capabilities of QP we got +#ifdef USE_RC + struct ibv_srq *srq; +#endif uint32_t qkey; //uint8_t link_layer; // IBV_LINK_LAYER_ETHERNET or other uint8_t active_rate; @@ -309,6 +320,9 @@ struct psm3_verbs_ep { int recv_wc_count; // number left in recv_wc_list int recv_wc_next; // next index #else +#ifdef USE_RC + struct psm3_verbs_recv_pool srq_recv_pool; +#endif // if asked to revisit a packet we save it here rbuf_t revisit_buf; uint32_t revisit_payload_size; @@ -385,8 +399,8 @@ extern psm2_error_t psm_verbs_alloc_send_pool(psm2_ep_t ep, struct ibv_pd *pd, extern psm2_error_t psm_verbs_init_send_allocator( psm3_verbs_send_allocator_t allocator, psm3_verbs_send_pool_t pool); -extern psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp, - psm3_verbs_recv_pool_t pool, +extern psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, uint32_t for_srq, + void *qp_srq, psm3_verbs_recv_pool_t pool, uint32_t recv_total, uint32_t recv_buffer_size); extern void psm_verbs_free_send_pool(psm3_verbs_send_pool_t pool); extern void psm_verbs_free_recv_pool(psm3_verbs_recv_pool_t pool); diff --git a/prov/psm3/psm3/hal_verbs/verbs_hal.c b/prov/psm3/psm3/hal_verbs/verbs_hal.c index 4f6bfb742ef..9575b316ff2 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_hal.c +++ b/prov/psm3/psm3/hal_verbs/verbs_hal.c @@ -166,21 +166,17 @@ static void psm3_hfp_verbs_mq_init_defaults(struct psm2_mq *mq) * Otherwise these defaults are used. */ unsigned rdmamode = psm3_verbs_parse_rdmamode(1); - mq->hfi_thresh_rv = 64000; - mq->hfi_base_window_rv = 131072; + mq->hfi_thresh_rv = PSM_MQ_NIC_RNDV_THRESH; + mq->ips_cpu_window_rv_str = PSM_CPU_NIC_RNDV_WINDOW_STR; if (! (rdmamode & IPS_PROTOEXP_FLAG_ENABLED)) { // TBD - when RDMA is disabled do we want to disable rendezvous? // even without RDMA, the receiver controlled pacing helps scalability mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous } mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY; -#ifdef PSM_CUDA - if (PSMI_IS_GPU_ENABLED) - mq->hfi_base_window_rv = 2097152; -#endif -#ifdef PSM_ONEAPI +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) if (PSMI_IS_GPU_ENABLED) - mq->hfi_base_window_rv = 512*1024; + mq->ips_gpu_window_rv_str = PSM_GPU_NIC_RNDV_WINDOW_STR; #endif // we parse mr_cache_mode and rv_gpu_cache_size here so we can cache it // once per EP open, even if multi-rail or multi-QP diff --git a/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h b/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h index 4f2df710571..2ba92503e9f 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h +++ b/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h @@ -287,29 +287,33 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_ipsaddr_set_req_params( //ipsaddr->verbs.rc_qp = NULL; } else { // we got a REQ or a REP, we can move to RTR - // if we are only doing RDMA, we don't need any buffers, but we need a - // pool object for RQ coallesce, so we create a pool with 0 size buffers - if (PSM2_OK != psm_verbs_alloc_recv_pool(proto->ep, ipsaddr->verbs.rc_qp, &ipsaddr->verbs.recv_pool, - min(proto->ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION, ipsaddr->verbs.rc_qp_max_recv_wr), - (proto->ep->rdmamode == IPS_PROTOEXP_FLAG_RDMA_USER)? 0 - // want to end up with multiple of cache line (64) - // pr_mtu is negotiated max PSM payload, not including hdrs - // pr_mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU - // be conservative (+BUFFER_HEADROOM) - : ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->pr_mtu - + MAX_PSM_HEADER + BUFFER_HEADROOM - )) { - _HFI_ERROR("failed to alloc RC recv buffers\n"); - return PSM2_INTERNAL_ERR; + if (! proto->ep->verbs_ep.srq) { + // if we are only doing RDMA, we don't need any buffers, but we need a + // pool object for RQ coallesce, so we create a pool with 0 size buffers + if (PSM2_OK != psm_verbs_alloc_recv_pool(proto->ep, 0, ipsaddr->verbs.rc_qp, &ipsaddr->verbs.recv_pool, + min(proto->ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION, ipsaddr->verbs.rc_qp_max_recv_wr), + (proto->ep->rdmamode == IPS_PROTOEXP_FLAG_RDMA_USER)? 0 + // want to end up with multiple of cache line (64) + // pr_mtu is negotiated max PSM payload, not including hdrs + // pr_mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU + // be conservative (+BUFFER_HEADROOM) + : ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->pr_mtu + + MAX_PSM_HEADER + BUFFER_HEADROOM + )) { + _HFI_ERROR("failed to alloc RC recv buffers\n"); + return PSM2_INTERNAL_ERR; + } } if (modify_rc_qp_to_init(proto->ep, ipsaddr->verbs.rc_qp)) { _HFI_ERROR("qp_to_init failed\n"); return PSM2_INTERNAL_ERR; } - if (PSM2_OK != psm3_ep_verbs_prepost_recv(&ipsaddr->verbs.recv_pool)) { - _HFI_ERROR("prepost failed\n"); - return PSM2_INTERNAL_ERR; + if (! proto->ep->verbs_ep.srq) { + if (PSM2_OK != psm3_ep_verbs_prepost_recv(&ipsaddr->verbs.recv_pool)) { + _HFI_ERROR("prepost failed\n"); + return PSM2_INTERNAL_ERR; + } } // RC QP MTU will be set to min of req->verbs.qp_attr and pr_mtu // TBD - we already factored in req vs pr to update pr no need diff --git a/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c b/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c index eebcac2e5da..f38aa505fc8 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c +++ b/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c @@ -278,7 +278,7 @@ psm2_error_t psm3_verbs_recvhdrq_progress(struct ips_recvhdrq *recvq) // wc.byte_len is len of inbound rdma write not including immed // wc.qp_num - local QP ips_protoexp_handle_immed_data(rcv_ev.proto, - (uint64_t)(rbuf_qp(ep, buf)->qp_context), + (uint64_t)(rbuf_qp_context(ep, buf)), RDMA_IMMED_USER_RC, WC(imm_data), WC(byte_len)); goto repost; break; @@ -310,7 +310,7 @@ psm2_error_t psm3_verbs_recvhdrq_progress(struct ips_recvhdrq *recvq) } rcv_ev.p_hdr = (struct ips_message_header *)(rbuf_to_buffer(buf)+rbuf_addition(buf)); rcv_ev.payload = (rbuf_to_buffer(buf) + rbuf_addition(buf) + sizeof(struct ips_message_header)); - _HFI_VDBG("%s receive - opcode %x\n", qp_type_str(rbuf_qp(ep, buf)), + _HFI_VDBG("%s receive - opcode %x\n", rbuf_qp_type_str(ep, buf), _get_proto_hfi_opcode(rcv_ev.p_hdr)); PSM2_LOG_PKT_STRM(PSM2_LOG_RX,rcv_ev.p_hdr,"PKT_STRM:"); diff --git a/prov/psm3/psm3/include/utils_debug.h b/prov/psm3/psm3/include/utils_debug.h index 499f1a41699..b7b6655f2e6 100644 --- a/prov/psm3/psm3/include/utils_debug.h +++ b/prov/psm3/psm3/include/utils_debug.h @@ -202,6 +202,14 @@ extern void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len); } \ } while (0) +#define _HFI_ENV_ERROR(fmt, ...) \ + do { \ + _Pragma_unlikely \ + if (unlikely(psm3_dbgmask&__HFI_INFO)) { \ + printf("%s: env " fmt, psm3_mylabel, ##__VA_ARGS__); \ + } \ + } while (0) + #define __HFI_PKTDBG_ON unlikely(psm3_dbgmask & __HFI_PKTDBG) #define __HFI_DBG_WHICH(which, fmt, ...) \ @@ -218,8 +226,7 @@ extern void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len); do { \ _Pragma_unlikely \ if (unlikely(psm3_dbgmask&(which))) { \ - PSM3_GETTIME \ - fprintf(psm3_dbgout, PSM3_TIME_FMT "%s: " fmt, PSM3_TIME_ARG, psm3_mylabel, \ + fprintf(psm3_dbgout, "%s: " fmt, psm3_mylabel, \ ##__VA_ARGS__); \ } \ } while (0) @@ -291,6 +298,8 @@ extern void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len); #define _HFI_INFO(fmt, ...) +#define _HFI_ENV_ERROR(fmt, ...) + #define __HFI_PKTDBG_ON 0 #define _HFI_DBG(fmt, ...) diff --git a/prov/psm3/psm3/include/utils_env.h b/prov/psm3/psm3/include/utils_env.h index 5e18975a36b..d95660f6e01 100644 --- a/prov/psm3/psm3/include/utils_env.h +++ b/prov/psm3/psm3/include/utils_env.h @@ -55,6 +55,7 @@ #define UTILS_ENV_H #include "psm2_mock_testing.h" +#include "fnmatch.h" /* we can only include low level headers here because this is * #included by utils_sysfs.c. Can't pull in HAL headers or heap debug macros @@ -81,21 +82,37 @@ union psmi_envvar_val { unsigned long long e_ulonglong; }; -#define PSMI_ENVVAR_LEVEL_USER 1 -#define PSMI_ENVVAR_LEVEL_HIDDEN 2 -#define PSMI_ENVVAR_LEVEL_NEVER_PRINT 4 - -#define PSMI_ENVVAR_TYPE_YESNO 0 -#define PSMI_ENVVAR_TYPE_STR 1 -#define PSMI_ENVVAR_TYPE_INT 2 -#define PSMI_ENVVAR_TYPE_UINT 3 -#define PSMI_ENVVAR_TYPE_UINT_FLAGS 4 -#define PSMI_ENVVAR_TYPE_LONG 5 -#define PSMI_ENVVAR_TYPE_ULONG 6 -#define PSMI_ENVVAR_TYPE_ULONG_FLAGS 7 -#define PSMI_ENVVAR_TYPE_ULONG_ULONG 8 -#define PSMI_ENVVAR_TYPE_STR_VAL_PAT 9 -#define PSMI_ENVVAR_TYPE_STR_TUPLES 10 +// psm3_getenv only expects LEVEL +// psm3_getenv_range accepts LEVEL and FLAGs +// MIN/MAX N/A to TYPEs: YESNO, STR, STR_VAL_PAT_*, STR_TUPLES +// 'min' and 'max' only allowed as input when corresponding +// range check enabled +// FLAG_FATAL will cause a fatal error on invalid input +// (syntax, range or check function detected). When FLAG_FATAL is not +// set an invalid input will fallback to the default with a message. +#define PSMI_ENVVAR_LEVEL_USER 1 // show in user help +#define PSMI_ENVVAR_LEVEL_HIDDEN 2 // hidden from user help +#define PSMI_ENVVAR_LEVEL_NEVER_PRINT 4 // a bit flag, never show in help +#define PSMI_ENVVAR_LEVEL_MASK 0x07 // mask for getting level +#define PSMI_ENVVAR_FLAG_NOMIN 0x10 // no min check +#define PSMI_ENVVAR_FLAG_NOMAX 0x20 // no max check +#define PSMI_ENVVAR_FLAG_NOABBREV 0x40 // no 'min' or 'max' as input +#define PSMI_ENVVAR_FLAG_NOMIN_NOMAX 0x70 // no min, no max, no abbrev +#define PSMI_ENVVAR_FLAG_FATAL 0x80 // invalid input is fatal + +#define PSMI_ENVVAR_TYPE_YESNO 0 +#define PSMI_ENVVAR_TYPE_STR 1 +#define PSMI_ENVVAR_TYPE_INT 2 +#define PSMI_ENVVAR_TYPE_UINT 3 +#define PSMI_ENVVAR_TYPE_UINT_FLAGS 4 +#define PSMI_ENVVAR_TYPE_LONG 5 +#define PSMI_ENVVAR_TYPE_ULONG 6 +#define PSMI_ENVVAR_TYPE_ULONG_FLAGS 7 +#define PSMI_ENVVAR_TYPE_ULONG_ULONG 8 +#define PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT 9 +#define PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT 10 +#define PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS 11 +#define PSMI_ENVVAR_TYPE_STR_TUPLES 12 #define PSMI_ENVVAR_VAL_YES ((union psmi_envvar_val) 1) #define PSMI_ENVVAR_VAL_NO ((union psmi_envvar_val) 0) @@ -105,43 +122,82 @@ void psm3_env_print_val(FILE *f, const char *name, int type, int psm3_env_snprint_val(char *buf, size_t size, const char *name, int type, union psmi_envvar_val val); +// psm3_getenv_check_t is optional in psm3_getenv_range +// to confirm the resulting value is valid (return of 0). +// On error (return != 0), errstr[errstr_size] is filled in with +// '\0' terminated string with more information about the error. +// +// This may be used for any envvar type to do further checks of the value +// such as integers which may need to be power of 2 or parse checking +// of strings. +// For strings parsed value(s) is not returned, so caller will need to parse +// again, but this allows better error reporting during env variable get. +// +// ptr is caller specific and can pass additional input information which may +// assist in verification of values. ptr should be used as input only +// because the check function is only called by psm3_getenv_range when +// otherwise valid input is supplied. +typedef int (*psm3_getenv_check_t)(int type, const union psmi_envvar_val val, + void *ptr, size_t errstr_size, char errstr[]); + int MOCKABLE(psm3_getenv)(const char *name, const char *descr, int level, int type, union psmi_envvar_val defval, union psmi_envvar_val *newval); MOCK_DCL_EPILOGUE(psm3_getenv); -/* - * Parsing int and unsigned int parameters - * 0 -> ok, *val updated - * -1 -> empty string - * -2 -> parse error - */ -int psm3_parse_str_int(const char *string, int *val); -int psm3_parse_str_uint(const char *string, unsigned int *val); +int MOCKABLE(psm3_getenv_range)(const char *name, const char *descr, + const char *help, unsigned level_flags, + int type, union psmi_envvar_val defval, union psmi_envvar_val min, + union psmi_envvar_val max, psm3_getenv_check_t check, void *ptr, + union psmi_envvar_val *newval); +MOCK_DCL_EPILOGUE(psm3_getenv_range); /* - * Parse long parameters - * -1 -> empty string - * -2 -> parse error + * Parsing int, unsigned int and long parameters + * 0 -> ok, *val updated + * -1 -> empty string, *val not updated + * -2 -> parse error, *val not updated */ -long psm3_parse_str_long(const char *str); +int psm3_parse_str_int(const char *string, int *val, int min, int max); +int psm3_parse_str_uint(const char *string, unsigned int *val, + unsigned int min, unsigned int max); +int psm3_parse_str_long(const char *str, long *val, long min, long max); /* * Parsing yesno parameters * allows: yes/no, true/false, on/off, 1/0 - * -1 -> empty string - * -2 -> parse error + * 0 -> ok, *val updated + * -1 -> empty string, *val not updated + * -2 -> parse error, *val not updated */ -int psm3_parse_str_yesno(const char *str); +int psm3_parse_str_yesno(const char *str, int *val); /* * Parsing int parameters set in string tuples. + * Returns: + * 0 - parsed with no errors, vals[] updated + * -1 - empty or NULL string, vals[] unchanged + * -2 - syntax error in one of more of the parameters + * parameters with syntax errors are unchanged, others without + * syntax errors are updated in vals[] */ int psm3_parse_str_tuples(const char *str, int ntup, int *vals); -/* parse env of the form 'val' or 'val:' or 'val:pattern' */ -int psm3_parse_val_pattern(const char *env, unsigned def, unsigned *val); +/* parse env of the form 'val' or 'val:' or 'val:pattern' + * Returns: + * 0 - parsed and matches current process, *val set to parsed val + * 0 - parsed and doesn't match current process, *val set to def + * -1 - nothing provided, *val set to def + * -2 - syntax error, *val set to def + * flags PSMI_ENVVAR_FLAG_NOMIN, PSMI_ENVVAR_FLAG_NOMAX and + * PSMI_ENVVAR_FLAG_NOABBREV control if 'min', 'minimum', 'max' or 'maximum' + * allowed as input and indicate if min and/or max supplied. + */ +int psm3_parse_val_pattern_int(const char *env, int def, int *val, + unsigned flags, int min, int max); +int psm3_parse_val_pattern_uint(const char *env, unsigned def, unsigned *val, + unsigned flags, unsigned min, unsigned max); #if defined(PSM_VERBS) || defined(PSM_SOCKETS) // return forced speed in mbps or 0 if not forced diff --git a/prov/psm3/psm3/psm.c b/prov/psm3/psm3/psm.c index 40826d38c1c..df138dd8a2f 100644 --- a/prov/psm3/psm3/psm.c +++ b/prov/psm3/psm3/psm.c @@ -97,6 +97,7 @@ sem_t *psm3_sem_affinity_shm_rw = NULL; int psm3_affinity_shared_file_opened = 0; char *psm3_affinity_shm_name; uint64_t *psm3_shared_affinity_ptr; +uint64_t *psm3_shared_affinity_nic_refcount_ptr; uint32_t psm3_cpu_model; @@ -164,6 +165,8 @@ CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream); CUresult (*psmi_cuEventSynchronize)(CUevent hEvent); CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags); CUresult (*psmi_cuMemFreeHost)(void* p); +CUresult (*psmi_cuMemHostRegister)(void* p, size_t bytesize, unsigned int Flags); +CUresult (*psmi_cuMemHostUnregister)(void* p); CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); @@ -202,6 +205,8 @@ uint64_t psmi_count_cuEventRecord; uint64_t psmi_count_cuEventSynchronize; uint64_t psmi_count_cuMemHostAlloc; uint64_t psmi_count_cuMemFreeHost; +uint64_t psmi_count_cuMemHostRegister; +uint64_t psmi_count_cuMemHostUnregister; uint64_t psmi_count_cuMemcpy; uint64_t psmi_count_cuMemcpyDtoD; uint64_t psmi_count_cuMemcpyDtoH; @@ -225,7 +230,7 @@ int psmi_cuda_lib_load() char *dlerr; PSM2_LOG_MSG("entering"); - _HFI_VDBG("Loading CUDA library.\n"); + _HFI_DBG("Loading CUDA library.\n"); psmi_cuda_lib = dlopen("libcuda.so.1", RTLD_LAZY); if (!psmi_cuda_lib) { @@ -270,6 +275,8 @@ int psmi_cuda_lib_load() PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventSynchronize); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemHostAlloc); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemFreeHost); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemHostRegister); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemHostUnregister); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpy); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoD); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoH); @@ -333,6 +340,8 @@ static void psmi_cuda_stats_register() PSMI_CUDA_COUNT_DECLU64(cuEventSynchronize), PSMI_CUDA_COUNT_DECLU64(cuMemHostAlloc), PSMI_CUDA_COUNT_DECLU64(cuMemFreeHost), + PSMI_CUDA_COUNT_DECLU64(cuMemHostRegister), + PSMI_CUDA_COUNT_DECLU64(cuMemHostUnregister), PSMI_CUDA_COUNT_DECLU64(cuMemcpy), PSMI_CUDA_COUNT_DECLU64(cuMemcpyDtoD), PSMI_CUDA_COUNT_DECLU64(cuMemcpyDtoH), @@ -366,6 +375,7 @@ static void psmi_cuda_stats_register() ze_result_t (*psmi_zeInit)(ze_init_flags_t flags); ze_result_t (*psmi_zeDriverGet)(uint32_t *pCount, ze_driver_handle_t *phDrivers); ze_result_t (*psmi_zeDeviceGet)(ze_driver_handle_t hDriver, uint32_t *pCount, ze_device_handle_t *phDevices); +ze_result_t (*psmi_zeDevicePciGetPropertiesExt)(ze_device_handle_t hDevice, ze_pci_ext_properties_t *pPciProperties); #ifndef PSM3_NO_ONEAPI_IMPORT ze_result_t (*psmi_zeDriverGetExtensionFunctionAddress)(ze_driver_handle_t hDriver, const char *name, void **ppFunctionAddress); ze_result_t (*psmi_zexDriverImportExternalPointer)(ze_driver_handle_t hDriver, void *ptr, size_t size); @@ -411,6 +421,7 @@ ze_result_t (*psmi_zelLoaderGetVersions)(size_t *num_elems, zel_component_versio uint64_t psmi_count_zeInit; uint64_t psmi_count_zeDriverGet; uint64_t psmi_count_zeDeviceGet; +uint64_t psmi_count_zeDevicePciGetPropertiesExt; #ifndef PSM3_NO_ONEAPI_IMPORT uint64_t psmi_count_zeDriverGetExtensionFunctionAddress; uint64_t psmi_count_zexDriverImportExternalPointer; @@ -473,6 +484,7 @@ int psmi_oneapi_ze_load() PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeInit); PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDriverGet); PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDeviceGet); + PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDevicePciGetPropertiesExt); #ifndef PSM3_NO_ONEAPI_IMPORT PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDriverGetExtensionFunctionAddress); #endif @@ -535,6 +547,7 @@ static void psmi_oneapi_ze_stats_register() PSMI_ONEAPI_ZE_COUNT_DECLU64(zeInit), PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDriverGet), PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDeviceGet), + PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDevicePciGetPropertiesExt), #ifndef PSM3_NO_ONEAPI_IMPORT PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDriverGetExtensionFunctionAddress), PSMI_ONEAPI_ZE_COUNT_DECLU64(zexDriverImportExternalPointer), @@ -637,11 +650,13 @@ static void psmi_gpu_init(void) is_gdr_copy_enabled = env_enable_gdr_copy.e_int; union psmi_envvar_val env_gpu_thresh_rndv; - ret = psm3_getenv("PSM3_GPU_THRESH_RNDV", + ret = psm3_getenv_range("PSM3_GPU_THRESH_RNDV", "RNDV protocol is used for GPU send message sizes greater than the threshold", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)gpu_thresh_rndv, &env_gpu_thresh_rndv); - if (ret) + NULL, PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)gpu_thresh_rndv, + (union psmi_envvar_val)0, (union psmi_envvar_val)UINT32_MAX, + NULL, NULL, &env_gpu_thresh_rndv); + if (ret > 0) /* * For backward compatibility, check if the old variable name is set. * Priority order: New name > old name > default value. @@ -693,7 +708,7 @@ int psmi_cuda_initialize() psm2_error_t err = PSM2_OK; PSM2_LOG_MSG("entering"); - _HFI_VDBG("Enabling CUDA support.\n"); + _HFI_DBG("Enabling CUDA support.\n"); psmi_cuda_stats_register(); @@ -727,6 +742,7 @@ static void psmi_oneapi_find_copy_only_engine(ze_device_handle_t dev, uint32_t count = 0; ze_command_queue_group_properties_t *props = NULL; int i; + int done = 0; /* Set the default */ ctxt->ordinal = 0; @@ -742,15 +758,27 @@ static void psmi_oneapi_find_copy_only_engine(ze_device_handle_t dev, PSMI_ONEAPI_ZE_CALL(zeDeviceGetCommandQueueGroupProperties, dev, &count, props); - /* Select the first copy-only engine group if possible */ + // pick the last command queue group which supports copy but not compute. + // For PVC this will be the xeLink copy engine which will also + // have numQueues >1 (TBD - perhaps only select if it has numQueues>1). + // This ordinal is then supplied to create Command Queues and Command Lists. for (i = count - 1; i >= 0; i--) { - if ((props[i].flags & + _HFI_DBG("GPU Queue Group %d: copy=%d Compute=%d num_queues=%d\n", i, + (props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) != 0, + (props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) != 0, + (int)props[i].numQueues); + if (! done && (props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) && !(props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)) { ctxt->ordinal = i; ctxt->num_queues = props[i].numQueues; - break; + done = 1; + if (_HFI_DBG_ON) { + _HFI_DBG_ALWAYS("Selected GPU copy engine %d\n", i); + } else { + break; + } } } psmi_free(props); @@ -789,6 +817,35 @@ static void psmi_oneapi_cmd_create(ze_device_handle_t dev, struct ze_dev_ctxt *c dev, &ze_cl_desc, &ctxt->cl); } ctxt->dev = dev; + + if (psm3_oneapi_parallel_dtod_copy_thresh < UINT_MAX) { + // create resources for dual copy mechanism + ze_event_pool_desc_t pool_desc = { + .stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, + .flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE, + .count = 2 + }; + ze_event_desc_t event_desc = { + .stype = ZE_STRUCTURE_TYPE_EVENT_DESC, + .signal = ZE_EVENT_SCOPE_FLAG_HOST, + .wait = ZE_EVENT_SCOPE_FLAG_HOST, + }; + PSMI_ONEAPI_ZE_CALL(zeEventPoolCreate, + ze_context, &pool_desc, 0, NULL, &ctxt->event_pool); + + event_desc.index = 0; + PSMI_ONEAPI_ZE_CALL(zeEventCreate, ctxt->event_pool, &event_desc, + &ctxt->copy_status0); + + event_desc.index = 1; + PSMI_ONEAPI_ZE_CALL(zeEventCreate, ctxt->event_pool, &event_desc, + &ctxt->copy_status1); + + psmi_oneapi_async_cmd_create(ctxt, &ctxt->async_cq0, + &ctxt->async_cl0); + psmi_oneapi_async_cmd_create(ctxt, &ctxt->async_cq1, + &ctxt->async_cl1); + } } void psmi_oneapi_cmd_create_all(void) @@ -804,8 +861,11 @@ void psmi_oneapi_cmd_create_all(void) for (i = 0; i < num_ze_devices; i++) { ctxt = &ze_devices[i]; - if (!ctxt->cl) + if (!ctxt->cl) { psmi_oneapi_cmd_create(ctxt->dev, ctxt); + _HFI_DBG("Initialized cmd queues for ze_device[%d] %p\n", + i, ctxt->dev); + } } if (num_ze_devices > 0) cur_ze_dev = &ze_devices[0]; @@ -819,6 +879,34 @@ void psmi_oneapi_cmd_destroy_all(void) for (i = 0; i < num_ze_devices; i++) { ctxt = &ze_devices[i]; + if (ctxt->async_cl1 != NULL) { + PSMI_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->async_cl1); + ctxt->async_cl1 = NULL; + } + if (ctxt->async_cq1 != NULL) { + PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, ctxt->async_cq1); + ctxt->async_cq1 = NULL; + } + if (ctxt->async_cl0 != NULL) { + PSMI_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->async_cl0); + ctxt->async_cl0 = NULL; + } + if (ctxt->async_cq0 != NULL) { + PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, ctxt->async_cq0); + ctxt->async_cq0 = NULL; + } + if (ctxt->copy_status1 != NULL) { + PSMI_ONEAPI_ZE_CALL(zeEventDestroy, ctxt->copy_status1); + ctxt->copy_status1 = NULL; + } + if (ctxt->copy_status0 != NULL) { + PSMI_ONEAPI_ZE_CALL(zeEventDestroy, ctxt->copy_status0); + ctxt->copy_status0 = NULL; + } + if (ctxt->event_pool != NULL) { + PSMI_ONEAPI_ZE_CALL(zeEventPoolDestroy, ctxt->event_pool); + ctxt->event_pool = NULL; + } if (ctxt->cl) { PSMI_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->cl); ctxt->cl = NULL; @@ -849,7 +937,7 @@ int psmi_oneapi_ze_initialize() union psmi_envvar_val env; PSM2_LOG_MSG("entering"); - _HFI_VDBG("Init Level Zero library.\n"); + _HFI_DBG("Init Level Zero library.\n"); psmi_oneapi_ze_stats_register(); err = psmi_oneapi_ze_load(); @@ -868,6 +956,13 @@ int psmi_oneapi_ze_initialize() (union psmi_envvar_val)1, &env); psm3_oneapi_immed_async_copy = env.e_int; + psm3_getenv("PSM3_ONEAPI_PARALLEL_DTOD_COPY_THRESH", + "Use parallel CommandLists for GPU to GPU copy larger than threshold", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)(256*1024-1), &env); + // no benefit below 128K-1, plus the copy is spilt at a 64K boundary + psm3_oneapi_parallel_dtod_copy_thresh = max(128*1024-1, env.e_uint); + PSMI_ONEAPI_ZE_CALL(zeInit, ZE_INIT_FLAG_GPU_ONLY); @@ -911,11 +1006,15 @@ int psmi_oneapi_ze_initialize() ze_context_desc_t ctxtDesc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, NULL, 0 }; PSMI_ONEAPI_ZE_CALL(zeContextCreate, ze_driver, &ctxtDesc, &ze_context); - _HFI_VDBG("ze_driver %p first device %p ze_context %p\n", - ze_driver, &devices[0], ze_context); + _HFI_DBG("ze_driver %p %u devices first device %p ze_context %p\n", + ze_driver, ze_device_count, devices[0], ze_context); - for (i = 0; i < ze_device_count; i++) + for (i = 0; i < ze_device_count; i++) { + ze_devices[i].dev_index = i; psmi_oneapi_cmd_create(devices[i], &ze_devices[i]); + _HFI_DBG("Initialized cmd queues for ze_device[%d] %p\n", + i, ze_devices[i].dev); + } num_ze_devices = ze_device_count; if (num_ze_devices > 0) @@ -1014,7 +1113,11 @@ void psmi_parse_nic_var() { union psmi_envvar_val env_nic; psm3_getenv("PSM3_NIC", - "Device Unit number or name or wildcard (-1 or 'any' autodetects)", + "Device(s) to consider for use. By name (" +#ifdef FNM_EXTMATCH + "extended " +#endif + "glob pattern), unit number or 'any'", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)"any", &env_nic); //autodetect @@ -1064,6 +1167,11 @@ static int psm3_parse_no_warn(void) } #endif +int init_cache_on = 1; +void psm3_turn_off_init_cache() { + init_cache_on = 0; +} + psm2_error_t psm3_init(int *major, int *minor) { psm2_error_t err = PSM2_OK; @@ -1177,10 +1285,10 @@ psm2_error_t psm3_init(int *major, int *minor) psm3_getenv("PSM3_TRACEMASK", "Mask flags for tracing", PSMI_ENVVAR_LEVEL_USER, - PSMI_ENVVAR_TYPE_STR_VAL_PAT, + PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS, (union psmi_envvar_val)__HFI_DEBUG_DEFAULT_STR, &env_tmask); - (void)psm3_parse_val_pattern(env_tmask.e_str, __HFI_DEBUG_DEFAULT, - &psm3_dbgmask); + (void)psm3_parse_val_pattern_uint(env_tmask.e_str, __HFI_DEBUG_DEFAULT, + &psm3_dbgmask, PSMI_ENVVAR_FLAG_NOMIN_NOMAX, 0, UINT_MAX); /* The "real thing" is done in utils_mallopt.c as constructor function, but * we getenv it here to report what we're doing with the setting */ @@ -1319,6 +1427,10 @@ psm2_error_t psm3_init(int *major, int *minor) goto fail_epid; } + if (psm3_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { + psm3_hwloc_topology_init(); + } + #ifdef PSM_DSA if (psm3_device_is_enabled(devid_enabled, PTL_DEVID_AMSH)) { if (psm3_dsa_init()) { @@ -1352,7 +1464,8 @@ psm2_error_t psm3_init(int *major, int *minor) * want it to appear in PSM3_VERBOSE_ENV help text */ int enable_cuda = 0; - if (psm3_parse_str_int(psm3_env_get("PSM3_CUDA"), &enable_cuda) == -2 + if (psm3_parse_str_int(psm3_env_get("PSM3_CUDA"), &enable_cuda, + INT_MIN, INT_MAX) == -2 || enable_cuda) { _HFI_INFO("WARNING: PSM built without CUDA enabled, PSM3_CUDA unavailable\n"); } @@ -1382,7 +1495,8 @@ psm2_error_t psm3_init(int *major, int *minor) * want it to appear in PSM3_VERBOSE_ENV help text */ int enable_oneapi = 0; - if (psm3_parse_str_int(psm3_env_get("PSM3_ONEAPI_ZE"), &enable_oneapi) == -2 + if (psm3_parse_str_int(psm3_env_get("PSM3_ONEAPI_ZE"), &enable_oneapi, + INT_MIN, INT_MAX) == -2 || enable_oneapi) { _HFI_INFO("WARNING: PSM built without ONEAPI_ZE enabled, PSM3_ONEAPI_ZE unavailable\n"); } @@ -1399,7 +1513,8 @@ psm2_error_t psm3_init(int *major, int *minor) * get the behavior they expected */ unsigned int gpudirect = 0; - if (psm3_parse_str_uint(psm3_env_get("PSM3_GPUDIRECT"), &gpudirect) == -2 + if (psm3_parse_str_uint(psm3_env_get("PSM3_GPUDIRECT"), &gpudirect, + 0, UINT_MAX) == -2 || gpudirect) { _HFI_INFO("WARNING: PSM built with neither ONEAPI_ZE nor CUDA enabled, PSM3_GPUDIRECT unavailable\n"); } @@ -1420,6 +1535,7 @@ psm2_error_t psm3_init(int *major, int *minor) #endif #if defined(PSM_DSA) || defined(PSM_CUDA) || defined(PSM_ONEAPI) fail_hal: + psm3_hwloc_topology_destroy(); // always safe to call psm3_hal_finalize(); #endif fail_epid: @@ -1450,6 +1566,7 @@ static inline psm2_error_t unit_query_ret_to_err(int ret) } } +static uint64_t nics_max_speed; psm2_error_t psm3_info_query(psm2_info_query_t q, void *out, size_t nargs, psm2_info_query_arg_t args[]) { @@ -1606,6 +1723,11 @@ psm2_error_t psm3_info_query(psm2_info_query_t q, void *out, if (port == 0) port = 1; /* VERBS_PORT */ if (unit == -1) { + if (init_cache_on && nics_max_speed) { + *speed = nics_max_speed; + rv = PSM2_OK; + break; + } // query for unit -1 returns max speed of all candidate NICs *speed = 0; for (unit = 0; unit < psmi_hal_get_num_units_(); unit++) { @@ -1615,7 +1737,12 @@ psm2_error_t psm3_info_query(psm2_info_query_t q, void *out, if (0 <= psmi_hal_get_port_speed(unit, port, &unit_speed)) *speed = max(*speed, unit_speed); } - rv = (*speed) ? PSM2_OK : PSM2_EP_NO_DEVICE; + if (*speed) { + nics_max_speed = *speed; + rv = PSM2_OK; + } else { + rv = PSM2_EP_NO_DEVICE; + } } else { if (psmi_hal_get_port_active(unit, port) <= 0) break; @@ -1749,7 +1876,9 @@ psm2_error_t psm3_finalize(void) * Start critical section to decrement ref count and unlink * affinity shm file. */ - psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name); + if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) { + _HFI_ERROR("unable to get NIC affinity semaphone, proceeding anyway\n"); + } psm3_shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] -= 1; if (psm3_shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] <= 0) { @@ -1767,6 +1896,7 @@ psm2_error_t psm3_finalize(void) munmap(psm3_shared_affinity_ptr, PSMI_PAGESIZE); psm3_shared_affinity_ptr = NULL; + psm3_shared_affinity_nic_refcount_ptr = NULL; psmi_free(psm3_affinity_shm_name); psm3_affinity_shm_name = NULL; psm3_affinity_shared_file_opened = 0; @@ -1782,6 +1912,7 @@ psm2_error_t psm3_finalize(void) psm3_affinity_semaphore_open = 0; } + psm3_hwloc_topology_destroy(); // always safe to call psm3_hal_finalize(); #ifdef PSM_CUDA if (PSMI_IS_GPU_ENABLED) diff --git a/prov/psm3/psm3/psm2.h b/prov/psm3/psm3/psm2.h index fe76b5fe4b8..b9ff1c598d1 100644 --- a/prov/psm3/psm3/psm2.h +++ b/prov/psm3/psm3/psm2.h @@ -1376,6 +1376,16 @@ void *psm3_epaddr_getctxt(psm2_epaddr_t epaddr); * option value: Deprecated; this option has no effect. */ +#define PSM2_MQ_OPT_GPU_RNDV_SHM_SZ 0x304 +#define PSM2_MQ_GPU_RNDV_SHM_SZ PSM2_MQ_OPT_GPU_RNDV_SHM_SZ + /**< [@b uint32_t ] Size at which to start enabling + * rendezvous messaging for shared memory (intra-node) GPU messages (If + * unset, defaults to 127 bytes for Intel GPU, 127 for NVIDIA GPU). + * + * component object: PSM2 Matched Queue (@ref psm2_mq_t). + * option value: Size at which to switch to rendezvous protocol for GPU send. + */ + /* PSM2_COMPONENT_AM options */ #define PSM2_AM_OPT_FRAG_SZ 0x401 #define PSM2_AM_MAX_FRAG_SZ PSM2_AM_OPT_FRAG_SZ @@ -1802,10 +1812,10 @@ char* psm3_env_get(const char *name); * * @param[in] const char *str parameter value * @retval 0 The string was valid, *val has value - * -1 The string was empty or NULL - * -2 The string had invalid syntax + * -1 The string was empty or NULL, *val not updated + * -2 The string had invalid syntax, *val not updated */ -int psm3_parse_str_int(const char *string, int *val); +int psm3_parse_str_int(const char *string, int *val, int min, int max); /** @brief PSM2 unsigned int parameter parsing * @@ -1813,22 +1823,56 @@ int psm3_parse_str_int(const char *string, int *val); * * @param[in] const char *str parameter value * @retval 0 The string was valid, *val has value - * -1 The string was empty or NULL - * -2 The string had invalid syntax + * -1 The string was empty or NULL, *val not updated + * -2 The string had invalid syntax, *val not updated */ -int psm3_parse_str_uint(const char *string, unsigned int *val); +int psm3_parse_str_uint(const char *string, unsigned int *val, + unsigned int min, unsigned int max); /** @brief PSM2 yesno parameter parsing * * Function that parses a string yesno parameter * * @param[in] const char *str parameter value - * @retval -1 The string was empty or NULL - * -2 The string had invalid syntax + * @retval 0 The string was valid, *val has value + * -1 The string was empty or NULL, *val not updated + * -2 The string had invalid syntax, *val not updated + * @param[out] int *val * 0 The string was No, False, Off or 0 * 1 The string was Yes, True, On or 1 */ -int psm3_parse_str_yesno(const char *str); +int psm3_parse_str_yesno(const char *str, int *val); + +// for the purposes of psmx3 accessing PSM3_DEVICES config, these +// interfaces are defined here. Not for general consumption +/* We currently have 3 PTLs, 0 is reserved. */ +#define PTL_DEVID_IPS 1 // ips aka nic, network inter-node +#define PTL_DEVID_AMSH 2 // shm, intra-node, scale-up +#define PTL_DEVID_SELF 3 // self + +/* We can currently initialize up to 3 PTLs */ +#define PTL_MAX_INIT 3 + +/** @brief PSM2 devices parameter parsing + * + * Function that gets and parses the PSM3_DEVICES string parameter + * + * @param[out] array of devices + * @retval PSM2_OK - devices successfully returned + * other (PSM2_PARAM_ERR) - error parsing devices + */ +psm2_error_t psm3_parse_devices(int devices[PTL_MAX_INIT]); + +/** @brief PSM2 devices list search + * + * Function that searches devid_enabled for a specific device + * + * @param[in] array of devices from psm3_parse_devices + * @param[in] devid: PTL_DEVID_IPS, PTL_DEVID_AMSH, or PTL_DEVID_SELF + * @retval 1 - given devid is enabled in devices[] + * 0 Given devid is disabled in devices[] + */ +int psm3_device_is_enabled(const int devices[PTL_MAX_INIT], int devid); /** @brief PSM2 env finalize * @@ -1872,6 +1916,8 @@ void psm3_memcpy(void *dest, const void *src, uint32_t len); /*! @} */ +void psm3_turn_off_init_cache(); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/prov/psm3/psm3/psm2_hal.c b/prov/psm3/psm3/psm2_hal.c index 058a6c26034..0c347ce2160 100644 --- a/prov/psm3/psm3/psm2_hal.c +++ b/prov/psm3/psm3/psm2_hal.c @@ -230,6 +230,69 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...) rv = -1; } break; + case psmi_hal_pre_init_cache_func_get_port_speed: + { + int unit = va_arg(ap,int); + + if ((unit >= 0) && (unit < p->params.num_units)) + { + int port = va_arg(ap,int); + if ((port >= 1) && (port <= p->params.num_ports)) + { + int i = unit * (p->params.num_ports+1) + port; + // only cache during PSM3 init + if (!init_cache_on || !p->params.port_speed_valid[i]) { + rv = p->hfp_get_port_speed(unit,port,&p->params.port_speed[i]); + p->params.port_speed_valid[i] = rv == 0 ? 1 : -1; + } + rv = (p->params.port_subnet_valid[i] ==1)? 0: -1; + if (rv == 0) { + uint64_t *speed = va_arg(ap, uint64_t*); + if (speed) *speed = p->params.port_speed[i]; + } + } + else + rv = -1; + } + else + rv = -1; + } + break; + case psmi_hal_pre_init_cache_func_get_port_lid: + { + int unit = va_arg(ap,int); + + if ((unit >= 0) && (unit < p->params.num_units)) + { + int port = va_arg(ap,int); + if ((port >= 1) && (port <= p->params.num_ports)) + { + int addr_index = va_arg(ap,int); + if (addr_index >= 0 && addr_index < psm3_addr_per_nic) + { + int i = unit * ((p->params.num_ports+1) * psm3_addr_per_nic) + port * psm3_addr_per_nic + addr_index; + // only cache during PSM3 init + if (!init_cache_on || !p->params.port_lid_valid[i]) { + rv = p->hfp_get_port_lid(unit,port,addr_index); + if (rv > 0) { + p->params.port_lid_valid[i] = 1; + p->params.port_lid[i] = rv; + } else { + p->params.port_lid_valid[i] = -1; + rv = -1; + } + break; + } + rv = p->params.port_lid_valid[i] == -1 ? -1 : p->params.port_lid[i]; + } + } + else + rv = -1; + } + else + rv = -1; + } + break; case psmi_hal_pre_init_cache_func_get_num_contexts: { int unit = va_arg(ap,int); @@ -310,6 +373,51 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...) rv = -1; } break; + case psmi_hal_pre_init_cache_func_get_port_subnet_name: + { + int unit = va_arg(ap,int); + + if ((unit >= 0) && (unit < p->params.num_units)) + { + int port = va_arg(ap,int); + if ((port >= 1) && (port <= p->params.num_ports)) + { + int addr_index = va_arg(ap,int); + if (addr_index >= 0 && addr_index < psm3_addr_per_nic) + { + int i = unit * ((p->params.num_ports+1) * psm3_addr_per_nic) + port * psm3_addr_per_nic + addr_index; + // only cache during PSM3 init + if (!init_cache_on || !p->params.port_subnet_name[i]) { + char buffer[PATH_MAX] = {}; + rv = p->hfp_get_port_subnet_name(unit, port, addr_index, buffer, sizeof(buffer)); + if (p->params.port_subnet_name[i]) { + psmi_free(p->params.port_subnet_name[i]); + } + if (rv == 0) { + p->params.port_subnet_name[i] = psmi_strdup(PSMI_EP_NONE, buffer); + } else { + p->params.port_subnet_name[i] = NULL; + rv = -1; + break; + } + } + char *buf = va_arg(ap, char*); + size_t bufsize = va_arg(ap, size_t); + rv = p->params.port_subnet_name[i] ? 0 : -1; + if (rv == 0 && buf) { + (void)snprintf(buf, bufsize, "%s", p->params.port_subnet_name[i]); + } + } + else + rv = -1; + } + else + rv = -1; + } + else + rv = -1; + } + break; case psmi_hal_pre_init_cache_func_get_unit_pci_bus: { int unit = va_arg(ap,int); @@ -469,6 +577,10 @@ static void psm3_hal_free_cache(struct _psmi_hal_instance *p) FREE_HAL_CACHE(unit_active_valid); FREE_HAL_CACHE(port_active); FREE_HAL_CACHE(port_active_valid); + FREE_HAL_CACHE(port_speed); + FREE_HAL_CACHE(port_speed_valid); + FREE_HAL_CACHE(port_lid); + FREE_HAL_CACHE(port_lid_valid); FREE_HAL_CACHE(num_contexts); FREE_HAL_CACHE(num_contexts_valid); FREE_HAL_CACHE(num_free_contexts); @@ -478,6 +590,7 @@ static void psm3_hal_free_cache(struct _psmi_hal_instance *p) FREE_HAL_CACHE(port_subnet_addr); FREE_HAL_CACHE(port_subnet_idx); FREE_HAL_CACHE(port_subnet_gid); + FREE_HAL_CACHE_ARRAY(port_subnet_name, p->params.num_units * p->params.num_ports * psm3_addr_per_nic); FREE_HAL_CACHE(unit_pci_bus_valid); FREE_HAL_CACHE(unit_pci_bus_domain); @@ -521,6 +634,10 @@ static psmi_hal_instance_t *psm3_hal_select_hal(psmi_hal_instance_t *p, ALLOC_HAL_CACHE(unit_active_valid, int8_t, nunits); ALLOC_HAL_CACHE(port_active, int8_t, nunits*(nports+1)); ALLOC_HAL_CACHE(port_active_valid, int8_t, nunits*(nports+1)); + ALLOC_HAL_CACHE(port_speed, uint64_t, nunits*(nports+1)); + ALLOC_HAL_CACHE(port_speed_valid, int8_t, nunits*(nports+1)); + ALLOC_HAL_CACHE(port_lid, int, nunits*(nports+1)*psm3_addr_per_nic); + ALLOC_HAL_CACHE(port_lid_valid, int8_t, nunits*(nports+1)*psm3_addr_per_nic); ALLOC_HAL_CACHE(num_contexts, uint16_t, nunits); ALLOC_HAL_CACHE(num_contexts_valid, uint16_t, nunits); ALLOC_HAL_CACHE(num_free_contexts, uint16_t, nunits); @@ -530,6 +647,7 @@ static psmi_hal_instance_t *psm3_hal_select_hal(psmi_hal_instance_t *p, ALLOC_HAL_CACHE(port_subnet_addr, psmi_naddr128_t, nunits*(nports+1)*psm3_addr_per_nic); ALLOC_HAL_CACHE(port_subnet_idx, int, nunits*(nports+1)*psm3_addr_per_nic); ALLOC_HAL_CACHE(port_subnet_gid, psmi_gid128_t, nunits*(nports+1)*psm3_addr_per_nic); + ALLOC_HAL_CACHE_ARRAY(port_subnet_name, char, nunits*(nports+1)*psm3_addr_per_nic); ALLOC_HAL_CACHE(unit_pci_bus_valid, int8_t, nunits); ALLOC_HAL_CACHE(unit_pci_bus_domain, uint32_t, nunits); @@ -557,6 +675,72 @@ static psmi_hal_instance_t *psm3_hal_select_hal(psmi_hal_instance_t *p, return NULL; } +/* check syntax of pattern. and confirm it matches at least 1 HAL + * returns: + * 0 - valid + * -1 - empty string + * -2 - invalid syntax + */ +static int parse_check_hal(int type, const union psmi_envvar_val val, void *ptr, + size_t errstr_size, char errstr[]) +{ + int i; + int ret; + + psmi_assert(type == PSMI_ENVVAR_TYPE_STR); + if (! val.e_str || ! *val.e_str) + return -1; + // use fnmatch to check syntax of pattern + // reviewing fnmatch source it only returns 0 or FNM_NOMATCH, but be + // safe and match fnmatch documentation that other values indicate error + ret = fnmatch(val.e_str, "dontcare", 0 +#ifdef FNM_EXTMATCH + | FNM_EXTMATCH +#endif + ); + if (ret && ret != FNM_NOMATCH) { + if (errstr_size) + snprintf(errstr, errstr_size, " invalid " +#ifdef FNM_EXTMATCH + "extended " +#endif + "glob pattern"); + return -2; + } + // we check for at least 1 matching HAL, but purposely do + // not check for active NICs within the HAL + // We allow any valid HAL, even if not included in the build + // This avoids surprises if user or middleware uses PSM3_HAL to limit + // PSM3 to a specific HAL, but the PSM3 build found lacks that HAL + ret = -2; // assume no matching HAL found + for (i=0; i <= PSM_HAL_INDEX_MAX; i++) + { + if (i == PSM_HAL_INDEX_LOOPBACK) + continue; + if (0 == strcmp("unknown", psm3_hal_index_to_str(i))) + continue; + + if (0 == strcmp(val.e_str, "any") || + 0 == fnmatch(val.e_str, psm3_hal_index_to_str(i), 0 +#ifdef FNM_EXTMATCH + | FNM_EXTMATCH +#endif + )) + { + ret = 0; + break; + } + } + if (ret == -2) { + if (errstr_size) + snprintf(errstr, errstr_size, " no matching HAL found"); + return -2; + } + return 0; +} + +static char hal_help[512] = ""; + static struct _psmi_hal_instance *psm3_hal_get_pi_inst(void) { int i; @@ -584,11 +768,12 @@ static struct _psmi_hal_instance *psm3_hal_get_pi_inst(void) */ union psmi_envvar_val env_hal; /* HAL instance preference */ - psm3_getenv("PSM3_HAL", - "Hardware Abstraction Layer to use (Default is first HAL" - " to find a valid, unfiltered NIC [any])", + psm3_getenv_range("PSM3_HAL", + "Hardware Abstraction Layer to use", hal_help, PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, - (union psmi_envvar_val)"any", &env_hal); + (union psmi_envvar_val)"any", + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + parse_check_hal, NULL, &env_hal); for (i=0; i <= PSM_HAL_INDEX_MAX; i++) { @@ -651,6 +836,36 @@ int psm3_hal_initialize(int devid_enabled[PTL_MAX_INIT]) PSMI_HAL_INI(); if (! psm3_hal_current_hal_instance) { + int i; + char valid_hal_list[80]; + int valid_len = 0; + char avail_hal_list[80]; + int avail_len = 0; + + valid_hal_list[0] = '\0'; + avail_hal_list[0] = '\0'; + for (i=0; i <= PSM_HAL_INDEX_MAX; i++) + { + if (i == PSM_HAL_INDEX_LOOPBACK) + continue; + if (0 == strcmp("unknown", psm3_hal_index_to_str(i))) + continue; + + snprintf(&valid_hal_list[valid_len], + sizeof(valid_hal_list)-valid_len, "%s'%s'", + valid_hal_list[0]?", ":"", psm3_hal_index_to_str(i)); + valid_len = strlen(valid_hal_list); + if (psm3_hal_table[i]) { + snprintf(&avail_hal_list[avail_len], + sizeof(avail_hal_list)-avail_len, "%s'%s'", + avail_hal_list[0]?", ":"", psm3_hal_index_to_str(i)); + avail_len = strlen(avail_hal_list); + } + } + snprintf(hal_help, sizeof(hal_help), + " 'any' - use first HAL which finds a valid, unfiltered NIC (default)\n" + " valid HALs: %s\n" + " available HALs: %s", valid_hal_list, avail_hal_list); if (! psm3_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { // register the loopback HAL and select it. Unlike normal HALs // we don't call psm3_hal_register_instance because it would enforce diff --git a/prov/psm3/psm3/psm2_hal.h b/prov/psm3/psm3/psm2_hal.h index d5658221c0c..055261da6c4 100644 --- a/prov/psm3/psm3/psm2_hal.h +++ b/prov/psm3/psm3/psm2_hal.h @@ -228,6 +228,10 @@ typedef struct _psmi_hal_params uint16_t default_pkey; int8_t *unit_active,*unit_active_valid; int8_t *port_active,*port_active_valid; + uint64_t *port_speed; + int8_t *port_speed_valid; + int *port_lid; + int8_t *port_lid_valid; uint16_t *num_contexts,*num_contexts_valid; uint16_t *num_free_contexts,*num_free_contexts_valid; // information from port_get_subnet @@ -237,6 +241,7 @@ typedef struct _psmi_hal_params psmi_naddr128_t *port_subnet_addr; int *port_subnet_idx; psmi_gid128_t *port_subnet_gid; + char **port_subnet_name; int8_t *unit_pci_bus_valid; uint32_t *unit_pci_bus_domain; @@ -254,6 +259,10 @@ typedef struct _psmi_hal_params #define PSM_HAL_ALG_ACROSS 0 #define PSM_HAL_ALG_WITHIN 1 #define PSM_HAL_ALG_ACROSS_ALL 2 +#define PSM_HAL_ALG_CPU_CENTRIC 3 +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY +#define PSM_HAL_ALG_GPU_CENTRIC 4 +#endif typedef enum { @@ -499,16 +508,22 @@ int psm3_hal_initialize(int devid_enabled[PTL_MAX_INIT]); int psm3_hal_finalize(void); +// indicate whether we cache data during PSM3 init +extern int init_cache_on; + enum psmi_hal_pre_init_cache_func_krnls { psmi_hal_pre_init_cache_func_get_num_units, psmi_hal_pre_init_cache_func_get_num_ports, psmi_hal_pre_init_cache_func_get_unit_active, psmi_hal_pre_init_cache_func_get_port_active, + psmi_hal_pre_init_cache_func_get_port_speed, + psmi_hal_pre_init_cache_func_get_port_lid, psmi_hal_pre_init_cache_func_get_num_contexts, psmi_hal_pre_init_cache_func_get_num_free_contexts, psmi_hal_pre_init_cache_func_get_default_pkey, psmi_hal_pre_init_cache_func_get_port_subnet, + psmi_hal_pre_init_cache_func_get_port_subnet_name, psmi_hal_pre_init_cache_func_get_unit_pci_bus, psmi_hal_pre_init_cache_func_get_unit_device_id, psmi_hal_pre_init_cache_func_get_unit_device_version, @@ -549,9 +564,6 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...) /* DISPATCH_FUNC */ #define psmi_hal_get_unit_name(...) PSMI_HAL_DISPATCH_FUNC(get_unit_name,__VA_ARGS__) -#define psmi_hal_get_port_subnet_name(...) PSMI_HAL_DISPATCH_FUNC(get_port_subnet_name,__VA_ARGS__) -#define psmi_hal_get_port_speed(...) PSMI_HAL_DISPATCH_FUNC(get_port_speed,__VA_ARGS__) -#define psmi_hal_get_port_lid(...) PSMI_HAL_DISPATCH_FUNC(get_port_lid,__VA_ARGS__) #define psmi_hal_mq_init_defaults(...) PSMI_HAL_DISPATCH_FUNC(mq_init_defaults,__VA_ARGS__) #define psmi_hal_ep_open_opts_get_defaults(...) PSMI_HAL_DISPATCH_FUNC(ep_open_opts_get_defaults,__VA_ARGS__) #define psmi_hal_context_initstats(...) PSMI_HAL_DISPATCH_FUNC(context_initstats,__VA_ARGS__) @@ -566,10 +578,13 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...) #define psmi_hal_get_num_ports_(...) PSMI_HAL_DISPATCH_PI(get_num_ports,##__VA_ARGS__) #define psmi_hal_get_unit_active(...) PSMI_HAL_DISPATCH_PI(get_unit_active,__VA_ARGS__) #define psmi_hal_get_port_active(...) PSMI_HAL_DISPATCH_PI(get_port_active,__VA_ARGS__) +#define psmi_hal_get_port_speed(...) PSMI_HAL_DISPATCH_PI(get_port_speed,__VA_ARGS__) +#define psmi_hal_get_port_lid(...) PSMI_HAL_DISPATCH_PI(get_port_lid,__VA_ARGS__) #define psmi_hal_get_num_contexts(...) PSMI_HAL_DISPATCH_PI(get_num_contexts,__VA_ARGS__) #define psmi_hal_get_num_free_contexts(...) PSMI_HAL_DISPATCH_PI(get_num_free_contexts,__VA_ARGS__) #define psmi_hal_get_default_pkey(...) PSMI_HAL_DISPATCH_PI(get_default_pkey,##__VA_ARGS__) #define psmi_hal_get_port_subnet(...) PSMI_HAL_DISPATCH_PI(get_port_subnet,__VA_ARGS__) +#define psmi_hal_get_port_subnet_name(...) PSMI_HAL_DISPATCH_PI(get_port_subnet_name,__VA_ARGS__) #define psmi_hal_get_unit_pci_bus(...) PSMI_HAL_DISPATCH_PI(get_unit_pci_bus,__VA_ARGS__) #define psmi_hal_get_unit_device_id(...) PSMI_HAL_DISPATCH_PI(get_unit_device_id,__VA_ARGS__) #define psmi_hal_get_unit_device_version(...) PSMI_HAL_DISPATCH_PI(get_unit_device_version,__VA_ARGS__) diff --git a/prov/psm3/psm3/psm2_hal_loopback.c b/prov/psm3/psm3/psm2_hal_loopback.c index cf78a99b2ee..913a45dec78 100644 --- a/prov/psm3/psm3/psm2_hal_loopback.c +++ b/prov/psm3/psm3/psm2_hal_loopback.c @@ -209,8 +209,10 @@ static int psm3_hfp_loopback_get_port_lid(int unit, int port, int addr_index) // also prior to the EP being opened static void psm3_hfp_loopback_mq_init_defaults(struct psm2_mq *mq) { - /* these are only used by ptl_ips */ - mq->hfi_base_window_rv = (~(uint32_t)0); // no rendezvous + mq->ips_cpu_window_rv_str = NULL; // no rendezvous +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + mq->ips_gpu_window_rv_str = NULL; // no rendezvous +#endif mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY; // RDMA and MR cache N/A, leave ep->rdmamode, ep->mr_cache_mode and diff --git a/prov/psm3/psm3/psm2_mq.h b/prov/psm3/psm3/psm2_mq.h index b32c5126ba8..517b4802d5b 100644 --- a/prov/psm3/psm3/psm2_mq.h +++ b/prov/psm3/psm3/psm2_mq.h @@ -173,7 +173,8 @@ extern "C" { * @li If and when possible, receive buffers should be posted as early as * possible and ideally before calling into the progress engine. * @li Use of rendezvous messaging that can be controlled with - * @ref PSM2_MQ_RNDV_HFI_SZ and @ref PSM2_MQ_RNDV_SHM_SZ options. These + * @ref PSM2_MQ_RNDV_HFI_SZ, @ref PSM2_MQ_RNDV_SHM_SZ and + * PSM2_MQ_GPU_RNDV_SHM_SZ options. These * options default to values determined to make effective use of * bandwidth and are hence not advisable for all communication message * sizes, but rendezvous messages inherently prevent unexpected @@ -477,6 +478,7 @@ struct psm2_mq_req_user { * @param[in] option Index of option to retrieve. Possible values are: * @li @ref PSM2_MQ_RNDV_HFI_SZ * @li @ref PSM2_MQ_RNDV_SHM_SZ + * @li @ref PSM2_MQ_GPU_RNDV_SHM_SZ * @li @ref PSM2_MQ_MAX_SYSBUF_MBYTES * * @param[in] value Pointer to storage that can be used to store the value of @@ -498,6 +500,7 @@ psm2_error_t psm3_mq_getopt(psm2_mq_t mq, int option, void *value); * @param[in] option Index of option to retrieve. Possible values are: * @li @ref PSM2_MQ_RNDV_HFI_SZ * @li @ref PSM2_MQ_RNDV_SHM_SZ + * @li @ref PSM2_MQ_GPU_RNDV_SHM_SZ * @li @ref PSM2_MQ_MAX_SYSBUF_MBYTES * * @param[in] value Pointer to storage that contains the value to be updated @@ -519,6 +522,9 @@ psm2_error_t psm3_mq_setopt(psm2_mq_t mq, int option, const void *value); #define PSM2_MQ_FLAG_SENDSYNC 0x01 /**< MQ Send Force synchronous send */ +#define PSM2_MQ_FLAG_INJECT 0x02 + /**< MQ Send Force bounce buffer for */ + /* FI_INJECT/fi_inject behavior */ #define PSM2_MQ_REQINVALID ((psm2_mq_req_t)(NULL)) /**< MQ request completion value */ @@ -710,6 +716,9 @@ psm3_mq_imrecv(psm2_mq_t mq, uint32_t flags, void *buf, uint32_t len, * synchronously, meaning that the message will not be sent until * the receiver acknowledges that it has matched the send with a * receive buffer. + * @li PSM2_MQ_FLAG_INJECT tells PSM to consume the send buffer + * immediately to comply with FI_INJECT/fi_inject behavior, + * cannot be used in conjunction with PSM2_MQ_FLAG_SENDSYNC. * @param[in] stag Message Send Tag * @param[in] buf Source buffer pointer * @param[in] len Length of message starting at @c buf. @@ -742,6 +751,9 @@ psm3_mq_send(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, * synchronously, meaning that the message will not be sent until * the receiver acknowledges that it has matched the send with a * receive buffer. + * @li PSM2_MQ_FLAG_INJECT tells PSM to consume the send buffer + * immediately to comply with FI_INJECT/fi_inject behavior, + * cannot be used in conjunction with PSM2_MQ_FLAG_SENDSYNC. * @param[in] stag Message Send Tag * @param[in] buf Source buffer pointer * @param[in] len Length of message starting at @c buf. @@ -776,6 +788,9 @@ psm3_mq_send2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, * synchronously, meaning that the message will not be sent until * the receiver acknowledges that it has matched the send with a * receive buffer. + * @li PSM2_MQ_FLAG_INJECT tells PSM to consume the send buffer + * immediately to comply with FI_INJECT/fi_inject behavior, + * cannot be used in conjunction with PSM2_MQ_FLAG_SENDSYNC. * @param[in] stag Message Send Tag * @param[in] buf Source buffer pointer * @param[in] len Length of message starting at @c buf. @@ -841,6 +856,9 @@ psm3_mq_isend(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, * synchronously, meaning that the message will not be sent until * the receiver acknowledges that it has matched the send with a * receive buffer. + * @li PSM2_MQ_FLAG_INJECT tells PSM to consume the send buffer + * immediately to comply with FI_INJECT/fi_inject behavior, + * cannot be used in conjunction with PSM2_MQ_FLAG_SENDSYNC. * @param[in] stag Message Send Tag, array of three 32-bit values. * @param[in] buf Source buffer pointer * @param[in] len Length of message starting at @c buf. diff --git a/prov/psm3/psm3/psm_config.h b/prov/psm3/psm3/psm_config.h index 479b6f9d732..4ce7de78157 100644 --- a/prov/psm3/psm3/psm_config.h +++ b/prov/psm3/psm3/psm_config.h @@ -99,6 +99,11 @@ /* #define PSM_PROFILE */ #endif +// If defined, for FI_INJECT Send DMA will be avoided +#ifndef PSM_INJECT_NOSDMA +/* #define PSM_INJECT_NOSDMA */ +#endif + #define PSMI_MIN_EP_CONNECT_TIMEOUT (2 * SEC_ULL) #define PSMI_MIN_EP_CLOSE_TIMEOUT (1 * SEC_ULL) #define PSMI_MAX_EP_CLOSE_TIMEOUT (2 * SEC_ULL) @@ -174,9 +179,21 @@ #define PSM_MQ_NIC_MAX_TINY 8 /* max TINY payload allowed */ +#define PSM_MQ_NIC_RNDV_THRESH 64000 +#define PSM_CPU_NIC_RNDV_WINDOW_STR "131072" +#ifdef PSM_CUDA +#define PSM_GPU_NIC_RNDV_WINDOW_STR "2097152" +#elif defined(PSM_ONEAPI) +#define PSM_GPU_NIC_RNDV_WINDOW_STR "131072:524287,262144:1048575,524288" +#endif #define PSM_MQ_NIC_MAX_RNDV_WINDOW (4 * 1024 * 1024) /* max rndv window */ #define MQ_SHM_THRESH_RNDV 16000 +#if defined(PSM_CUDA) +#define MQ_SHM_GPU_THRESH_RNDV 127 +#elif defined(PSM_ONEAPI) +#define MQ_SHM_GPU_THRESH_RNDV 127 +#endif // LEARN_HASH_SELECTOR has PSM3 dynamically learn the combinations // of src_addr presence and tagsel used by a given middleware. This diff --git a/prov/psm3/psm3/psm_context.c b/prov/psm3/psm3/psm_context.c index 047cfbc38a3..35477d69f2f 100644 --- a/prov/psm3/psm3/psm_context.c +++ b/prov/psm3/psm3/psm_context.c @@ -58,7 +58,6 @@ #include "psm_user.h" #include "psm2_hal.h" -static int psmi_parse_nic_selection_algorithm(void); static psm2_error_t psm3_ep_verify_pkey(psm2_ep_t ep, uint16_t pkey, uint16_t *opkey, uint16_t* oindex); @@ -92,481 +91,6 @@ int psm3_context_interrupt_isenabled(psm2_ep_t ep) return psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED); } - -/* returns the 8-bit hash value of an uuid. */ -static inline -uint8_t -psm3_get_uuid_hash(psm2_uuid_t const uuid) -{ - int i; - uint8_t hashed_uuid = 0; - - for (i=0; i < sizeof(psm2_uuid_t); ++i) - hashed_uuid ^= *((uint8_t const *)uuid + i); - - return hashed_uuid; -} - -int psm3_get_current_proc_location() -{ - int core_id, node_id; - - core_id = sched_getcpu(); - if (core_id < 0) - return -EINVAL; - - node_id = numa_node_of_cpu(core_id); - if (node_id < 0) - return -EINVAL; - - return node_id; -} - -// print a bitmask in condensed form at _HFI_VBG level -// condensed form consolidates sequential numbers such as: "0-43,88-131" -static void vdbg_print_bitmask(const char* prefix, struct bitmask *bmp) -{ - if (_HFI_VDBG_ON) { - int i, len; - char buf[1024]; - int last=-1; - int first=-1; - int max = numa_num_possible_nodes(); - - snprintf(buf, sizeof(buf), "%s", prefix); - len = strlen(buf); - for (i=0; i 1) { - if (first == last) { - // first in a possible sequence - snprintf(&buf[len], sizeof(buf)-len, ",%d", i); - } else { - // complete prior sequence, first in a new sequence - snprintf(&buf[len], sizeof(buf)-len, "-%d,%d", last, i); - } - first = i; - last = first; - } else { - last = i; - } - len = strlen(buf); - } - // complete prior sequence as needed - if (first>=0 && first != last) - snprintf(&buf[len], sizeof(buf)-len, "-%d", last); - _HFI_VDBG("%s\n", buf); - } -} - -// return the largest possible numa ID of a CPU in this system -int psm3_get_max_cpu_numa() -{ - static int max_cpu_numa = -1; - struct bitmask *cpumask, *empty_cpumask; - int i; - - if (max_cpu_numa >= 0) - return max_cpu_numa; - - // we don't depend on numa_num_configured_nodes since in theory there - // could be non-CPU memory NUMA nodes. We only need to know the - // largest possible value for a CPU numa node ID - - // numa_max_node - largest NUMA node which is not disabled - // numa_node_to_cpus - given a NUMA node, create list of CPUs - // numa_node_of_cpu - cpu ID to NUMA (or error if invalid CPU) - // numa_node_to_cpus - cpumask of CPUs on given NUMA node - - max_cpu_numa = -1; - empty_cpumask = numa_allocate_cpumask(); - numa_bitmask_clearall(empty_cpumask); - //vdbg_print_bitmask("empty_cpumask: ", empty_cpumask); - - cpumask = numa_allocate_cpumask(); - _HFI_VDBG("numa_max_node=%d\n", numa_max_node()); - for (i=numa_max_node(); i >= 0; i--) { - numa_bitmask_clearall(cpumask); - int ret = numa_node_to_cpus(i, cpumask); - _HFI_VDBG("i=%d node_to_cpus ret=%d\n", i, ret); - vdbg_print_bitmask("cpumask: ", cpumask); - if (ret >= 0 && ! numa_bitmask_equal(cpumask, empty_cpumask)) { - max_cpu_numa = i; - break; - } - } - numa_free_cpumask(cpumask); - numa_free_cpumask(empty_cpumask); - psmi_assert_always(max_cpu_numa >= 0); - return max_cpu_numa; -} - -/* search the list of all units for those which are active - * and optionally match the given NUMA node_id (when node_id >= 0) - * returns the number of active units found. - * Note get_unit_active tests for active ports, valid addresses and - * performs filtering as done in get_port_subnets - */ -static int -hfi_find_active_hfis(int nunits, int node_id, int *saved_hfis) -{ - int found = 0, unit_id; - - for (unit_id = 0; unit_id < nunits; unit_id++) { - int node_id_i; - - if (psmi_hal_get_unit_active(unit_id) <= 0) - continue; - - if (node_id < 0) { - saved_hfis[found++] = unit_id; - _HFI_DBG("RoundRobinAll Found NIC unit= %d, local rank=%d.\n", - unit_id, psm3_get_mylocalrank()); - } else if (!psmi_hal_get_node_id(unit_id, &node_id_i) - && node_id_i == node_id) { - saved_hfis[found++] = unit_id; - _HFI_DBG("RoundRobin Found NIC unit= %d, node = %d, local rank=%d.\n", - unit_id, node_id, psm3_get_mylocalrank()); - } - } - return found; -} - -static void -psmi_spread_nic_selection(psm2_uuid_t const job_key, long *unit_start, - long *unit_end, int nunits) -{ - { - int found, saved_hfis[nunits]; - - /* else, we are going to look at: - (a hash of the job key plus the local rank id) mod nunits. */ - found = hfi_find_active_hfis(nunits, -1, saved_hfis); - if (found) - *unit_start = saved_hfis[((psm3_get_mylocalrank()+1) + - psm3_get_uuid_hash(job_key)) % found]; - else - *unit_start = 0; // caller will fail - /* just in case, caller will check all other units, with wrap */ - if (*unit_start > 0) - *unit_end = *unit_start - 1; - else - *unit_end = nunits-1; - } - _HFI_DBG("RoundRobinAll Will select 1st viable NIC unit= %ld to %ld.\n", - *unit_start, *unit_end); -} - -static int -psm3_create_and_open_affinity_shm(psm2_uuid_t const job_key) -{ - int shm_fd, ret; - int first_to_create = 0; - size_t shm_name_len = 256; - - psmi_assert_always(psm3_affinity_semaphore_open); - if (psm3_affinity_shared_file_opened) { - /* opened and have our reference counted in shm */ - psmi_assert_always(psm3_affinity_shm_name != NULL); - psmi_assert_always(psm3_shared_affinity_ptr != NULL); - return 0; - } - - psm3_shared_affinity_ptr = NULL; - psm3_affinity_shm_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, shm_name_len); - - psmi_assert_always(psm3_affinity_shm_name != NULL); - snprintf(psm3_affinity_shm_name, shm_name_len, - AFFINITY_SHM_BASENAME".%d", - psm3_get_uuid_hash(job_key)); - shm_fd = shm_open(psm3_affinity_shm_name, O_RDWR | O_CREAT | O_EXCL, - S_IRUSR | S_IWUSR); - if ((shm_fd < 0) && (errno == EEXIST)) { - shm_fd = shm_open(psm3_affinity_shm_name, O_RDWR, S_IRUSR | S_IWUSR); - if (shm_fd < 0) { - _HFI_VDBG("Cannot open affinity shared mem fd:%s, errno=%d\n", - psm3_affinity_shm_name, errno); - goto free_name; - } - } else if (shm_fd >= 0) { - first_to_create = 1; - } else { - _HFI_VDBG("Cannot create affinity shared mem fd:%s, errno=%d\n", - psm3_affinity_shm_name, errno); - goto free_name; - } - - ret = ftruncate(shm_fd, PSMI_PAGESIZE); - if ( ret < 0 ) { - _HFI_VDBG("Cannot truncate affinity shared mem fd:%s, errno=%d\n", - psm3_affinity_shm_name, errno); - goto close_shm; - } - - psm3_shared_affinity_ptr = (uint64_t *) mmap(NULL, PSMI_PAGESIZE, PROT_READ | PROT_WRITE, - MAP_SHARED, shm_fd, 0); - if (psm3_shared_affinity_ptr == MAP_FAILED) { - _HFI_VDBG("Cannot mmap affinity shared memory: %s, errno=%d\n", - psm3_affinity_shm_name, errno); - goto close_shm; - } - close(shm_fd); - shm_fd = -1; - - if (first_to_create) { - _HFI_VDBG("Initializing shm to store NIC affinity per socket: %s\n", psm3_affinity_shm_name); - - memset(psm3_shared_affinity_ptr, 0, PSMI_PAGESIZE); - - /* - * Once shm object is initialized, unlock others to be able to - * use it. - */ - psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name); - } else { - _HFI_VDBG("Opened shm object to read/write NIC affinity per socket: %s\n", psm3_affinity_shm_name); - } - - /* - * Start critical section to increment reference count when creating - * or opening shm object. Decrement of ref count will be done before - * closing the shm. - */ - if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) { - _HFI_VDBG("Could not enter critical section to update shm refcount\n"); - goto unmap_shm; - } - - psm3_shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] += 1; - _HFI_VDBG("shm refcount = %"PRId64"\n", psm3_shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION]); - - /* End critical section */ - psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name); - - psm3_affinity_shared_file_opened = 1; - - return 0; - -unmap_shm: - munmap(psm3_shared_affinity_ptr, PSMI_PAGESIZE); - psm3_shared_affinity_ptr = NULL; -close_shm: - if (shm_fd >= 0) close(shm_fd); -free_name: - psmi_free(psm3_affinity_shm_name); - psm3_affinity_shm_name = NULL; - return -1; -} - -/* - * Spread HFI selection between units if we find more than one within a socket. - */ -static void -psmi_spread_hfi_within_socket(long *unit_start, long *unit_end, int node_id, - int *saved_hfis, int found, psm2_uuid_t const job_key) -{ - int ret, shm_location; - - /* - * Take affinity lock and open shared memory region to be able to - * accurately determine which HFI to pick for this process. If any - * issues, bail by picking first known HFI. - */ - if (!psm3_affinity_semaphore_open) - goto spread_hfi_fallback; - - ret = psm3_create_and_open_affinity_shm(job_key); - if (ret < 0) - goto spread_hfi_fallback; - - shm_location = AFFINITY_SHM_HFI_INDEX_LOCATION + node_id; - if (shm_location > PSMI_PAGESIZE) - goto spread_hfi_fallback; - - /* Start critical section to read/write shm object */ - if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) { - _HFI_VDBG("Could not enter critical section to update NIC index\n"); - goto spread_hfi_fallback; - } - - *unit_start = *unit_end = saved_hfis[psm3_shared_affinity_ptr[shm_location]]; - psm3_shared_affinity_ptr[shm_location] = - (psm3_shared_affinity_ptr[shm_location] + 1) % found; - _HFI_DBG("RoundRobin Selected NIC unit= %ld, Next NIC=%ld, node = %d, local rank=%d, found=%d.\n", - *unit_start, psm3_shared_affinity_ptr[shm_location], node_id, - psm3_get_mylocalrank(), found); - - /* End Critical Section */ - psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name); - - return; - -spread_hfi_fallback: - *unit_start = *unit_end = saved_hfis[0]; -} - -static void -psm3_create_affinity_semaphores(psm2_uuid_t const job_key) -{ - int ret; - size_t sem_len = 256; - - /* - * If already opened, no need to do anything else. - * This could be true for Multi-EP cases where a different thread has - * already created the semaphores. We don't need separate locks here as - * we are protected by the overall "psm3_creation_lock" which each - * thread will take in psm3_ep_open() - */ - if (psm3_affinity_semaphore_open) - return; - - psm3_sem_affinity_shm_rw_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, sem_len); - psmi_assert_always(psm3_sem_affinity_shm_rw_name != NULL); - snprintf(psm3_sem_affinity_shm_rw_name, sem_len, - SEM_AFFINITY_SHM_RW_BASENAME".%d", - psm3_get_uuid_hash(job_key)); - - ret = psmi_init_semaphore(&psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name, - S_IRUSR | S_IWUSR, 0); - if (ret) { - _HFI_VDBG("Cannot initialize semaphore: %s for read-write access to shm object.\n", - psm3_sem_affinity_shm_rw_name); - if (psm3_sem_affinity_shm_rw) - sem_close(psm3_sem_affinity_shm_rw); - psmi_free(psm3_sem_affinity_shm_rw_name); - psm3_sem_affinity_shm_rw_name = NULL; - return; - } - - _HFI_VDBG("Semaphore: %s created for read-write access to shm object.\n", - psm3_sem_affinity_shm_rw_name); - - psm3_affinity_semaphore_open = 1; - - return; -} - -// return set of units to consider and which to start at. -// caller will use 1st active unit which can be opened. -// caller will wrap around so it's valid for start > end -// Note: When using multiple rails per PSM process, higher level code will -// walk through desired units and unit_param will specify a specific unit -static -psm2_error_t -psmi_compute_start_and_end_unit(long unit_param, long addr_index, - int nunitsactive,int nunits, - psm2_uuid_t const job_key, - long *unit_start,long *unit_end) -{ - unsigned short nic_sel_alg = PSMI_UNIT_SEL_ALG_ACROSS; - int node_id, found = 0; - int saved_hfis[nunits]; - - /* if the user did not set PSM3_NIC then ... */ - if (unit_param == PSM3_NIC_ANY) - { - if (nunitsactive > 1) { - // if NICs are on different planes (non-routed subnets) - // we need to have all ranks default to the same plane - // so force 1st active NIC in that case - int have_subnet = 0, unit_id; - psmi_subnet128_t got_subnet = { }; - for (unit_id = 0; unit_id < nunits; unit_id++) { - psmi_subnet128_t subnet; - if (psmi_hal_get_unit_active(unit_id) <= 0) - continue; - if (0 != psmi_hal_get_port_subnet(unit_id, 1 /* VERBS_PORT*/, - addr_index>0?addr_index:0, - &subnet, NULL, NULL, NULL)) - continue; // can't access NIC - if (! have_subnet) { - have_subnet = 1; - got_subnet = subnet; - } else if (! psm3_subnets_match(got_subnet, - subnet)) { - // active units have different tech - // (IB/OPA vs Eth) or different subnets - // caller will pick 1st active unit - *unit_start = 0; - *unit_end = nunits - 1; - _HFI_DBG("Multi-Plane config: Will select 1st viable NIC unit= %ld to %ld.\n", - *unit_start, *unit_end); - return PSM2_OK; - } - } - } - - /* Get the actual selection algorithm from the environment: */ - nic_sel_alg = psmi_parse_nic_selection_algorithm(); - /* If round-robin is selection algorithm and ... */ - if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS) && - /* there are more than 1 active units then ... */ - (nunitsactive > 1)) - { - /* - * Pick first HFI we find on same root complex - * as current task. If none found, fall back to - * RoundRobinAll load-balancing algorithm. - */ - node_id = psm3_get_current_proc_location(); - if (node_id >= 0) { - found = hfi_find_active_hfis(nunits, node_id, - saved_hfis); - if (found > 1) { - psm3_create_affinity_semaphores(job_key); - psmi_spread_hfi_within_socket(unit_start, unit_end, - node_id, saved_hfis, - found, job_key); - } else if (found == 1) { - *unit_start = *unit_end = saved_hfis[0]; - _HFI_DBG("RoundRobin Selected NIC unit= %ld, node = %d, local rank=%d, found=%d.\n", - *unit_start, node_id, - psm3_get_mylocalrank(), found); - } - } - - if (node_id < 0 || !found) { - _HFI_DBG("RoundRobin No local NIC found, using RoundRobinAll, node = %d, local rank=%d, found=%d.\n", - node_id, - psm3_get_mylocalrank(), found); - psmi_spread_nic_selection(job_key, unit_start, - unit_end, nunits); - } - } else if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS_ALL) && - (nunitsactive > 1)) { - psmi_spread_nic_selection(job_key, unit_start, - unit_end, nunits); - } - else { // PSMI_UNIT_SEL_ALG_WITHIN or only 1 active unit - // caller will pick 1st active unit - *unit_start = 0; - *unit_end = nunits - 1; - _HFI_DBG("%s: Will select 1st viable NIC unit= %ld to %ld.\n", - (nic_sel_alg == PSMI_UNIT_SEL_ALG_WITHIN) - ?"Packed":"Only 1 viable NIC", - *unit_start, *unit_end); - } - } else if (unit_param >= 0) { - /* the user specified PSM3_NIC, we use it. */ - *unit_start = *unit_end = unit_param; - _HFI_DBG("Caller selected NIC %ld.\n", *unit_start); - } else { - psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "PSM3 can't open unit: %ld for reading and writing", - unit_param); - return PSM2_EP_DEVICE_FAILURE; - } - - return PSM2_OK; -} - static int psmi_hash_addr_index(long unit, long port, long addr_index) { /* if the user did not set addr_index, then use a hash */ @@ -578,6 +102,9 @@ static int psmi_hash_addr_index(long unit, long port, long addr_index) return addr_index; } +// Open a single NIC. +// if unit_param is PSM3_NIC_ANY, the chosen PSM3_NIC_SELECTION_ALG will be +// used to pick a single active NIC psm2_error_t psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_index, psm2_uuid_t const job_key, uint16_t network_pkey, @@ -620,15 +147,15 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde unit_start = 0; unit_end = nunits - 1; - err = psmi_compute_start_and_end_unit(unit_param, addr_index, + err = psm3_compute_start_and_end_unit(unit_param, addr_index, nunitsactive, nunits, job_key, &unit_start, &unit_end); if (err != PSM2_OK) goto ret; - /* this is the start of a loop that starts at unit_start and goes to unit_end. - but note that the way the loop computes the loop control variable is by - an expression involving the mod operator. */ + /* Loop from unit_start to unit_end inclusive and pick 1st active found + * As needed wrap, so it's valid for unit_start >= unit_end + */ int success = 0; unit_id_prev = unit_id = unit_start; do @@ -645,6 +172,10 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde psmi_hash_addr_index(unit_id, port, addr_index), open_timeout, ep, job_key, HAL_CONTEXT_OPEN_RETRY_MAX)) { + // in modes where we refcount NIC use, + // psm3_compute_start_and_end_unit will have returned exactly + // 1 NIC and refcount'ed it, so we dec refcount here + psm3_dec_nic_refcount(unit_id); /* go to next unit if failed to open. */ unit_id_prev = unit_id; unit_id = (unit_id + 1) % nunits; @@ -709,6 +240,7 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde close: psmi_hal_close_context(ep); + psm3_dec_nic_refcount(ep->unit_id); bail: _HFI_PRDBG("open failed: unit_id: %ld, err: %d (%s)\n", unit_id, err, strerror(errno)); ret: @@ -720,16 +252,21 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde psm2_error_t psm3_context_close(psm2_ep_t ep) { psmi_hal_close_context(ep); + psm3_dec_nic_refcount(ep->unit_id); return PSM2_OK; } +// up to 4 digits per CPU number, plus a coma or dash +#define MAX_CPU_AFFINITY_STRING (CPU_SETSIZE * 5) + static inline char * _dump_cpu_affinity(char *buf, size_t buf_size, cpu_set_t * cpuset) { int i; - int isfirst = 1; - char tmp[25]; //%d = 10 :: 10 + '-' + 10 + ',' + '\0' = 23 + char tmp[25]; //%d, = 10+','+\0 or %d-%d, = 10 + '-' + 10 + ',' + '\0' = 23 int first = -1, last = -1; + int len = 0; + *buf = '\0'; for (i = 0; i < CPU_SETSIZE; i++) { if (CPU_ISSET(i, cpuset)) { if (first == -1) { @@ -745,13 +282,8 @@ static inline char * _dump_cpu_affinity(char *buf, size_t buf_size, cpu_set_t * } first = last = -1; - if (isfirst) { - strncpy(buf, tmp, buf_size-1); - isfirst=0; - } else { - strncat(buf, tmp, buf_size-1); - } - buf[buf_size-1] = '\0'; + snprintf(&buf[len], buf_size-len,"%s", tmp); + len = strlen(buf); } } @@ -761,26 +293,48 @@ static inline char * _dump_cpu_affinity(char *buf, size_t buf_size, cpu_set_t * } else { snprintf(tmp, sizeof(tmp), "%d-%d,", first, last); } - if (isfirst) { - strncpy(buf, tmp, buf_size-1); - } else { - strncat(buf, tmp, buf_size-1); - } - buf[buf_size-1] = '\0'; + snprintf(&buf[len], buf_size-len,"%s", tmp); + len = strlen(buf); } - char *comma = strrchr(buf, ','); - if (comma) comma[0] = '\0'; + if (len) + buf[len-1] = '\0'; // elimate trailing coma return buf; } -// called by HAL context_open to set affinity consistent with -// NIC NUMA location when NIC NUMA location is a superset of thread CPU set -// TBD unclear when this provides value. +// called by HAL context_open to set CPU affinity narrower consistent with +// NIC NUMA location +// Intel MPI sets PSM3_NO_CPUAFFINITY to disable this function +// Suspect this is not effective or has bugs. For Omni-Path the NIC +// driver set affinity before this was called, and this was thus likely a noop. +// This is a noop if: +// - if NIC is not NUMA local to any of CPUs in existing affinity +// - if existing affinity selects more cores than those local to NIC +// even if that set incompletely overlaps the NIC local core set +// suspect this is a bug and test should be opposity or just test +// for overlap. +// if NIC is NUMA local to CPU, and NIC core list is larger than existing +// affinity, will limit scope of affinity to cores NUMA local to NIC +// - does not consider the full set of selected NICs when multirail enabled +// - may only provide value if CPU set from caller is small but > 1 CPU NUMA +// domain in which case this will reduce it to a single CPU NUMA domain +// matching the NIC's NUMA location. +// +// By default this is enabled, but two undocumented variables +// PSM3_FORCE_CPUAFFINITY and PSM3_NO_CPUAFFINITY can control this +// as well as the ep_open skip_affinity flag. +// // May be better if we analyzed NIC NUMA location and various other // process and thread locations when NIC NUMA is a subset of CPU affinity // and guide a good choice for CPU affinity, but that would require // intra-node process coordination to avoid duplicate CPU selections +// +// TBD for GPU affinity this may not make sense. Also PSM3 can't force a GPU +// selection for an app. +// +// TBD when PSM3 is using multiple NICs (PSM3_MULTIRAIL > 0) this should +// be enhanced to attempt to select a CPU based on location of all NICs being +// used, not just a single NIC. int psm3_context_set_affinity(psm2_ep_t ep, int unit) { @@ -796,8 +350,9 @@ psm3_context_set_affinity(psm2_ep_t ep, int unit) } if (_HFI_DBG_ON) { - char cpu_buf[128] = {0}; - _HFI_DBG_ALWAYS( "CPU affinity Before set: %s\n", _dump_cpu_affinity(cpu_buf, 128, &cpuset)); + char cpu_buf[MAX_CPU_AFFINITY_STRING] = {0}; + _HFI_DBG_ALWAYS( "CPU affinity Before set: %s\n", + _dump_cpu_affinity(cpu_buf, MAX_CPU_AFFINITY_STRING, &cpuset)); } /* @@ -837,10 +392,11 @@ psm3_context_set_affinity(psm2_ep_t ep, int unit) //err = -PSM_HAL_ERROR_GENERAL_ERROR; goto bail; } else if (cpu_and_count == 0 && _HFI_DBG_ON) { - char buf1[128] = {0}; - char buf2[128] = {0}; + char buf1[MAX_CPU_AFFINITY_STRING] = {0}; + char buf2[MAX_CPU_AFFINITY_STRING] = {0}; _HFI_DBG_ALWAYS( "CPU affinity not set, NIC selected is not on the same socket as thread (\"%s\" & \"%s\" == 0).\n", - _dump_cpu_affinity(buf1, 128, &nic_cpuset), _dump_cpu_affinity(buf2, 128, &cpuset)); + _dump_cpu_affinity(buf1, MAX_CPU_AFFINITY_STRING, &nic_cpuset), + _dump_cpu_affinity(buf2, MAX_CPU_AFFINITY_STRING, &cpuset)); } } skip_affinity: @@ -852,8 +408,9 @@ psm3_context_set_affinity(psm2_ep_t ep, int unit) "Can't get CPU affinity: %s\n", strerror(errno)); goto bail; } - char cpu_buf[128] = {0}; - _HFI_DBG_ALWAYS( "CPU affinity After set: %s\n", _dump_cpu_affinity(cpu_buf, 128, &cpuset)); + char cpu_buf[MAX_CPU_AFFINITY_STRING] = {0}; + _HFI_DBG_ALWAYS( "CPU affinity After set: %s\n", + _dump_cpu_affinity(cpu_buf, MAX_CPU_AFFINITY_STRING, &cpuset)); } return 0; @@ -904,39 +461,3 @@ psm3_ep_verify_pkey(psm2_ep_t ep, uint16_t pkey, uint16_t *opkey, uint16_t* oind return PSM2_OK; } - -static -int psmi_parse_nic_selection_algorithm(void) -{ - union psmi_envvar_val env_nic_alg; - int nic_alg = PSMI_UNIT_SEL_ALG_ACROSS; - - const char* PSM3_NIC_SELECTION_ALG_HELP = - "NIC Device Selection Algorithm to use. Round Robin[RoundRobin or rr] (Default) " - ", Packed[p] or Round Robin All[RoundRobinAll or rra]."; - - /* If a specific unit is set in the environment, use that one. */ - psm3_getenv("PSM3_NIC_SELECTION_ALG", PSM3_NIC_SELECTION_ALG_HELP, - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, - (union psmi_envvar_val)"rr", &env_nic_alg); - - if (!strcasecmp(env_nic_alg.e_str, "Round Robin") - || !strcasecmp(env_nic_alg.e_str, "RoundRobin") - || !strcasecmp(env_nic_alg.e_str, "rr")) - nic_alg = PSMI_UNIT_SEL_ALG_ACROSS; - else if (!strcasecmp(env_nic_alg.e_str, "Packed") - || !strcasecmp(env_nic_alg.e_str, "p")) - nic_alg = PSMI_UNIT_SEL_ALG_WITHIN; - else if (!strcasecmp(env_nic_alg.e_str, "Round Robin All") - || !strcasecmp(env_nic_alg.e_str, "RoundRobinAll") - || !strcasecmp(env_nic_alg.e_str, "rra")) - nic_alg = PSMI_UNIT_SEL_ALG_ACROSS_ALL; - else { - _HFI_INFO( - "Invalid value for PSM3_NIC_SELECTION_ALG ('%s') %-40s Using: %s\n", - env_nic_alg.e_str, PSM3_NIC_SELECTION_ALG_HELP, "RoundRobin"); - nic_alg = PSMI_UNIT_SEL_ALG_ACROSS; - } - - return nic_alg; -} diff --git a/prov/psm3/psm3/psm_context.h b/prov/psm3/psm3/psm_context.h index 188e1284cc4..28339284bcf 100644 --- a/prov/psm3/psm3/psm_context.h +++ b/prov/psm3/psm3/psm_context.h @@ -76,21 +76,4 @@ psm3_context_set_affinity(psm2_ep_t ep, int unit); psm2_error_t psm3_context_interrupt_set(psm2_ep_t ep, int enable); int psm3_context_interrupt_isenabled(psm2_ep_t ep); -/* - * round robin contexts across HFIs, then - * ports; this is the default. - * This option spreads the HFI selection within the local socket. - * If it is preferred to spread job over over entire set of - * HFIs within the system, see ALG_ACROSS_ALL below. - */ -#define PSMI_UNIT_SEL_ALG_ACROSS PSM_HAL_ALG_ACROSS - -#define PSMI_UNIT_SEL_ALG_ACROSS_ALL PSM_HAL_ALG_ACROSS_ALL - -/* - * use all contexts on an HFI (round robin - * active ports within), then next HFI - */ -#define PSMI_UNIT_SEL_ALG_WITHIN PSM_HAL_ALG_WITHIN - #endif /* PSM_CONTEXT_H */ diff --git a/prov/psm3/psm3/psm_ep.c b/prov/psm3/psm3/psm_ep.c index 9e31af3e65c..36dbf40abfa 100644 --- a/prov/psm3/psm3/psm_ep.c +++ b/prov/psm3/psm3/psm_ep.c @@ -119,385 +119,6 @@ psm2_error_t psm3_ep_num_devunits(uint32_t *num_units_o) return PSM2_OK; } -struct rail_info { - psmi_subnet128_t subnet; - unsigned unit; - unsigned port; - unsigned addr_index; -}; - -static int cmpfunc(const void *p1, const void *p2) -{ - struct rail_info *a = ((struct rail_info *) p1); - struct rail_info *b = ((struct rail_info *) p2); - int ret; - - ret = psmi_subnet128_cmp(a->subnet, b->subnet); - if (ret == 0) { - if (a->addr_index < b->addr_index) - return -1; - else if (a->addr_index > b->addr_index) - return 1; - } - return ret; -} - -// process PSM3_MULTIRAIL and PSM3_MULTIRAIL_MAP and return the -// list of unit/port/addr_index in unit[0-(*num_rails-1)], -// port[0-(*num_rails-1)] and addr_index[0-(*num_rails-1)] -// When *num_rails is returned as 0, multirail is not enabled and -// other mechanisms (PSM3_NIC, PSM3_NIC_SELECTION_ALG) must be -// used by the caller to select a single NIC for the process -static psm2_error_t -psm3_ep_multirail(int *num_rails, uint32_t *unit, uint16_t *port, int *addr_index) -{ - uint32_t num_units = 0; - psmi_subnet128_t subnet; - unsigned i, j, k, count = 0; - int ret; - psm2_error_t err = PSM2_OK; - struct rail_info rail_info[PSMI_MAX_RAILS]; - union psmi_envvar_val env_multirail; - union psmi_envvar_val env_multirail_map; - int multirail_within_socket_used = 0; - int node_id = -1, found = 0; - - psm3_getenv("PSM3_MULTIRAIL", - "Use all available NICs in the system for communication.\n" - "-1: No NIC autoselection,\n" - "0: Disabled (default),\n" - "1: Enable multirail across all available NICs,\n" - "2: Enable multirail within socket.\n" - "\t For multirail within a socket, we try to find at\n" - "\t least one NIC on the same socket as current task.\n" - "\t If none found, we continue to use other NICs within\n" - "\t the system.", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)0, - &env_multirail); - if (env_multirail.e_int <= 0) { - *num_rails = 0; - return PSM2_OK; - } - - if (env_multirail.e_int == 2) - multirail_within_socket_used = 1; - -/* - * map is in format: unit:port-addr_index,unit:port-addr_index,... - * where :port is optional (default of 1) and unit can be name or number - * -addr_index is also optionall and defaults to "all" - * addr_index can be an integer between 0 and PSM3_ADDR_PER_NIC-1 - * or "any" or "all". "any" selects a single address using the hash and - * "all" setups a rail for each address. - */ -#define MAX_MAP_LEN (PSMI_MAX_RAILS*128) - if (!psm3_getenv("PSM3_MULTIRAIL_MAP", - "NIC selections for each rail in format:\n" - " rail,rail,...\n" -#if 0 - "Where rail can be: unit:port-addr_index or unit\n" -#else - "Where rail can be: unit-addr_index or unit\n" -#endif - "unit can be device name or unit number\n" -#if 0 - "where :port is optional (default of 1)\n" -#endif - "addr_index can be 0 to PSM3_ADDR_PER_NIC-1, or 'any' or 'all'\n" - "When addr_index is omitted, it defaults to 'all'\n" - "default autoselects", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, - (union psmi_envvar_val)"", &env_multirail_map)) { - - char temp[MAX_MAP_LEN+1]; - char *s; - char *delim; - - strncpy(temp, env_multirail_map.e_str, MAX_MAP_LEN); - if (temp[MAX_MAP_LEN-1] != 0) - return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "PSM3_MULTIRAIL_MAP too long: '%s'", - env_multirail_map.e_str); - s = temp; - psmi_assert(*s); - do { - int u, p = 1; - int skip_port = 0; - int skip_addr_index = 0; - int a_index = PSM3_ADDR_INDEX_ALL; - - if (! *s) // trailing ',' on 2nd or later loop - break; - if (count >= PSMI_MAX_RAILS) - return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "PSM3_MULTIRAIL_MAP exceeds %u rails: '%s'", - PSMI_MAX_RAILS, env_multirail_map.e_str); - - // find end of unit field and put in \0 as needed - delim = strpbrk(s, ":-,"); - if (!delim || *delim == ',') { - skip_port = 1; skip_addr_index = 1; - } else if (*delim == '-') { - skip_port = 1; - } - if (delim) - *delim = '\0'; - // parse unit - u = psm3_sysfs_find_unit(s); - if (u < 0) - return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "PSM3_MULTIRAIL_MAP invalid unit: '%s'", s); - // find next field - if (delim) - s = delim+1; - if (! skip_port) { - // find end of port field and put in \0 as needed - delim = strpbrk(s, "-,"); - if (!delim || *delim == ',') - skip_addr_index = 1; - if (delim) - *delim = '\0'; - // parse port - p = psm3_parse_str_long(s); - if (p < 0) - return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "PSM3_MULTIRAIL_MAP invalid port: '%s'", s); - // find next field - if (delim) - s = delim+1; - } - if (! skip_addr_index) { - // find end of addr_index field and put in \0 as needed - delim = strchr(s, ','); - if (delim) - *delim = '\0'; - // parse addr_index - if (0 == strcmp(s, "all")) - a_index = PSM3_ADDR_INDEX_ALL; // we will loop below - else if (0 == strcmp(s, "any")) - a_index = PSM3_ADDR_INDEX_ANY; // caller will pick - else { - a_index = psm3_parse_str_long(s); - if (a_index < 0 || a_index >= psm3_addr_per_nic) - return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "PSM3_MULTIRAIL_MAP invalid addr index: '%s'", s); - } - // find next field - if (delim) - s = delim+1; - } - - if (a_index == PSM3_ADDR_INDEX_ALL) { // all - for (a_index = 0; a_index < psm3_addr_per_nic; a_index++) { - if (count >= PSMI_MAX_RAILS) - return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "PSM3_MULTIRAIL_MAP exceeds %u rails: '%s' due to multi-ip", - PSMI_MAX_RAILS, env_multirail_map.e_str); - unit[count] = u; - port[count] = p; - addr_index[count] = a_index; - count++; - } - } else { - unit[count] = u; - port[count] = p; - addr_index[count] = a_index; - count++; - } - } while (delim); - *num_rails = count; - -/* - * Check if any of the port is not usable. Just use addr_index 0 for check - */ - for (i = 0; i < count; i++) { - _HFI_VDBG("rail %d: %u(%s) %u\n", i, - unit[i], psm3_sysfs_unit_dev_name(unit[i]), port[i]); - ret = psmi_hal_get_port_active(unit[i], port[i]); - if (ret <= 0) - return psm3_handle_error(NULL, - PSM2_EP_DEVICE_FAILURE, - "PSM3_MULTIRAIL_MAP: Unit/port: %d(%s):%d is not active.", - unit[i], psm3_sysfs_unit_dev_name(unit[i]), - port[i]); - ret = psmi_hal_get_port_lid(unit[i], port[i], 0 /* addr_index*/); - if (ret <= 0) - return psm3_handle_error(NULL, - PSM2_EP_DEVICE_FAILURE, - "PSM3_MULTIRAIL_MAP: unit %d(%s):%d was filtered out, unable to use", - unit[i], psm3_sysfs_unit_dev_name(unit[i]), - port[i]); - ret = psmi_hal_get_port_subnet(unit[i], port[i], 0 /* addr_index*/, NULL, NULL, NULL, NULL); - if (ret == -1) - return psm3_handle_error(NULL, - PSM2_EP_DEVICE_FAILURE, - "PSM3_MULTIRAIL_MAP: Couldn't get subnet for unit %d(%s):%d", - unit[i], psm3_sysfs_unit_dev_name(unit[i]), - port[i]); - } - return PSM2_OK; - } - - if ((err = psm3_ep_num_devunits(&num_units))) { - return err; - } - if (num_units > PSMI_MAX_RAILS) { - _HFI_INFO - ("Found %d units, max %d units are supported, use %d\n", - num_units, PSMI_MAX_RAILS, PSMI_MAX_RAILS); - num_units = PSMI_MAX_RAILS; - } - - /* - * PSM3_MULTIRAIL=2 functionality- - * - Try to find at least find one HFI in the same root - * complex. If none found, continue to run and - * use remaining HFIs in the system. - * - If we do find at least one HFI in same root complex, we - * go ahead and add to list. - */ - if (multirail_within_socket_used) { - node_id = psm3_get_current_proc_location(); - for (i = 0; i < num_units; i++) { - if (psmi_hal_get_unit_active(i) <= 0) - continue; - int node_id_i; - - if (!psmi_hal_get_node_id(i, &node_id_i)) { - if (node_id_i == node_id) { - found = 1; - break; - } - } - } - } -/* - * Get all the ports and addr_index with a valid lid and gid, one port per unit. - * but up to PSM3_ADDR_PER_NIC addresses - */ - for (i = 0; i < num_units; i++) { - int node_id_i; - - if (!psmi_hal_get_node_id(i, &node_id_i)) - { - if (multirail_within_socket_used && - found && (node_id_i != node_id)) - continue; - } - - for (j = PSM3_NIC_MIN_PORT; j <= PSM3_NIC_MAX_PORT; j++) { - int got_port = 0; - for (k = 0; k < psm3_addr_per_nic; k++) { - ret = psmi_hal_get_port_lid(i, j, k); - if (ret <= 0) - continue; - ret = psmi_hal_get_port_subnet(i, j, k, &subnet, NULL, NULL, NULL); - if (ret == -1) - continue; - - rail_info[count].subnet = subnet; - rail_info[count].unit = i; - rail_info[count].port = j; - rail_info[count].addr_index = k; - got_port = 1; - count++; - } - if (got_port) // one port per unit - break; - } - } - -/* - * Sort all the ports within rail_info from small to big. - * This is for multiple fabrics, and we use fabric with the - * smallest subnet to make the master connection. - */ - qsort(rail_info, count, sizeof(rail_info[0]), cmpfunc); - - for (i = 0; i < count; i++) { - unit[i] = rail_info[i].unit; - port[i] = rail_info[i].port; - addr_index[i] = rail_info[i].addr_index; - } - *num_rails = count; - return PSM2_OK; -} - -// this is used to find devices with the same address as another process, -// implying intra-node comms. -// we poplate hfi_nids and nnids with the set of network ids (NID) for -// all the local NICs. -// The caller will see if any of these NIDs match the NID of the remote process. -// Note that NIDs are globally unique and include both subnet and NIC address -// information, so we can compare them regardless of their subnet. -// NIDs which are not on the same subnet will not match. -// NIDs on the same subnet only match if they are the same NIC. -// Two local NICs with the same subnet and same address is an unexpected -// invalid config, and will silently match the two NICs. -#define MAX_GID_IDX 31 -static psm2_error_t -psm3_ep_devnids(psm2_nid_t **nids, uint32_t *num_nids_o) -{ - uint32_t num_units = 0; - int i; - psm2_error_t err = PSM2_OK; - - PSMI_ERR_UNLESS_INITIALIZED(NULL); - - if (hfi_nids == NULL) { - if ((err = psm3_ep_num_devunits(&num_units))) - goto fail; - hfi_nids = (psm2_nid_t *) - psmi_calloc(PSMI_EP_NONE, UNDEFINED, - num_units * psmi_hal_get_num_ports()*psm3_addr_per_nic, sizeof(*hfi_nids)); - if (hfi_nids == NULL) { - err = psm3_handle_error(NULL, PSM2_NO_MEMORY, - "Couldn't allocate memory for dev_nids structure"); - goto fail; - } - - for (i = 0; i < num_units; i++) { - int j; - for (j = PSM3_NIC_MIN_PORT; j <= PSM3_NIC_MAX_PORT; j++) { - int k; - for (k = 0; k < psm3_addr_per_nic; k++) { - int lid = psmi_hal_get_port_lid(i, j, k); - int ret, idx = 0; - psmi_subnet128_t subnet = { }; - psmi_naddr128_t addr = { }; - psmi_gid128_t gid = { }; - - // skip ports which aren't ready for use - if (lid <= 0) - continue; - ret = psmi_hal_get_port_subnet(i, j, k, &subnet, &addr, &idx, &gid); - if (ret == -1) - continue; - hfi_nids[nnids] = psm3_build_nid(i, addr, lid); - _HFI_VDBG("NIC unit %d, port %d addr_index %d, found %s " - "GID[%d] %s subnet %s\n", - i, j, k, - psm3_nid_fmt(hfi_nids[nnids], 0), - idx, psm3_gid128_fmt(gid, 1), - psm3_subnet128_fmt(subnet, 2)); - nnids++; - } - } - } - if (nnids == 0) { - err = psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "Couldn't find any unfiltered units"); - goto fail; - } - } - *nids = hfi_nids; - *num_nids_o = nnids; - -fail: - return err; -} - psm2_error_t psm3_ep_query(int *num_of_epinfo, psm2_epinfo_t *array_of_epinfo) { psm2_error_t err = PSM2_OK; @@ -632,6 +253,80 @@ psm2_error_t psm3_epaddr_to_epid(psm2_epaddr_t epaddr, psm2_epid_t *epid) return err; } +// this is used to find devices with the same address as another process, +// implying intra-node comms. +// we poplate hfi_nids and nnids with the set of network ids (NID) for +// all the local NICs. +// The caller will see if any of these NIDs match the NID of the remote process. +// Note that NIDs are globally unique and include both subnet and NIC address +// information, so we can compare them regardless of their subnet. +// NIDs which are not on the same subnet will not match. +// NIDs on the same subnet only match if they are the same NIC. +// Two local NICs with the same subnet and same address is an unexpected +// invalid config, and will silently match the two NICs. +#define MAX_GID_IDX 31 +static psm2_error_t +psm3_ep_devnids(psm2_nid_t **nids, uint32_t *num_nids_o) +{ + uint32_t num_units = 0; + int i; + psm2_error_t err = PSM2_OK; + + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + if (hfi_nids == NULL) { + if ((err = psm3_ep_num_devunits(&num_units))) + goto fail; + hfi_nids = (psm2_nid_t *) + psmi_calloc(PSMI_EP_NONE, UNDEFINED, + num_units * psmi_hal_get_num_ports()*psm3_addr_per_nic, sizeof(*hfi_nids)); + if (hfi_nids == NULL) { + err = psm3_handle_error(NULL, PSM2_NO_MEMORY, + "Couldn't allocate memory for dev_nids structure"); + goto fail; + } + + for (i = 0; i < num_units; i++) { + int j; + for (j = PSM3_NIC_MIN_PORT; j <= PSM3_NIC_MAX_PORT; j++) { + int k; + for (k = 0; k < psm3_addr_per_nic; k++) { + int lid = psmi_hal_get_port_lid(i, j, k); + int ret, idx = 0; + psmi_subnet128_t subnet = { }; + psmi_naddr128_t addr = { }; + psmi_gid128_t gid = { }; + + // skip ports which aren't ready for use + if (lid <= 0) + continue; + ret = psmi_hal_get_port_subnet(i, j, k, &subnet, &addr, &idx, &gid); + if (ret == -1) + continue; + hfi_nids[nnids] = psm3_build_nid(i, addr, lid); + _HFI_VDBG("NIC unit %d, port %d addr_index %d, found %s " + "GID[%d] %s subnet %s\n", + i, j, k, + psm3_nid_fmt(hfi_nids[nnids], 0), + idx, psm3_gid128_fmt(gid, 1), + psm3_subnet128_fmt(subnet, 2)); + nnids++; + } + } + } + if (nnids == 0) { + err = psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "Couldn't find any unfiltered units"); + goto fail; + } + } + *nids = hfi_nids; + *num_nids_o = nnids; + +fail: + return err; +} + // Indicate if the given epid is a local process. // In which case we can use intra-node shared memory comms with it. psm2_error_t @@ -714,6 +409,12 @@ psm2_error_t psm3_ep_open_opts_get_defaults(struct psm3_ep_open_opts *opts) psm2_error_t psm3_poll_noop(ptl_t *ptl, int replyonly, bool force); +// open a single internal EP for a single NIC +// For 1st internal EP opts may indicate PSM3_NIC_ANY in which case +// psm3_ep_open_device will let psm3_context_open pick the NIC based on +// PSM3_NIC_SELECTION_ALG. +// For multirail and when opening additional QPs for the NIC, opts will +// select a specific NIC. psm2_error_t psm3_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled, struct psm3_ep_open_opts const *opts_i, psm2_mq_t mq, @@ -821,11 +522,13 @@ psm3_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled, /* Get immediate data size - transfers less than immediate data size do * not consume a send buffer and require just a send descriptor. */ - if (!psm3_getenv("PSM3_SEND_IMMEDIATE_SIZE", - "Immediate data send size not requiring a buffer [128]", - PSMI_ENVVAR_LEVEL_HIDDEN, - PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)128, &envvar_val)) { + if (!psm3_getenv_range("PSM3_SEND_IMMEDIATE_SIZE", + "Immediate data send size not requiring a buffer. Default 128.", + "Actual permitted upper limit is NIC dependent.", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)128, + (union psmi_envvar_val)0, (union psmi_envvar_val)1024, + NULL, NULL, &envvar_val)) { opts.imm_size = envvar_val.e_uint; } @@ -1075,12 +778,10 @@ psm3_ep_open(psm2_uuid_t const unique_job_key, psm2_mq_t mq; psm2_epid_t epid; psm2_ep_t ep, tmp; - uint32_t units[PSMI_MAX_QPS]; - uint16_t ports[PSMI_MAX_QPS]; - int addr_indexes[PSMI_MAX_QPS]; - int i, num_rails = 0; + int i; int devid_enabled[PTL_MAX_INIT]; struct psm3_ep_open_opts opts = *opts_i; + struct multirail_config multirail_config = { 0 }; PSM2_LOG_MSG("entering"); PSMI_ERR_UNLESS_INITIALIZED(NULL); @@ -1127,15 +828,15 @@ psm3_ep_open(psm2_uuid_t const unique_job_key, goto fail; if (psm3_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { - err = psm3_ep_multirail(&num_rails, units, ports, addr_indexes); + err = psm3_ep_multirail(&multirail_config); if (err != PSM2_OK) goto fail; /* If multi-rail is used, set the first ep unit/port */ - if (num_rails > 0) { - opts.unit = units[0]; - opts.port = ports[0]; - opts.addr_index = addr_indexes[0]; + if (multirail_config.num_rails > 0) { + opts.unit = multirail_config.units[0]; + opts.port = multirail_config.ports[0]; + opts.addr_index = multirail_config.addr_indexes[0]; } } #if defined(PSM_CUDA) || defined(PSM_ONEAPI) @@ -1183,13 +884,13 @@ psm3_ep_open(psm2_uuid_t const unique_job_key, psmi_hal_context_initstats(ep); union psmi_envvar_val envvar_val; - if (num_rails <= 0) { + if (multirail_config.num_rails <= 0) { // the NIC has now been selected for our process // use the same NIC for any additional QPs below - num_rails = 1; - units[0] = ep->unit_id; - ports[0] = ep->portnum; - addr_indexes[0] = ep->addr_index; + multirail_config.num_rails = 1; + multirail_config.units[0] = ep->unit_id; + multirail_config.ports[0] = ep->portnum; + multirail_config.addr_indexes[0] = ep->addr_index; } // When QP_PER_NIC >1, creates more than 1 QP on each NIC and then // uses the multi-rail algorithms to spread the traffic across QPs @@ -1204,22 +905,28 @@ psm3_ep_open(psm2_uuid_t const unique_job_key, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)1, &envvar_val); - if ((num_rails * envvar_val.e_uint) > PSMI_MAX_QPS) { + if ((multirail_config.num_rails * envvar_val.e_uint) > PSMI_MAX_QPS) { err = psm3_handle_error(NULL, PSM2_TOO_MANY_ENDPOINTS, "PSM3_QP_PER_NIC (%u) * num_rails (%d) > Max Support QPs (%u)", - envvar_val.e_uint, num_rails, PSMI_MAX_QPS); + envvar_val.e_uint, multirail_config.num_rails, PSMI_MAX_QPS); goto fail; } for (j= 0; j< envvar_val.e_uint; j++) { - for (i = 0; i < num_rails; i++) { - _HFI_VDBG("rail %d unit %u port %u addr_index %d\n", i, units[i], ports[i], addr_indexes[i]); + // loop will open additional internal EPs for all + // the additional QPs on 1st rail and for all the + // additional rails and all the QPs on those rails + for (i = 0; i < multirail_config.num_rails; i++) { + _HFI_VDBG("rail %d unit %u port %u addr_index %d\n", i, + multirail_config.units[i], + multirail_config.ports[i], + multirail_config.addr_indexes[i]); // did 0, 0 already above if (i == 0 && j== 0) continue; - opts.unit = units[i]; - opts.port = ports[i]; - opts.addr_index = addr_indexes[i]; + opts.unit = multirail_config.units[i]; + opts.port = multirail_config.ports[i]; + opts.addr_index = multirail_config.addr_indexes[i]; /* Create secondary EP */ err = psm3_ep_open_internal(unique_job_key, @@ -1542,6 +1249,15 @@ psm3_parse_devices(int devices[PTL_MAX_INIT]) int len; int i = 0; union psmi_envvar_val devs; + static int have_value = 0; + static int saved[PTL_MAX_INIT]; + + // only parse once so doesn't appear in PSM3_VERBOSE_ENV multiple times + if (have_value) { + for (i=0; i < PTL_MAX_INIT; i++) + devices[i] = saved[i]; + return PSM2_OK; + } /* See which ptl devices we want to use for this ep to be opened */ psm3_getenv("PSM3_DEVICES", @@ -1605,6 +1321,9 @@ psm3_parse_devices(int devices[PTL_MAX_INIT]) *(b_new - 1) = '\0'; _HFI_PRDBG("PSM Device allocation order: %s\n", devstr); + for (i=0; i < PTL_MAX_INIT; i++) + saved[i] = devices[i]; + have_value = 1; fail: if (devstr != NULL) psmi_free(devstr); diff --git a/prov/psm3/psm3/psm_ep.h b/prov/psm3/psm3/psm_ep.h index 609c75ea8b6..c1ec006eff9 100644 --- a/prov/psm3/psm3/psm_ep.h +++ b/prov/psm3/psm3/psm_ep.h @@ -123,6 +123,7 @@ struct psm2_ep { uint16_t network_pkey_index; /**> Pkey index */ int did_syslog; const char *dev_name; /* just for logging */ + const char *addl_nic_info; /* just for logging */ psm2_uuid_t uuid; uint16_t jkey; uint64_t service_id; /* OPA service ID */ @@ -271,8 +272,6 @@ struct psm2_epaddr { PSMI_PROFILE_UNBLOCK(); \ } while (0) -psm2_error_t psm3_parse_devices(int devices[PTL_MAX_INIT]); -int psm3_device_is_enabled(const int devices[PTL_MAX_INIT], int devid); int psm3_ep_device_is_enabled(const psm2_ep_t ep, int devid); #ifdef PSM_HAVE_RNDV_MOD diff --git a/prov/psm3/psm3/psm_mpool.c b/prov/psm3/psm3/psm_mpool.c index d6b6445a154..6bf33b7d74a 100644 --- a/prov/psm3/psm3/psm_mpool.c +++ b/prov/psm3/psm3/psm_mpool.c @@ -470,8 +470,10 @@ void MOCKABLE(psm3_mpool_get_obj_info)(mpool_t mp, uint32_t *num_obj_per_chunk, uint32_t *num_obj_max_total) { - *num_obj_per_chunk = mp->mp_num_obj_per_chunk; - *num_obj_max_total = mp->mp_num_obj_max_total; + if (num_obj_per_chunk) + *num_obj_per_chunk = mp->mp_num_obj_per_chunk; + if (num_obj_max_total) + *num_obj_max_total = mp->mp_num_obj_max_total; return; } MOCK_DEF_EPILOGUE(psm3_mpool_get_obj_info); diff --git a/prov/psm3/psm3/psm_mq.c b/prov/psm3/psm3/psm_mq.c index ca6cd100b7c..5203715fff8 100644 --- a/prov/psm3/psm3/psm_mq.c +++ b/prov/psm3/psm3/psm_mq.c @@ -1445,6 +1445,18 @@ psm2_error_t psm3_mqopt_ctl(psm2_mq_t mq, uint32_t key, void *value, int get) _HFI_VDBG("RNDV_SHM_SZ = %d (%s)\n", mq->shm_thresh_rv, get ? "GET" : "SET"); break; +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + case PSM2_MQ_GPU_RNDV_SHM_SZ: + if (get) + *((uint32_t *) value) = mq->shm_gpu_thresh_rv; + else { + val32 = *((uint32_t *) value); + mq->shm_gpu_thresh_rv = val32; + } + _HFI_VDBG("RNDV_GPU_SHM_SZ = %d (%s)\n", + mq->shm_gpu_thresh_rv, get ? "GET" : "SET"); + break; +#endif case PSM2_MQ_MAX_SYSBUF_MBYTES: /* Deprecated: this option no longer does anything. */ break; @@ -1597,6 +1609,169 @@ psm3_mq_print_stats_finalize(psm2_mq_t mq) } } +/* parse a list of window_rv:limit values for + * PSM3_RNDV_NIC_WINDOW and PSM3_GPU_RNDV_NIC_WINDOW + * format is window:limit,window:limit,window + * limit value must be increasing, limit for last entry is optional and + * will be UINT32_MAX even if a value is specified. + * 0 - successfully parsed, *list points to malloced list + * -1 - str empty, *list unchanged + * -2 - syntax error, *list unchanged + */ +static int psm3_mq_parse_window_rv(const char *str, + size_t errstr_size, char errstr[], + struct psm3_mq_window_rv_entry **list) +{ +#define MAX_WINDOW_STR_LEN 1024 + char temp[MAX_WINDOW_STR_LEN+1]; + char *s; + char *delim; + struct psm3_mq_window_rv_entry *ret = NULL; + int i; + unsigned int win, limit; + int skip_limit; + + if (!str || ! *str) + return -1; + + strncpy(temp, str, MAX_WINDOW_STR_LEN); + if (temp[MAX_WINDOW_STR_LEN-1] != 0) { + // string too long + if (errstr_size) + snprintf(errstr, errstr_size, + " Value too long, limit %u characters", + MAX_WINDOW_STR_LEN-1); + return -2; + } + + s = temp; + i = 0; + do { + if (! *s) // trailing ',' on 2nd or later loop + break; + // find end of window field and put in \0 as needed + delim = strpbrk(s, ":,"); + skip_limit = (!delim || *delim == ','); + if (delim) + *delim = '\0'; + // parse window + if (psm3_parse_str_uint(s, &win, 1, PSM_MQ_NIC_MAX_RNDV_WINDOW)) { + if (errstr_size) + snprintf(errstr, errstr_size, " Invalid window_rv: %s", s); + goto fail; + } + // find next field + if (delim) + s = delim+1; + if (skip_limit) { + limit = UINT32_MAX; + } else { + delim = strpbrk(s, ","); + if (delim) + *delim = '\0'; + //parse limit + if (!strcasecmp(s, "max") || !strcasecmp(s, "maximum")) { + limit = UINT32_MAX; + } else { + if (psm3_parse_str_uint(s, &limit, 1, UINT32_MAX)) { + if (errstr_size) + snprintf(errstr, errstr_size, " Invalid limit: %s", s); + goto fail; + } + } + // find next field + if (delim) + s = delim+1; + } + if (i && ret[i-1].limit >= limit) { + if (errstr_size) + snprintf(errstr, errstr_size, " Limit not increasing: %u", limit); + goto fail; + } + + ret = (struct psm3_mq_window_rv_entry*)psmi_realloc(PSMI_EP_NONE, + UNDEFINED, ret, sizeof(struct psm3_mq_window_rv_entry)*(i+1)); + if (! ret) // keep scans happy + return -2; + ret[i].window_rv = ROUNDUP(win, PSMI_PAGESIZE); + ret[i].limit = limit; + i++; + } while (delim); + if (! i) + return -1; + // force last entry limit to UINT32_MAX so used for all remaining lengths + ret[i-1].limit = UINT32_MAX; + if (list) + *list = ret; + else + psmi_free(ret); + return 0; + +fail: + psmi_free(ret); + return -2; +} + +static int psm3_mq_parse_check_window_rv(int type, + const union psmi_envvar_val val, + void * ptr, + size_t errstr_size, char errstr[]) +{ + psmi_assert(type == PSMI_ENVVAR_TYPE_STR); + return psm3_mq_parse_window_rv(val.e_str, errstr_size, errstr, NULL); +} + +PSMI_ALWAYS_INLINE(uint32_t search_window(struct psm3_mq_window_rv_entry *e, + uint32_t len)) +{ + for (; len > e->limit; e++) + ; + return e->window_rv; +} + +// for CPU build, gpu argument ignored, but avoids needing ifdef in callers +uint32_t psm3_mq_max_window_rv(psm2_mq_t mq, int gpu) +{ + // must do search since window_rv may not be increasing (but usually is) + uint32_t ret = 0; + struct psm3_mq_window_rv_entry *e; +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + if (gpu) + e = mq->ips_gpu_window_rv; + else +#endif + e = mq->ips_cpu_window_rv; + do { + ret = max(ret, e->window_rv); + } while ((e++)->limit < UINT32_MAX); + return ret; +} + +uint32_t psm3_mq_get_window_rv(psm2_mq_req_t req) +{ + if (! req->window_rv) { +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + if (req->is_buf_gpu_mem) { + req->window_rv = search_window( + req->mq->ips_gpu_window_rv, + req->req_data.send_msglen); + } else +#endif /* PSM_CUDA || PSM_ONEAPI */ + req->window_rv = search_window(req->mq->ips_cpu_window_rv, + req->req_data.send_msglen); +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + _HFI_VDBG("Selected Window of %u for %u byte %s msg\n", + req->window_rv, + req->req_data.send_msglen, + req->is_buf_gpu_mem?"GPU":"CPU"); +#else + _HFI_VDBG("Selected Window of %u for %u byte msg\n", + req->window_rv, req->req_data.send_msglen); +#endif + } + return req->window_rv; +} + /* * This is the API for the user. We actually allocate the MQ much earlier, but * the user can set options after obtaining an endpoint @@ -2402,6 +2577,9 @@ psm2_error_t psm3_mq_malloc(psm2_mq_t *mqo) // shm_thresh_rv is N/A to NIC and HAL, so we set this here and let // HAL set the rest of the defaults mq->shm_thresh_rv = MQ_SHM_THRESH_RNDV; +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + mq->shm_gpu_thresh_rv = MQ_SHM_GPU_THRESH_RNDV; +#endif psmi_hal_mq_init_defaults(mq); @@ -2426,6 +2604,9 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq) { union psmi_envvar_val env_hfitiny, env_rvwin, env_hfirv, env_shmrv, env_hash, env_stats; +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + union psmi_envvar_val env_shmgpurv; +#endif // a limit of PSM_MQ_MAX_TINY btyes is hardcoded into the PSM protocol psm3_getenv("PSM3_MQ_TINY_NIC_LIMIT", @@ -2440,11 +2621,66 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq) (union psmi_envvar_val)mq->hfi_thresh_rv, &env_hfirv); mq->hfi_thresh_rv = env_hfirv.e_uint; - psm3_getenv("PSM3_MQ_RNDV_NIC_WINDOW", - "NIC rendezvous window size, max 4M", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)mq->hfi_base_window_rv, &env_rvwin); - mq->hfi_base_window_rv = min(PSM_MQ_NIC_MAX_RNDV_WINDOW, env_rvwin.e_uint); +#define WINDOW_SYNTAX "Specified as window_size:limit,window_size:limit, ...\nwhere limit is the largest message size the window_size is applicable to.\nThe last window_size in the list will be used for all remaining message\nsizes (eg. its limit is optional and ignored).\nwindow_size must be <= 4194304 and the limit in each entry must be larger\nthan the prior entry." + + // for loopback, no ips so no window_rv + if (mq->ips_cpu_window_rv_str) { + int got_depwin = 0; // using deprecated PSM3_MQ_RNDV_NIC_WINDOW + + // PSM3_RNDV_NIC_WINDOW overrides deprecated PSM3_MQ_RNDV_NIC_WINDOW. + // only parse PSM3_MQ_RNDV_NIC_WINDOW if used default for + // PSM3_RNDV_NIC_WINDOW because it was not specified. + if (psm3_getenv_range("PSM3_RNDV_NIC_WINDOW", + "List of NIC rendezvous windows sizes for messges to and from a CPU buffer.", + WINDOW_SYNTAX, + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)(char*)(mq->ips_cpu_window_rv_str), + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + psm3_mq_parse_check_window_rv, NULL, &env_rvwin) > 0) { + // new syntax is superset of old + got_depwin = (0 == psm3_getenv_range("PSM3_MQ_RNDV_NIC_WINDOW", + "[Deprecated, use PSM3_RNDV_NIC_WINDOW and PSM3_GPU_RNDV_NIC_WINDOW]", + "NIC rendezvous window size, max 4194304", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)(char*)(mq->ips_cpu_window_rv_str), + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + psm3_mq_parse_check_window_rv, NULL, &env_rvwin)); + } + if (psm3_mq_parse_window_rv(env_rvwin.e_str, 0, NULL, + &mq->ips_cpu_window_rv) < 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); + } +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + if (PSMI_IS_GPU_ENABLED && mq->ips_gpu_window_rv_str) { + union psmi_envvar_val env_gpurvwin; + char *env; + + env = psm3_env_get("PSM3_GPU_RNDV_NIC_WINDOW"); + if (env && *env) + got_depwin = 0; // use new default as default + // PSM3_GPU_RNDV_NIC_WINDOW overrides deprecated + // PSM3_MQ_RNDV_NIC_WINDOW. + // If PSM3_GPU_RNDV_NIC_WINDOW not specified and user specified + // PSM3_MQ_RNDV_NIC_WINDOW, use it for GPU too. + (void)psm3_getenv_range("PSM3_GPU_RNDV_NIC_WINDOW", + "List of NIC rendezvous windows sizes for messages to or from a GPU buffer.", + WINDOW_SYNTAX, + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + got_depwin?env_rvwin: + (union psmi_envvar_val)(char*)(mq->ips_gpu_window_rv_str), + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + psm3_mq_parse_check_window_rv, NULL, &env_gpurvwin); + if (psm3_mq_parse_window_rv(env_gpurvwin.e_str, 0, NULL, + &mq->ips_gpu_window_rv)< 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); + } + } +#else + (void)got_depwin; // keep compiler happy +#endif /* PSM_CUDA || PSM_ONEAPI */ + } /* Re-evaluate this since it may have changed after initializing the shm * device */ @@ -2455,6 +2691,17 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq) (union psmi_envvar_val)mq->shm_thresh_rv, &env_shmrv); mq->shm_thresh_rv = env_shmrv.e_uint; +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + if (PSMI_IS_GPU_ENABLED) { + mq->shm_gpu_thresh_rv = psm3_shm_mq_gpu_rv_thresh; + psm3_getenv("PSM3_MQ_RNDV_SHM_GPU_THRESH", + "shm eager-to-rendezvous switchover for GPU send", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)mq->shm_gpu_thresh_rv, &env_shmgpurv); + mq->shm_gpu_thresh_rv = env_shmgpurv.e_uint; + } +#endif + psm3_getenv("PSM3_MQ_HASH_THRESH", "linear list to hash tag matching switchover", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, @@ -2486,6 +2733,10 @@ psm2_error_t MOCKABLE(psm3_mq_free)(psm2_mq_t mq) psm3_mq_req_fini(mq); psm3_mq_sysbuf_fini(mq); psm3_stats_deregister_type(PSMI_STATSTYPE_MQ, mq); +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + psmi_free(mq->ips_gpu_window_rv); +#endif + psmi_free(mq->ips_cpu_window_rv); psmi_free(mq); return PSM2_OK; } diff --git a/prov/psm3/psm3/psm_mq_internal.h b/prov/psm3/psm3/psm_mq_internal.h index f83e50bbffd..6c7127b0245 100644 --- a/prov/psm3/psm3/psm_mq_internal.h +++ b/prov/psm3/psm3/psm_mq_internal.h @@ -85,6 +85,11 @@ struct psm2_mq_perf_data int perf_print_stats; }; +struct psm3_mq_window_rv_entry { + uint32_t window_rv; + uint32_t limit; +}; + #ifdef LEARN_HASH_SELECTOR // When transition back to nohash mode, should the prior // learned table_sel be retained for use next time transition to hash mode. @@ -175,9 +180,15 @@ struct psm2_mq { uint32_t hfi_thresh_tiny; uint32_t hfi_thresh_rv; uint32_t shm_thresh_rv; - uint32_t hfi_base_window_rv; /**> this is a base rndv window size, - will be further trimmed down per-connection based - on the peer's MTU */ +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + uint32_t shm_gpu_thresh_rv; +#endif + const char *ips_cpu_window_rv_str; // default input to parser + struct psm3_mq_window_rv_entry *ips_cpu_window_rv; +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + const char *ips_gpu_window_rv_str; // default input to parser + struct psm3_mq_window_rv_entry *ips_gpu_window_rv; +#endif uint32_t hash_thresh; int memmode; @@ -313,6 +324,7 @@ struct psm2_mq_req { mq_rts_callback_fn_t rts_callback; psm2_epaddr_t rts_peer; uintptr_t rts_sbuf; + uint32_t window_rv; // window size chosen by receiver or GPU send prefetcher #ifdef PSM_HAVE_REG_MR psm3_verbs_mr_t mr; // local registered memory for app buffer @@ -752,6 +764,9 @@ psm2_mq_req_t psm3_mq_req_match(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t * psm2_error_t psm3_mq_malloc(psm2_mq_t *mqo); psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq); psm2_error_t psm3_mq_initstats(psm2_mq_t mq, psm2_epid_t epid); +extern uint32_t psm3_mq_max_window_rv(psm2_mq_t mq, int gpu); +uint32_t psm3_mq_get_window_rv(psm2_mq_req_t req); + psm2_error_t MOCKABLE(psm3_mq_free)(psm2_mq_t mq); MOCK_DCL_EPILOGUE(psm3_mq_free); diff --git a/prov/psm3/psm3/psm_mq_recv.c b/prov/psm3/psm3/psm_mq_recv.c index bc90d07c5cf..7b481351843 100644 --- a/prov/psm3/psm3/psm_mq_recv.c +++ b/prov/psm3/psm3/psm_mq_recv.c @@ -199,11 +199,13 @@ psm3_mq_req_copy(psm2_mq_req_t req, } if (msgptr != buf) { #if defined(PSM_CUDA) || defined(PSM_ONEAPI) + // for loopback HAL, invalid to call psm3_mq_get_window_rv() + // however, for loopback HAL, gdr copy is disabled if (use_gdrcopy) psm3_mq_req_gpu_copy((uint64_t)req->req_data.buf, req->req_data.recv_msglen, (uint64_t)msgptr, msglen_this, - req->mq->hfi_base_window_rv, buf, + psm3_mq_get_window_rv(req), buf, ep); else #endif diff --git a/prov/psm3/psm3/psm_mq_utils.c b/prov/psm3/psm3/psm_mq_utils.c index af2988f64f1..7e80739373a 100644 --- a/prov/psm3/psm3/psm_mq_utils.c +++ b/prov/psm3/psm3/psm_mq_utils.c @@ -82,9 +82,9 @@ psm2_mq_req_t MOCKABLE(psm3_mq_req_alloc)(psm2_mq_t mq, uint32_t type) return req; } else { /* we're out of reqs */ int issend = (type == MQE_TYPE_SEND); - uint32_t reqmax, reqchunk; + uint32_t reqmax; psm3_mpool_get_obj_info(issend ? mq->sreq_pool : mq->rreq_pool, - &reqchunk, &reqmax); + NULL, &reqmax); psm3_handle_error(PSMI_EP_NORETURN, PSM2_PARAM_ERR, "Exhausted %d MQ %s request descriptors, which usually indicates " diff --git a/prov/psm3/psm3/psm_nic_select.c b/prov/psm3/psm3/psm_nic_select.c new file mode 100644 index 00000000000..1a451f5eb67 --- /dev/null +++ b/prov/psm3/psm3/psm_nic_select.c @@ -0,0 +1,2098 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2024 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2024 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2024 Intel Corporation. All rights reserved. */ + +#include +#include +#include "psm_user.h" +#include "psm2_hal.h" +#ifdef PSM_USE_HWLOC +#include +#include +#endif + +#define MAX_MAP_LEN (PSMI_MAX_RAILS*128) + +// sanity check, psm_user.h should ensure this, unless user tried to +// manually set PSM_HAVE_GPU_CENTRIC_AFFINITY +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY +#ifndef PSM_USE_HWLOC +#error "PSM_HAVE_GPU_CENTRIC_AFFINITY set without PSM_USE_HWLOC" +#endif +#endif + +// subnuma is risky right now, so disable and explore in future +//#ifdef PSM_USE_HWLOC +//#define PSM3_HAVE_CPU_SUBNUMA +//#endif +#undef PSM3_HAVE_CPU_SUBNUMA + +#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) +struct pci_addr { + uint32_t domain; + uint32_t bus; + uint32_t dev; + uint32_t func; +}; +#endif + +// table of refcount per unit_id counting references by endpoints within +// local process +// protected by psm3_creation_lock (held in psm_ep.c during EP open and close) +static uint64_t psm3_nic_refcount[PSMI_MAX_RAILS]; + +// psm3_shared_affinity_nic_refcount_ptr is the pointer to table of refcount +// per unit_id countting references by all processes within node. +// protected by psm3_sem_affinity_shm_rw semaphore + +static int psmi_parse_nic_selection_algorithm(void); + +#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) +static hwloc_topology_t psm3_hwloc_topology; +static int psm3_hwloc_topology_initialized; +static int psm3_hwloc_topology_init_failed; +static void psm3_deferred_hwloc_topology_init(); +#endif + +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY +static int psm3_get_distance_between_pcis(const struct pci_addr *pci_addr_1, + const struct pci_addr *pci_addr_2); +#endif +#ifdef PSM3_HAVE_CPU_SUBNUMA +static hwloc_obj_t psm3_get_non_io_ancestor_obj( + const struct pci_addr *pci_addr); +#endif + +// As we consider and select NICs, we fill in additional information +// or set filtered to exclude the NIC from further consideration. +// The use of filtered avoids the cost of repeatedly compressing the list. +struct nic_info { + uint8_t filtered; // has NIC been filtered out from possible selection + psmi_subnet128_t subnet; + unsigned unit; + unsigned port; + unsigned addr_index; + int numa_id; // CPU NUMA location of NIC +#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) + struct pci_addr pci_addr; +#endif +#ifdef PSM3_HAVE_CPU_SUBNUMA + int cpu_close; // is CPU sub-numa close to NIC +#endif +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY + int gpu_distance; +#endif +}; + + +/* returns the 8-bit hash value of an uuid. */ +static inline +uint8_t +psm3_get_uuid_hash(psm2_uuid_t const uuid) +{ + int i; + uint8_t hashed_uuid = 0; + + for (i=0; i < sizeof(psm2_uuid_t); ++i) + hashed_uuid ^= *((uint8_t const *)uuid + i); + + return hashed_uuid; +} + +int psm3_get_current_proc_location() +{ + int core_id, node_id; + + core_id = sched_getcpu(); + if (core_id < 0) + return -EINVAL; + + node_id = numa_node_of_cpu(core_id); + if (node_id < 0) + return -EINVAL; + + return node_id; +} + +// print a bitmask in condensed form at _HFI_VBG level +// condensed form consolidates sequential numbers such as: "0-43,88-131" +static void vdbg_print_bitmask(const char* prefix, struct bitmask *bmp) +{ + if (_HFI_VDBG_ON) { + int i, len; + char buf[1024]; + int last=-1; + int first=-1; + int max = numa_num_possible_nodes(); + + snprintf(buf, sizeof(buf), "%s", prefix); + len = strlen(buf); + for (i=0; i 1) { + if (first == last) { + // first in a possible sequence + snprintf(&buf[len], sizeof(buf)-len, ",%d", i); + } else { + // complete prior sequence, first in a new sequence + snprintf(&buf[len], sizeof(buf)-len, "-%d,%d", last, i); + } + first = i; + last = first; + } else { + last = i; + } + len = strlen(buf); + } + // complete prior sequence as needed + if (first>=0 && first != last) + snprintf(&buf[len], sizeof(buf)-len, "-%d", last); + _HFI_VDBG("%s\n", buf); + } +} + +// return the largest possible numa ID of a CPU in this system +int psm3_get_max_cpu_numa() +{ + static int max_cpu_numa = -1; + struct bitmask *cpumask, *empty_cpumask; + int i; + + if (max_cpu_numa >= 0) + return max_cpu_numa; + + // we don't depend on numa_num_configured_nodes since in theory there + // could be non-CPU memory NUMA nodes. We only need to know the + // largest possible value for a CPU numa node ID + + // numa_max_node - largest NUMA node which is not disabled + // numa_node_to_cpus - given a NUMA node, create list of CPUs + // numa_node_of_cpu - cpu ID to NUMA (or error if invalid CPU) + // numa_node_to_cpus - cpumask of CPUs on given NUMA node + + max_cpu_numa = -1; + empty_cpumask = numa_allocate_cpumask(); + numa_bitmask_clearall(empty_cpumask); + //vdbg_print_bitmask("empty_cpumask: ", empty_cpumask); + + cpumask = numa_allocate_cpumask(); + _HFI_VDBG("numa_max_node=%d\n", numa_max_node()); + for (i=numa_max_node(); i >= 0; i--) { + numa_bitmask_clearall(cpumask); + int ret = numa_node_to_cpus(i, cpumask); + _HFI_VDBG("i=%d node_to_cpus ret=%d\n", i, ret); + vdbg_print_bitmask("cpumask: ", cpumask); + if (ret >= 0 && ! numa_bitmask_equal(cpumask, empty_cpumask)) { + max_cpu_numa = i; + break; + } + } + numa_free_cpumask(cpumask); + numa_free_cpumask(empty_cpumask); + psmi_assert_always(max_cpu_numa >= 0); + return max_cpu_numa; +} + +/* search the list of all units for those which are active + * and optionally match the given NUMA node_id (when node_id >= 0) + * returns the number of active units found. + * Note get_unit_active tests for active ports, valid addresses and + * performs filtering as done in get_port_subnets + */ +static int +hfi_find_active_hfis(int nunits, int node_id, int *saved_hfis) +{ + int found = 0, unit_id; + + for (unit_id = 0; unit_id < nunits; unit_id++) { + int node_id_i; + + if (psmi_hal_get_unit_active(unit_id) <= 0) + continue; + + if (node_id < 0) { + saved_hfis[found++] = unit_id; + _HFI_DBG("RoundRobinAll Found NIC unit= %d, local rank=%d.\n", + unit_id, psm3_get_mylocalrank()); + } else if (!psmi_hal_get_node_id(unit_id, &node_id_i) + && node_id_i == node_id) { + saved_hfis[found++] = unit_id; + _HFI_DBG("RoundRobin Found NIC unit= %d, node = %d, local rank=%d.\n", + unit_id, node_id, psm3_get_mylocalrank()); + } + } + return found; +} + +// select NIC across all NICs, use a hash of job_id and local rank to +// distribute local ranks across NICs and to attempt to distribute +// jobs across NICs. +// TBD - if know never have >1 job per node, could ignore job_id, perhaps +// have an env to exclude job_id from hash so NIC selection is deterministic +static void +psmi_spread_nic_selection(psm2_uuid_t const job_key, long *unit_start, + long *unit_end, int nunits) +{ + { + int found, saved_hfis[nunits]; + + /* else, we are going to look at: + (a hash of the job key plus the local rank id) mod nunits. */ + found = hfi_find_active_hfis(nunits, -1, saved_hfis); + if (found) + *unit_start = saved_hfis[((psm3_get_mylocalrank()+1) + + psm3_get_uuid_hash(job_key)) % found]; + else + // none found, caller will fail, start is a don't care + *unit_start = 0; + /* just in case, caller will check all other units, with wrap */ + if (*unit_start > 0) + *unit_end = *unit_start - 1; + else + *unit_end = nunits-1; + } + _HFI_DBG("RoundRobinAll Will select 1st viable NIC unit= %ld to %ld.\n", + *unit_start, *unit_end); +} + +static int +psm3_create_and_open_affinity_shm(psm2_uuid_t const job_key) +{ + int shm_fd, ret; + int first_to_create = 0; + size_t shm_name_len = 256; + + psmi_assert_always(psm3_affinity_semaphore_open); + if (psm3_affinity_shared_file_opened) { + /* opened and have our reference counted in shm */ + psmi_assert_always(psm3_affinity_shm_name != NULL); + psmi_assert_always(psm3_shared_affinity_ptr != NULL); + return 0; + } + + psm3_shared_affinity_ptr = NULL; + psm3_affinity_shm_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, shm_name_len); + + psmi_assert_always(psm3_affinity_shm_name != NULL); + snprintf(psm3_affinity_shm_name, shm_name_len, + AFFINITY_SHM_BASENAME".%d", + psm3_get_uuid_hash(job_key)); + shm_fd = shm_open(psm3_affinity_shm_name, O_RDWR | O_CREAT | O_EXCL, + S_IRUSR | S_IWUSR); + if ((shm_fd < 0) && (errno == EEXIST)) { + shm_fd = shm_open(psm3_affinity_shm_name, O_RDWR, S_IRUSR | S_IWUSR); + if (shm_fd < 0) { + _HFI_VDBG("Cannot open affinity shared mem fd:%s, errno=%d\n", + psm3_affinity_shm_name, errno); + goto free_name; + } + } else if (shm_fd >= 0) { + first_to_create = 1; + } else { + _HFI_VDBG("Cannot create affinity shared mem fd:%s, errno=%d\n", + psm3_affinity_shm_name, errno); + goto free_name; + } + + ret = ftruncate(shm_fd, PSMI_PAGESIZE); + if ( ret < 0 ) { + _HFI_VDBG("Cannot truncate affinity shared mem fd:%s, errno=%d\n", + psm3_affinity_shm_name, errno); + goto close_shm; + } + + psm3_shared_affinity_ptr = (uint64_t *) mmap(NULL, PSMI_PAGESIZE, PROT_READ | PROT_WRITE, + MAP_SHARED, shm_fd, 0); + if (psm3_shared_affinity_ptr == MAP_FAILED) { + _HFI_VDBG("Cannot mmap affinity shared memory: %s, errno=%d\n", + psm3_affinity_shm_name, errno); + goto close_shm; + } + close(shm_fd); + shm_fd = -1; + + if (first_to_create) { + _HFI_VDBG("Initializing shm to store NIC affinity per socket: %s\n", psm3_affinity_shm_name); + + memset(psm3_shared_affinity_ptr, 0, PSMI_PAGESIZE); + + /* + * Once shm object is initialized, unlock others to be able to + * use it. + */ + psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name); + } else { + _HFI_VDBG("Opened shm object to read/write NIC affinity per socket: %s\n", psm3_affinity_shm_name); + } + + /* + * Start critical section to increment reference count when creating + * or opening shm object. Decrement of ref count will be done before + * closing the shm. + */ + if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) { + _HFI_VDBG("Could not enter critical section to update shm refcount\n"); + goto unmap_shm; + } + + psm3_shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] += 1; + _HFI_VDBG("shm refcount = %"PRId64"\n", psm3_shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION]); + + /* End critical section */ + psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name); + + psm3_affinity_shared_file_opened = 1; + + return 0; + +unmap_shm: + munmap(psm3_shared_affinity_ptr, PSMI_PAGESIZE); + psm3_shared_affinity_ptr = NULL; +close_shm: + if (shm_fd >= 0) close(shm_fd); +free_name: + psmi_free(psm3_affinity_shm_name); + psm3_affinity_shm_name = NULL; + return -1; +} + +/* + * Spread HFI selection between units if we find more than one within a socket. + */ +static void +psmi_spread_hfi_within_socket(long *unit_start, long *unit_end, int node_id, + int *saved_hfis, int found, psm2_uuid_t const job_key) +{ + int ret, shm_location; + + /* + * Take affinity lock and open shared memory region to be able to + * accurately determine which HFI to pick for this process. If any + * issues, bail by picking first known HFI. + */ + if (!psm3_affinity_semaphore_open) + goto spread_hfi_fallback; + + ret = psm3_create_and_open_affinity_shm(job_key); + if (ret < 0) + goto spread_hfi_fallback; + + // one shm entry per CPU NUMA domain + // The entry contains the next round robin NIC to use + // in the form of a index into saved_hfis + // saved_hfis has a list of all the NUMA local active NICs + shm_location = AFFINITY_SHM_HFI_INDEX_LOCATION + node_id; + if (shm_location > PSMI_PAGESIZE) + goto spread_hfi_fallback; + + /* Start critical section to read/write shm object */ + if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) { + _HFI_VDBG("Could not enter critical section to update NIC index\n"); + goto spread_hfi_fallback; + } + + *unit_start = *unit_end = saved_hfis[psm3_shared_affinity_ptr[shm_location]]; + psm3_shared_affinity_ptr[shm_location] = + (psm3_shared_affinity_ptr[shm_location] + 1) % found; + _HFI_DBG("RoundRobin Selected NIC unit= %ld, Next NIC=%ld, node = %d, local rank=%d, found=%d.\n", + *unit_start, psm3_shared_affinity_ptr[shm_location], node_id, + psm3_get_mylocalrank(), found); + + /* End Critical Section */ + psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name); + + return; + +spread_hfi_fallback: + *unit_start = *unit_end = saved_hfis[0]; +} + +static void +psm3_create_affinity_semaphores(psm2_uuid_t const job_key) +{ + int ret; + size_t sem_len = 256; + + /* + * If already opened, no need to do anything else. + * This could be true for Multi-EP cases where a different thread has + * already created the semaphores. We don't need separate locks here as + * we are protected by the overall "psm3_creation_lock" which each + * thread will take in psm3_ep_open() + */ + if (psm3_affinity_semaphore_open) + return; + + psm3_sem_affinity_shm_rw_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, sem_len); + psmi_assert_always(psm3_sem_affinity_shm_rw_name != NULL); + snprintf(psm3_sem_affinity_shm_rw_name, sem_len, + SEM_AFFINITY_SHM_RW_BASENAME".%d", + psm3_get_uuid_hash(job_key)); + + ret = psmi_init_semaphore(&psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name, + S_IRUSR | S_IWUSR, 0); + if (ret) { + _HFI_VDBG("Cannot initialize semaphore: %s for read-write access to shm object.\n", + psm3_sem_affinity_shm_rw_name); + if (psm3_sem_affinity_shm_rw) + sem_close(psm3_sem_affinity_shm_rw); + psmi_free(psm3_sem_affinity_shm_rw_name); + psm3_sem_affinity_shm_rw_name = NULL; + return; + } + + _HFI_VDBG("Semaphore: %s created for read-write access to shm object.\n", + psm3_sem_affinity_shm_rw_name); + + psm3_affinity_semaphore_open = 1; + + return; +} + +/* + * Get all the ports and optionally addr_index'es with a valid lid and gid, + * one port per unit but up to PSM3_ADDR_PER_NIC addresses. + * + * Returns count of entries put in nic_info + * + * There will be exactly per_addr_index entries per active unit all for the + * same port within the unit + */ +unsigned nic_info_init(struct nic_info *nic_info, unsigned nunits, int per_addr_index) +{ + unsigned unit, port, addr_index; + unsigned num_addr_index = per_addr_index?psm3_addr_per_nic:1; + int ret; + unsigned count = 0; + + for (unit = 0; unit < nunits; unit++) { + // get_unit_active is redundant since it loops on all ports and + // confirms at least 1 port has a valid lid. We test that below. + //if (psmi_hal_get_unit_active(unit) <= 0) + // continue; + for (port = PSM3_NIC_MIN_PORT; port <= PSM3_NIC_MAX_PORT; port++) { + int got_port = 0; + for (addr_index = 0; addr_index < num_addr_index; addr_index++) { + psmi_subnet128_t subnet; + ret = psmi_hal_get_port_lid(unit, port, addr_index); + if (ret <= 0) + continue; + ret = psmi_hal_get_port_subnet(unit, port, addr_index, &subnet, NULL, NULL, NULL); + if (ret == -1) + continue; + + nic_info[count].filtered = 0; + nic_info[count].subnet = subnet; + nic_info[count].unit = unit; + nic_info[count].port = port; + nic_info[count].addr_index = addr_index; +#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) + nic_info[count].pci_addr.domain = UINT32_MAX; +#endif + got_port = 1; + count++; + } + if (got_port) // one port per unit + break; + } + } + return count; +} + +/* If at least 1 NIC matches the current CPUs NUMA id, + * filter out all NICs which do not match. + * If none match, noop. + * Also initializes nic_info.numa_id + */ +void nic_info_filter_numa(struct nic_info *nic_info, unsigned ninfo) +{ + unsigned i; + int found = 0; + + int cpu_numa_id = psm3_get_current_proc_location(); + if (cpu_numa_id < 0) { + _HFI_DBG("Unable to determine CPU NUMA location, skipping filter of NIC CPU NUMA location\n"); + return; + } + + for (i=0; i < ninfo; i++) + { + if (nic_info[i].filtered) + continue; + + if (psmi_hal_get_node_id(nic_info[i].unit, &nic_info[i].numa_id) != 0) { + // assume match (don't filter this NIC) + _HFI_DBG("Unable to determine NIC NUMA location for unit %d (%s), assuming local to CPU NUMA (%d)\n", + nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit), + cpu_numa_id); + nic_info[i].numa_id = cpu_numa_id; + } else { + _HFI_DBG("NIC NUMA location for unit %d (%s) is %d\n", + nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit), + nic_info[i].numa_id); + } + found += (nic_info[i].numa_id == cpu_numa_id); + } + if (found) { + _HFI_DBG("Found %d unfiltered NUMA local NICs for CPU NUMA id = %d\n", + found, cpu_numa_id); + // filter out NICs not in cpu_numa_id + for (i=0; i < ninfo; i++) + { + if (nic_info[i].filtered) + continue; + nic_info[i].filtered = (nic_info[i].numa_id != cpu_numa_id); + } + } else { + _HFI_DBG("No NUMA local NIC found, CPU NUMA id = %d\n", cpu_numa_id); + } +} + +/* If at least 1 NIC matches the current CPUs sub-NUMA group, + * filter out all NICs which do not match. + * If none match, noop. + * Also initializes nic_info.pci_addr and nic_info.cpu_close + */ +void nic_info_filter_sub_numa(struct nic_info *nic_info, unsigned ninfo) +{ +#ifdef PSM3_HAVE_CPU_SUBNUMA + unsigned i; + int found = 0; + hwloc_cpuset_t cpu_bind_set; + + psm3_deferred_hwloc_topology_init(); + if (psm3_hwloc_topology_init_failed) + return; // hwloc incorrect version + psmi_assert(psm3_hwloc_topology_initialized); + + // here we use entire CPU bind set, (should match pthread_getaffinity_np) + // as opposed to just the current process location. + cpu_bind_set = hwloc_bitmap_alloc(); + if (! cpu_bind_set) { + _HFI_DBG("Unable to allocate CPU set, skipping filter of CPU sub-NUMA location\n"); + return; + } +#if 0 + // use current process affinity + if (hwloc_get_cpubind(psm3_hwloc_topology, cpu_bind_set, + HWLOC_CPUBIND_PROCESS)) { + _HFI_DBG("Unable to determine process CPU binding, skipping filter of CPU sub-NUMA location\n"); + goto fail; + } +#else + // use current thread affinity + pthread_t mythread = pthread_self(); + if (hwloc_get_thread_cpubind(psm3_hwloc_topology, mythread, + cpu_bind_set, HWLOC_CPUBIND_THREAD)) { + _HFI_DBG("Unable to determine thread CPU binding, skipping filter of CPU sub-NUMA location\n"); + goto fail; + } +#endif + + for (i=0; i < ninfo; i++) + { + if (nic_info[i].filtered) + continue; + if (nic_info[i].pci_addr.domain == UINT32_MAX + && psmi_hal_get_unit_pci_bus(nic_info[i].unit, + &nic_info[i].pci_addr.domain, &nic_info[i].pci_addr.bus, + &nic_info[i].pci_addr.dev, &nic_info[i].pci_addr.func)) { + _HFI_DBG("Unable to get NIC PCIe address for unit %d (%s)\n", + nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit)); + // can't filter out NIC because if all fail we won't have any. + // Unsure how to rank this NIC vs others, so assume not close + nic_info[i].cpu_close = 0; + continue; + } + + hwloc_obj_t ancestor = psm3_get_non_io_ancestor_obj( + &nic_info[i].pci_addr); + if (! ancestor) { + _HFI_DBG("Unable to determine NIC ancestor for unit %d (%s) at PCIe %04x:%02x:%02x.%x\n", + nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit), + nic_info[i].pci_addr.domain, nic_info[i].pci_addr.bus, + nic_info[i].pci_addr.dev, nic_info[i].pci_addr.func); + // can't filter out NIC because if all fail we won't have any. + // Unsure how to rank this NIC vs others, so assume not close + nic_info[i].cpu_close = 0; + continue; + } + + // If any overlap of NIC and process CPU sets, consider it close + nic_info[i].cpu_close = + hwloc_bitmap_isincluded(cpu_bind_set, ancestor->cpuset) + || hwloc_bitmap_isincluded(ancestor->cpuset, cpu_bind_set); + + if (_HFI_DBG_ON) { + char buf[256] = {0};; + hwloc_bitmap_list_snprintf(buf, sizeof(buf), ancestor->cpuset); + buf[sizeof(buf)-1] = '\0'; // paranoid, hwloc doc not clear + _HFI_DBG_ALWAYS("NIC closeness to CPU for unit %d (%s) at %u:%u:%u:%u is %d, NIC close to CPUs: %s\n", + nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit), + nic_info[i].pci_addr.domain, nic_info[i].pci_addr.bus, + nic_info[i].pci_addr.dev, nic_info[i].pci_addr.func, + nic_info[i].cpu_close, buf); + } + found += nic_info[i].cpu_close; + } + if (found) { + if (_HFI_DBG_ON) { + char buf[256] = {0};; + hwloc_bitmap_list_snprintf(buf, sizeof(buf), cpu_bind_set); + buf[sizeof(buf)-1] = '\0'; // paranoid, hwloc doc not clear + _HFI_DBG_ALWAYS("Found %d unfiltered NICs close to CPUs: %s\n", found, buf); + } + // filter out NICs not close + for (i=0; i < ninfo; i++) + { + if (nic_info[i].filtered) + continue; + nic_info[i].filtered = ! nic_info[i].cpu_close; + } + } else { + if (_HFI_DBG_ON) { + char buf[256] = {0};; + hwloc_bitmap_list_snprintf(buf, sizeof(buf), cpu_bind_set); + buf[sizeof(buf)-1] = '\0'; // paranoid, hwloc doc not clear + _HFI_DBG_ALWAYS("No NICs found close to CPUs: %s\n", buf); + } + } +fail: + hwloc_bitmap_free(cpu_bind_set); +#else + //_HFI_DBG("Filtering based on CPU closeness to NIC disabled\n"); +#endif +} + +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY +/* Find the closest NIC to the current GPU and then fiter out all NICs + * which are further from the GPU than that closest NIC + * If no GPU for the process yet, or PSM3 GPU support not enabled, noop. + * Also initializes nic_info.pci_addr and nic_info.gpu_distance + */ +void nic_info_filter_gpu_distance(struct nic_info *nic_info, unsigned ninfo) +{ + unsigned i; + int min_distance = INT_MAX; // smallest distance found + unsigned found = 0; + struct pci_addr gpu_pci_addr; + + if (! PSMI_IS_GPU_ENABLED) + return; + + psm3_deferred_hwloc_topology_init(); + if (psm3_hwloc_topology_init_failed) + return; // hwloc incorrect version + psmi_assert(psm3_hwloc_topology_initialized); + + // Get current GPU PCIe address to gpu_pci_addr; +#ifdef PSM_CUDA + { + int domain, bus, dev; + int num_devices; + CUdevice device; + + PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices); + _HFI_DBG("%d Cuda GPUs found\n", num_devices); + if (! num_devices) + return; + + if (num_devices == 1) { + PSMI_CUDA_CALL(cuDeviceGet, &device, 0); + } else { + // all GPUs will be visible to process, see if app chose one first + CUcontext ctxt = {0}; + if (! psmi_cuCtxGetCurrent || psmi_cuCtxGetCurrent(&ctxt) || ! ctxt) { + _HFI_DBG("Unable to get Cuda ctxt\n"); + //PSMI_CUDA_CALL(cuDeviceGet, &device, 0); + return; + } else { + PSMI_CUDA_CALL(cuCtxGetDevice, &device); + } + } + _HFI_DBG("Using Cuda GPU %d\n", device); + PSMI_CUDA_CALL(cuDeviceGetAttribute, + &domain, + CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, + device); + PSMI_CUDA_CALL(cuDeviceGetAttribute, + &bus, + CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, + device); + PSMI_CUDA_CALL(cuDeviceGetAttribute, + &dev, + CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, + device); + gpu_pci_addr.domain = domain; + gpu_pci_addr.bus = bus; + gpu_pci_addr.dev = dev; + gpu_pci_addr.func = 0; + } +#elif defined(PSM_ONEAPI) + { + ze_pci_ext_properties_t PciProperties; + + _HFI_DBG("%d Level Zero GPUs found\n", num_ze_devices); + if (! num_ze_devices) + return; + + // caling middleware will have limited GPUs visible to process + PSMI_ONEAPI_ZE_CALL(zeDevicePciGetPropertiesExt, + ze_devices[0].dev, &PciProperties); + gpu_pci_addr.domain = PciProperties.address.domain; + gpu_pci_addr.bus = PciProperties.address.bus; + gpu_pci_addr.dev = PciProperties.address.device; + gpu_pci_addr.func = PciProperties.address.function; + } +#endif + _HFI_DBG("GPU PCIe address is %04x:%02x:%02x.%x\n", + gpu_pci_addr.domain, gpu_pci_addr.bus, + gpu_pci_addr.dev, gpu_pci_addr.func); + + for (i=0; i < ninfo; i++) { + if (nic_info[i].filtered) + continue; + if (nic_info[i].pci_addr.domain == UINT32_MAX + && psmi_hal_get_unit_pci_bus(nic_info[i].unit, + &nic_info[i].pci_addr.domain, &nic_info[i].pci_addr.bus, + &nic_info[i].pci_addr.dev, &nic_info[i].pci_addr.func)) { + _HFI_DBG("Unable to get NIC PCIe address for unit %d (%s)\n", + nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit)); + // can't filter out NIC because if all fail we won't have any. + // Unsure how to rank this NIC vs others, so use max distance + nic_info[i].gpu_distance = INT_MAX; + continue; + } + nic_info[i].gpu_distance = psm3_get_distance_between_pcis( + &nic_info[i].pci_addr, &gpu_pci_addr); + _HFI_DBG("NIC PCIe address for unit %d (%s) is %04x:%02x:%02x.%x distance to GPU: %d\n", + nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit), + nic_info[i].pci_addr.domain, nic_info[i].pci_addr.bus, + nic_info[i].pci_addr.dev, nic_info[i].pci_addr.func, + nic_info[i].gpu_distance); + if (nic_info[i].gpu_distance < min_distance) { + min_distance = nic_info[i].gpu_distance; + } + } + if (min_distance == INT_MAX) { + _HFI_DBG("No NIC found with a known distance\n"); + return; // noop + } + + // filter out all NICs with a distance > min_distance + for (i=0; i < ninfo; i++) { + if (nic_info[i].filtered) + continue; + psmi_assert(nic_info[i].gpu_distance >= min_distance); + nic_info[i].filtered = (nic_info[i].gpu_distance > min_distance); + found += ! nic_info[i].filtered; + } + _HFI_DBG("Found %d unfiltered NICs with GPU distance of %d\n", + found, min_distance); +} +#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */ + +// filter down the list of NICs with a CPU locality focus as priority +// if present, the GPU is considered last. If the GPU is NUMA local +// to the CPU, the GPU filter can further limit NICs to those close to the +// GPU (same PCIe switch). But if the GPU is not NUMA local to the CPU, +// the gpu distance filter may still limit distance or end up being a noop. +static void nic_info_filter_cpu_centric(struct nic_info *nic_info, + unsigned ninfo) +{ + _HFI_DBG("Filtering NICs with CPU Centric Strategy\n"); + nic_info_filter_sub_numa(nic_info, ninfo); + nic_info_filter_numa(nic_info, ninfo); +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY + nic_info_filter_gpu_distance(nic_info, ninfo); +#endif +} + +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY +// filter down the list of NICs with a GPU locality focus as priority +// When there is a GPU, once we have selected NICs closest to that +// GPU we are likely to have limited ourselves to NICs in the same +// NUMA as the GPU, so the CPU NUMA tests will become noops. +// For example, a GPU and NIC on the same PCIe switch will by definition +// be in the same CPU root complex and hence same CPU NUMA. +// But if there is no GPU or none of the NICs are close to the GPU +// the CPU numa tests may narrow the list of NICs. +static void nic_info_filter_gpu_centric(struct nic_info *nic_info, + unsigned ninfo) +{ + _HFI_DBG("Filtering NICs with GPU Centric Strategy\n"); + nic_info_filter_gpu_distance(nic_info, ninfo); + nic_info_filter_numa(nic_info, ninfo); + nic_info_filter_sub_numa(nic_info, ninfo); +} +#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */ + +// analyze the refcount table and filter out NICs with refcounts +// higher than the lowest found. +// If all NICs have equal refcounts, noop. +static void +nic_info_filter_refcount(struct nic_info *nic_info, unsigned ninfo, + uint64_t *refcount, unsigned nunits, const char *str) +{ + unsigned i; + uint64_t min_refcount = UINT64_MAX; // smallest refcount found + unsigned found = 0; + + for (i=0; i < ninfo; i++) { + if (nic_info[i].filtered) + continue; + psmi_assert(nic_info[i].unit < nunits); + _HFI_DBG("NIC %s reference count for unit %d (%s) is %"PRIu64"\n", str, + nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit), + refcount[nic_info[i].unit]); + if (refcount[nic_info[i].unit] < min_refcount) { + min_refcount = refcount[nic_info[i].unit]; + psmi_assert(nic_info[i].unit < nunits);; + } + } + if (min_refcount == UINT64_MAX) { + // unexpected, should have found a smaller value + _HFI_DBG("No NIC found with a low %s reference count\n", str); + return; // noop + } + + // filter out all NICs with a refcount > min_refcount + for (i=0; i < ninfo; i++) { + if (nic_info[i].filtered) + continue; + psmi_assert(refcount[nic_info[i].unit] >= min_refcount); + nic_info[i].filtered = (refcount[nic_info[i].unit] > min_refcount); + found += ! nic_info[i].filtered; + } + _HFI_DBG("Found %d unfiltered NICs with %s reference count of %"PRIu64"\n", + found, str, min_refcount); +} + +// return index in nic_info of 1st unfiltered NIC +static unsigned +nic_info_get_first_unfiltered_nic(struct nic_info *nic_info, unsigned ninfo) +{ + unsigned i; + for (i=0; i < ninfo; i++) { + if (! nic_info[i].filtered) + return i; + } + psmi_assert(0); + return 0; +} + +/* + * Select NIC among the unfiltered NICs in nic_info while + * scoreboarding use of each NIC and picking the one with lowest + * unit number and least use. + * + * Scoreboarding starts with the local process's NIC usage across all EPs + * This helps to ensure a given process balances itself across unfiltered NICs + * on the assumption that all local processes will ultimately have the same + * number of endpoints. + * + * After the local process scoreboarding, the shm scoreboard is checked + * to pick a NIC based on lowest refcount within the server. Thus balancing + * NIC usage within the server. + * + * Among NICs with the lowest reference counts, the lowest entry in nic_info + * (also lowest unit_id) is selected. + * This assumes only one entry appears in nic_info for each unit_id + * (eg. nic_info_init was given per_addr_index of 1) and the entries in + * nic_info are sorted by unit_id (in order built by nic_info_init). + * + * Due to call sequence prior to this, nic_info list will already be sorted by + * unit_id since it was built in that order by nic_info_init. + * Returns index in nic_info of selected NIC. + * On any issues, selects 1st NIC + */ +static int +psm3_open_shm_scoreboard_and_select_nic( + struct nic_info *nic_info, unsigned ninfo, + psm2_uuid_t const job_key, unsigned nunits) +{ + int ret, shm_location, index; + + psmi_assert(nunits > 0); + psmi_assert(ninfo > 0); + + // balance among endpoints within current process + nic_info_filter_refcount(nic_info, ninfo, + psm3_nic_refcount, nunits, "local process"); + + psm3_create_affinity_semaphores(job_key); + /* + * Take affinity lock and open shared memory region to be able to + * accurately determine which NIC to pick for this process. If any + * issues, bail by picking first unfiltered NIC in nic_info + */ + if (!psm3_affinity_semaphore_open) + goto fallback; + + ret = psm3_create_and_open_affinity_shm(job_key); + if (ret < 0) + goto fallback; + + // start of scoreboard area, we keep refcount for each unit_id. + // Note that some other modes may organize the shm area differently, + // so it's important that all processes and all endpoints use the same + // fundamental modes for PSM3_MULTIRAIL and PSM3_NIC_SELECTION_ALG + shm_location = AFFINITY_SHM_HFI_INDEX_LOCATION; + if (shm_location + sizeof(*psm3_shared_affinity_ptr)*nunits > PSMI_PAGESIZE) + goto fallback; + + // At psm3_shm_refcount_ptr in Linux shared memory is a table indexed + // by unit_id with a reference count per NIC showing the total + // endpoints within the job which are using the NIC. + psm3_shared_affinity_nic_refcount_ptr = + &psm3_shared_affinity_ptr[shm_location]; + + /* Start critical section to read/write shm object */ + if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) { + _HFI_VDBG("Could not enter critical section to update NIC index\n"); + goto fallback; + } + + // balance among procceses within current node + nic_info_filter_refcount(nic_info, ninfo, + psm3_shared_affinity_nic_refcount_ptr, + nunits, "local node"); + + // use lowest index among those which remain + index = nic_info_get_first_unfiltered_nic(nic_info, ninfo); + + // update reference counts for node level and process level + psm3_shared_affinity_nic_refcount_ptr[nic_info[index].unit]++; + psm3_nic_refcount[nic_info[index].unit]++; + + /* End Critical Section */ + psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name); + + psmi_assert(index >= 0 && index <= ninfo); + _HFI_DBG("Selected NIC unit %d(%s)\n", + nic_info[index].unit, psm3_sysfs_unit_dev_name(nic_info[index].unit)); + return index; + +fallback: + index = nic_info_get_first_unfiltered_nic(nic_info, ninfo); + psm3_nic_refcount[nic_info[index].unit]++; // inc process level refcount + return index; +} + +// decrement reference counts which were incremented in local process +// and in shm within node +// For modes which do not track this style of refcounts psm3_nic_refcount +// will be zero for every unit_id and psm3_shared_affinity_nic_refcount_ptr will +// be NULL (or if psm3 has been finalized) +void psm3_dec_nic_refcount(int unit_id) +{ + // in some modes we don't track refcount, in which case do nothing + if (psm3_nic_refcount[unit_id]) + psm3_nic_refcount[unit_id]--; + if (psm3_affinity_shared_file_opened && psm3_affinity_semaphore_open + && psm3_shared_affinity_nic_refcount_ptr) { + /* Start critical section to read/write shm object */ + if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) { + _HFI_VDBG("Could not enter critical section to update NIC refcount\n"); + } else { + psm3_shared_affinity_nic_refcount_ptr[unit_id]--; + /* End Critical Section */ + psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name); + } + } +} + +psm2_error_t +psm3_compute_start_and_end_unit_cpu_centric( + psm2_uuid_t const job_key, + long *unit_start,long *unit_end, int nunits) +{ + unsigned index; + unsigned ninfo; + struct nic_info nic_info[PSMI_MAX_RAILS]; + + // caller will enumerate addr_index, just just get all active ports + ninfo = nic_info_init(nic_info, nunits, 0); + if (! ninfo) { + // should not happen, caller already confirmed there is >1 active unit + // mimic what caller of psm3_compute_start_and_end_unit would do + return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM3 no nic units are active"); + } + + nic_info_filter_cpu_centric(nic_info, ninfo); + + index = psm3_open_shm_scoreboard_and_select_nic(nic_info, ninfo, + job_key, nunits); + psmi_assert(index >= 0 && index < ninfo); + + // caller will select 1st active port and an addr_index within unit + *unit_start = *unit_end = nic_info[index].unit; + return PSM2_OK; +} + +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY +psm2_error_t +psm3_compute_start_and_end_unit_gpu_centric( + psm2_uuid_t const job_key, + long *unit_start,long *unit_end, int nunits) +{ + unsigned index; + unsigned ninfo; + struct nic_info nic_info[PSMI_MAX_RAILS]; + + // caller will enumerate addr_index, just just get all active ports + ninfo = nic_info_init(nic_info, nunits, 0); + if (! ninfo) { + // should not happen, caller already confirmed there is >1 active unit + // mimic what caller of psm3_compute_start_and_end_unit would do + return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM3 no nic units are active"); + } + + nic_info_filter_gpu_centric(nic_info, ninfo); + + index = psm3_open_shm_scoreboard_and_select_nic(nic_info, ninfo, + job_key, nunits); + psmi_assert(index >= 0 && index < ninfo); + + // caller will select 1st active port and an addr_index within unit + *unit_start = *unit_end = nic_info[index].unit; + return PSM2_OK; +} +#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */ + +// return set of units to consider and which to start at. +// caller will use 1st active unit which can be opened. +// caller will wrap around so it's valid for start >= end +// Note: When using multiple rails per PSM process, higher level code will +// walk through desired units and unit_param will specify a specific unit +// if unit_param is PSM3_NIC_ANY, this will pick starting point for nic search +psm2_error_t +psm3_compute_start_and_end_unit(long unit_param, long addr_index, + int nunitsactive, int nunits, + psm2_uuid_t const job_key, + long *unit_start, long *unit_end) +{ + unsigned short nic_sel_alg = PSMI_UNIT_SEL_ALG_ACROSS; + int node_id, found = 0; + int saved_hfis[nunits]; + + /* if the user did not set PSM3_NIC then ... */ + if (unit_param == PSM3_NIC_ANY) + { + if (nunitsactive > 1) { + // if NICs are on different planes (non-routed subnets) + // we need to have all ranks default to the same plane + // so force 1st active NIC in that case + int have_subnet = 0, unit_id; + psmi_subnet128_t got_subnet = { }; + for (unit_id = 0; unit_id < nunits; unit_id++) { + psmi_subnet128_t subnet; + if (psmi_hal_get_unit_active(unit_id) <= 0) + continue; + if (0 != psmi_hal_get_port_subnet(unit_id, 1 /* VERBS_PORT*/, + addr_index>0?addr_index:0, + &subnet, NULL, NULL, NULL)) + continue; // can't access NIC + if (! have_subnet) { + have_subnet = 1; + got_subnet = subnet; + } else if (! psm3_subnets_match(got_subnet, + subnet)) { + // active units have different tech + // (IB/OPA vs Eth) or different subnets + // caller will pick 1st active unit + *unit_start = 0; + *unit_end = nunits - 1; + _HFI_DBG("Multi-Plane config: Will select 1st viable NIC unit= %ld to %ld.\n", + *unit_start, *unit_end); + return PSM2_OK; + } + } + } + + /* Get the actual selection algorithm from the environment: */ + nic_sel_alg = psmi_parse_nic_selection_algorithm(); + /* If round-robin is selection algorithm and ... */ + if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS) && + /* there are more than 1 active units then ... */ + (nunitsactive > 1)) + { + /* + * Pick an HFI on same root complex as current task. + * linux IPC ensures balanced NIC usage within job. + * If none found, fall back to + * RoundRobinAll load-balancing algorithm. + */ + node_id = psm3_get_current_proc_location(); + if (node_id >= 0) { + found = hfi_find_active_hfis(nunits, node_id, + saved_hfis); + if (found > 1) { + psm3_create_affinity_semaphores(job_key); + psmi_spread_hfi_within_socket(unit_start, unit_end, + node_id, saved_hfis, + found, job_key); + } else if (found == 1) { + *unit_start = *unit_end = saved_hfis[0]; + _HFI_DBG("RoundRobin Selected NIC unit= %ld, node = %d, local rank=%d, found=%d.\n", + *unit_start, node_id, + psm3_get_mylocalrank(), found); + } + } + + if (node_id < 0 || !found) { + _HFI_DBG("RoundRobin No local NIC found, using RoundRobinAll, node = %d, local rank=%d, found=%d.\n", + node_id, + psm3_get_mylocalrank(), found); + psmi_spread_nic_selection(job_key, unit_start, + unit_end, nunits); + } + } else if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS_ALL) && + (nunitsactive > 1)) { + psmi_spread_nic_selection(job_key, unit_start, + unit_end, nunits); + } else if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_CPU_CENTRIC) && + (nunitsactive > 1)) { + return psm3_compute_start_and_end_unit_cpu_centric(job_key, + unit_start, unit_end, nunits); +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY + } else if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_GPU_CENTRIC) && + (nunitsactive > 1)) { + return psm3_compute_start_and_end_unit_gpu_centric(job_key, + unit_start, unit_end, nunits); +#endif + } else { // PSMI_UNIT_SEL_ALG_WITHIN or only 1 active unit + // caller will pick 1st active unit + *unit_start = 0; + *unit_end = nunits - 1; + _HFI_DBG("%s: Will select 1st viable NIC unit= %ld to %ld.\n", + (nic_sel_alg == PSMI_UNIT_SEL_ALG_WITHIN) + ?"Packed":"Only 1 viable NIC", + *unit_start, *unit_end); + } + } else if (unit_param >= 0) { + /* the user specified PSM3_NIC, we use it. */ + *unit_start = *unit_end = unit_param; + _HFI_DBG("Caller selected NIC %ld.\n", *unit_start); + } else { + psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM3 can't open unit: %ld for reading and writing", + unit_param); + return PSM2_EP_DEVICE_FAILURE; + } + + return PSM2_OK; +} + +static +int psmi_parse_nic_selection_algorithm(void) +{ + union psmi_envvar_val env_nic_alg; + int nic_alg = PSMI_UNIT_SEL_ALG_ACROSS; + + const char* PSM3_NIC_SELECTION_ALG_HELP = + "NIC Device Selection Algorithm to use. Round Robin[RoundRobin or rr] (Default) " + ", Packed[p], Round Robin All[RoundRobinAll or rra]," +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY + " CPU Centric Round Robin [CpuRoundRobin or crr]" + ", or GPU Centric Round Robin [GpuRoundRobin or grr]"; +#else + " or CPU Centric Round Robin [CpuRoundRobin or crr]"; +#endif + + + /* If a specific unit is set in the environment, use that one. */ + psm3_getenv("PSM3_NIC_SELECTION_ALG", PSM3_NIC_SELECTION_ALG_HELP, + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)"rr", &env_nic_alg); + + if (!strcasecmp(env_nic_alg.e_str, "Round Robin") + || !strcasecmp(env_nic_alg.e_str, "RoundRobin") + || !strcasecmp(env_nic_alg.e_str, "rr")) + nic_alg = PSMI_UNIT_SEL_ALG_ACROSS; + else if (!strcasecmp(env_nic_alg.e_str, "Packed") + || !strcasecmp(env_nic_alg.e_str, "p")) + nic_alg = PSMI_UNIT_SEL_ALG_WITHIN; + else if (!strcasecmp(env_nic_alg.e_str, "Round Robin All") + || !strcasecmp(env_nic_alg.e_str, "RoundRobinAll") + || !strcasecmp(env_nic_alg.e_str, "rra")) + nic_alg = PSMI_UNIT_SEL_ALG_ACROSS_ALL; + else if (!strcasecmp(env_nic_alg.e_str, "CPU Centric Round Robin") + || !strcasecmp(env_nic_alg.e_str, "CpuRoundRobin") + || !strcasecmp(env_nic_alg.e_str, "crr")) + nic_alg = PSMI_UNIT_SEL_ALG_CPU_CENTRIC; +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY + else if (!strcasecmp(env_nic_alg.e_str, "GPU Centric Round Robin") + || !strcasecmp(env_nic_alg.e_str, "GpuRoundRobin") + || !strcasecmp(env_nic_alg.e_str, "grr")) + nic_alg = PSMI_UNIT_SEL_ALG_GPU_CENTRIC; +#endif + else { + _HFI_INFO( + "Invalid value for PSM3_NIC_SELECTION_ALG ('%s') %-40s Using: %s\n", + env_nic_alg.e_str, PSM3_NIC_SELECTION_ALG_HELP, "RoundRobin"); + nic_alg = PSMI_UNIT_SEL_ALG_ACROSS; + } + + return nic_alg; +} + +/* parse a list of NIC rails for PSM3_MULTIRAIL_MAP + * map is in format: unit:port-addr_index,unit:port-addr_index,...;unit.... + * where :port is optional (default of 1) and unit can be name or number + * -addr_index is also optional and defaults to "all" + * addr_index can be an integer between 0 and PSM3_ADDR_PER_NIC-1 + * or "any" or "all". "any" selects a single address using the hash and + * "all" setups a rail for each address. + * ; may separate sets of rails. When more than 1 set is presented, the + * map_index selects which set is used. + * Returns: + * 0 - successfully parsed, config_out updated + * -1 - str empty, config_out unchanged + * -2 - syntax error, config_out partially updated + */ +static int psm3_parse_multirail_map(const char *str, int map_index, + size_t errstr_size, char errstr[], + struct multirail_config *config_out) +{ + char temp[MAX_MAP_LEN+1]; + char *s; + char *delim; + char delim_char = '\0'; + unsigned i; + int ret; + int set_index = 0; + + if (!str || ! *str) + return -1; + + strncpy(temp, str, MAX_MAP_LEN); + if (temp[MAX_MAP_LEN-1] != 0) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Value too long, limit %u characters", + MAX_MAP_LEN-1); + return -2; + } + config_out->num_rails = 0; + s = temp; + psmi_assert(*s); + do { + int u; + unsigned int p = 1; + int skip_port = 0; + int skip_addr_index = 0; + long a_index = PSM3_ADDR_INDEX_ALL; + + if (! *s) { // trailing ',' or ';' on 2nd or later loop + if (delim_char == ';') + set_index--; // never started next set + break; + } + if (delim_char == ';') { + // start of a new set + config_out->num_rails = 0; + } + if (config_out->num_rails >= PSMI_MAX_RAILS) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Value too long, limit %u rails", + PSMI_MAX_RAILS); + return -2; + } + + // find end of unit field and put in \0 as needed + delim = strpbrk(s, ":-,;"); + if (!delim || *delim == ',' || *delim == ';') { + skip_port = 1; skip_addr_index = 1; + } else if (*delim == '-') { + skip_port = 1; + } + if (delim) { + delim_char = *delim; + *delim = '\0'; + } else { + delim_char = '\0'; + } + // parse unit + u = psm3_sysfs_find_unit(s); + if (u < 0) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Invalid unit: '%s'", s); + return -2; + } + // find next field + if (delim) + s = delim+1; + if (! skip_port) { + // find end of port field and put in \0 as needed + delim = strpbrk(s, "-,;"); + if (!delim || *delim == ',' || *delim == ';') + skip_addr_index = 1; + if (delim) { + delim_char = *delim; + *delim = '\0'; + } else { + delim_char = '\0'; + } + // parse port + if (psm3_parse_str_uint(s, &p, 0, UINT_MAX) < 0) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Invalid port: '%s'", s); + return -2; + } + // find next field + if (delim) + s = delim+1; + } + if (! skip_addr_index) { + // find end of addr_index field and put in \0 as needed + delim = strpbrk(s, ",;"); + if (delim) { + delim_char = *delim; + *delim = '\0'; + } else { + delim_char = '\0'; + } + // parse addr_index + if (0 == strcmp(s, "all")) + a_index = PSM3_ADDR_INDEX_ALL; // we will loop below + else if (0 == strcmp(s, "any")) + a_index = PSM3_ADDR_INDEX_ANY; // caller will pick + else if (psm3_parse_str_long(s, &a_index, 0, psm3_addr_per_nic-1)) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Invalid addr index: '%s'", s); + return -2; + } + // find next field + if (delim) + s = delim+1; + } + + if (a_index == PSM3_ADDR_INDEX_ALL) { // all + for (a_index = 0; a_index < psm3_addr_per_nic; a_index++) { + if (config_out->num_rails >= PSMI_MAX_RAILS) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Limit of %u rails exceeded due to multi-addr", + PSMI_MAX_RAILS); + return -2; + } + config_out->units[config_out->num_rails] = u; + config_out->ports[config_out->num_rails] = p; + config_out->addr_indexes[config_out->num_rails] = a_index; + config_out->num_rails++; + } + } else { + config_out->units[config_out->num_rails] = u; + config_out->ports[config_out->num_rails] = p; + config_out->addr_indexes[config_out->num_rails] = a_index; + config_out->num_rails++; + } + if (delim_char == ';') { + if (set_index == map_index) + break; // found it, stop parsing + set_index++; // start of next + } + } while (delim); + + // if only 1 set input, we use it, otherwise must have enough sets for us + psmi_assert(set_index >= 0); + if (set_index > 0 && set_index != map_index) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Insufficient sets specified: %d need at least %d", + set_index+1, map_index+1); + return -2; + } + psmi_assert(set_index == 0 || set_index == map_index); + + // must have at least 1 rail. Since we caught empty string above, + // if we get here without any rails input must be something like "," or ";" + // and we'll treat that as a syntax error + if (! config_out->num_rails) { + if (errstr_size) + snprintf(errstr, errstr_size, " No rails specified"); + return -2; + } + + // Check if any of the ports are not usable. Just use addr_index 0 for check + for (i = 0; i < config_out->num_rails; i++) { + _HFI_VDBG("rail %d: %u(%s) %u\n", i, + config_out->units[i], + psm3_sysfs_unit_dev_name(config_out->units[i]), + config_out->ports[i]); + + ret = psmi_hal_get_port_active(config_out->units[i], + config_out->ports[i]); + if (ret <= 0) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Unit:port: %d(%s):%d is not active.", + config_out->units[i], + psm3_sysfs_unit_dev_name(config_out->units[i]), + config_out->ports[i]); + return -2; + } + + ret = psmi_hal_get_port_lid(config_out->units[i], + config_out->ports[i], 0 /* addr_index*/); + if (ret <= 0) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Unit:port: %d(%s):%d was filtered out, unable to use", + config_out->units[i], + psm3_sysfs_unit_dev_name(config_out->units[i]), + config_out->ports[i]); + return -2; + } + + ret = psmi_hal_get_port_subnet(config_out->units[i], + config_out->ports[i], 0 /* addr_index*/, + NULL, NULL, NULL, NULL); + if (ret == -1) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Couldn't get subnet for unit %d (%s):%d", + config_out->units[i], + psm3_sysfs_unit_dev_name(config_out->units[i]), + config_out->ports[i]); + return -2; + } + } + + // valid input + return 0; +} + +static int psm3_parse_check_multirail_map(int type, + const union psmi_envvar_val val, void *ptr, + size_t errstr_size, char errstr[]) +{ + struct multirail_config temp; + int map_index = *(int*)ptr; + psmi_assert(type == PSMI_ENVVAR_TYPE_STR); + return psm3_parse_multirail_map(val.e_str, map_index, errstr_size, errstr, + &temp); +} + +// comparison function for qsort +// Sort by subnet 1st, then by nic unit, then by addr_index. +// Nics are already numbered in alphabetic order so this effectively +// sorts by subnet, then nic name, then addr_index.. +// We simply ignore the filtered field, filtered NICs will also get sorted +// but omitted from final output list by caller +static int niccmpfunc(const void *p1, const void *p2) +{ + struct nic_info *a = ((struct nic_info *) p1); + struct nic_info *b = ((struct nic_info *) p2); + int ret; + + ret = psmi_subnet128_cmp(a->subnet, b->subnet); + if (ret == 0) { + if (a->unit < b->unit) + return -1; + else if (a->unit > b->unit) + return 1; + + if (a->addr_index < b->addr_index) + return -1; + else if (a->addr_index > b->addr_index) + return 1; + } + return ret; +} + +/* + * Sort all the ports within nic_info from small to big. + * So, when there are multiple fabrics, and we will use fabric with the + * smallest subnet to make the master connection. + */ +static void +psm3_copy_nic_info_to_multitrail_config( + struct nic_info *nic_info, unsigned ninfo, + struct multirail_config *multirail_config) +{ + unsigned i, j; + + qsort(nic_info, ninfo, sizeof(nic_info[0]), niccmpfunc); + + multirail_config->num_rails = 0; + j = 0; + for (i = 0; i < ninfo; i++) { + if (nic_info[i].filtered) + continue; + multirail_config->units[j] = nic_info[i].unit; + multirail_config->ports[j] = nic_info[i].port; + multirail_config->addr_indexes[j] = nic_info[i].addr_index; + multirail_config->num_rails++; + j++; + } +} + +// select a list of NICs to use, optimizing for CPU locality first +static psm2_error_t +psm3_ep_multirail_autoselect_cpu_centric(uint32_t nunits, + struct multirail_config *multirail_config) +{ + unsigned ninfo; + struct nic_info nic_info[PSMI_MAX_RAILS]; + + // enumerate addr_index too + ninfo = nic_info_init(nic_info, nunits, 1); + if (! ninfo) { + // caller will try single NIC selection next + multirail_config->num_rails = 0; + return PSM2_OK; + } + + nic_info_filter_cpu_centric(nic_info, ninfo); + + // we will use all unfiltered units + + // ensure psm3_context_set_affinity doesn't unnecessarily narrow CPU + // selection, it will be called per rail and if rails are in + // different CPU NUMA could have an undesired impact + setenv("PSM3_NO_AFFINITY", "1", 1); + + psm3_copy_nic_info_to_multitrail_config(nic_info, ninfo, multirail_config); + return PSM2_OK; +} + +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY +// select a list of NICs to use, optimizing for GPU locality first +static psm2_error_t +psm3_ep_multirail_autoselect_gpu_centric(uint32_t nunits, + struct multirail_config *multirail_config) +{ + unsigned ninfo; + struct nic_info nic_info[PSMI_MAX_RAILS]; + + // enumerate addr_index too + ninfo = nic_info_init(nic_info, nunits, 1); + if (! ninfo) { + // caller will try single NIC selection next + multirail_config->num_rails = 0; + return PSM2_OK; + } + + nic_info_filter_gpu_centric(nic_info, ninfo); + + // we will use all unfiltered units + + // ensure psm3_context_set_affinity doesn't unnecessarily narrow CPU + // selection, it will be called per rail and if rails are in + // different CPU NUMA could have an undesired impact + setenv("PSM3_NO_AFFINITY", "1", 1); + + psm3_copy_nic_info_to_multitrail_config(nic_info, ninfo, multirail_config); + return PSM2_OK; +} +#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */ + +// for use in psm3_ep_multirail_autoselect so can sort rails +// by subnet and addr_index +struct rail_info { + psmi_subnet128_t subnet; + unsigned unit; + unsigned port; + unsigned addr_index; +}; + +static int cmpfunc(const void *p1, const void *p2) +{ + struct rail_info *a = ((struct rail_info *) p1); + struct rail_info *b = ((struct rail_info *) p2); + int ret; + + ret = psmi_subnet128_cmp(a->subnet, b->subnet); + if (ret == 0) { + if (a->addr_index < b->addr_index) + return -1; + else if (a->addr_index > b->addr_index) + return 1; + } + return ret; +} + +// Multirail enabled, autoselect one or more NICs for this process +// multirail_mode is PSM3_MULTIRAIL selection (1=all NICs, 2=NUMA local NICs) +static psm2_error_t +psm3_ep_multirail_autoselect(int multirail_mode, + struct multirail_config *multirail_config) +{ + uint32_t num_units = 0; + psmi_subnet128_t subnet; + unsigned i, j, k, count = 0; + int ret; + psm2_error_t err = PSM2_OK; + struct rail_info rail_info[PSMI_MAX_RAILS]; + int multirail_within_socket_used = 0; + int node_id = -1, found = 0; + + if (multirail_mode == 2) + multirail_within_socket_used = 1; + + + if ((err = psm3_ep_num_devunits(&num_units))) { + return err; + } + + if (num_units > PSMI_MAX_RAILS) { + _HFI_INFO + ("Found %d units, max %d units are supported, using first %d\n", + num_units, PSMI_MAX_RAILS, PSMI_MAX_RAILS); + num_units = PSMI_MAX_RAILS; + } + + if (multirail_mode == 3) + return psm3_ep_multirail_autoselect_cpu_centric(num_units, multirail_config); +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY + if (multirail_mode == 4) + return psm3_ep_multirail_autoselect_gpu_centric(num_units, multirail_config); +#endif + + /* + * PSM3_MULTIRAIL=2 functionality- + * - Try to find at least find one NIC in the same root + * complex. If none found, continue to run and + * use remaining NIC in the system. + * - If we do find at least one NIC in same root complex, we + * go ahead and add to list. + */ + if (multirail_within_socket_used) { + node_id = psm3_get_current_proc_location(); + for (i = 0; i < num_units; i++) { + if (psmi_hal_get_unit_active(i) <= 0) + continue; + int node_id_i; + + if (!psmi_hal_get_node_id(i, &node_id_i)) { + if (node_id_i == node_id) { + found = 1; + break; + } + } + } + } +/* + * Get all the ports and addr_index with a valid lid and gid, one port per unit. + * but up to PSM3_ADDR_PER_NIC addresses. If we are using the NUMA selection + * algorithm and found at list 1 NUMA local NIC above, limit the list to NUMA + * local NICs, otherwise list all NICs + */ + for (i = 0; i < num_units; i++) { + int node_id_i; + + if (!psmi_hal_get_node_id(i, &node_id_i)) + { + if (multirail_within_socket_used && + found && (node_id_i != node_id)) + continue; + } + + for (j = PSM3_NIC_MIN_PORT; j <= PSM3_NIC_MAX_PORT; j++) { + int got_port = 0; + for (k = 0; k < psm3_addr_per_nic; k++) { + ret = psmi_hal_get_port_lid(i, j, k); + if (ret <= 0) + continue; + ret = psmi_hal_get_port_subnet(i, j, k, &subnet, NULL, NULL, NULL); + if (ret == -1) + continue; + + rail_info[count].subnet = subnet; + rail_info[count].unit = i; + rail_info[count].port = j; + rail_info[count].addr_index = k; + got_port = 1; + count++; + } + if (got_port) // one port per unit + break; + } + } + +/* + * Sort all the ports within rail_info from small to big. + * This is for multiple fabrics, and we use fabric with the + * smallest subnet to make the master connection. + */ + qsort(rail_info, count, sizeof(rail_info[0]), cmpfunc); + + for (i = 0; i < count; i++) { + multirail_config->units[i] = rail_info[i].unit; + multirail_config->ports[i] = rail_info[i].port; + multirail_config->addr_indexes[i] = rail_info[i].addr_index; + } + multirail_config->num_rails = count; + return PSM2_OK; +} + +// process PSM3_MULTIRAIL and PSM3_MULTIRAIL_MAP and return the +// list of unit/port/addr_index in multirail_config. +// When multirail_config->num_rails is returned as 0, multirail is not enabled +// and other mechanisms (PSM3_NIC, PSM3_NIC_SELECTION_ALG) must be +// used by the caller to select a single NIC for the process. +// This can return num_rails==1 if exactly 1 NIC is to be used by this process +// or num_rails>1 if this process is to stripe data across multiple NICs +// in which case the 1st NIC in multirail_config should be used as the +// primary NIC for job communications setup. +psm2_error_t +psm3_ep_multirail(struct multirail_config *multirail_config) +{ + int ret; + union psmi_envvar_val env_multirail; + union psmi_envvar_val env_multirail_map; + int map_index; + + psm3_getenv_range("PSM3_MULTIRAIL", + "Control use of multiple NICs", + "-1: No PSM3 NIC autoselection (middleware selects 1 NIC per process).\n" + " 0: (default) Middleware may select NICs or use PSM3 'autoselect_one'\n" + " interface. 'autoselect_one' interface will pick 1 NIC per process\n" + " based on PSM3_NIC_SELECTION_ALG.\n" + " 1: Enable multirail, each process uses all available NICs. Only 'autoselect'\n" + " interface presented to middleware.\n" + " 2: Enable multirail, each process uses all NUMA local NICs. Only 'autoselect'\n" + " interface presented to middleware. If no NUMA local NICs found for a given\n" + " process, PSM3 will use all available NICs for that process.\n" + " 3: Enable multirail, each process selects only ideally located NICs with\n" +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY + " consideration of NIC, CPU" +#ifdef PSM3_HAVE_CPU_SUBNUMA + " sub-NUMA" +#endif + " and GPU locations with priority given\n" + " to CPU locality. Only 'autoselect' interface presented to middleware.\n" + " If no NUMA local NICs are found for a given process and all NICs are equal\n" + " distance to the GPU, PSM3 will use all available NICs for that process.\n" +#else + " consideration of NIC and CPU" +#ifdef PSM3_HAVE_CPU_SUBNUMA + " sub-NUMA" +#endif + " locations.\n" + " Only 'autoselect' interface presented to middleware.\n" + " If no NUMA local NICs are found for a given process, PSM3 will use all\n" + " available NICs for that process.\n" +#endif +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY + " 4: Enable multirail, each process selects only ideally located NICs with\n" + " consideration of NIC, GPU, and CPU" +#ifdef PSM3_HAVE_CPU_SUBNUMA + " sub-NUMA" +#endif + " locations with priority given\n" + " to GPU locality. Only 'autoselect' interface presented to middleware.\n" + " If no NUMA local NICs are found for a given process, PSM3 will use all\n" + " available NICs of equal distance to the GPU for that process." +#endif + , + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)0, +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY + (union psmi_envvar_val)-1, (union psmi_envvar_val)4, +#else + (union psmi_envvar_val)-1, (union psmi_envvar_val)3, +#endif + NULL, NULL, &env_multirail); + if (env_multirail.e_int <= 0) { + // will pick 1 NIC per process + multirail_config->num_rails = 0; + return PSM2_OK; + } + + if (env_multirail.e_int == 1 || env_multirail.e_int == 2) { + // TBD - move this code to a separate function + // for PSM3_MULTIRAIL=1 or 2, PSM3_MULTIRAIL_MAP can explicitly select NICs. + // We treat invalid input, such as bad syntax or selection of an unusable + // port (down/missing/etc), as a fatal error instead of attempting to run + // on the default PSM3_MULTIRAIL_MAP config. This helps avoid + // inconsistent NIC selections, especially for down ports, which may + // cause confusing behaviors or errors. + // If PSM3_MULTIRAIL_MAP contains multiple lists of NICs, then + // if PSM3_MULTIRAIL=1 - use local rank index (0, ...) to select + // if PSM3_MULTIRAIL=2 - use process NUMA (0, ...) to select + if (env_multirail.e_int == 1) { + map_index = psm3_get_mylocalrank(); + } else if (env_multirail.e_int == 2) { + map_index = psm3_get_current_proc_location(); + if (map_index < 0) { + return psm3_handle_error(PSMI_EP_NORETURN, + PSM2_EP_DEVICE_FAILURE, + "Unable to get NUMA location of current process\n"); + } + } else { + psmi_assert(0); + } + ret = psm3_getenv_range("PSM3_MULTIRAIL_MAP", + "Explicit NIC selections for each rail", + "Specified as:\n" + " rail,rail,...;rail,rail,...\n" +#if 0 + "Where rail can be: unit:port-addr_index or unit\n" +#else + "Where rail can be: unit-addr_index or unit\n" +#endif + "unit can be device name or unit number\n" +#if 0 + "where :port is optional (default of 1)\n" +#endif + "addr_index can be 0 to PSM3_ADDR_PER_NIC-1, or 'any' or 'all'\n" + "When addr_index is omitted, it defaults to 'all'\n" + "When more than 1 set of rails is present (each set is separated by ;),\n" + "the set to use for a given process is selected based on PSM3_MULTIRAIL.\n" + " 1 - use local rank number to select\n" + " 2 - use local CPU NUMA to select\n" + "When empty, PSM3 will autoselect NICs as controlled by PSM3_MULTIRAIL.", + PSMI_ENVVAR_LEVEL_USER|PSMI_ENVVAR_FLAG_FATAL, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)"", + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + psm3_parse_check_multirail_map, &map_index, &env_multirail_map); + if (ret < 0) { // syntax error in input, ret error instead of using default + psmi_assert(0); // should not get here since specified FLAG_FATAL + multirail_config->num_rails = 0; + return psm3_handle_error(PSMI_EP_NORETURN, + PSM2_EP_DEVICE_FAILURE, + "Invalid value for PSM3_MULTIRAIL_MAP: '%s', can't proceed\n", + env_multirail_map.e_str); + } + if (! ret) { + // valid input + if (psm3_parse_multirail_map(env_multirail_map.e_str, map_index, 0, NULL, + multirail_config) < 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); + } + return PSM2_OK; + } + } + + // multirail enabled, automatically select 1 or more NICs + return psm3_ep_multirail_autoselect(env_multirail.e_int, multirail_config); +} + +// potential job start hwloc initialization. To avoid overhead +// when hwloc is not needed, we defer to the 1st actual need for hwloc +void +psm3_hwloc_topology_init() +{ +} + +#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) +// deferred hwloc initialization. Caller must hold psm3_creation_lock +static void psm3_deferred_hwloc_topology_init() +{ + unsigned version; + Dl_info info_hwloc; + const char *location; + + // only try once + if (psm3_hwloc_topology_initialized || psm3_hwloc_topology_init_failed) + return; + +#define SHOW_HWLOC_VERSION(ver) (ver)>>16, ((ver) >> 8) & 0xff, (ver) & 0xff + version = hwloc_get_api_version(); + location = dladdr(hwloc_topology_init, &info_hwloc) ? + info_hwloc.dli_fname : "hwloc path not available"; + if ((version >> 16) != (HWLOC_API_VERSION >> 16)) { + _HFI_ERROR("PSM3 was compiled for hwloc API %u.%u.%u but found library API %u.%u.%u at %s.\n" + "You may need to point LD_LIBRARY_PATH to the right hwloc library.\n" + "Disabling some NIC selection affinity features\n", + SHOW_HWLOC_VERSION(HWLOC_API_VERSION), SHOW_HWLOC_VERSION(version), + location); + psm3_hwloc_topology_init_failed = 1; + return; + } + // HWLOC_VERSION string mentioned in docs, but not defined in headers + psm3_print_identify("%s %s hwloc runtime API %u.%u.%u at %s, built against API %u.%u.%u\n", + psm3_get_mylabel(), psm3_ident_tag, + SHOW_HWLOC_VERSION(version), location, + SHOW_HWLOC_VERSION(HWLOC_API_VERSION)); + + hwloc_topology_init(&psm3_hwloc_topology); + // detection configuration, need all PCI devices and CPU sub-numa + // HWLOC_API_VERSION is rev X.Y.Z as (X<<16)+(Y<<8)+Z + // significant API changes from 1.0 to 2.0, including ABI changes +#if HWLOC_API_VERSION < 0x20000 + hwloc_topology_set_flags(psm3_hwloc_topology, + HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_IO_BRIDGES); +#else + hwloc_topology_set_io_types_filter(psm3_hwloc_topology, + HWLOC_TYPE_FILTER_KEEP_ALL); +#endif + hwloc_topology_load(psm3_hwloc_topology); + psm3_hwloc_topology_initialized = 1; +} +#endif /* defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) */ + +void +psm3_hwloc_topology_destroy() +{ +#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) + if (psm3_hwloc_topology_initialized) { + psm3_hwloc_topology_initialized = 0; + hwloc_topology_destroy(psm3_hwloc_topology); + } +#endif +} + +#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) +/* Get the next PCI device in the system. + * + * return the first PCI device if prev is NULL. + * looping on this allows iterating through all PCIe devices + * device=any PCIe component (root controller, bridge, switch, device, etc) + */ +static inline hwloc_obj_t +get_next_pcidev(hwloc_topology_t topology, hwloc_obj_t prev) +{ + return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PCI_DEVICE, prev); +} + +/* Find the PCI device hwloc object matching the PCI bus id + * given domain, bus, device and func PCI bus id. + */ +static hwloc_obj_t +get_pcidev_by_busid(hwloc_topology_t topology, + const struct pci_addr *addr) +{ + hwloc_obj_t obj = NULL; + while ((obj = get_next_pcidev(topology, obj)) != NULL) { + if (obj->attr->pcidev.domain == addr->domain + && obj->attr->pcidev.bus == addr->bus + && obj->attr->pcidev.dev == addr->dev + && obj->attr->pcidev.func == addr->func) + return obj; + } + return NULL; +} +#endif /* defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) */ + +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY +// compare two hwloc objects for equality +// 1 on match, 0 on mismatch +static int equal_hwlocobj(const hwloc_obj_t obj1, const hwloc_obj_t obj2) +{ + return (obj1->type == obj2->type + && obj1->depth == obj2->depth + && obj1->logical_index == obj2->logical_index); +} + +// compute distance in between objects (PCIe devices). +// If the devices are on different PCIe controllers and/or different CPU sockets +// returns INT_MAX +static int get_distance_to_common_ancestor(const hwloc_obj_t obj1, const hwloc_obj_t obj2) +{ + int d1 = 0; + int d2 = 0; + hwloc_obj_t temp1 = obj1; + + while (temp1) { + + hwloc_obj_t temp2 = obj2; + d2 = 0; + + while (temp2) { + + /* common ancestor found */ + if (equal_hwlocobj(temp1, temp2)) { + return d1 + d2; + } + temp2 = temp2->parent; + d2++; + } + temp1 = temp1->parent; + d1++; + } + + /* No common ancestor found, return INT_MAX as the distance */ + return INT_MAX; +} + +// compute distance in PCIe hops between devices. If the +// If the devices are on different PCIe controllers and/or different CPU sockets +// returns INT_MAX +static int psm3_get_distance_between_pcis(const struct pci_addr *pci_addr_1, + const struct pci_addr *pci_addr_2) +{ + hwloc_obj_t obj1 = get_pcidev_by_busid(psm3_hwloc_topology, pci_addr_1); + hwloc_obj_t obj2 = get_pcidev_by_busid(psm3_hwloc_topology, pci_addr_2); + return get_distance_to_common_ancestor(obj1, obj2); +} +#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */ + +#ifdef PSM3_HAVE_CPU_SUBNUMA +// find ancestor of a device, namely the PCIe controller in the CPU socket +static hwloc_obj_t psm3_get_non_io_ancestor_obj( + const struct pci_addr *pci_addr) +{ + hwloc_obj_t obj = get_pcidev_by_busid(psm3_hwloc_topology, pci_addr); + if (! obj) + return NULL; + return hwloc_get_non_io_ancestor_obj(psm3_hwloc_topology, obj); +} +#endif /* PSM3_HAVE_CPU_SUBNUMA */ diff --git a/prov/psm3/psm3/psm_nic_select.h b/prov/psm3/psm3/psm_nic_select.h new file mode 100644 index 00000000000..cfd23ea1081 --- /dev/null +++ b/prov/psm3/psm3/psm_nic_select.h @@ -0,0 +1,116 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2024 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2024 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2024 Intel Corporation. All rights reserved. */ + +#ifndef _PSMI_IN_USER_H +#error psm_nic_select.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSM_NIC_SELECT_H +#define _PSM_NIC_SELECT_H + +// PSM3_NIC_SELECTION_ALG choices +/* + * round robin contexts across HFIs, then + * ports; this is the default. + * This option spreads the HFI selection within the local socket. + * If it is preferred to spread job over over entire set of + * HFIs within the system, see ALG_ACROSS_ALL below. + */ +#define PSMI_UNIT_SEL_ALG_ACROSS PSM_HAL_ALG_ACROSS + +#define PSMI_UNIT_SEL_ALG_ACROSS_ALL PSM_HAL_ALG_ACROSS_ALL + +/* + * use all contexts on an HFI (round robin + * active ports within), then next HFI + */ +#define PSMI_UNIT_SEL_ALG_WITHIN PSM_HAL_ALG_WITHIN + +#define PSMI_UNIT_SEL_ALG_CPU_CENTRIC PSM_HAL_ALG_CPU_CENTRIC +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY +#define PSMI_UNIT_SEL_ALG_GPU_CENTRIC PSM_HAL_ALG_GPU_CENTRIC +#endif + +struct multirail_config { + int num_rails; + uint32_t units[PSMI_MAX_RAILS]; + uint16_t ports[PSMI_MAX_RAILS]; + int addr_indexes[PSMI_MAX_RAILS]; +}; + +// return set of units to consider and which to start at. +// caller will use 1st active unit which can be opened. +// caller will wrap around so it's valid for start >= end +// Note: When using multiple rails per PSM process, higher level code will +// walk through desired units and unit_param will specify a specific unit +// if unit_param is PSM3_NIC_ANY, this will pick starting point for nic search +psm2_error_t +psm3_compute_start_and_end_unit(long unit_param, long addr_index, + int nunitsactive,int nunits, + psm2_uuid_t const job_key, + long *unit_start,long *unit_end); + +psm2_error_t +psm3_ep_multirail(struct multirail_config *multirail_config); + +// decrement any NIC refcounts which may have been +// incremented by psm3_compute_start_and_end_unit +void psm3_dec_nic_refcount(int unit_id); + +// manage hwloc topology discovery. These will be Noops when ! PSM_USE_HWLOC +void psm3_hwloc_topology_init(); +void psm3_hwloc_topology_destroy(); + +#endif /* PSM_NIC_SELECT_H */ diff --git a/prov/psm3/psm3/psm_oneapi_ze.c b/prov/psm3/psm3/psm_oneapi_ze.c index 568581ad84b..2090fb68326 100644 --- a/prov/psm3/psm3/psm_oneapi_ze.c +++ b/prov/psm3/psm3/psm_oneapi_ze.c @@ -70,6 +70,7 @@ int psm3_num_ze_dev_fds; #endif int psm3_oneapi_immed_sync_copy; int psm3_oneapi_immed_async_copy; +unsigned psm3_oneapi_parallel_dtod_copy_thresh; const char* psmi_oneapi_ze_result_to_string(const ze_result_t result) { #define ZE_RESULT_CASE(RES) case ZE_RESULT_##RES: return STRINGIFY(RES) @@ -203,6 +204,72 @@ void psmi_oneapi_ze_memcpy(void *dstptr, const void *srcptr, size_t size) } } +// synchronous GPU memcpy DTOD (xeLink) +void psmi_oneapi_ze_memcpy_DTOD(void *dstptr, const void *srcptr, size_t size) +{ + struct ze_dev_ctxt *ctxt; + + psmi_assert(size > 0); + ctxt = psmi_oneapi_dev_ctxt_get(dstptr); + if (!ctxt) { + _HFI_ERROR("dst %p src %p not GPU buf for copying\n", + dstptr, srcptr); + return; + } + if (size <= psm3_oneapi_parallel_dtod_copy_thresh) { + if (psm3_oneapi_immed_sync_copy) { + PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl, + dstptr, srcptr, size, NULL, 0, NULL); + } else { + PSMI_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->cl); + PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl, + dstptr, srcptr, size, NULL, 0, NULL); + PSMI_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->cl); + PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->cq, + 1, &ctxt->cl, NULL); + PSMI_ONEAPI_ZE_CALL(zeCommandQueueSynchronize, ctxt->cq, UINT32_MAX); + } + } else { + // for large DTOD copies, start 2 parallel commands + // then wait for both + size_t size0 = ROUNDUP64P2(size/2, 64*1024); + size_t size1 = size - size0; + + if (psm3_oneapi_immed_sync_copy) { + PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl0, + dstptr, srcptr, size0, ctxt->copy_status0, 0, NULL); + + PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl1, + (void*)((uintptr_t)dstptr+size0), + (void*)((uintptr_t)srcptr+size0), size1, ctxt->copy_status1, + 0, NULL); + } else { + PSMI_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->async_cl0); + PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl0, + dstptr, srcptr, size0, ctxt->copy_status0, 0, NULL); + PSMI_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->async_cl0); + PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->async_cq0, + 1, &ctxt->async_cl0, NULL); + + PSMI_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->async_cl1); + PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl1, + (void*)((uintptr_t)dstptr+size0), + (void*)((uintptr_t)srcptr+size0), size1, ctxt->copy_status1, + 0, NULL); + PSMI_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->async_cl1); + PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->async_cq1, + 1, &ctxt->async_cl1, NULL); + } + // 2nd copy may be slightly smaller so waity for it first so + // can potentially hide its Reset latency while 1st copy completes + PSMI_ONEAPI_ZE_CALL(zeEventHostSynchronize, ctxt->copy_status1, UINT32_MAX); + PSMI_ONEAPI_ZE_CALL(zeEventHostReset, ctxt->copy_status1); + + PSMI_ONEAPI_ZE_CALL(zeEventHostSynchronize, ctxt->copy_status0, UINT32_MAX); + PSMI_ONEAPI_ZE_CALL(zeEventHostReset, ctxt->copy_status0); + } +} + // for pipelined async GPU memcpy // *p_cq is left as NULL when psm3_oneapi_immed_async_copy enabled void psmi_oneapi_async_cmd_create(struct ze_dev_ctxt *ctxt, diff --git a/prov/psm3/psm3/psm_perf.c b/prov/psm3/psm3/psm_perf.c index 6b30ca60eeb..5e2f6c4f169 100644 --- a/prov/psm3/psm3/psm_perf.c +++ b/prov/psm3/psm3/psm_perf.c @@ -207,7 +207,7 @@ static void psmi_rdpmc_perf_framework_init() * * Read the current value of a running performance counter. */ -unsigned long long rdpmc_read(struct rdpmc_ctx *ctx) +unsigned long long psm3_rdpmc_read(struct rdpmc_ctx *ctx) { static __thread int rdpmc_perf_initialized = 0; diff --git a/prov/psm3/psm3/psm_perf.h b/prov/psm3/psm3/psm_perf.h index db51ceb2fa7..8fdea147fca 100644 --- a/prov/psm3/psm3/psm_perf.h +++ b/prov/psm3/psm3/psm_perf.h @@ -87,7 +87,7 @@ extern char global_rdpmc_slot_name[RDPMC_PERF_MAX_SLOT_NUMBER][RDPMC_PERF_MAX_SL extern unsigned int global_rdpmc_type; extern unsigned int global_rdpmc_config; -extern unsigned long long rdpmc_read(struct rdpmc_ctx *ctx); +extern unsigned long long psm3_rdpmc_read(struct rdpmc_ctx *ctx); #define RDPMC_PERF_INIT() \ { \ @@ -111,12 +111,12 @@ extern unsigned long long rdpmc_read(struct rdpmc_ctx *ctx); #define RDPMC_PERF_BEGIN(slot_number) \ { \ - global_rdpmc_begin[(slot_number)] = rdpmc_read(&global_rdpmc_ctx); \ + global_rdpmc_begin[(slot_number)] = psm3_rdpmc_read(&global_rdpmc_ctx); \ } #define RDPMC_PERF_END(slot_number) \ { \ - global_rdpmc_summ[(slot_number)] += (rdpmc_read(&global_rdpmc_ctx) - global_rdpmc_begin[(slot_number)]); \ + global_rdpmc_summ[(slot_number)] += (psm3_rdpmc_read(&global_rdpmc_ctx) - global_rdpmc_begin[(slot_number)]); \ global_rdpmc_number[(slot_number)]++; \ } diff --git a/prov/psm3/psm3/psm_stats.c b/prov/psm3/psm3/psm_stats.c index 400a8e8c55e..4ae33fe9a85 100644 --- a/prov/psm3/psm3/psm_stats.c +++ b/prov/psm3/psm3/psm_stats.c @@ -641,30 +641,54 @@ psm2_error_t psm3_stats_initialize(void) { union psmi_envvar_val env_stats_freq; + union psmi_envvar_val env_stats_prefix; union psmi_envvar_val env_stats_help; union psmi_envvar_val env_statsmask; - int got_stats_freq; - int got_stats_help; - int got_statsmask; + int noenv_stats_freq; // env var not specified, used default + int noenv_stats_prefix; // env var not specified, used default + int noenv_stats_help; // env var not specified, used default + int noenv_statsmask; // env var not specified, used default psmi_assert(! perf_stats_initialized); - got_stats_freq = psm3_getenv("PSM3_PRINT_STATS", - "Prints performance stats every n seconds to file " - "./psm3-perf-stat-[hostname]-pid-[pid] when set to -1 stats are " - "printed only once on 1st ep close", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val) 0, &env_stats_freq); - print_stats_freq = env_stats_freq.e_uint; - - got_stats_help = psm3_getenv("PSM3_PRINT_STATS_HELP", + noenv_stats_freq = (0 < psm3_getenv_range("PSM3_PRINT_STATS", + "Prints performance stats every n seconds", + " 0 - disable output\n" + " -1 - only output once at end of job on 1st ep close\n" + " >=1 - output every n seconds\n" + " val: - limit output to rank 0 (for val of -1 or >=1)\n" + " val:pattern - limit output to processes whose label matches\n " +#ifdef FNM_EXTMATCH + "extended " +#endif + "glob pattern (for val of -1 or >=1)\n" + "Output goes to file ${PSM3_PRNT_STATS_PREFIX}psm3-perf-stat-[hostname]-pid-[pid]", + PSMI_ENVVAR_LEVEL_USER|PSMI_ENVVAR_FLAG_NOABBREV, + PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT, + (union psmi_envvar_val)"0", + (union psmi_envvar_val)-1, (union psmi_envvar_val)INT_MAX, + NULL, NULL, &env_stats_freq)); + (void)psm3_parse_val_pattern_int(env_stats_freq.e_str, 0, + &print_stats_freq, + PSMI_ENVVAR_FLAG_NOABBREV, -1, INT_MAX); + + noenv_stats_prefix = (0 < psm3_getenv_range("PSM3_PRINT_STATS_PREFIX", + "Prefix for filename for performance stats output", + "May be used to add a prefix possibly including directory for output", + PSMI_ENVVAR_LEVEL_USER|PSMI_ENVVAR_FLAG_NOABBREV, + PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)"./", + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + NULL, NULL, &env_stats_prefix)); + + noenv_stats_help = (0 < psm3_getenv("PSM3_PRINT_STATS_HELP", "Prints performance stats help text on rank 0 to file " - "./psm3-perf-stat-help-[hostname]-pid-[pid]", + "${PSM3_PRINT_STATS_PREFIX}psm3-perf-stat-help-[hostname]-pid-[pid]", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val) 0, &env_stats_help); + (union psmi_envvar_val) 0, &env_stats_help)); print_stats_help = env_stats_help.e_uint && (psm3_get_myrank() == 0); - got_statsmask = psm3_getenv("PSM3_PRINT_STATSMASK", + noenv_statsmask = (0 < psm3_getenv("PSM3_PRINT_STATSMASK", "Mask of statistic types to print: " "MQ=1, RCVTHREAD=0x100, IPS=0x200" #if defined(PSM_HAVE_REG_MR) @@ -681,21 +705,21 @@ psm3_stats_initialize(void) #endif ". 0x100000 causes zero values to also be shown", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, - (union psmi_envvar_val) PSMI_STATSTYPE_ALL, &env_statsmask); + (union psmi_envvar_val) PSMI_STATSTYPE_ALL, &env_statsmask)); print_statsmask = env_statsmask.e_uint; stats_start = time(NULL); snprintf(perf_file_name, sizeof(perf_file_name), - "./psm3-perf-stat-%s-pid-%d", - psm3_gethostname(), getpid()); + "%spsm3-perf-stat-%s-pid-%d", + env_stats_prefix.e_str, psm3_gethostname(), getpid()); if (print_stats_help) { // a few optons, such as CUDA, ONEAPI_ZE, RDMA affect what is // included in help, so use a unique filename per job snprintf(perf_help_file_name, sizeof(perf_help_file_name), - "./psm3-perf-stat-help-%s-pid-%d", - psm3_gethostname(), getpid()); + "%spsm3-perf-stat-help-%s-pid-%d", + env_stats_prefix.e_str, psm3_gethostname(), getpid()); perf_help_fd = fopen(perf_help_file_name, "w"); if (!perf_help_fd) _HFI_ERROR("Failed to create fd for performance logging help: %s: %s\n", @@ -706,13 +730,19 @@ psm3_stats_initialize(void) print_job_info_help(); print_basic_job_info(); - if (got_stats_freq) + // if got a valid value or an invalid value, psm3_getenv will have + // stashed it and print_basic_job_info will have put in stats file + // otherwise we want to always report the STATS variable settings + if (noenv_stats_freq) psm3_stats_print_env_val("PSM3_PRINT_STATS", PSMI_ENVVAR_TYPE_UINT, env_stats_freq); - if (got_stats_help) + if (noenv_stats_prefix) + psm3_stats_print_env_val("PSM3_PRINT_STATS_PREFIX", + PSMI_ENVVAR_TYPE_STR, env_stats_prefix); + if (noenv_stats_help) psm3_stats_print_env_val("PSM3_PRINT_STATS_HELP", PSMI_ENVVAR_TYPE_UINT, env_stats_help); - if (got_statsmask) + if (noenv_statsmask) psm3_stats_print_env_val("PSM3_PRINT_STATSMASK", PSMI_ENVVAR_TYPE_UINT_FLAGS, env_statsmask); diff --git a/prov/psm3/psm3/psm_sysbuf.c b/prov/psm3/psm3/psm_sysbuf.c index f9bee0be199..698507e8528 100644 --- a/prov/psm3/psm3/psm_sysbuf.c +++ b/prov/psm3/psm3/psm_sysbuf.c @@ -77,11 +77,46 @@ struct psmi_mem_block_ctrl { void psm3_mq_sysbuf_init(psm2_mq_t mq) { int i; + // sysbuf is used for unexpected eager messages in nic, shm and self + // for self, unexpected is a courtesy to bad apps, app should always post + // recv before send when sendint to self. + // for nic, eager is only messages below rendezvous threshold. + // In TCP and CPU jobs threshold can be larger. TCP allows up to 256K. + // Typical verbs rendezvous threshold is 8000-64K bytes, with GPU + // tending to use a lower threshold as GPU copies are expensive. + // for shm, GPU messages use rendezvous anytime GPU supports Scale-Up + // GPU to GPU comms, such as xeLink or nvLink. + // A message which exceeds largest block_size[], will have a temporary + // sysbuf allocated and freed. For CPU this is ok as malloc is not + // terribly expensive. However for GPU, the subsequent copy will pay + // a GPU DMA registration cost in Cuda or Level Zero, so it is best to + // avoid temporary buffers. Fortunately GPU apps tend to have fewer + // processes per node and hence more available CPU memory to hold the + // buffers. + // + // So for GPU jobs, we allow a few larger block sizes just in case + // rendezvous threshold is set high or TCP is being used with a large + // eager message size (aka PSM3_MTU). + // replenishing_rate is how many we add to pool at a time, there is + // no upper bound to the pool. +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + uint32_t gpu_block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, 65536, 262144, (uint32_t)-1}; + uint32_t gpu_replenishing_rate[] = {128, 64, 32, 16, 8, 4, 2, 2, 0}; + uint32_t block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, (uint32_t)-1, (uint32_t)-1, (uint32_t)-1}; + uint32_t replenishing_rate[] = {128, 64, 32, 16, 8, 4, 0, 0, 0}; +#else uint32_t block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, (uint32_t)-1}; uint32_t replenishing_rate[] = {128, 64, 32, 16, 8, 4, 0}; +#endif if (mq->mem_ctrl_is_init) return; +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + if (PSMI_IS_GPU_ENABLED) { + memcpy(block_sizes, gpu_block_sizes, sizeof(block_sizes)); + memcpy(replenishing_rate, gpu_replenishing_rate, sizeof(replenishing_rate)); + } +#endif mq->mem_ctrl_is_init = 1; for (i=0; i < MM_NUM_OF_POOLS; i++) { @@ -125,9 +160,35 @@ void psm3_mq_sysbuf_fini(psm2_mq_t mq) // free all buffers that is currently no for (i=0; i < MM_NUM_OF_POOLS; i++) { while ((block = mq->handler_index[i].free_list) != NULL) { mq->handler_index[i].free_list = block->next; +#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER) + if (PSMI_IS_GPU_ENABLED && cu_ctxt) { + /* ignore NOT_REGISTERED in case cuda initialized late */ + /* ignore other errors as context could be destroyed before this */ + CUresult cudaerr; + //PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, + // cuMemHostUnregister, block); + psmi_count_cuMemHostUnregister++; + cudaerr = psmi_cuMemHostUnregister(block); + if (cudaerr) { + const char *pStr = NULL; + psmi_count_cuGetErrorString++; + psmi_cuGetErrorString(cudaerr, &pStr); + _HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n", + cudaerr, pStr?pStr:"Unknown"); + } + } +#endif #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) - if (PSMI_IS_GPU_ENABLED) - PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, block); + if (PSMI_IS_GPU_ENABLED) { + ze_result_t result; + //PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, block); + psmi_count_zexDriverReleaseImportedPointer++; + result = psmi_zexDriverReleaseImportedPointer(ze_driver, + block); + if (result != ZE_RESULT_SUCCESS) { + _HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result)); + } + } #endif psmi_free(block); } @@ -168,6 +229,13 @@ void *psm3_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t alloc_size) new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz); if (new_block) { +#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER) + // for transient buffers, no use Importing, adds cost for + // CPU copy, just pay GPU cost on the copy, we use once & free + //if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt()) + // PSMI_CUDA_CALL(cuMemHostRegister, new_block, newsz, + // CU_MEMHOSTALLOC_PORTABLE); +#endif #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) // for transient buffers, no use Importing, adds cost for // CPU copy, just pay GPU cost on the copy, we use once & free @@ -189,6 +257,14 @@ void *psm3_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t alloc_size) new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz); if (new_block) { +#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER) + // By registering memory with Cuds, we make + // cuMemcpy* run faster for copies between + // GPU and this sysbuf + if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt()) + PSMI_CUDA_CALL(cuMemHostRegister, new_block, newsz, + CU_MEMHOSTALLOC_PORTABLE); +#endif #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) // By registering memory with Level Zero, we make // zeCommandListAppendMemoryCopy run faster for copies between @@ -233,11 +309,21 @@ void psm3_mq_sysbuf_free(psm2_mq_t mq, void * mem_to_free) mm_handler = block_to_free->mem_handler; if (mm_handler->flags & MM_FLAG_TRANSIENT) { +#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER) + // for transient buffers, no use Importing, adds cost for + // CPU copy, just pay GPU cost on the copy, we use once & free + //if (PSMI_IS_GPU_ENABLED && cu_ctxt) { + // /* ignore NOT_REGISTERED in case cuda initialized late */ + // CUresult cudaerr; + // PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, + // cuMemHostUnregister, block_to_free); + //} +#endif #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) // for transient buffers, no use Importing, adds cost for // CPU copy, just pay GPU cost on the copy, we use once & free //if (PSMI_IS_GPU_ENABLED) - // PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, block); + // PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, block_to_free); #endif psmi_free(block_to_free); } else { diff --git a/prov/psm3/psm3/psm_sysbuf.h b/prov/psm3/psm3/psm_sysbuf.h index 90945d520ed..31ff116d088 100644 --- a/prov/psm3/psm3/psm_sysbuf.h +++ b/prov/psm3/psm3/psm_sysbuf.h @@ -58,7 +58,11 @@ #include "psm_user.h" +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#define MM_NUM_OF_POOLS 9 +#else #define MM_NUM_OF_POOLS 7 +#endif typedef struct psmi_mem_ctrl { struct psmi_mem_block_ctrl *free_list; diff --git a/prov/psm3/psm3/psm_user.h b/prov/psm3/psm3/psm_user.h index 38e9b8d9310..18c58d9934d 100644 --- a/prov/psm3/psm3/psm_user.h +++ b/prov/psm3/psm3/psm_user.h @@ -60,6 +60,13 @@ extern "C" { #endif +#if defined(PSM_CUDA) +// if defined, do not use cuMemHostRegister for malloced pipeline +// copy bounce buffers +// otherwise, use cuMemHostRegister when malloc buffer +//#define PSM3_NO_CUDA_REGISTER +#endif + #if defined(PSM_ONEAPI) // if defined, use malloc for pipeline copy bounce buffers // otherwise, use zeMemAllocHost @@ -116,6 +123,10 @@ extern "C" { #endif /* RNDV_MOD */ +#if (defined(PSM_CUDA) || defined(PSM_ONEAPI)) && defined(PSM_USE_HWLOC) +#define PSM_HAVE_GPU_CENTRIC_AFFINITY +#endif + #include "psm_config.h" #include #include @@ -166,6 +177,7 @@ typedef void *psmi_hal_hw_context; #include "psm_help.h" #include "psm_error.h" +#include "psm_nic_select.h" #include "psm_context.h" #include "psm_utils.h" #include "psm_timer.h" @@ -208,6 +220,7 @@ extern int psm3_opened_endpoint_count; extern int psm3_affinity_shared_file_opened; extern uint64_t *psm3_shared_affinity_ptr; +extern uint64_t *psm3_shared_affinity_nic_refcount_ptr; extern char *psm3_affinity_shm_name; extern sem_t *psm3_sem_affinity_shm_rw; @@ -378,6 +391,8 @@ extern uint32_t gpudirect_rdma_send_limit; extern uint32_t gpudirect_rdma_recv_limit; extern uint32_t gpu_thresh_rndv; +#define MAX_ZE_DEVICES 8 + struct ips_gpu_hostbuf { STAILQ_ENTRY(ips_gpu_hostbuf) req_next; STAILQ_ENTRY(ips_gpu_hostbuf) next; @@ -390,8 +405,9 @@ struct ips_gpu_hostbuf { CUevent copy_status; #elif defined(PSM_ONEAPI) ze_event_pool_handle_t event_pool; - ze_command_list_handle_t command_list; + ze_command_list_handle_t command_lists[MAX_ZE_DEVICES]; ze_event_handle_t copy_status; + int cur_dev_inx; #endif psm2_mq_req_t req; void* host_buf; @@ -413,8 +429,6 @@ extern void *psmi_cuda_lib; #ifdef PSM_ONEAPI -#define MAX_ZE_DEVICES 8 - int psmi_oneapi_ze_initialize(void); psm2_error_t psm3_ze_init_fds(void); int *psm3_ze_get_dev_fds(int *nfds); @@ -428,11 +442,22 @@ extern int psm3_num_ze_dev_fds; struct ze_dev_ctxt { ze_device_handle_t dev; + int dev_index; /* Index in ze_devices[] */ uint32_t ordinal; /* CmdQGrp ordinal for the 1st copy_only engine */ uint32_t index; /* Cmdqueue index within the CmdQGrp */ uint32_t num_queues; /* Number of queues in the CmdQGrp */ + // for most sync copies ze_command_queue_handle_t cq; // NULL if psm3_oneapi_immed_sync_copy ze_command_list_handle_t cl; + // fields below are only used for large DTOD sync copy so can do 2 + // parallel async copies then wait for both + ze_event_handle_t copy_status0; + ze_event_handle_t copy_status1; + ze_command_list_handle_t async_cl0; + ze_command_list_handle_t async_cl1; + ze_command_queue_handle_t async_cq0;// NULL if psm3_oneapi_immed_sync_copy + ze_command_queue_handle_t async_cq1;// NULL if psm3_oneapi_immed_sync_copy + ze_event_pool_handle_t event_pool; }; extern ze_api_version_t zel_api_version; @@ -444,6 +469,7 @@ extern int num_ze_devices; extern struct ze_dev_ctxt *cur_ze_dev; extern int psm3_oneapi_immed_sync_copy; extern int psm3_oneapi_immed_async_copy; +extern unsigned psm3_oneapi_parallel_dtod_copy_thresh; const char* psmi_oneapi_ze_result_to_string(const ze_result_t result); void psmi_oneapi_async_cmd_create(struct ze_dev_ctxt *ctxt, @@ -467,6 +493,7 @@ extern int psm3_oneapi_ze_using_zemem_alloc; extern void psm3_oneapi_ze_can_use_zemem(); void psmi_oneapi_ze_memcpy(void *dstptr, const void *srcptr, size_t size); +void psmi_oneapi_ze_memcpy_DTOD(void *dstptr, const void *srcptr, size_t size); static inline int device_support_gpudirect() @@ -501,6 +528,8 @@ extern CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream); extern CUresult (*psmi_cuEventSynchronize)(CUevent hEvent); extern CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags); extern CUresult (*psmi_cuMemFreeHost)(void* p); +extern CUresult (*psmi_cuMemHostRegister)(void* p, size_t bytesize, unsigned int Flags); +extern CUresult (*psmi_cuMemHostUnregister)(void* p); extern CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); extern CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); extern CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); @@ -527,6 +556,7 @@ extern ze_result_t (*psmi_zexDriverImportExternalPointer)(ze_driver_handle_t hDr extern ze_result_t (*psmi_zexDriverReleaseImportedPointer)(ze_driver_handle_t hDriver, void *ptr); #endif extern ze_result_t (*psmi_zeDeviceGet)(ze_driver_handle_t hDriver, uint32_t *pCount, ze_device_handle_t *phDevices); +extern ze_result_t (*psmi_zeDevicePciGetPropertiesExt)(ze_device_handle_t hDevice, ze_pci_ext_properties_t *pPciProperties); #ifndef PSM3_NO_ONEAPI_IMPORT extern ze_result_t (*psmi_zeDriverGetExtensionFunctionAddress)(ze_driver_handle_t hDriver, const char *name, void **ppFunctionAddress); #endif @@ -591,6 +621,8 @@ extern uint64_t psmi_count_cuEventRecord; extern uint64_t psmi_count_cuEventSynchronize; extern uint64_t psmi_count_cuMemHostAlloc; extern uint64_t psmi_count_cuMemFreeHost; +extern uint64_t psmi_count_cuMemHostRegister; +extern uint64_t psmi_count_cuMemHostUnregister; extern uint64_t psmi_count_cuMemcpy; extern uint64_t psmi_count_cuMemcpyDtoD; extern uint64_t psmi_count_cuMemcpyDtoH; @@ -617,6 +649,7 @@ extern uint64_t psmi_count_zexDriverImportExternalPointer; extern uint64_t psmi_count_zexDriverReleaseImportedPointer; #endif extern uint64_t psmi_count_zeDeviceGet; +extern uint64_t psmi_count_zeDevicePciGetPropertiesExt; #ifndef PSM3_NO_ONEAPI_IMPORT extern uint64_t psmi_count_zeDriverGetExtensionFunctionAddress; #endif @@ -679,6 +712,20 @@ static int check_set_cuda_ctxt(void) return 0; } +/* Make sure have a real GPU job. Set cu_ctxt if available */ +PSMI_ALWAYS_INLINE( +int check_have_cuda_ctxt(void)) +{ + if (! cu_ctxt) { + if (unlikely(check_set_cuda_ctxt())) { \ + psm3_handle_error(PSMI_EP_NORETURN, \ + PSM2_INTERNAL_ERR, "Failed to set/synchronize" \ + " CUDA context.\n"); \ + } \ + } + return (cu_ctxt != NULL); +} + #define PSMI_CUDA_CALL(func, args...) do { \ CUresult cudaerr; \ @@ -688,19 +735,18 @@ static int check_set_cuda_ctxt(void) " CUDA context.\n"); \ } \ psmi_count_##func++; \ - cudaerr = psmi_##func(args); \ + cudaerr = (CUresult)psmi_##func(args); \ if (cudaerr != CUDA_SUCCESS) { \ const char *pStr = NULL; \ psmi_count_cuGetErrorString++; \ psmi_cuGetErrorString(cudaerr, &pStr); \ _HFI_ERROR( \ "CUDA failure: %s() (at %s:%d)" \ - "returned %d: %s\n", \ + " returned %d: %s\n", \ #func, __FILE__, __LINE__, cudaerr, \ pStr?pStr:"Unknown"); \ - psm3_handle_error( \ - PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ - "Error returned from CUDA function.\n");\ + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Error returned from CUDA function %s.\n", #func);\ } \ } while (0) #endif // PSM_CUDA @@ -712,12 +758,12 @@ static int check_set_cuda_ctxt(void) psmi_count_##func++; \ result = psmi_##func(args); \ if(result != ZE_RESULT_SUCCESS) { \ - _HFI_ERROR( "OneAPI Level Zero failure: %s() (at %s:%d) " \ - "returned %d(%s)\n", \ - #func, __FILE__, __LINE__, result, psmi_oneapi_ze_result_to_string(result)); \ - psm3_handle_error( PSMI_EP_NORETURN, \ - PSM2_INTERNAL_ERR, \ - "Error returned from OneAPI Level Zero function %s.\n", STRINGIFY(func)); \ + _HFI_ERROR( "OneAPI Level Zero failure: %s() (at %s:%d)" \ + " returned 0x%x: %s\n", \ + #func, __FILE__, __LINE__, result, \ + psmi_oneapi_ze_result_to_string(result)); \ + psm3_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Error returned from OneAPI Level Zero function %s.\n", #func); \ } \ } while (0) @@ -755,7 +801,7 @@ _psmi_is_oneapi_ze_mem(const void *ptr, struct ze_dev_ctxt **ctxt)) if (result == ZE_RESULT_SUCCESS && (mem_props.type != ZE_MEMORY_TYPE_UNKNOWN)) { ret = 1; - _HFI_VDBG("ptr %p type %d dev %p ze_device %p\n", + _HFI_VDBG("ptr %p type %d dev %p cur_ze_dev %p\n", ptr, mem_props.type, dev, cur_ze_dev->dev); /* * Check if the gpu device has changed. @@ -782,6 +828,7 @@ _psmi_is_oneapi_ze_mem(const void *ptr, struct ze_dev_ctxt **ctxt)) break; } } + _HFI_VDBG("check ze_device[%d-%d] for dev %p: no match\n", 0, num_ze_devices-1, dev); } } @@ -947,19 +994,18 @@ int gpu_p2p_supported()) "before psm3_ep_open call \n"); \ _HFI_ERROR( \ "CUDA failure: %s() (at %s:%d)" \ - "returned %d: %s\n", \ + " returned %d: %s\n", \ #func, __FILE__, __LINE__, cudaerr, \ pStr?pStr:"Unknown"); \ - psm3_handle_error( \ - PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ - "Error returned from CUDA function.\n");\ + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Error returned from CUDA function %s.\n", #func);\ } else if (cudaerr == except_err) { \ const char *pStr = NULL; \ psmi_count_cuGetErrorString++; \ psmi_cuGetErrorString(cudaerr, &pStr); \ _HFI_DBG( \ "CUDA non-zero return value: %s() (at %s:%d)" \ - "returned %d: %s\n", \ + " returned %d: %s\n", \ #func, __FILE__, __LINE__, cudaerr, \ pStr?pStr:"Unknown"); \ } \ @@ -974,12 +1020,11 @@ int gpu_p2p_supported()) psmi_count_cuGetErrorString++; \ psmi_cuGetErrorString(cudaerr, &pStr); \ _HFI_ERROR( \ - "CUDA failure: %s() returned %d: %s\n", \ - "cuEventQuery", cudaerr, \ + "CUDA failure: %s() (at %s:%d) returned %d: %s\n", \ + "cuEventQuery", __FILE__, __LINE__, cudaerr, \ pStr?pStr:"Unknown"); \ - psm3_handle_error( \ - PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ - "Error returned from CUDA function.\n");\ + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Error returned from CUDA function cuEventQuery.\n");\ } \ } while (0) @@ -1063,13 +1108,12 @@ int _psm3_oneapi_ze_memcpy_done(const struct ips_gpu_hostbuf *ghb) } else if (result == ZE_RESULT_NOT_READY) { return 0; } else { - _HFI_ERROR( "OneAPI LZ failure: %s() returned %d(%s)\n", - __FUNCTION__, result, + _HFI_ERROR("OneAPI Level Zero failure: %s() (at %s:%d) returned 0x%x: %s\n", + "zeEventQueryStatus", __FILE__, __LINE__, result, psmi_oneapi_ze_result_to_string(result)); - psm3_handle_error( PSMI_EP_NORETURN, - PSM2_INTERNAL_ERR, - "Error returned from OneAPI LZ function %s.\n", - __FUNCTION__); + psm3_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Error returned from OneAPI Level Zero function %s.\n", + "zeEventQueryStatus"); } return 0; } @@ -1219,16 +1263,13 @@ _psmi_is_gdr_copy_enabled()) PSMI_CUDA_CALL(cuEventRecord, ghb->copy_status, \ protoexp->cudastream_recv); \ } while (0) -#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len, bufsz) \ +#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len) \ do { \ if (proto->cudastream_send == NULL) { \ PSMI_CUDA_CALL(cuStreamCreate, \ &proto->cudastream_send, \ CU_STREAM_NON_BLOCKING); \ } \ - if (ghb->host_buf == NULL && bufsz) { \ - PSM3_GPU_HOST_ALLOC(&ghb->host_buf, bufsz); \ - } \ if (ghb->copy_status == NULL) { \ PSMI_CUDA_CALL(cuEventCreate, \ &ghb->copy_status, CU_EVENT_DEFAULT); \ @@ -1246,13 +1287,6 @@ _psmi_is_gdr_copy_enabled()) ghb->copy_status = NULL; \ ghb->host_buf = NULL; \ } while (0) -// TBD, create of Event here could be omitted and let HTOD/DTOH_START create it -#define PSM3_GPU_HOSTBUF_FORCE_INIT(ghb, bufsz) \ - do { \ - PSM3_GPU_HOST_ALLOC(&ghb->host_buf, bufsz); \ - PSMI_CUDA_CALL(cuEventCreate, \ - &ghb->copy_status, CU_EVENT_DEFAULT); \ - } while (0) #define PSM3_GPU_HOSTBUF_RESET(ghb) \ do { \ } while (0) @@ -1278,6 +1312,10 @@ _psmi_is_gdr_copy_enabled()) PSMI_CUDA_CALL(cuMemHostAlloc, (void **)(ret_ptr), \ (size),CU_MEMHOSTALLOC_PORTABLE); \ } while (0) +#define PSM3_GPU_HOST_FREE(ptr) \ + do { \ + PSMI_CUDA_CALL(cuMemFreeHost, (void *)ptr); \ + } while (0) // HOST_ALLOC memory treated as CPU memory for Verbs MRs #define PSM3_GPU_ADDR_SEND_MR(mqreq) \ ( (mqreq)->is_buf_gpu_mem && ! (mqreq)->gpu_hostbuf_used ) @@ -1295,24 +1333,40 @@ _psmi_is_gdr_copy_enabled()) #elif defined(PSM_ONEAPI) #define PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp) \ do { \ - protoexp->cq_recv = NULL; \ + int i; \ + \ + for (i = 0; i < MAX_ZE_DEVICES; i++) \ + protoexp->cq_recvs[i] = NULL; \ } while (0) #define PSM3_GPU_PREPARE_DTOH_MEMCPYS(proto) \ do { \ - proto->cq_send = NULL; \ + int i; \ + \ + for (i = 0; i < MAX_ZE_DEVICES; i++) \ + proto->cq_sends[i] = NULL; \ } while (0) #define PSM3_GPU_SHUTDOWN_HTOD_MEMCPYS(protoexp) \ do { \ - if (protoexp->cq_recv) { \ - PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, \ - protoexp->cq_recv); \ + int i; \ + \ + for (i = 0; i < MAX_ZE_DEVICES; i++) { \ + if (protoexp->cq_recvs[i]) { \ + PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, \ + protoexp->cq_recvs[i]); \ + protoexp->cq_recvs[i] = NULL; \ + } \ } \ } while (0) #define PSM3_GPU_SHUTDOWN_DTOH_MEMCPYS(proto) \ do { \ - if (proto->cq_send) { \ - PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, \ - proto->cq_send); \ + int i; \ + \ + for (i = 0; i < MAX_ZE_DEVICES; i++) { \ + if (proto->cq_sends[i]) { \ + PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, \ + proto->cq_sends[i]); \ + proto->cq_sends[i] = NULL; \ + } \ } \ } while (0) @@ -1330,13 +1384,14 @@ _psmi_is_gdr_copy_enabled()) .index = 0 \ }; \ struct ze_dev_ctxt *ctxt; \ + int inx; \ \ ctxt = psmi_oneapi_dev_ctxt_get(ghb->gpu_buf); \ if (!ctxt) \ psm3_handle_error(PSMI_EP_NORETURN, \ PSM2_INTERNAL_ERR, \ - "%s HTOD: no dev ctxt\n", \ - __FUNCTION__); \ + "%s HTOD: unknown GPU device for addr %p\n", \ + __FUNCTION__, ghb->gpu_buf);\ if (ghb->event_pool == NULL) { \ PSMI_ONEAPI_ZE_CALL(zeEventPoolCreate, \ ze_context, &pool_desc, 0, NULL, \ @@ -1347,23 +1402,26 @@ _psmi_is_gdr_copy_enabled()) ghb->event_pool, &event_desc, \ &ghb->copy_status); \ } \ - if (! ghb->command_list) { \ + inx = ctxt->dev_index; \ + if (! ghb->command_lists[inx]) { \ psmi_oneapi_async_cmd_create(ctxt, \ - &protoexp->cq_recv, &ghb->command_list);\ + &protoexp->cq_recvs[inx], \ + &ghb->command_lists[inx]); \ } \ + ghb->cur_dev_inx = inx; \ PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, \ - ghb->command_list, \ + ghb->command_lists[inx], \ ghb->gpu_buf, ghb->host_buf, len, \ ghb->copy_status, 0, NULL); \ if (! psm3_oneapi_immed_async_copy) { \ PSMI_ONEAPI_ZE_CALL(zeCommandListClose, \ - ghb->command_list); \ + ghb->command_lists[inx]); \ PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists,\ - protoexp->cq_recv, 1, \ - &ghb->command_list, NULL); \ + protoexp->cq_recvs[inx], 1, \ + &ghb->command_lists[inx], NULL); \ } \ } while (0) -#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len, bufsz) \ +#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len) \ do { \ ze_event_pool_desc_t pool_desc = { \ .stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, \ @@ -1377,13 +1435,14 @@ _psmi_is_gdr_copy_enabled()) .index = 0 \ }; \ struct ze_dev_ctxt *ctxt; \ + int inx; \ \ ctxt = psmi_oneapi_dev_ctxt_get(ghb->gpu_buf); \ if (!ctxt) \ psm3_handle_error(PSMI_EP_NORETURN, \ PSM2_INTERNAL_ERR, \ - "%s DTOH: no dev ctxt\n", \ - __FUNCTION__); \ + "%s DTOH: unknown GPU device for addr %p\n", \ + __FUNCTION__, ghb->gpu_buf);\ if (ghb->event_pool == NULL) { \ PSMI_ONEAPI_ZE_CALL(zeEventPoolCreate, \ ze_context, &pool_desc, 0, NULL, \ @@ -1394,68 +1453,50 @@ _psmi_is_gdr_copy_enabled()) ghb->event_pool, &event_desc, \ &ghb->copy_status); \ } \ - if (ghb->host_buf == NULL && bufsz) { \ - PSM3_GPU_HOST_ALLOC(&ghb->host_buf, bufsz); \ - } \ - if (! ghb->command_list) { \ + inx = ctxt->dev_index; \ + if (! ghb->command_lists[inx]) { \ psmi_oneapi_async_cmd_create(ctxt, \ - &proto->cq_send, &ghb->command_list);\ + &proto->cq_sends[inx], \ + &ghb->command_lists[inx]); \ } \ + ghb->cur_dev_inx = inx; \ PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, \ - ghb->command_list, \ + ghb->command_lists[inx], \ ghb->host_buf, ghb->gpu_buf, len, \ ghb->copy_status, 0, NULL); \ if (! psm3_oneapi_immed_async_copy) { \ PSMI_ONEAPI_ZE_CALL(zeCommandListClose, \ - ghb->command_list); \ + ghb->command_lists[inx]); \ PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists,\ - proto->cq_send, 1, \ - &ghb->command_list, NULL); \ + proto->cq_sends[inx], 1, \ + &ghb->command_lists[inx], NULL); \ } \ } while (0) #define PSM3_GPU_MEMCPY_DONE(ghb) \ _psm3_oneapi_ze_memcpy_done(ghb) #define PSM3_GPU_HOSTBUF_LAZY_INIT(ghb) \ do { \ + int i; \ + \ ghb->event_pool = NULL; \ ghb->copy_status = NULL; \ - ghb->command_list = NULL; \ + for (i = 0; i < MAX_ZE_DEVICES; i++) \ + ghb->command_lists[i] = NULL; \ ghb->host_buf = NULL; \ } while (0) -// TBD, create of Event and command list here could be omitted and let -// HTOD/DTOH_START create it -#define PSM3_GPU_HOSTBUF_FORCE_INIT(ghb, bufsz) \ - do { \ - ze_event_pool_desc_t pool_desc = { \ - .stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, \ - .flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE, \ - .count = 1 \ - }; \ - ze_event_desc_t event_desc = { \ - .stype = ZE_STRUCTURE_TYPE_EVENT_DESC, \ - .signal = ZE_EVENT_SCOPE_FLAG_HOST, \ - .wait = ZE_EVENT_SCOPE_FLAG_HOST, \ - .index = 0 \ - }; \ - PSMI_ONEAPI_ZE_CALL(zeEventPoolCreate, \ - ze_context, &pool_desc, 0, NULL, \ - &ghb->event_pool); \ - PSMI_ONEAPI_ZE_CALL(zeEventCreate, \ - ghb->event_pool, &event_desc, \ - &ghb->copy_status); \ - PSM3_GPU_HOST_ALLOC(&ghb->host_buf, bufsz); \ - } while (0) #define PSM3_GPU_HOSTBUF_RESET(ghb) \ do { \ if (! psm3_oneapi_immed_async_copy) { \ PSMI_ONEAPI_ZE_CALL(zeCommandListReset, \ - ghb->command_list); \ + ghb->command_lists[ghb->cur_dev_inx]);\ } \ PSMI_ONEAPI_ZE_CALL(zeEventHostReset, \ ghb->copy_status); \ } while (0) #define PSM3_GPU_HOSTBUF_DESTROY(ghb) \ do { \ + int i; \ + \ if (ghb->copy_status != NULL) { \ PSMI_ONEAPI_ZE_CALL(zeEventDestroy, \ ghb->copy_status); \ @@ -1467,13 +1508,17 @@ _psmi_is_gdr_copy_enabled()) PSMI_ONEAPI_ZE_CALL(zeEventPoolDestroy, \ ghb->event_pool); \ } \ - if (ghb->command_list != NULL) { \ - PSMI_ONEAPI_ZE_CALL(zeCommandListDestroy, \ - ghb->command_list); \ + for (i = 0; i < MAX_ZE_DEVICES; i++) { \ + if (ghb->command_lists[i]) { \ + PSMI_ONEAPI_ZE_CALL( \ + zeCommandListDestroy, \ + ghb->command_lists[i]); \ + ghb->command_lists[i] = NULL; \ + } \ } \ } while (0) #define PSM3_GPU_MEMCPY_DTOD(dstptr, srcptr, len) \ - do { psmi_oneapi_ze_memcpy(dstptr, srcptr, len); } while(0) + do { psmi_oneapi_ze_memcpy_DTOD(dstptr, srcptr, len); } while(0) #define PSM3_GPU_MEMCPY_HTOD(dstptr, srcptr, len) \ do { psmi_oneapi_ze_memcpy(dstptr, srcptr, len); } while(0) #define PSM3_GPU_SYNCHRONIZE_MEMCPY() \ @@ -1506,6 +1551,7 @@ _psmi_is_gdr_copy_enabled()) ( (tidrecvc)->is_ptr_gpu_backed \ || ((mqreq)->gpu_hostbuf_used && psm3_oneapi_ze_using_zemem_alloc)) #endif /* PSM3_USE_ONEAPI_MALLOC */ +#define PSM3_GPU_HOST_FREE(ptr) PSM3_ONEAPI_ZE_HOST_FREE(ptr) #define PSM3_MARK_BUF_SYNCHRONOUS(buf) do { /* not needed for OneAPI ZE */ } while (0) #define PSM3_GPU_MEMCPY_DTOH(dstptr, srcptr, len) \ do { psmi_oneapi_ze_memcpy(dstptr, srcptr, len); } while (0) diff --git a/prov/psm3/psm3/psm_utils.c b/prov/psm3/psm3/psm_utils.c index e99b950e1bf..c2525fa935c 100644 --- a/prov/psm3/psm3/psm_utils.c +++ b/prov/psm3/psm3/psm_utils.c @@ -2550,14 +2550,12 @@ unsigned psmi_parse_gpudirect_rdma_send_limit(int force) /* Default send threshold for Gpu-direct set to UINT_MAX * (always use GPUDIRECT) */ - psm3_getenv("PSM3_GPUDIRECT_RDMA_SEND_LIMIT", - "GPUDirect RDMA feature on send side will be switched off for messages larger than limit.", + psm3_getenv_range("PSM3_GPUDIRECT_RDMA_SEND_LIMIT", + "GPUDirect RDMA feature on send side will be switched off for messages larger than limit.", NULL, PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, -#ifdef PSM_ONEAPI - (union psmi_envvar_val)(1024*1024), &envval); -#else - (union psmi_envvar_val)UINT_MAX, &envval); -#endif + (union psmi_envvar_val)UINT_MAX, + (union psmi_envvar_val)0, (union psmi_envvar_val)UINT_MAX, + NULL, NULL, &envval); saved = envval.e_uint; done: @@ -2584,10 +2582,16 @@ unsigned psmi_parse_gpudirect_rdma_recv_limit(int force) /* Default receive threshold for Gpu-direct set to UINT_MAX * (always use GPUDIRECT) */ - psm3_getenv("PSM3_GPUDIRECT_RDMA_RECV_LIMIT", - "GPUDirect RDMA feature on receive side will be switched off for messages larger than limit.", + psm3_getenv_range("PSM3_GPUDIRECT_RDMA_RECV_LIMIT", + "GPUDirect RDMA feature on receive side will be switched off for messages larger than limit.", NULL, PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)UINT_MAX, &envval); +#ifdef PSM_CUDA + (union psmi_envvar_val)UINT_MAX, +#elif defined(PSM_ONEAPI) + (union psmi_envvar_val)1, +#endif + (union psmi_envvar_val)0, (union psmi_envvar_val)UINT_MAX, + NULL, NULL, &envval); saved = envval.e_uint; done: @@ -2611,10 +2615,11 @@ unsigned psmi_parse_gpudirect_rv_gpu_cache_size(int reload) // RV defaults are sufficient for default PSM parameters // but for HALs with RDMA, if user adjusts ep->hfi_num_send_rdma or - // mq->hfi_base_window_rv they also need to increase the cache size. + // mq->ips_gpu_window_rv they also need to increase the cache size. // psm3_verbs_alloc_mr_cache will verify cache size is sufficient. // min size is (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) * - // chunk size (mq->hfi_base_window_rv after psmi_mq_initialize_params) + // chunk size (psm3_mq_max_window_rv(mq, 1) after + // psmi_mq_initialize_params) if (PSMI_IS_GPU_ENABLED && psmi_parse_gpudirect() ) { psm3_getenv("PSM3_RV_GPU_CACHE_SIZE", "kernel space GPU cache size" @@ -2665,23 +2670,28 @@ int psm3_parse_identify(void) { union psmi_envvar_val myenv; static int have_value; - static unsigned saved_identify; + static int saved_identify; // only parse once so doesn't appear in PSM3_VERBOSE_ENV multiple times if (have_value) return saved_identify; - psm3_getenv("PSM3_IDENTIFY", "Identify PSM version being run " - "(0 - disable, 1 - enable, 1: - limit output to rank 0, " - "1:pattern - limit output " - "to processes whose label matches " + psm3_getenv_range("PSM3_IDENTIFY", "Identify PSM version being run", + " 0 - disable\n" + " 1 - enable\n" + " 1: - limit output to rank 0\n" + " 1:pattern - limit output to processes whose label matches\n " #ifdef FNM_EXTMATCH "extended " #endif "glob pattern)", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_VAL_PAT, - (union psmi_envvar_val)"0", &myenv); - (void)psm3_parse_val_pattern(myenv.e_str, 0, &saved_identify); + PSMI_ENVVAR_LEVEL_USER|PSMI_ENVVAR_FLAG_NOABBREV, + PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT, + (union psmi_envvar_val)"0", + (union psmi_envvar_val)0, (union psmi_envvar_val)1, + NULL, NULL, &myenv); + (void)psm3_parse_val_pattern_int(myenv.e_str, 0, &saved_identify, + PSMI_ENVVAR_FLAG_NOABBREV, 0, 1); have_value = 1; return saved_identify; @@ -2891,11 +2901,12 @@ void psm3_print_ep_identify(psm2_ep_t ep) (void)psmi_hal_get_port_speed(ep->unit_id, ep->portnum, &link_speed); psmi_hal_get_node_id(ep->unit_id, &node_id); - psm3_print_identify("%s %s NIC %u (%s) Port %u %"PRIu64" Mbps NUMA %d %s%s\n", + psm3_print_identify("%s %s NIC %u (%s) Port %u %"PRIu64" Mbps NUMA %d %s%s%s\n", psm3_get_mylabel(), psm3_ident_tag, ep->unit_id, ep->dev_name, ep->portnum, link_speed/(1000*1000), node_id, psm3_epid_fmt_addr(ep->epid, 0), + ep->addl_nic_info?ep->addl_nic_info:"", (! psm3_ep_device_is_enabled(ep, PTL_DEVID_AMSH) && (((struct ptl_ips *)(ep->ptl_ips.ptl))->proto.flags & IPS_PROTO_FLAG_LOOPBACK))?" loopback":""); @@ -3011,7 +3022,7 @@ void psm3_parse_multi_ep() #ifdef PSM_FI -unsigned psm3_faultinj_enabled = 0; +int psm3_faultinj_enabled = 0; int psm3_faultinj_verbose = 0; char *psm3_faultinj_outfile = NULL; int psm3_faultinj_sec_rail = 0; @@ -3025,21 +3036,25 @@ void psm3_parse_faultinj() { union psmi_envvar_val env_fi; - psm3_getenv("PSM3_FI", "PSM Fault Injection " - "(0 - disable, 1 - enable, " - "2 - enable but default each injector to 0 rate " - "#: - limit to rank 0, " - "#:pattern - limit " - "to processes whose label matches " + psm3_getenv_range("PSM3_FI", "PSM Fault Injection", + " 0 - disable\n" + " 1 - enable\n" + " 2 - enable but default each injector to 0 rate\n" + " #: - limit to rank 0\n" + " #:pattern - limit to processes whose label matches\n " #ifdef FNM_EXTMATCH "extended " #endif - "glob pattern) " - "mode 2 can be useful to generate full stats help " - "when PSM3_PRINT_STATS_HELP enabled", - PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR_VAL_PAT, - (union psmi_envvar_val)"0", &env_fi); - (void)psm3_parse_val_pattern(env_fi.e_str, 0, &psm3_faultinj_enabled); + "glob pattern\n" + "mode 2 can be useful to generate help for all injectors\n" + "when PSM3_PRINT_STATS_HELP=1 or PSM3_VERBOSE_ENV=3:", + PSMI_ENVVAR_LEVEL_HIDDEN|PSMI_ENVVAR_FLAG_NOABBREV, + PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT, + (union psmi_envvar_val)"0", + (union psmi_envvar_val)0, (union psmi_envvar_val)2, + NULL, NULL, &env_fi); + (void)psm3_parse_val_pattern_int(env_fi.e_str, 0, + &psm3_faultinj_enabled, PSMI_ENVVAR_FLAG_NOABBREV, 0, 2); if (psm3_faultinj_enabled) { char *def = NULL; @@ -3143,6 +3158,52 @@ void psm3_faultinj_fini() return; } +/* parse fault injection controls + * format is num:denom:initial_seed + * denom must be >= num and > 0 + * Either field can be omitted in which case default (input fvals) is used + * for given field. + * 0 - successfully parsed, fvals updated + * -1 - str empty, fvals unchanged + * -2 - syntax error, fvals may have been changed + */ +static int parse_faultinj_control(const char *str, + size_t errstr_size, char errstr[], + int fvals[3]) +{ + psmi_assert(fvals); + int ret = psm3_parse_str_tuples(str, 3, fvals); + if (ret < 0) + return ret; + if (! fvals[1]) { + if (errstr_size) + snprintf(errstr, errstr_size, " denom must be non-zero"); + return -2; + } + if (fvals[0] < 0 || fvals[1] < 0) { + if (errstr_size) + snprintf(errstr, errstr_size, " Negative values for num and denom not allowed"); + return -2; + } + if (fvals[0] > fvals[1]) { + if (errstr_size) + snprintf(errstr, errstr_size, " num (%d) must be <= denom (%d)", fvals[0], fvals[1]); + return -2; + } + return 0; +} + +static int parse_check_faultinj_control(int type, + const union psmi_envvar_val val, void *ptr, + size_t errstr_size, char errstr[]) +{ + // parser will set fvals to result, use a copy to protect input of defaults + int fvals[3] = { ((int*)ptr)[0], ((int*)ptr)[1], ((int*)ptr)[2] }; + psmi_assert(type == PSMI_ENVVAR_TYPE_STR_TUPLES); + return parse_faultinj_control(val.e_str, errstr_size, errstr, fvals); +} + + /* * Intended to be used only once, not in the critical path */ @@ -3186,27 +3247,34 @@ struct psm3_faultinj_spec *psm3_faultinj_getspec(const char *spec_name, * error condition. */ { - int fvals[3] = { num, denom, (int)getpid() }; + int fvals[3] = { fi->num, fi->denom, fi->initial_seed }; union psmi_envvar_val env_fi; char fvals_str[128]; char fname[128]; char fdesc[300]; + int ret; snprintf(fvals_str, sizeof(fvals_str), "%d:%d:%d", fi->num, fi->denom, fi->initial_seed); snprintf(fname, sizeof(fname), "PSM3_FI_%s", spec_name); - snprintf(fdesc, sizeof(fdesc), "Fault Injection - %s <%s>", - help, fvals_str); - - if (!psm3_getenv(fname, fdesc, PSMI_ENVVAR_LEVEL_HIDDEN, - PSMI_ENVVAR_TYPE_STR_TUPLES, - (union psmi_envvar_val)fvals_str, &env_fi)) { + snprintf(fdesc, sizeof(fdesc), "Fault Injection - %s", help); + + ret = psm3_getenv_range(fname, fdesc, + "Specified as num:denom:seed, where num/denom is approx probability\nand seed seeds the random number generator", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR_TUPLES, + (union psmi_envvar_val)fvals_str, + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + parse_check_faultinj_control, fvals, &env_fi); + if (ret == 0) { /* not using default values */ - (void)psm3_parse_str_tuples(env_fi.e_str, 3, fvals); + if (parse_faultinj_control(env_fi.e_str, 0, NULL, fvals) < 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); + } fi->num = fvals[0]; fi->denom = fvals[1]; fi->initial_seed = fvals[2]; - } else if (psm3_faultinj_enabled == 2) { + } else if (ret == 1 && psm3_faultinj_enabled == 2) { // default unspecified injectors to off fi->num = 0; } diff --git a/prov/psm3/psm3/psm_utils.h b/prov/psm3/psm3/psm_utils.h index ab654cb451d..d39b49e6711 100644 --- a/prov/psm3/psm3/psm_utils.h +++ b/prov/psm3/psm3/psm_utils.h @@ -528,7 +528,7 @@ void psm3_parse_multi_ep(); * pri_reg_mr - priority register MR failure (ENOMEM) * gdrmmap - GPU gdrcopy pin and mmap failure */ -extern unsigned psm3_faultinj_enabled; /* use macro to test */ +extern int psm3_faultinj_enabled; /* use macro to test */ extern int psm3_faultinj_verbose; /* use IS_FAULT macro to test */ extern int psm3_faultinj_sec_rail;/* faults only on secondary rails or EPs */ diff --git a/prov/psm3/psm3/ptl.h b/prov/psm3/psm3/ptl.h index dcdba3a7c6d..44110636411 100644 --- a/prov/psm3/psm3/ptl.h +++ b/prov/psm3/psm3/ptl.h @@ -68,14 +68,6 @@ #include #include -/* We currently have 3 PTLs, 0 is reserved. */ -#define PTL_DEVID_IPS 1 -#define PTL_DEVID_AMSH 2 -#define PTL_DEVID_SELF 3 - -/* We can currently initialize up to 3 PTLs */ -#define PTL_MAX_INIT 3 - /* struct ptl is an incomplete type, and it serves as a generic or opaque container. It should remain an incomplete type in the entire psm source base. concrete ptl types need to have a suffix such as ptl_self, diff --git a/prov/psm3/psm3/ptl_am/am_config.h b/prov/psm3/psm3/ptl_am/am_config.h index f436f471c25..79600601037 100644 --- a/prov/psm3/psm3/ptl_am/am_config.h +++ b/prov/psm3/psm3/ptl_am/am_config.h @@ -67,6 +67,14 @@ #define AMSH_HAVE_CMA 0x1 #define AMSH_HAVE_KASSIST 0x1 +#if defined(PSM_CUDA) +/* Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem */ +#define PSMI_MQ_GPU_RV_THRESH 127 +#elif defined(PSM_ONEAPI) +/* Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem */ +#define PSMI_MQ_GPU_RV_THRESH 127 +#endif + /* Each block reserves some space at the beginning to store auxiliary data */ #define AMSH_BLOCK_HEADER_SIZE 4096 diff --git a/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c b/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c index a8151240469..ac561c6d32f 100644 --- a/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c +++ b/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c @@ -96,7 +96,7 @@ typedef struct { static psm2_error_t am_ze_memhandle_mpool_alloc( am_ze_memhandle_cache_t cache, uint32_t memcache_size); -void am_ze_memhandle_delete(void *buf_ptr); +static void am_ze_memhandle_delete(void *buf_ptr); /* * Custom comparator @@ -653,9 +653,9 @@ am_ze_memhandle_acquire(am_ze_memhandle_cache_t cache, } +#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) void am_ze_memhandle_delete(void *buf_ptr) { -#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) /* Release the reference to the buffer */ PSMI_ONEAPI_ZE_CALL(zeMemFree, ze_context, buf_ptr); @@ -679,8 +679,8 @@ void am_ze_memhandle_delete(void *buf_ptr) * GEM_CLOSE. */ #endif -#endif /* HAVE_DRM or HAVE_LIBDRM */ } +#endif /* HAVE_DRM or HAVE_LIBDRM */ void am_ze_memhandle_release(am_ze_memhandle_cache_t cache, diff --git a/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c b/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c index 2cea9932454..020f3afb349 100644 --- a/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c +++ b/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c @@ -88,6 +88,9 @@ #endif int psm3_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST; +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +int psm3_shm_mq_gpu_rv_thresh = PSMI_MQ_GPU_RV_THRESH; +#endif // qcounts and qelemsz tunable via amsh_fifo_getconfig(); static amsh_qinfo_t amsh_qcounts = { @@ -371,6 +374,16 @@ psm2_error_t psm3_shm_create(ptl_t *ptl_gen) } memset((void *) mapptr, 0, segsz); /* touch all of my pages */ +#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) + if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt()) + PSMI_CUDA_CALL(cuMemHostRegister, mapptr, segsz, + CU_MEMHOSTALLOC_PORTABLE); +#endif +#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) + if (PSMI_IS_GPU_ENABLED) + PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, ze_driver, + mapptr, segsz); +#endif /* Our own ep's info for ptl_am resides at the start of the shm object. Other processes need some of this info to @@ -418,6 +431,37 @@ psm2_error_t psm3_epdir_extend(ptl_t *ptl_gen) psm2_error_t psm3_do_unmap(uintptr_t shmbase) { psm2_error_t err = PSM2_OK; +#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) + if (PSMI_IS_GPU_ENABLED && cu_ctxt) { + /* ignore NOT_REGISTERED in case cuda initialized late */ + /* ignore other errors as context could be destroyed before this */ + CUresult cudaerr; + //PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, + // cuMemHostUnregister, (void*)shmbase); + psmi_count_cuMemHostUnregister++; + cudaerr = psmi_cuMemHostUnregister((void*)shmbase); + if (cudaerr) { + const char *pStr = NULL; + psmi_count_cuGetErrorString++; + psmi_cuGetErrorString(cudaerr, &pStr); + _HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n", + cudaerr, pStr?pStr:"Unknown"); + } + } +#endif +#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) + if (PSMI_IS_GPU_ENABLED) { + ze_result_t result; + //PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, + // (void *)shmbase); + psmi_count_zexDriverReleaseImportedPointer++; + result = psmi_zexDriverReleaseImportedPointer(ze_driver, + (void *)shmbase); + if (result != ZE_RESULT_SUCCESS) { + _HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result)); + } + } +#endif if (munmap((void *)shmbase, am_ctl_sizeof_block())) { err = psm3_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, @@ -550,6 +594,16 @@ psm2_error_t psm3_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm // read every page in segment so faulted into our address space psm3_touch_mmap(dest_mapptr, segsz); +#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) + if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt()) + PSMI_CUDA_CALL(cuMemHostRegister, dest_mapptr, segsz, + CU_MEMHOSTALLOC_PORTABLE); +#endif +#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) + if (PSMI_IS_GPU_ENABLED) + PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, ze_driver, + dest_mapptr, segsz); +#endif shmidx = -1; if ((ptl->max_ep_idx + 1) == ptl->am_ep_size) { @@ -711,6 +765,37 @@ psm2_error_t psm3_shm_detach(ptl_t *ptl_gen) shm_unlink(ptl->amsh_keyname); psmi_free(ptl->amsh_keyname); +#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) + if (PSMI_IS_GPU_ENABLED && cu_ctxt) { + /* ignore NOT_REGISTERED in case cuda initialized late */ + /* ignore other errors as context could be destroyed before this */ + CUresult cudaerr; + //PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, + // cuMemHostUnregister, (void*)shmbase); + psmi_count_cuMemHostUnregister++; + cudaerr = psmi_cuMemHostUnregister((void*)shmbase); + if (cudaerr) { + const char *pStr = NULL; + psmi_count_cuGetErrorString++; + psmi_cuGetErrorString(cudaerr, &pStr); + _HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n", + cudaerr, pStr?pStr:"Unknown"); + } + } +#endif +#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) + if (PSMI_IS_GPU_ENABLED) { + ze_result_t result; + //PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, + // (void *)shmbase); + psmi_count_zexDriverReleaseImportedPointer++; + result = psmi_zexDriverReleaseImportedPointer(ze_driver, + (void *)shmbase); + if (result != ZE_RESULT_SUCCESS) { + _HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result)); + } + } +#endif if (munmap((void *)shmbase, am_ctl_sizeof_block())) { err = psm3_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, @@ -2382,7 +2467,8 @@ amsh_mq_send_inner_eager(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, args[2].u32w1 = tag->tag[2]; args[2].u32w0 = 0; - if (!flags_user && len <= AMLONG_MTU) { + psmi_assert(!(flags_user & PSM2_MQ_FLAG_SENDSYNC));// needs rndv + if (len <= AMLONG_MTU) { if (len <= 32) args[0].u32w0 = MQ_MSG_TINY; else @@ -2445,26 +2531,29 @@ amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, if (PSM3_IS_BUFFER_GPU_MEM(ubuf, len)) { gpu_mem = 1; - /* All sends from a gpu buffer use the rendezvous protocol if p2p is supported */ - if (ep_supports_p2p) { + /* SENDSYNC gets priority, assume not used for MPI_isend w/INJECT */ + /* otherwise use eager for INJECT as caller is waiting */ + if ((flags_user & (PSM2_MQ_FLAG_SENDSYNC|PSM2_MQ_FLAG_INJECT)) + == PSM2_MQ_FLAG_INJECT) + goto do_eager; + + /* larger sends from a gpu buffer use the rendezvous protocol if p2p is supported */ + if (ep_supports_p2p && len > mq->shm_gpu_thresh_rv) { goto do_rendezvous; } - - /* - * Use eager messages if P2P is unsupported between endpoints. - * Potentially use rendezvous with blocking requests only. - */ - if (!is_blocking) - goto do_eager; - } + } else #endif + /* SENDSYNC gets priority, assume not used for MPI_isend w/INJECT */ + /* otherwise use eager for INJECT as caller is waiting */ + if ((flags_user & (PSM2_MQ_FLAG_SENDSYNC|PSM2_MQ_FLAG_INJECT)) + == PSM2_MQ_FLAG_INJECT) + goto do_eager; + if (flags_user & PSM2_MQ_FLAG_SENDSYNC) goto do_rendezvous; if (len <= mq->shm_thresh_rv) -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) do_eager: -#endif return amsh_mq_send_inner_eager(mq, req, epaddr, args, flags_user, flags_internal, tag, ubuf, len); do_rendezvous: @@ -2600,17 +2689,31 @@ int psm3_get_kassist_mode() return PSMI_KASSIST_OFF; #endif -#if !defined(PSM_CUDA) && !defined(PSM_ONEAPI) union psmi_envvar_val env_kassist; const char *PSM3_KASSIST_MODE_HELP = "PSM Shared memory kernel assist mode " "(cma-put, cma-get, none)"; +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + // GPU limits KASSIST choices to cma-get or none + const char *PSM3_KASSIST_MODE_GPU_HELP = "PSM Shared memory kernel assist mode " + "(cma-get, none)"; +#endif - if (!psm3_getenv("PSM3_KASSIST_MODE", PSM3_KASSIST_MODE_HELP, + if (!psm3_getenv("PSM3_KASSIST_MODE", +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + PSMI_IS_GPU_ENABLED? + PSM3_KASSIST_MODE_GPU_HELP:PSM3_KASSIST_MODE_HELP, +#else + PSM3_KASSIST_MODE_HELP, +#endif PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val) PSMI_KASSIST_MODE_DEFAULT_STRING, &env_kassist)) { char *s = env_kassist.e_str; - if (strcasecmp(s, "cma-put") == 0) + if ( +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + ! PSMI_IS_GPU_ENABLED && +#endif + strcasecmp(s, "cma-put") == 0) mode = PSMI_KASSIST_CMA_PUT; else if (strcasecmp(s, "cma-get") == 0) mode = PSMI_KASSIST_CMA_GET; @@ -2622,7 +2725,6 @@ int psm3_get_kassist_mode() mode = PSMI_KASSIST_CMA_GET; } } -#endif return mode; } @@ -3005,11 +3107,9 @@ amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl) PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val) CUDA_MEMHANDLE_CACHE_SIZE, &env_memcache_size); -#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) if ((err = am_cuda_memhandle_cache_alloc(&ptl->memhandle_cache, env_memcache_size.e_uint, &ep->mq->stats) != PSM2_OK)) goto fail; -#endif } } #endif @@ -3160,6 +3260,10 @@ static psm2_error_t amsh_fini(ptl_t *ptl_gen, int force, uint64_t timeout_ns) am_ze_memhandle_cache_free(ptl->memhandle_cache); #endif ptl->memhandle_cache = NULL; +#endif +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + if (PSMI_IS_GPU_ENABLED && ptl->gpu_bounce_buf) + PSM3_GPU_HOST_FREE(ptl->gpu_bounce_buf); #endif return PSM2_OK; fail: diff --git a/prov/psm3/psm3/ptl_am/psm_am_internal.h b/prov/psm3/psm3/ptl_am/psm_am_internal.h index 56df72a6c13..203b9512c3a 100644 --- a/prov/psm3/psm3/ptl_am/psm_am_internal.h +++ b/prov/psm3/psm3/ptl_am/psm_am_internal.h @@ -468,6 +468,10 @@ struct ptl_am { #ifdef PSM_ONEAPI am_ze_memhandle_cache_t memhandle_cache; #endif +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#define AMSH_GPU_BOUNCE_BUF_SZ (256*1024) + void *gpu_bounce_buf; // for H to D +#endif } __attribute__((aligned(64))); #endif diff --git a/prov/psm3/psm3/ptl_am/ptl.c b/prov/psm3/psm3/ptl_am/ptl.c index 62142f898a9..8a38d22ad4d 100644 --- a/prov/psm3/psm3/ptl_am/ptl.c +++ b/prov/psm3/psm3/ptl_am/ptl.c @@ -54,6 +54,7 @@ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ #include "psm_user.h" +#include "psm2_hal.h" #include "psm_mq_internal.h" #include "psm_am_internal.h" #include "cmarw.h" @@ -162,19 +163,32 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted, * resides on the GPU */ if (req->is_buf_gpu_mem) { - void* gpu_ipc_bounce_buf = psmi_malloc(PSMI_EP_NONE, UNDEFINED, req->req_data.recv_msglen); - size_t nbytes = psm3_cma_get(pid, (void *)req->rts_sbuf, - gpu_ipc_bounce_buf, req->req_data.recv_msglen); - psmi_assert_always(nbytes == req->req_data.recv_msglen); - PSM3_GPU_MEMCPY_HTOD(req->req_data.buf, gpu_ipc_bounce_buf, - req->req_data.recv_msglen); + size_t cnt = 0; + if (!ptl->gpu_bounce_buf) + PSM3_GPU_HOST_ALLOC(&ptl->gpu_bounce_buf, AMSH_GPU_BOUNCE_BUF_SZ); + while (cnt < req->req_data.recv_msglen) { + size_t nbytes = min(req->req_data.recv_msglen-cnt, + AMSH_GPU_BOUNCE_BUF_SZ); + size_t res = psm3_cma_get(pid, (void *)(req->rts_sbuf+cnt), + ptl->gpu_bounce_buf, nbytes); + void *buf; + psmi_assert_always(nbytes == res); + if (PSMI_USE_GDR_COPY_RECV(nbytes) + && NULL != (buf = psmi_hal_gdr_convert_gpu_to_host_addr( + (unsigned long)req->req_data.buf+cnt, + nbytes, 1, ptl->ep))) + psm3_mq_mtucpy_host_mem(buf, ptl->gpu_bounce_buf, nbytes); + else + PSM3_GPU_MEMCPY_HTOD(req->req_data.buf+cnt, + ptl->gpu_bounce_buf, nbytes); + cnt+= nbytes; + } /* Cuda library has recent optimizations where they do * not guarantee synchronus nature for Host to Device * copies for msg sizes less than 64k. The event record * and synchronize calls are to guarentee completion. */ PSM3_GPU_SYNCHRONIZE_MEMCPY(); - psmi_free(gpu_ipc_bounce_buf); } else { /* cma can be done in handler context or not. */ size_t nbytes = psm3_cma_get(pid, (void *)req->rts_sbuf, diff --git a/prov/psm3/psm3/ptl_am/ptl_fwd.h b/prov/psm3/psm3/ptl_am/ptl_fwd.h index e7dcd060d22..85593aad847 100644 --- a/prov/psm3/psm3/ptl_am/ptl_fwd.h +++ b/prov/psm3/psm3/ptl_am/ptl_fwd.h @@ -60,5 +60,6 @@ extern struct ptl_ctl_init psm3_ptl_amsh; extern int psm3_shm_mq_rv_thresh; +extern int psm3_shm_mq_gpu_rv_thresh; #endif diff --git a/prov/psm3/psm3/ptl_ips/ips_expected_proto.h b/prov/psm3/psm3/ptl_ips/ips_expected_proto.h index 6e9b94f3a97..2bdd85a309c 100644 --- a/prov/psm3/psm3/ptl_ips/ips_expected_proto.h +++ b/prov/psm3/psm3/ptl_ips/ips_expected_proto.h @@ -137,7 +137,8 @@ struct ips_protoexp { #ifdef PSM_CUDA CUstream cudastream_recv; #elif defined(PSM_ONEAPI) - ze_command_queue_handle_t cq_recv; // NULL if psm3_oneapi_immed_async_copy + /* Will not be usd if psm3_oneapi_immed_async_copy */ + ze_command_queue_handle_t cq_recvs[MAX_ZE_DEVICES]; #endif }; @@ -201,6 +202,7 @@ struct ips_tid_send_desc { * would need to attach to a tidsendc would be 2 */ struct ips_gpu_hostbuf *gpu_hostbuf[2]; + struct ips_gpu_hostbuf *gpu_split_buf; /* Number of hostbufs attached */ uint8_t gpu_num_buf; #endif @@ -362,4 +364,11 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, ptl_arg_t rdescid, uint32_t tidflow_genseq, ips_tid_session_list *tid_list, uint32_t tid_list_size); + +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +// buffers for GPU send copy pipeline +struct ips_gpu_hostbuf* psm3_ips_allocate_send_chb(struct ips_proto *proto, + uint32_t nbytes, int allow_temp); +void psm3_ips_deallocate_send_chb(struct ips_gpu_hostbuf* chb, int reset); +#endif #endif /* #ifndef __IPS_EXPECTED_PROTO_H__ */ diff --git a/prov/psm3/psm3/ptl_ips/ips_path_rec.c b/prov/psm3/psm3/ptl_ips/ips_path_rec.c index 3db38328818..de57f5317e9 100644 --- a/prov/psm3/psm3/ptl_ips/ips_path_rec.c +++ b/prov/psm3/psm3/ptl_ips/ips_path_rec.c @@ -127,8 +127,12 @@ enum psm3_ibv_rate ips_link_speed_to_enum(uint64_t link_speed) return PSM3_IBV_RATE_300_GBPS; else if (link_speed <= 400*PSM3_GIGABIT) return PSM3_IBV_RATE_400_GBPS; - else + else if (link_speed <= 600*PSM3_GIGABIT) return PSM3_IBV_RATE_600_GBPS; + else if (link_speed <= 800*PSM3_GIGABIT) + return PSM3_IBV_RATE_800_GBPS; + else + return PSM3_IBV_RATE_1200_GBPS; } static uint64_t ips_enum_to_link_speed(enum psm3_ibv_rate rate) @@ -155,6 +159,8 @@ static uint64_t ips_enum_to_link_speed(enum psm3_ibv_rate rate) case PSM3_IBV_RATE_50_GBPS: return 50*PSM3_GIGABIT; case PSM3_IBV_RATE_400_GBPS: return 400*PSM3_GIGABIT; case PSM3_IBV_RATE_600_GBPS: return 600*PSM3_GIGABIT; + case PSM3_IBV_RATE_800_GBPS: return 800*PSM3_GIGABIT; + case PSM3_IBV_RATE_1200_GBPS: return 1200*PSM3_GIGABIT; default: return 100*PSM3_GIGABIT; } } @@ -458,6 +464,51 @@ ips_none_path_rec(struct ips_proto *proto, return err; } +/* parse error check timeouts for PSM3_ERRCHK_TIMEOUT or PSM3_ERRCHK_TIMEOUT_US + * format is min:max:factor + * all must be non-zero, min must be <= max + * Either field can be omitted in which case default (input tvals) is used + * for given field. + * 0 - successfully parsed, tvals updated + * -1 - str empty, tvals unchanged + * -2 - syntax error, tvals may have been changed + */ +static int parse_errchk_timeout(const char *str, + size_t errstr_size, char errstr[], + int tvals[3]) +{ + psmi_assert(tvals); + int ret = psm3_parse_str_tuples(str, 3, tvals); + if (ret < 0) + return ret; + if (tvals[0] < 0 || tvals[1] < 0 || tvals[2] < 0) { + if (errstr_size) + snprintf(errstr, errstr_size, " Negative values not allowed"); + return -2; + } + if (tvals[0] == 0 || tvals[1] == 0 || tvals[2] == 0) { + if (errstr_size) + snprintf(errstr, errstr_size, " Zero values not allowed"); + return -2; + } + if (tvals[0] > tvals[1]) { + if (errstr_size) + snprintf(errstr, errstr_size, " min (%d) must be <= max (%d)", tvals[0], tvals[1]); + return -2; + } + return 0; +} + +static int parse_check_errchk_timeout(int type, + const union psmi_envvar_val val, void *ptr, + size_t errstr_size, char errstr[]) +{ + // parser will set tvals to result, use a copy to protect input of defaults + int tvals[3] = { ((int*)ptr)[0], ((int*)ptr)[1], ((int*)ptr)[2] }; + psmi_assert(type == PSMI_ENVVAR_TYPE_STR_TUPLES); + return parse_errchk_timeout(val.e_str, errstr_size, errstr, tvals); +} + static psm2_error_t ips_none_path_rec_init(struct ips_proto *proto) { psm2_error_t err = PSM2_OK; @@ -478,17 +529,18 @@ static psm2_error_t ips_none_path_rec_init(struct ips_proto *proto) IPS_PROTO_ERRCHK_FACTOR_DEFAULT }; - if (!psm3_getenv("PSM3_ERRCHK_TIMEOUT", - "Errchk timeouts in mS ", + (void)psm3_getenv_range("PSM3_ERRCHK_TIMEOUT", + "Errchk timeouts in milliseconds ", + "Specified as min:max:factor where min and max is the range of timeouts\nand factor is the multiplier for growing timeout", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_TUPLES, (union psmi_envvar_val)PSM_TID_TIMEOUT_DEFAULT, - &env_to)) { - /* Not using default values, parse what we can */ - (void)psm3_parse_str_tuples(env_to.e_str, 3, tvals); - /* Adjust for max smaller than min, things would break */ - if (tvals[1] < tvals[0]) - tvals[1] = tvals[0]; + (union psmi_envvar_val)NULL, + (union psmi_envvar_val)NULL, + parse_check_errchk_timeout, tvals, &env_to); + if (parse_errchk_timeout(env_to.e_str, 0, NULL, tvals) < 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); } proto->epinfo.ep_timeout_ack = ms_2_cycles(tvals[0]); @@ -502,22 +554,26 @@ static psm2_error_t ips_none_path_rec_init(struct ips_proto *proto) * This allows values in units of microseconds and will override * any values specified in PSM3_ERRCHK_TIMEOUT */ - if (!psm3_getenv("PSM3_ERRCHK_TIMEOUT_US", - "Errchk timeouts in usec ", + int us_tvals[3] = { + IPS_PROTO_ERRCHK_MS_MIN_DEFAULT*1000, + IPS_PROTO_ERRCHK_MS_MAX_DEFAULT*1000, + IPS_PROTO_ERRCHK_FACTOR_DEFAULT + }; + if (1 > psm3_getenv_range("PSM3_ERRCHK_TIMEOUT_US", + "Errchk timeouts in microseconds ", + "Specified as min:max:factor where min and max is the range of timeouts\nand factor is the multiplier for growing timeout", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_TUPLES, (union psmi_envvar_val)PSM_TID_TIMEOUT_DEFAULT_US, - &env_to)) { - /* Not using default values, parse what we can */ - int us_tvals[3] = { - IPS_PROTO_ERRCHK_MS_MIN_DEFAULT*1000, - IPS_PROTO_ERRCHK_MS_MAX_DEFAULT*1000, - IPS_PROTO_ERRCHK_FACTOR_DEFAULT - }; - (void)psm3_parse_str_tuples(env_to.e_str, 3, us_tvals); - /* Adjust for max smaller than min, things would break */ - if (us_tvals[1] < us_tvals[0]) - us_tvals[1] = us_tvals[0]; + (union psmi_envvar_val)NULL, + (union psmi_envvar_val)NULL, + parse_check_errchk_timeout, us_tvals, &env_to)) { + // value specified (perhaps bad input), use + // what was returned (will be default if bad input) + if (parse_errchk_timeout(env_to.e_str, 0, NULL, us_tvals) < 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); + } proto->epinfo.ep_timeout_ack = us_2_cycles(us_tvals[0]); proto->epinfo.ep_timeout_ack_max = us_2_cycles(us_tvals[1]); proto->epinfo.ep_timeout_ack_factor = us_tvals[2]; diff --git a/prov/psm3/psm3/ptl_ips/ips_path_rec.h b/prov/psm3/psm3/ptl_ips/ips_path_rec.h index ebca755e95a..17fa819a396 100644 --- a/prov/psm3/psm3/ptl_ips/ips_path_rec.h +++ b/prov/psm3/psm3/ptl_ips/ips_path_rec.h @@ -124,6 +124,8 @@ enum psm3_ibv_rate { PSM3_IBV_RATE_50_GBPS = 20, PSM3_IBV_RATE_400_GBPS = 21, PSM3_IBV_RATE_600_GBPS = 22, + PSM3_IBV_RATE_800_GBPS = 23, + PSM3_IBV_RATE_1200_GBPS = 24, }; static inline int opa_mtu_enum_to_int(enum opa_mtu mtu) diff --git a/prov/psm3/psm3/ptl_ips/ips_proto.c b/prov/psm3/psm3/ptl_ips/ips_proto.c index d4c723a430a..f6c9c215bcb 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto.c +++ b/prov/psm3/psm3/ptl_ips/ips_proto.c @@ -452,6 +452,8 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) { struct psmi_rlimit_mpool rlim = GPU_HOSTBUFFER_LIMITS; uint32_t maxsz, chunksz, max_elements; + uint32_t pool_num_obj_max_total; + uint32_t small_pool_num_obj_max_total; if ((err = psm3_parse_mpool_env(proto->mq, 1, &rlim, &maxsz, &chunksz))) @@ -459,10 +461,12 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, /* the maxsz is the amount in MB, not the number of entries, * since the element size depends on the window size */ - max_elements = (maxsz*1024*1024) / proto->mq->hfi_base_window_rv; + max_elements = (maxsz*1024*1024) / psm3_mq_max_window_rv(proto->mq, 1); /* mpool requires max_elements to be power of 2. round down. */ max_elements = 1 << (31 - __builtin_clz(max_elements)); - proto->gpu_hostbuf_send_cfg.bufsz = proto->mq->hfi_base_window_rv; + /* need at least 3 buffers */ + max_elements = max(4, max_elements); + proto->gpu_hostbuf_send_cfg.bufsz = psm3_mq_max_window_rv(proto->mq, 1); proto->gpu_hostbuf_pool_send = psm3_mpool_create_for_gpu(sizeof(struct ips_gpu_hostbuf), chunksz, max_elements, 0, @@ -476,6 +480,8 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, "Couldn't allocate GPU host send buffer pool"); goto fail; } + psm3_mpool_get_obj_info(proto->gpu_hostbuf_pool_send, + NULL, &pool_num_obj_max_total); /* use the same number of elements for the small pool */ proto->gpu_hostbuf_small_send_cfg.bufsz = GPU_SMALLHOSTBUF_SZ; @@ -492,6 +498,8 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, "Couldn't allocate GPU host small send buffer pool"); goto fail; } + psm3_mpool_get_obj_info(proto->gpu_hostbuf_pool_small_send, + NULL, &small_pool_num_obj_max_total); /* Configure the amount of prefetching */ union psmi_envvar_val env_prefetch_limit; @@ -502,6 +510,12 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, (union psmi_envvar_val)GPU_WINDOW_PREFETCH_DEFAULT, &env_prefetch_limit); proto->gpu_prefetch_limit = env_prefetch_limit.e_uint; + _HFI_DBG("GPU Send Copy Pipeline: %u of %u bytes (small), %u of %u bytes, prefetch %u\n", + small_pool_num_obj_max_total, + proto->gpu_hostbuf_small_send_cfg.bufsz, + pool_num_obj_max_total, + proto->gpu_hostbuf_send_cfg.bufsz, + proto->gpu_prefetch_limit); } #endif /* PSM_CUDA || PSM_ONEAPI */ @@ -530,7 +544,8 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, // but can survive if it's smaller as we will delay transfer til avail if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) { cache_pri_entries = HFI_TF_NFLOWS + proto->ep->hfi_num_send_rdma; - cache_pri_size = (uint64_t)cache_pri_entries * proto->mq->hfi_base_window_rv; + cache_pri_size = (uint64_t)cache_pri_entries * + psm3_mq_max_window_rv(proto->mq, 0); if (MR_CACHE_USER_CACHING(proto->ep->mr_cache_mode)) { // we attempt to cache, so can benefit from more than inflight // make enough room to have a good number of entries @@ -578,7 +593,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, default_cache_entries = max(default_cache_entries, ((uint64_t)env_mr_cache_size_mb.e_uint * (1024*1024)) - / max( proto->mq->hfi_base_window_rv/2, + / max(psm3_mq_max_window_rv(proto->mq, 0)/2, proto->mq->hfi_thresh_rv)); } else { // only send DMA, size based on smaller MRs @@ -2292,10 +2307,10 @@ ips_proto_register_stats(struct ips_proto *proto) "RDMA rendezvous message bytes received direct into a GPU buffer", &proto->strat_stats.rndv_rdma_gdr_recv_bytes), PSMI_STATS_DECLU64("rndv_rdma_hbuf_recv", - "RDMA rendezvous messages received into via pipelined GPU copy", + "RDMA rendezvous messages received into a GPU buffer via pipelined GPU copy", &proto->strat_stats.rndv_rdma_hbuf_recv), PSMI_STATS_DECLU64("rndv_rdma_hbuf_recv_bytes", - "RDMA rendezvous message bytes received into via pipelined GPU copy", + "RDMA rendezvous message bytes received into a GPU buffer via pipelined GPU copy", &proto->strat_stats.rndv_rdma_hbuf_recv_bytes), #endif PSMI_STATS_DECLU64("rndv_rdma_cpu_send", @@ -2312,10 +2327,10 @@ ips_proto_register_stats(struct ips_proto *proto) "RDMA rendezvous message bytes sent from a GPU buffer via send RDMA", &proto->strat_stats.rndv_rdma_gdr_send_bytes), PSMI_STATS_DECLU64("rndv_rdma_hbuf_send", - "RDMA rendezvous messages sent from a GPU buffer into via pipelined GPU copy", + "RDMA rendezvous messages sent from a GPU buffer via pipelined GPU copy", &proto->strat_stats.rndv_rdma_hbuf_send), PSMI_STATS_DECLU64("rndv_rdma_hbuf_send_bytes", - "RDMA rendezvous message bytes sent from a GPU buffer into via pipelined GPU copy", + "RDMA rendezvous message bytes sent from a GPU buffer via pipelined GPU copy", &proto->strat_stats.rndv_rdma_hbuf_send_bytes), #endif }; diff --git a/prov/psm3/psm3/ptl_ips/ips_proto.h b/prov/psm3/psm3/ptl_ips/ips_proto.h index eccd6ce3d25..9c1b920f075 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto.h +++ b/prov/psm3/psm3/ptl_ips/ips_proto.h @@ -437,7 +437,8 @@ struct ips_proto { #ifdef PSM_CUDA CUstream cudastream_send; #elif defined(PSM_ONEAPI) - ze_command_queue_handle_t cq_send; // NULL if psm3_oneapi_immed_async_copy + /* Will not be used if psm3_oneapi_immed_async_copy */ + ze_command_queue_handle_t cq_sends[MAX_ZE_DEVICES]; #endif #if defined(PSM_CUDA) || defined(PSM_ONEAPI) diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_expected.c b/prov/psm3/psm3/ptl_ips/ips_proto_expected.c index 057bdb74c5c..c39231b8679 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto_expected.c +++ b/prov/psm3/psm3/ptl_ips/ips_proto_expected.c @@ -260,10 +260,11 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto, #if defined(PSM_CUDA) || defined(PSM_ONEAPI) { - if (PSMI_IS_GPU_ENABLED && - !(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) { + if (PSMI_IS_GPU_ENABLED) { struct psmi_rlimit_mpool rlim = GPU_HOSTBUFFER_LIMITS; uint32_t maxsz, chunksz, max_elements; + uint32_t pool_num_obj_max_total; + uint32_t small_pool_num_obj_max_total; if ((err = psm3_parse_mpool_env(protoexp->proto->mq, 1, &rlim, &maxsz, &chunksz))) @@ -271,11 +272,14 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto, /* the maxsz is the amount in MB, not the number of entries, * since the element size depends on the window size */ - max_elements = (maxsz*1024*1024) / proto->mq->hfi_base_window_rv; + max_elements = (maxsz*1024*1024) / + psm3_mq_max_window_rv(proto->mq, 1); /* mpool requires max_elements to be power of 2. round down. */ max_elements = 1 << (31 - __builtin_clz(max_elements)); + /* need at least 2 buffers */ + max_elements = max(2, max_elements); protoexp->gpu_hostbuf_recv_cfg.bufsz = - proto->mq->hfi_base_window_rv; + psm3_mq_max_window_rv(proto->mq, 1); protoexp->gpu_hostbuf_pool_recv = psm3_mpool_create_for_gpu(sizeof(struct ips_gpu_hostbuf), @@ -290,6 +294,8 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto, "Couldn't allocate GPU host receive buffer pool"); goto fail; } + psm3_mpool_get_obj_info(protoexp->gpu_hostbuf_pool_recv, + NULL, &pool_num_obj_max_total); protoexp->gpu_hostbuf_small_recv_cfg.bufsz = GPU_SMALLHOSTBUF_SZ; @@ -306,6 +312,13 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto, "Couldn't allocate GPU host small receive buffer pool"); goto fail; } + psm3_mpool_get_obj_info(protoexp->gpu_hostbuf_pool_small_recv, + NULL, &small_pool_num_obj_max_total); + _HFI_DBG("GPU Recv Copy Pipeline: %u of %u bytes (small), %u of %u bytes\n", + small_pool_num_obj_max_total, + protoexp->gpu_hostbuf_small_recv_cfg.bufsz, + pool_num_obj_max_total, + protoexp->gpu_hostbuf_recv_cfg.bufsz); PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp); STAILQ_INIT(&protoexp->gpupend_getreqsq); } else { @@ -460,7 +473,7 @@ psm3_ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp, uint64_t nbytes; PSM2_LOG_MSG("entering"); - psmi_assert((req->mq->hfi_base_window_rv % PSMI_PAGESIZE) == 0); + psmi_assert((psm3_mq_get_window_rv(req) % PSMI_PAGESIZE) == 0); getreq = (struct ips_tid_get_request *) psm3_mpool_get(protoexp->tid_getreq_pool); @@ -519,8 +532,9 @@ psm3_ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp, else #endif nbytes = PSMI_ALIGNUP((length + count - 1) / count, PSMI_PAGESIZE); - getreq->tidgr_rndv_winsz = - min(nbytes, req->mq->hfi_base_window_rv); + getreq->tidgr_rndv_winsz = psm3_mq_get_window_rv(req); + if (nbytes < getreq->tidgr_rndv_winsz) + getreq->tidgr_rndv_winsz = nbytes; _HFI_MMDBG("posting TID get request: nbytes=%"PRIu64" winsz=%u len=%u\n", nbytes, getreq->tidgr_rndv_winsz, getreq->tidgr_length); // we have now computed the size of each TID sequence (tidgr_rndv_winsz) @@ -635,12 +649,19 @@ psm3_ips_protoexp_send_tid_grant(struct ips_tid_recv_desc *tidrecvc) #if defined(PSM_CUDA) || defined(PSM_ONEAPI) -static -void psmi_deallocate_chb(struct ips_gpu_hostbuf* chb) +void psm3_ips_deallocate_send_chb(struct ips_gpu_hostbuf* chb, int reset) { - PSM3_GPU_HOSTBUF_DESTROY(chb); - psmi_free(chb); - return; + if (chb->is_tempbuf) { + PSM3_GPU_HOSTBUF_DESTROY(chb); + psmi_free(chb); + } else { + chb->req = NULL; + chb->offset = 0; + chb->bytes_read = 0; + if (reset) + PSM3_GPU_HOSTBUF_RESET(chb); + psm3_mpool_put(chb); + } } #endif @@ -677,19 +698,13 @@ ips_protoexp_tidsendc_complete(struct ips_tid_send_desc *tidsendc) STAILQ_REMOVE(&req->sendreq_prefetch, tidsendc->gpu_hostbuf[0], ips_gpu_hostbuf, req_next); - if (tidsendc->gpu_hostbuf[0]->is_tempbuf) - psmi_deallocate_chb(tidsendc->gpu_hostbuf[0]); - else { - tidsendc->gpu_hostbuf[0]->req = NULL; - tidsendc->gpu_hostbuf[0]->offset = 0; - tidsendc->gpu_hostbuf[0]->bytes_read = 0; - PSM3_GPU_HOSTBUF_RESET(tidsendc->gpu_hostbuf[0]); - psm3_mpool_put(tidsendc->gpu_hostbuf[0]); - } + psm3_ips_deallocate_send_chb(tidsendc->gpu_hostbuf[0], 1); psmi_gpu_run_prefetcher(protoexp, tidsendc); } - } else - psmi_free(tidsendc->userbuf); + } else { + psm3_ips_deallocate_send_chb(tidsendc->gpu_split_buf, 0); + tidsendc->gpu_split_buf = NULL; + } } #endif /* Check if we can complete the send request. */ @@ -1220,7 +1235,9 @@ int ips_protoexp_handle_immed_data(struct ips_proto *proto, uint64_t conn_ref, // For User RC conn_ref is context we set in rc_qp_create (*ipsaddr) // For Kernel RC, conn_ref is the conn handle (psm3_rv_conn_get_conn_handle) // maybe this should be an assert so don't add test in production code + // caller can't get qp_context (conn_ref) from rbuf_qp for SRQ if ((conn_type == RDMA_IMMED_USER_RC) + && ! proto->ep->verbs_ep.srq && (uint64_t)tidrecvc->ipsaddr != conn_ref) { // RDWA Write is not on expected RC QP from remote node _HFI_ERROR("RDMA Write on Wrong User QP 0x%"PRIx64", expect 0x%"PRIx64"\n", @@ -1304,19 +1321,41 @@ psmi_gpu_reclaim_hostbufs(struct ips_tid_get_request *getreq) } return PSM2_OK; } -static -struct ips_gpu_hostbuf* psmi_allocate_chb(uint32_t window_len) + +// allocate a chb control structure. The actual buffer and event needed for the +// DTOH async copy are allocated in chb's 1st use in PSM3_GPU_MEMCPY_DTOH_START +struct ips_gpu_hostbuf* psm3_ips_allocate_send_chb(struct ips_proto *proto, + uint32_t nbytes, int allow_temp) { - struct ips_gpu_hostbuf* chb = (struct ips_gpu_hostbuf*) - psmi_calloc(PSMI_EP_NONE, + struct ips_gpu_hostbuf* chb = NULL; + unsigned bufsz; + + if (nbytes <= GPU_SMALLHOSTBUF_SZ) { + chb = (struct ips_gpu_hostbuf *) psm3_mpool_get( + proto->gpu_hostbuf_pool_small_send); + bufsz = proto->gpu_hostbuf_small_send_cfg.bufsz; + } + if (chb == NULL) { + chb = (struct ips_gpu_hostbuf *) psm3_mpool_get( + proto->gpu_hostbuf_pool_send); + bufsz = proto->gpu_hostbuf_send_cfg.bufsz; + } + + /* were any buffers available? If not force allocate */ + if (chb == NULL && allow_temp) { + chb = (struct ips_gpu_hostbuf*) psmi_calloc(PSMI_EP_NONE, UNDEFINED, 1, sizeof(struct ips_gpu_hostbuf)); - if_pf (chb == NULL) { - psm3_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, - "Couldn't allocate cuda host buffers "); - return NULL; + if_pf (chb == NULL) { + psm3_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, + "Couldn't allocate GPU host bounce buffers "); + return NULL; + } + chb->is_tempbuf = 1; + bufsz = nbytes; } - PSM3_GPU_HOSTBUF_FORCE_INIT(chb, window_len); + if (chb && ! chb->host_buf) + PSM3_GPU_HOST_ALLOC(&chb->host_buf, bufsz); return chb; } @@ -1333,21 +1372,12 @@ void psmi_gpu_run_prefetcher(struct ips_protoexp *protoexp, if (req->prefetch_send_msgoff < req->req_data.send_msglen) { /* some data remains to be sent */ offset = req->prefetch_send_msgoff; + psmi_assert(req->is_buf_gpu_mem); window_len = ips_gpu_next_window( - proto->mq->hfi_base_window_rv, + psm3_mq_get_window_rv(req), offset, req->req_data.buf_len); - unsigned bufsz = 0; - if (window_len <= GPU_SMALLHOSTBUF_SZ) { - chb = (struct ips_gpu_hostbuf *) psm3_mpool_get( - proto->gpu_hostbuf_pool_small_send); - bufsz = proto->gpu_hostbuf_small_send_cfg.bufsz; - } - if (chb == NULL) { - chb = (struct ips_gpu_hostbuf *) psm3_mpool_get( - proto->gpu_hostbuf_pool_send); - bufsz = proto->gpu_hostbuf_send_cfg.bufsz; - } + chb = psm3_ips_allocate_send_chb(proto, window_len, 0); /* were any buffers available for the prefetcher? */ if (chb == NULL) return; @@ -1358,7 +1388,7 @@ void psmi_gpu_run_prefetcher(struct ips_protoexp *protoexp, chb->gpu_buf = (uint8_t*)req->req_data.buf + offset; chb->bytes_read = 0; - PSM3_GPU_MEMCPY_DTOH_START(proto, chb, window_len, bufsz); + PSM3_GPU_MEMCPY_DTOH_START(proto, chb, window_len); STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next); return; @@ -1384,28 +1414,13 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, while (req->prefetch_send_msgoff < tsess_srcoff + tsess_length) { /* some data remains to be sent */ offset = req->prefetch_send_msgoff; + psmi_assert(req->is_buf_gpu_mem); window_len = ips_gpu_next_window( - proto->mq->hfi_base_window_rv, + psm3_mq_get_window_rv(req), offset, req->req_data.buf_len); - unsigned bufsz = 0; - if (window_len <= GPU_SMALLHOSTBUF_SZ) { - chb = (struct ips_gpu_hostbuf *) psm3_mpool_get( - proto->gpu_hostbuf_pool_small_send); - bufsz = proto->gpu_hostbuf_small_send_cfg.bufsz; - } - if (chb == NULL) { - chb = (struct ips_gpu_hostbuf *) psm3_mpool_get( - proto->gpu_hostbuf_pool_send); - bufsz = proto->gpu_hostbuf_send_cfg.bufsz; - } - - /* were any buffers available? If not force allocate */ - if (chb == NULL) { - chb = psmi_allocate_chb(window_len); - psmi_assert(chb); - chb->is_tempbuf = 1; - } + /* if no buffers available, force allocate of a temp buf */ + chb = psm3_ips_allocate_send_chb(proto, window_len, 1); req->prefetch_send_msgoff += window_len; chb->offset = offset; chb->size = window_len; @@ -1413,19 +1428,24 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, chb->gpu_buf = (uint8_t*)req->req_data.buf + offset; chb->bytes_read = 0; - PSM3_GPU_MEMCPY_DTOH_START(proto, chb, window_len, bufsz); + PSM3_GPU_MEMCPY_DTOH_START(proto, chb, window_len); STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next); if (type == PSMI_GPU_PARTIAL_MATCH_FOUND) { + // caller matched 1st chb, but needed more prefetched + // see if we have what we need now if ((tsess_srcoff < chb->offset) && ((tsess_srcoff + tsess_length) > chb->offset)) { + // will collect the 2 prefetched chb's for this + // RDMA Write send into a single CPU temp buffer + // do alloc now, hoping to hide it behind GPU async copy to chb tidsendc->gpu_hostbuf[0] = chb_prev; tidsendc->gpu_hostbuf[1] = chb; tidsendc->gpu_num_buf = 2; - void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED, - tsess_length); + tidsendc->gpu_split_buf = psm3_ips_allocate_send_chb(proto, + tsess_length, 1); tidsendc->userbuf = - (void *)((uintptr_t) buffer); + (void *)((uintptr_t) tidsendc->gpu_split_buf->host_buf); tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf + tsess_unaligned_start); @@ -1433,29 +1453,35 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, } } else { if (attached) { + // we attached one in prior loop, now have + // a second, should have what we need now + psmi_assert((tsess_srcoff + tsess_length) > chb->offset); + // will collect the 2 prefetched chb's for this + // RDMA Write send into a single CPU temp buffer + // do alloc now, hoping to hide it behind GPU async copy to chb tidsendc->gpu_hostbuf[0] = chb_prev; tidsendc->gpu_hostbuf[1] = chb; tidsendc->gpu_num_buf = 2; - void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED, - tsess_length); + tidsendc->gpu_split_buf = psm3_ips_allocate_send_chb(proto, + tsess_length, 1); tidsendc->userbuf = - (void *)((uintptr_t) buffer); + (void *)((uintptr_t) tidsendc->gpu_split_buf->host_buf); tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf + tsess_unaligned_start); - attached = 0; return; } if ((tsess_srcoff > chb->offset) && (tsess_srcoff < (chb->offset + chb->size)) && ((tsess_srcoff + tsess_length) > (chb->offset + chb->size))) { + // we prefetched one, but need another chb_prev = chb; attached = 1; - chb = NULL; continue; } else if ((chb->offset <= tsess_srcoff) && ((tsess_srcoff + tsess_length) <= (chb->offset+chb->size))) { + // we prefetched one and have what we need tidsendc->gpu_hostbuf[0] = chb; tidsendc->gpu_hostbuf[1] = NULL; tidsendc->gpu_num_buf = 1; @@ -1466,8 +1492,7 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, (void *)((uintptr_t)tidsendc->userbuf + tsess_unaligned_start ); return; - } else - chb = NULL; + } } } } @@ -1575,11 +1600,11 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, tidsendc->gpu_num_buf = 0; if (req->gpu_hostbuf_used) { /* To get a match: - * 1. Tid list offset + length is contained within a chb - * 2. Tid list offset + length is contained within - * the prefetched offset of this req. - * 3. Tid list offset + length is partially prefetched - * within one chb. (A partial match) + * 1. FULL - Tid list offset + length is contained within a chb + * 2. SPLIT - Tid list offset + length is contained within + * the prefetched offset of this req. (2 chb) + * 3. PARTIAL - Tid list offset + length is partially prefetched + * within one chb. */ STAILQ_FOREACH(chb, &req->sendreq_prefetch, req_next) { rc = psmi_find_match_in_prefeteched_chb(chb, @@ -1600,10 +1625,13 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, tidsendc->gpu_hostbuf[1] = NULL; tidsendc->gpu_num_buf = 1; } else if (rc == PSMI_GPU_SPLIT_MATCH_FOUND){ - void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED, - tid_list->tsess_length); + // will collect the 2 prefetched chb's for this + // RDMA Write send into a single CPU temp buffer + // do alloc now, hoping to hide it behind GPU async copy to chb + tidsendc->gpu_split_buf =psm3_ips_allocate_send_chb(protoexp->proto, + tid_list->tsess_length, 1); tidsendc->userbuf = - (void *)((uintptr_t) buffer); + (void *)((uintptr_t) tidsendc->gpu_split_buf->host_buf); tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf ); @@ -1612,6 +1640,7 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, tidsendc->gpu_hostbuf[1] = chb_next; tidsendc->gpu_num_buf = 2; } else if (rc == PSMI_GPU_PARTIAL_MATCH_FOUND) { + // need to prefetch more psmi_attach_chb_to_tidsendc(protoexp, req, tidsendc, chb, @@ -1620,6 +1649,7 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, 0, rc); } else { + // no match, need to prefetch psmi_attach_chb_to_tidsendc(protoexp, req, tidsendc, NULL, @@ -1849,6 +1879,7 @@ psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc) #if defined(PSM_CUDA) || defined(PSM_ONEAPI) struct ips_gpu_hostbuf *chb, *chb_next; uint32_t offset_in_chb, i; + // wait for async copies into needed prefetcher chb's to finish for (i = 0; i < tidsendc->gpu_num_buf; i++) { chb = tidsendc->gpu_hostbuf[i]; if (chb) { @@ -1864,8 +1895,9 @@ psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc) chb = tidsendc->gpu_hostbuf[0]; chb_next = tidsendc->gpu_hostbuf[1]; offset_in_chb = tidsendc->tid_list.tsess_srcoff - chb->offset; - /* Copying data from multiple cuda - * host buffers into a bounce buffer. + /* Copying data from multiple prefetched + * host buffers into a single temp CPU bounce buffer. + * so can issue a single RDMA Write from the temp bounce buffer */ memcpy(tidsendc->buffer, (void *)((uintptr_t)chb->host_buf + offset_in_chb), chb->size-offset_in_chb); @@ -1881,29 +1913,13 @@ psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc) if(chb->bytes_read == chb->size) { STAILQ_REMOVE(&tidsendc->mqreq->sendreq_prefetch, chb, ips_gpu_hostbuf, req_next); - if (chb->is_tempbuf) - psmi_deallocate_chb(chb); - else { - chb->req = NULL; - chb->offset = 0; - chb->bytes_read = 0; - PSM3_GPU_HOSTBUF_RESET(chb); - psm3_mpool_put(chb); - } + psm3_ips_deallocate_send_chb(chb, 1); psmi_gpu_run_prefetcher(protoexp, tidsendc); } if(chb_next->bytes_read == chb_next->size) { STAILQ_REMOVE(&tidsendc->mqreq->sendreq_prefetch, chb_next, ips_gpu_hostbuf, req_next); - if (chb_next->is_tempbuf) - psmi_deallocate_chb(chb_next); - else{ - chb_next->req = NULL; - chb_next->offset = 0; - chb_next->bytes_read = 0; - PSM3_GPU_HOSTBUF_RESET(chb_next); - psm3_mpool_put(chb_next); - } + psm3_ips_deallocate_send_chb(chb_next, 1); psmi_gpu_run_prefetcher(protoexp, tidsendc); } /* Clean Up tidsendc ref's to split cuda hostbufs when no longer needed */ @@ -2190,8 +2206,10 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp, tidrecvc->stats.nReXmit = 0; tidrecvc->stats.nErrChkReceived = 0; - _HFI_EXP("alloc tidrecv=%d\n", - tidrecvc->rdescid._desc_idx); + _HFI_EXP("alloc tidrecv=%d srcoff=%u length=%u\n", + tidrecvc->rdescid._desc_idx, + tidrecvc->tid_list.tsess_srcoff, + tidrecvc->tid_list.tsess_length); tidrecvc->grantscb = grantscb; diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_mq.c b/prov/psm3/psm3/ptl_ips/ips_proto_mq.c index b4582c6521d..cdcc480e89a 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto_mq.c +++ b/prov/psm3/psm3/ptl_ips/ips_proto_mq.c @@ -158,8 +158,7 @@ int ips_proto_mq_eager_complete(void *reqp, uint32_t nbytes) chb = STAILQ_FIRST(&req->sendreq_prefetch); STAILQ_REMOVE_HEAD(&req->sendreq_prefetch, req_next); - PSM3_GPU_HOSTBUF_RESET(chb); - psm3_mpool_put(chb); + psm3_ips_deallocate_send_chb(chb, 1); } } #endif @@ -508,24 +507,13 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, while ((offset < len) && (prefetch_lookahead < proto->gpu_prefetch_limit)) { chb = NULL; + psmi_assert(req->is_buf_gpu_mem); window_len = ips_gpu_next_window( - proto->mq->hfi_base_window_rv, + psm3_mq_get_window_rv(req), offset, len); - unsigned bufsz; - if (window_len <= GPU_SMALLHOSTBUF_SZ) { - chb = (struct ips_gpu_hostbuf *) - psm3_mpool_get( - proto->gpu_hostbuf_pool_small_send); - bufsz = proto->gpu_hostbuf_small_send_cfg.bufsz; - } - if (chb == NULL) { - chb = (struct ips_gpu_hostbuf *) - psm3_mpool_get( - proto->gpu_hostbuf_pool_send); - bufsz = proto->gpu_hostbuf_send_cfg.bufsz; - } + chb = psm3_ips_allocate_send_chb(proto, window_len, 0); /* any buffers available? */ if (chb == NULL) { @@ -540,7 +528,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, chb->gpu_buf = (uint8_t*)buf + offset; chb->bytes_read = 0; - PSM3_GPU_MEMCPY_DTOH_START(proto, chb, window_len, bufsz); + PSM3_GPU_MEMCPY_DTOH_START(proto, chb, window_len); STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next); @@ -590,7 +578,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, && ips_epaddr_rdma_connected(ipsaddr) && !req->mr #if defined(PSM_CUDA) || defined(PSM_ONEAPI) - && len > GPUDIRECT_THRESH_RV + && (!PSMI_IS_GPU_ENABLED || len > GPUDIRECT_THRESH_RV) && ! req->gpu_hostbuf_used #endif ) { @@ -625,9 +613,11 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, #if defined(PSM_CUDA) || defined(PSM_ONEAPI) static inline -int psm3_is_needed_rendezvous(struct ips_proto *proto, uint32_t len) +int psm3_is_needed_rendezvous(struct ips_proto *proto, uint32_t len, + uint32_t flags_user) { if ( + !(flags_user & PSM2_MQ_FLAG_INJECT) && len > gpu_thresh_rndv){ return 1; } @@ -667,6 +657,8 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user ipsaddr = (ips_epaddr_t *)mepaddr; } psmi_assert(ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED); + // psmx3 layer never uses mq_isend for FI_INJECT + psmi_assert(! (flags_user & PSM2_MQ_FLAG_INJECT)); proto = ((psm2_epaddr_t) ipsaddr)->proto; @@ -681,7 +673,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user if (req->is_buf_gpu_mem) { gpu_mem = 1; PSM3_MARK_BUF_SYNCHRONOUS(ubuf); - if (psm3_is_needed_rendezvous(proto, len)) + if (psm3_is_needed_rendezvous(proto, len, 0)) goto do_rendezvous; } #endif @@ -1026,12 +1018,13 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, gpu_mem = PSM3_IS_BUFFER_GPU_MEM(ubuf, len); if (gpu_mem) { PSM3_MARK_BUF_SYNCHRONOUS(ubuf); - if (psm3_is_needed_rendezvous(proto, len)) + if (psm3_is_needed_rendezvous(proto, len, flags)) goto do_rendezvous; } #endif flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO]; + /* SENDSYNC gets priority, assume not used for MPI_isend w/INJECT */ if (flags & PSM2_MQ_FLAG_SENDSYNC) { goto do_rendezvous; } else if (len <= mq->hfi_thresh_tiny) { @@ -1117,7 +1110,11 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, } else { user_buffer = ubuf; #ifdef PSM_HAVE_REG_MR - if (len > proto->iovec_gpu_thresh_eager_blocking) { + if (len > proto->iovec_gpu_thresh_eager_blocking +#ifdef PSM_INJECT_NOSDMA + && !(flags & PSM2_MQ_FLAG_INJECT) +#endif + ) { scb->mr = psm3_verbs_reg_mr( proto->mr_cache, 0, (void*)user_buffer, len, IBV_ACCESS_IS_GPU_ADDR); @@ -1142,7 +1139,11 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, #endif // PSM_CUDA || PSM_ONEAPI { #ifdef PSM_HAVE_REG_MR - if (len > proto->iovec_thresh_eager_blocking) { + if (len > proto->iovec_thresh_eager_blocking +#ifdef PSM_INJECT_NOSDMA + && !(flags & PSM2_MQ_FLAG_INJECT) +#endif + ) { scb->mr = psm3_verbs_reg_mr(proto->mr_cache, 0, (void*)user_buffer, len, 0); } else @@ -1240,6 +1241,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]); } else if (len <= mq->hfi_thresh_rv) { + // for FI_INJECT eager comes from user buffer, needs end to end ack psm2_mq_req_t req; /* Block until we can get a req */ diff --git a/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c b/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c index f1cee4faffd..562721a0b37 100644 --- a/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c +++ b/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c @@ -264,11 +264,64 @@ void psm3_ips_ptl_rcvthread_transfer_ownership(ptl_t *from_ptl_gen, ptl_t *to_pt rcvc->ptl = to_ptl_gen; } +/* parse recv thread frequency for PSM3_RCVTHREAD_FREQ" + * format is min_freq[:max_freq[:shift_freq]]", + * Either field can be omitted in which case default (input tvals) is used + * for given field. + * 0 - successfully parsed, tvals updated + * -1 - str empty, tvals unchanged + * -2 - syntax error, tvals may have been changed + */ +static int parse_rcvthread_freq(const char *str, + size_t errstr_size, char errstr[], + int tvals[3]) +{ + psmi_assert(tvals); + int ret = psm3_parse_str_tuples(str, 3, tvals); + if (ret < 0) + return ret; + if (tvals[0] == 0 || tvals[1] == 0) { + // disables receiver thread, no other checks needed + return 0; + } + if (tvals[0] < 0 || tvals[0] > 1000) { + if (errstr_size) + snprintf(errstr, errstr_size, " min_freq must be 0 to 1000"); + return -2; + } + if (tvals[1] < 0 || tvals[1] > 1000) { + if (errstr_size) + snprintf(errstr, errstr_size, " max_freq must be 0 to 1000"); + return -2; + } + if (tvals[0] > tvals[1]) { + if (errstr_size) + snprintf(errstr, errstr_size, " min_freq (%d) must be <= max_freq (%d)", tvals[0], tvals[1]); + return -2; + } + if (tvals[2] < 0 || tvals[2] > 10) { + if (errstr_size) + snprintf(errstr, errstr_size, " shift_freq must be 0 to 10"); + return -2; + } + return 0; +} + +static int parse_check_rcvthread_freq(int type, + const union psmi_envvar_val val, void *ptr, + size_t errstr_size, char errstr[]) +{ + // parser will set tvals to result, use a copy to protect input of defaults + int tvals[3] = { ((int*)ptr)[0], ((int*)ptr)[1], ((int*)ptr)[2] }; + psmi_assert(type == PSMI_ENVVAR_TYPE_STR_TUPLES); + return parse_rcvthread_freq(val.e_str, errstr_size, errstr, tvals); +} + + psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc) { union psmi_envvar_val env_to; char rcv_freq[192]; - int no_timeout = 0; int tvals[3] = { RCVTHREAD_TO_MIN_FREQ, RCVTHREAD_TO_MAX_FREQ, RCVTHREAD_TO_SHIFT @@ -276,40 +329,19 @@ psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc) snprintf(rcv_freq, sizeof(rcv_freq) - 1, "%d:%d:%d", RCVTHREAD_TO_MIN_FREQ, RCVTHREAD_TO_MAX_FREQ, RCVTHREAD_TO_SHIFT); - rcv_freq[sizeof(rcv_freq) - 1] = '\0'; - if (!psm3_getenv("PSM3_RCVTHREAD_FREQ", + (void)psm3_getenv_range("PSM3_RCVTHREAD_FREQ", "Recv Thread frequency (per sec) ", + "Specified as min_freq[:max_freq[:shift_freq]]\nwhere min_freq and max_freq are polls per second\n(0 disables receiver thread)\nand 2^shift_freq is amount to multiply or divide frequency by", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_TUPLES, - (union psmi_envvar_val)rcv_freq, &env_to)) { - /* not using default values */ - (void)psm3_parse_str_tuples(env_to.e_str, 3, tvals); - int invalid = 0; - - if (tvals[0] == 0 || tvals[1] == 0) { - no_timeout = 1; - } else { - if (tvals[0] > 1000) - invalid = 1; - if (tvals[1] > 1000 || tvals[1] < tvals[0]) - invalid = 1; - if (tvals[2] > 10) - invalid = 1; - } - - if (invalid) { - _HFI_INFO - ("Overriding invalid request for RcvThread frequency" - " settings of %s to be <%d:%d:%d>\n", env_to.e_str, - RCVTHREAD_TO_MIN_FREQ, RCVTHREAD_TO_MAX_FREQ, - RCVTHREAD_TO_SHIFT); - tvals[0] = RCVTHREAD_TO_MIN_FREQ; - tvals[1] = RCVTHREAD_TO_MAX_FREQ; - tvals[2] = RCVTHREAD_TO_SHIFT; - } + (union psmi_envvar_val)rcv_freq, + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + parse_check_rcvthread_freq, tvals, &env_to); + if (parse_rcvthread_freq(env_to.e_str, 0, NULL, tvals) < 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); } - - if (no_timeout) { + if (tvals[0] == 0 || tvals[1] == 0) { rcvc->last_timeout = -1; _HFI_PRDBG("PSM3_RCVTHREAD_FREQ set to only interrupt " "(no timeouts)\n"); diff --git a/prov/psm3/psm3/ptl_self/ptl.c b/prov/psm3/psm3/ptl_self/ptl.c index 35181f0f3ba..19231015d9b 100644 --- a/prov/psm3/psm3/ptl_self/ptl.c +++ b/prov/psm3/psm3/ptl_self/ptl.c @@ -80,6 +80,14 @@ ptl_handle_rtsmatch(psm2_mq_req_t recv_req, int was_posted) psm2_mq_req_t send_req = (psm2_mq_req_t) recv_req->ptl_req_ptr; if (recv_req->req_data.recv_msglen > 0) { +#ifdef PSM_DSA + if (psm3_use_dsa(recv_req->req_data.recv_msglen)) + psm3_dsa_memcpy(recv_req->req_data.buf, + send_req->req_data.buf, + recv_req->req_data.recv_msglen, 0, + &send_req->mq->stats.dsa_stats[0]); + else +#endif psm3_mq_mtucpy(recv_req->req_data.buf, send_req->req_data.buf, recv_req->req_data.recv_msglen); } diff --git a/prov/psm3/psm3/utils/utils_dsa.c b/prov/psm3/psm3/utils/utils_dsa.c index 219f8201fb1..2c697b1cf20 100644 --- a/prov/psm3/psm3/utils/utils_dsa.c +++ b/prov/psm3/psm3/utils/utils_dsa.c @@ -97,9 +97,14 @@ static uint32_t dsa_thresh; // copies > thresh will use DSA // per process (such as OneCCL workers or Intel MPI Multi-EP threading). // But expected counts for such are modest (2-4 for Intel MPI, 8-16 for OneCCL) #define DSA_MAX_QUEUES 32 + +// Default: 2 MB. +#define DSA_MAX_XFER_SIZE_DEFAULT (1 << 21) + // information parsed from PSM3_DSA_WQS static char *dsa_wq_filename[DSA_MAX_PROC][DSA_MAX_QUEUES]; static uint8_t dsa_wq_mode[DSA_MAX_PROC][DSA_MAX_QUEUES]; +static uint32_t dsa_wq_max_xfer_size[DSA_MAX_PROC][DSA_MAX_QUEUES]; static uint32_t dsa_num_wqs[DSA_MAX_PROC]; static uint32_t dsa_num_proc; @@ -108,6 +113,7 @@ struct dsa_wq { const char *wq_filename; // points into dsa_wq_filename void *wq_reg; // mmap memory uint32_t use_count; // how many threads assigned to this WQ + uint32_t max_xfer_size; // Maximum supported transfer size uint8_t dedicated; // is this a dedicated (1) or shared (0) WQ }; static struct dsa_wq dsa_wqs[DSA_MAX_QUEUES]; @@ -119,6 +125,7 @@ static psmi_spinlock_t dsa_wq_lock; // protects dsa_wq.use_count // Each thread is assigned a DSA WQ on 1st memcpy static __thread void *dsa_wq_reg = NULL; static __thread uint8_t dsa_wq_dedicated; +static __thread uint32_t dsa_wq_xfer_limit; // we keep completion record in thread local storage instead of stack // this way if a DSA completion times out and arrives late it still has a @@ -163,6 +170,13 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx, uint32_t cpu_n; uint64_t start_cycles, end_cycles; uint64_t loops; + uint32_t dsa_chk_size; + uint32_t cpu_chk_size; + int t_chunks; + uint32_t dsa_copied_len = 0; + uint32_t cpu_copied_len = 0; + int copied_chunks = 0; + uint32_t dsa_cp_len; #if defined(PSM_CUDA) || defined(PSM_ONEAPI) if (n && PSMI_IS_GPU_ENABLED && (PSMI_IS_GPU_MEM(dest) || PSMI_IS_GPU_MEM((void *) src))) { @@ -177,22 +191,31 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx, return; } + /* + * Calculate the total chunks. + */ + t_chunks = (n + dsa_wq_xfer_limit - 1) / dsa_wq_xfer_limit; + // TBD - add some statistics for DSA vs CPU copy use // to maximize performance we do part of the copy with CPU while we // wait for DSA to copy the rest if (dsa_ratio) { cpu_n = n/dsa_ratio; + cpu_chk_size = cpu_n / t_chunks; // TBD - should we compute so DSA gets a full multiple of pages and CPU // does the rest? Should we start DSA on a page boundary? // round down to page boundary //cpu_n = ROUNDDOWNP2(cpu_n, PSMI_PAGESIZE); // round to a multiple of 8 bytes at least - cpu_n = ROUNDDOWNP2(cpu_n, 8); + cpu_chk_size = ROUNDDOWNP2(cpu_chk_size, 8); + cpu_n = cpu_chk_size * t_chunks; } else { cpu_n = 0; + cpu_chk_size = 0; } dsa_n = n - cpu_n; + dsa_chk_size = (dsa_n + t_chunks - 1)/t_chunks; dsa_src = (void*)((uintptr_t)src + cpu_n); dsa_dest = (void*)((uintptr_t)dest + cpu_n); psmi_assert(dsa_n); @@ -200,6 +223,8 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx, // comp ptr must be 32 byte aligned comp = (struct dsa_completion_record *)(((uintptr_t)&dsa_comp[0] + 0x1f) & ~0x1f); + +restart: comp->status = 0; desc.opcode = DSA_OPCODE_MEMMOVE; /* set CRAV (comp address valid) and RCR (request comp) so get completion */ @@ -218,9 +243,13 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx, // for overall server. Best to take the pain here as page faults should // be rare during steady state of most apps // desc.flags |= IDXD_OP_FLAG_BOF; - desc.xfer_size = dsa_n; - desc.src_addr = (uintptr_t)dsa_src; - desc.dst_addr = (uintptr_t)dsa_dest; + if (copied_chunks < (t_chunks - 1)) + dsa_cp_len = dsa_chk_size; + else + dsa_cp_len = dsa_n - dsa_copied_len; + desc.xfer_size = dsa_cp_len; + desc.src_addr = (uintptr_t)dsa_src + dsa_copied_len; + desc.dst_addr = (uintptr_t)dsa_dest + dsa_copied_len; desc.completion_addr = (uintptr_t)comp; // make sure completion status zeroing fully written before post to HW @@ -239,9 +268,8 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx, if (get_cycles() > end_cycles) { _HFI_INFO("Disabling DSA: DSA SWQ Enqueue Timeout\n"); dsa_available = 0; - memcpy(dest, src, n); stats->dsa_error++; - return; + goto memcpy_exit; } } stats->dsa_swq_wait_ns += cycles_to_nanosecs(get_cycles() - start_cycles); @@ -252,11 +280,13 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx, if (cpu_n) { // while DSA does it's thing, we copy rest via CPU - memcpy(dest, src, cpu_n); + memcpy((void *)((uintptr_t)dest + cpu_copied_len), + (void *)((uintptr_t)src + cpu_copied_len), cpu_chk_size); + cpu_copied_len += cpu_chk_size; } stats->dsa_copy++; - stats->dsa_copy_bytes += dsa_n; + stats->dsa_copy_bytes += dsa_cp_len; // wait for DSA to finish start_cycles = get_cycles(); @@ -269,8 +299,8 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx, if (get_cycles() > end_cycles && comp->status == 0) { _HFI_INFO("Disabling DSA: DSA Hardware Timeout\n"); dsa_available = 0; - memcpy(dsa_dest, dsa_src, dsa_n); stats->dsa_error++; + goto memcpy_exit; return; } loops++; @@ -294,9 +324,22 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx, stats->dsa_page_fault_rd++; _HFI_VDBG("DSA desc failed: page fault status %u\n", comp->status); } - memcpy(dsa_dest, dsa_src, dsa_n); - return; + goto memcpy_exit; } + /* Check loop status */ + dsa_copied_len += dsa_cp_len; + if (++copied_chunks < t_chunks) + goto restart; + + return; + +memcpy_exit: + memcpy((void *)((uintptr_t)dsa_dest + dsa_copied_len), + (void *)((uintptr_t)dsa_src + dsa_copied_len), + dsa_n - dsa_copied_len); + memcpy((void *)((uintptr_t)dest + cpu_copied_len), + (void *)((uintptr_t)src + cpu_copied_len), + cpu_n - cpu_copied_len); return; } @@ -378,6 +421,58 @@ static int psm3_dsa_mode(const char *wq_filename) return -1; } +// determine the max transfer size for a DSA WQ by reading the max_transfer_size +// file under DSA_DEVICES/wqX.Y/ +// where wqX.Y is last part of supplied wq_filename +// return the max_transfer_size. +// on error returns 0 and an _HFI_ERROR message has been output +static int psm3_dsa_max_xfer_size(const char *wq_filename) +{ + char wq_size_filename[PATH_MAX]; + const char *p; + char buf[20]; + int fd; + int res; + + p = strrchr(wq_filename, '/'); + if (p) + p++; // skip '/' + else + p = wq_filename; + res = snprintf(wq_size_filename, sizeof(wq_size_filename), + "%s/%s/max_transfer_size", DSA_DEVICES, p); + if (res < 0 || res > sizeof(wq_size_filename) - 1) { + _HFI_ERROR("Unable to determine DSA WQ max xfer size for %s\n", + wq_filename); + return 0; + } + fd = open(wq_size_filename, O_RDONLY); + if (fd < 0) { + _HFI_ERROR("Failed to open DSA WQ max xfer size: %s: %s\n", + wq_size_filename, strerror(errno)); + return 0; + } + res = read(fd, buf, sizeof(buf)-1); + if (res < 0) { + _HFI_ERROR("Failed to read DSA WQ max xfer size: %s: %s\n", + wq_size_filename, strerror(errno)); + close(fd); + return 0; + } + close(fd); + if (! res) { + _HFI_ERROR("Failed to read DSA WQ max xfer size: %s: empty file\n", + wq_size_filename); + return 0; + } + if (buf[res-1] == '\n') + buf[res-1] = '\0'; + else + buf[res] = '\0'; + _HFI_DBG("DSA WQ %s max xfer size %s\n", wq_filename, buf); + return (uint32_t)strtoul(buf, NULL, 0); +} + /* initialize DSA - call once per process */ /* Some invalid inputs and DSA initialization errors are treated as fatal errors * since if DSA gets initialized on some nodes, but not on others, the @@ -410,11 +505,11 @@ int psm3_dsa_init(void) if (! psm3_getenv("PSM3_DSA_WQS", "List of DSA WQ devices to use, one list per local process or per\n" "CPU socket:\n" - " wq0,wq2:wq4,wq6:,...\n" + " wq0,wq2;wq4,wq6;,...\n" "Each wq should be a shared workqueue DSA device or a unique\n" "dedicated workqueue DSA device,\n" " such as /dev/dsa/wq0.0\n" - "Colon separates the lists for different processes\n" + "Semicolon separates the lists for different processes\n" " default is '' in which case DSA is not used\n", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)"", &env_dsa_wq)) { @@ -430,10 +525,13 @@ int psm3_dsa_init(void) } s = temp; psmi_assert(*s); + // both : and ; are treated the same below, : is deprecated do { int mode; + uint32_t xfer_size; + new_proc = 0; - if (! *s) // trailing ',' or ':' on 2nd or later loop + if (! *s) // trailing ',' or ':' or ';' on 2nd or later loop break; if (proc >= DSA_MAX_PROC) { _HFI_ERROR("PSM3_DSA_WQS exceeds %u per node process limit: '%s'", @@ -441,9 +539,9 @@ int psm3_dsa_init(void) psmi_free(temp); goto fail; } - delim = strpbrk(s, ",:"); + delim = strpbrk(s, ",:;"); if (delim) { - new_proc = (*delim == ':'); + new_proc = (*delim == ':' || *delim == ';'); *delim = '\0'; } if (dsa_num_wqs[proc] > DSA_MAX_QUEUES) { @@ -460,6 +558,9 @@ int psm3_dsa_init(void) } if (mode) all_are_shared = 0; + xfer_size = psm3_dsa_max_xfer_size(s); + dsa_wq_max_xfer_size[proc][dsa_num_wqs[proc]] = xfer_size > 0 ? + xfer_size : DSA_MAX_XFER_SIZE_DEFAULT; dsa_wq_mode[proc][dsa_num_wqs[proc]] = mode; dsa_wq_filename[proc][dsa_num_wqs[proc]] = psmi_strdup(PSMI_EP_NONE, s); dsa_num_wqs[proc]++; @@ -468,7 +569,7 @@ int psm3_dsa_init(void) s = delim+1; } while (delim); psmi_free(temp); - // new_proc means trailing :, ignore it + // new_proc means trailing : or ;, ignore it // otherwise, last we processed counts if (!new_proc && proc < DSA_MAX_PROC && dsa_num_wqs[proc]) proc++; @@ -580,6 +681,7 @@ int psm3_dsa_init(void) // key off having rw access to the DSA WQ to decide if DSA is available dsa_wqs[i].wq_filename = dsa_wq_filename[proc][i]; dsa_wqs[i].dedicated = dsa_wq_mode[proc][i]; + dsa_wqs[i].max_xfer_size = dsa_wq_max_xfer_size[proc][i]; if (! realpath(dsa_wqs[i].wq_filename, dsa_filename)) { _HFI_ERROR("Failed to resolve DSA WQ path %s\n", dsa_wqs[i].wq_filename); goto fail; @@ -658,6 +760,7 @@ static inline void psm3_dsa_pick_wq(void) found: dsa_wq_reg = dsa_wqs[sel].wq_reg; dsa_wq_dedicated = dsa_wqs[sel].dedicated; + dsa_wq_xfer_limit = dsa_wqs[sel].max_xfer_size; } diff --git a/prov/psm3/psm3/utils/utils_env.c b/prov/psm3/psm3/utils/utils_env.c index f8c2dbd8b96..55efb77bc2b 100644 --- a/prov/psm3/psm3/utils/utils_env.c +++ b/prov/psm3/psm3/utils/utils_env.c @@ -90,7 +90,8 @@ int psm3_env_initialize(void) // get verbosity level setting for env logging // if invalid syntax, will output warning when parse during psm3_getenv const char *verb_env = getenv("PSM3_VERBOSE_ENV"); - (void)psm3_parse_val_pattern(verb_env, 0, &verb_env_val); + (void)psm3_parse_val_pattern_uint(verb_env, 0, &verb_env_val, + PSMI_ENVVAR_FLAG_NOABBREV, 0, 3); if (verb_env_val) env_log_level = 0; // log at INFO level @@ -119,7 +120,7 @@ int psm3_env_initialize(void) c = fgetc(f); if (c != EOF) { // line too long, fgetc until read newline - _HFI_INFO("%s: Ignoring line too long: '%s' ...\n", + _HFI_ENV_ERROR("%s: Ignoring line too long: '%s' ...\n", PSM3_ENV_FILENAME, buf); while (c != (int)(unsigned char)'\n' && (c = fgetc(f)) != EOF) ; @@ -150,7 +151,7 @@ int psm3_env_initialize(void) j = strspn(&buf[i], "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"); if (buf[i+j] != '=') { // malformed assignment,skip - _HFI_INFO("%s: Ignoring malformed assignment: '%s'\n", + _HFI_ENV_ERROR("%s: Ignoring malformed assignment: '%s'\n", PSM3_ENV_FILENAME, buf); continue; } @@ -180,7 +181,8 @@ int psm3_env_initialize(void) // allow /etc/psm3.conf to set PSM3_VERBOSE_ENV when defaulted // if invalid syntax, will output warning when parse during psm3_getenv if (! verb_env && 0 == strcmp("PSM3_VERBOSE_ENV", var.name)) { - (void)psm3_parse_val_pattern(var.value, 0, &verb_env_val); + (void)psm3_parse_val_pattern_uint(var.value, 0, &verb_env_val, + PSMI_ENVVAR_FLAG_NOABBREV, 0, 3); if (verb_env_val) env_log_level = 0; // log at INFO level } @@ -189,7 +191,7 @@ int psm3_env_initialize(void) // this must be parsed in a constructor prior to this function, // so we ignore it here if (0 == strcmp(var.name, "PSM3_DISABLE_MMAP_MALLOC")) { - _HFI_INFO("WARNING: %s Ignoring %s\n", PSM3_ENV_FILENAME,var.name); + _HFI_ENV_ERROR("WARNING: %s Ignoring %s\n", PSM3_ENV_FILENAME,var.name); psmi_free(var.name); psmi_free(var.value); continue; @@ -252,7 +254,9 @@ void psm3_env_print_val(FILE *f, const char *name, int type, switch (type) { case PSMI_ENVVAR_TYPE_STR: case PSMI_ENVVAR_TYPE_STR_TUPLES: - case PSMI_ENVVAR_TYPE_STR_VAL_PAT: + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT: + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT: + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS: fprintf(f, "%s=%s\n", name, val.e_str); break; case PSMI_ENVVAR_TYPE_INT: @@ -286,7 +290,9 @@ int psm3_env_snprint_val(char *buf, size_t size, const char *name, int type, switch (type) { case PSMI_ENVVAR_TYPE_STR: case PSMI_ENVVAR_TYPE_STR_TUPLES: - case PSMI_ENVVAR_TYPE_STR_VAL_PAT: + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT: + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT: + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS: return snprintf(buf, size, "%s=%s\n", name, val.e_str); break; case PSMI_ENVVAR_TYPE_INT: @@ -332,20 +338,18 @@ char *psm3_env_get(const char *name) return NULL; } -/* _CONSUMED_ALL() is a macro which indicates if strtol() consumed all - of the input passed to it. */ -#define _CONSUMED_ALL(CHAR_PTR) (((CHAR_PTR) != NULL) && (*(CHAR_PTR) == 0)) - // don't document that 3 and 3: and 3:pattern can output hidden params const char *PSM3_VERBOSE_ENV_HELP = - "Enable verbose output of environment variables. " - "(0 - none, 1 - changed w/o help, 2 - user help, " - "#: - limit output to rank 0, #:pattern - limit output " - "to processes whose label matches " + "Enable verbose output of environment variables.\n" + " 0 - none\n" + " 1 - only output changed w/o help\n" + " 2 - output all with help,\n" + " #: - limit output to rank 0\n" + " #:pattern - limit output to processes whose label matches\n " #ifdef FNM_EXTMATCH - "extended " + "extended " #endif - "glob pattern"; + "glob pattern"; /* If PSM3_VERBOSE_ENV is set in the environment, we determine * what its verbose level is and print the environment at "INFO" @@ -362,25 +366,24 @@ static int psm3_getenv_is_verblevel(int printlevel) unsigned verb_env_val; if (env) psm3_stats_print_env("PSM3_VERBOSE_ENV", env); - int ret = psm3_parse_val_pattern(env, 0, &verb_env_val); + int ret = psm3_parse_val_pattern_uint(env, 0, &verb_env_val, + PSMI_ENVVAR_FLAG_NOABBREV, 0, 3); psmi_getenv_verblevel = verb_env_val; - if (psmi_getenv_verblevel < 0 || psmi_getenv_verblevel > 3) - psmi_getenv_verblevel = 2; if (psmi_getenv_verblevel > 0) nlevel = 0; /* output at INFO level */ if (ret == -2) - _HFI_ENVDBG(0, "Invalid value for %s ('%s') %-40s Using: %u\n", - "PSM3_VERBOSE_ENV", env, PSM3_VERBOSE_ENV_HELP, verb_env_val); + _HFI_ENVDBG(0, "Invalid value for %s ('%s') Using: %u\nHelp: %s\n", + "PSM3_VERBOSE_ENV", env, verb_env_val, PSM3_VERBOSE_ENV_HELP); else if (psmi_getenv_verblevel == 1) _HFI_ENVDBG(0, " %-25s => '%s' (default was '%s')\n", "PSM3_VERBOSE_ENV", env?env:"", "0"); else if (env && *env) - _HFI_ENVDBG(nlevel, " %-25s %-40s => '%s' (default was '%s')\n", - "PSM3_VERBOSE_ENV", PSM3_VERBOSE_ENV_HELP, env, "0"); + _HFI_ENVDBG(nlevel, " %-25s => '%s' (default was '%s')\nHelp: %s\n", + "PSM3_VERBOSE_ENV", env, "0", PSM3_VERBOSE_ENV_HELP); else /* defaulted */ _HFI_ENVDBG(nlevel, - " %-25s %-40s => '%s'\n", - "PSM3_VERBOSE_ENV", PSM3_VERBOSE_ENV_HELP, "0"); + " %-25s => '%s'\nHelp: %s\n", + "PSM3_VERBOSE_ENV", "0", PSM3_VERBOSE_ENV_HELP); } // printlevel is visibility of env (USER=1 or HIDDEN=2) // so at verbosity 1 and 2 output USER @@ -419,314 +422,647 @@ static int psm3_count_tuples(const char *str) return ret; } -int -MOCKABLE(psm3_getenv)(const char *name, const char *descr, int level, - int type, union psmi_envvar_val defval, +/* _CONSUMED_ALL indicates if strtol() (and friends) consumed all of the input + * passed to it. CHAR_PTR is the output char pointer from strtol + */ +#define _CONSUMED_ALL(CHAR_PTR) (((CHAR_PTR) != NULL) && (*(CHAR_PTR) == 0)) + +/* convert a string to a signed number with basic bounds checking + * returns 0 - valid value and *val updated + * -1 -> empty string, *val unchanged + * -2 -> parse or range error, *val unchanged + */ +static int convert_str_signed(const char *str, long long *val, + long long min, long long max) +{ + char *ep; + long long temp; + + psmi_assert(val != NULL); + if (! str || ! *str) + return -1; + /* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */ + errno = 0; + temp = strtoll(str, &ep, 10); + if (! _CONSUMED_ALL(ep)) { + errno = 0; + temp = strtoll(str, &ep, 16); + if (! _CONSUMED_ALL(ep)) + return -2; + } + if (errno || temp < min || temp > max) + return -2; + + *val = temp; + return 0; +} + +/* convert a string to an unsigned number with basic bounds checking + * returns 0 - valid value and *val updated + * -1 -> empty string, *val unchanged + * -2 -> parse or range error, *val unchanged + */ +static int convert_str_unsigned(const char *str, unsigned long long *val, + unsigned long long min, unsigned long long max) +{ + char *ep; + unsigned long long temp; + + psmi_assert(val != NULL); + if (! str || ! *str) + return -1; + /* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */ + errno = 0; + temp = strtoull(str, &ep, 10); + if (! _CONSUMED_ALL(ep)) { + errno = 0; + temp = strtoull(str, &ep, 16); + if (! _CONSUMED_ALL(ep)) + return -2; + } + if (errno || temp < min || temp > max) + return -2; + + *val = temp; + return 0; +} +#undef _CONSUMED_ALL + +// returns: +// 0 - valid value input +// 1 - variable not set, used default +// -1 - invalid value for variable or invalid syntax, used default +int MOCKABLE(psm3_getenv_range)(const char *name, const char *descr, + const char *help, unsigned level_flags, + int type, union psmi_envvar_val defval, union psmi_envvar_val min, + union psmi_envvar_val max, psm3_getenv_check_t check, void *ptr, union psmi_envvar_val *newval) { - int used_default = 0; + int ret = 0; union psmi_envvar_val tval; char *env = psm3_env_get(name); + unsigned level = level_flags & PSMI_ENVVAR_LEVEL_MASK; + char rangestr[80] = ""; // for help + char errstr[512] = ""; // additional info for invalid values + char statserrstr[700] = ""; // add'l info for stats file when invalid input + +#define FORMAT_RANGESTR(FIELD, fmt) \ + do { \ + if ((level_flags & PSMI_ENVVAR_FLAG_NOMIN)) { \ + if ((level_flags & PSMI_ENVVAR_FLAG_NOMAX)) \ + rangestr[0] = '\0'; \ + else \ + snprintf(rangestr, sizeof(rangestr)," Max allowed " fmt "%s",\ + max.FIELD, \ + (level_flags & PSMI_ENVVAR_FLAG_NOABBREV)?"":" (or 'max')");\ + } else if ((level_flags & PSMI_ENVVAR_FLAG_NOMAX)) { \ + snprintf(rangestr, sizeof(rangestr)," Min allowed " fmt "%s", \ + min.FIELD, \ + (level_flags & PSMI_ENVVAR_FLAG_NOABBREV)?"":" (or 'min')");\ + } else { \ + snprintf(rangestr, sizeof(rangestr)," Valid range " fmt "%s" \ + " to " fmt "%s", \ + min.FIELD, \ + (level_flags & PSMI_ENVVAR_FLAG_NOABBREV)?"":" (or 'min')",\ + max.FIELD, \ + (level_flags & PSMI_ENVVAR_FLAG_NOABBREV)?"":" (or 'max')");\ + } \ + } while (0) + +#define _GETENV_CHECK(tval) \ + do { \ + if (check) { \ + if ((*check)(type, tval, ptr, sizeof(errstr), errstr)) { \ + tval = defval; \ + ret = -1; \ + /* errstr now has additional error information */ \ + } \ + } \ + } while (0); /* for verblevel 1 we only output non-default values with no help * for verblevel>1 we promote to info (verblevel=2 promotes USER, * verblevel=3 promotes HIDDEN) and show help. * for verblevel< 1 we don't promote anything and show help */ -#define _GETENV_PRINT(env, used_default, fmt, val, defval) \ +#define _GETENV_PRINT(env, ret, fmt, val, defval) \ do { \ (void)psm3_getenv_is_verblevel(level); \ - if (env && *env && used_default) \ - _HFI_INFO("Invalid value for %s ('%s') %-40s Using: " fmt "\n", \ - name, env, descr, val); \ - else if (used_default && psmi_getenv_verblevel != 1) \ - GETENV_PRINTF(level, "%s%-25s %-40s => " fmt \ - "\n", level > 1 ? "*" : " ", name, \ - descr, val); \ - else if (! used_default && psmi_getenv_verblevel == 1) \ + if (ret < 0 && (level_flags & PSMI_ENVVAR_FLAG_FATAL)) { \ + _HFI_ENV_ERROR("Invalid value for %s ('%s')%s\nHelp: %s%s\n%s%s", \ + name, env, errstr, descr, rangestr,\ + help?help:"", help?"\n":""); \ + snprintf(statserrstr, sizeof(statserrstr), \ + "Invalid value ('%s')%s", env, errstr); \ + } else if (ret < 0) { \ + _HFI_ENV_ERROR("Invalid value for %s ('%s')%s Using: " fmt "\nHelp: %s%s\n%s%s", \ + name, env, errstr, val, descr, rangestr,\ + help?help:"", help?"\n":""); \ + snprintf(statserrstr, sizeof(statserrstr), \ + "Invalid value ('%s')%s Using: " fmt, \ + env, errstr, val); \ + } else if (ret > 0 && psmi_getenv_verblevel != 1) \ + GETENV_PRINTF(level, "%s%-25s => " fmt \ + "\nHelp: %s%s\n%s%s", level > 1 ? "*" : " ", name, \ + val, descr, rangestr, \ + help?help:"", help?"\n":"");\ + else if (ret == 0 && psmi_getenv_verblevel == 1) \ GETENV_PRINTF(1, "%s%-25s => " \ fmt " (default was " fmt ")\n", \ level > 1 ? "*" : " ", name, \ val, defval); \ - else if (! used_default && psmi_getenv_verblevel != 1) \ - GETENV_PRINTF(1, "%s%-25s %-40s => " \ - fmt " (default was " fmt ")\n", \ - level > 1 ? "*" : " ", name, descr, \ - val, defval); \ + else if (ret == 0 && psmi_getenv_verblevel != 1) \ + GETENV_PRINTF(1, "%s%-25s => " \ + fmt " (default was " fmt ")\nHelp: %s%s\n%s%s", \ + level > 1 ? "*" : " ", name, \ + val, defval, descr, rangestr, \ + help?help:"", help?"\n":""); \ } while (0) -#define _CONVERT_TO_NUM(DEST,TYPE,STRTOL) \ - do { \ - char *ep; \ - /* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */ \ - DEST = (TYPE)STRTOL(env, &ep, 10); \ - if (! _CONSUMED_ALL(ep)) { \ - DEST = (TYPE)STRTOL(env, &ep, 16); \ - if (! _CONSUMED_ALL(ep)) { \ - used_default = 1; \ - tval = defval; \ - } \ - } \ +#define _CONVERT_TO_NUM(FIELD,TYPE,SIGNED,MIN,MAX) \ + do { \ + if (!(level_flags & (PSMI_ENVVAR_FLAG_NOMIN|PSMI_ENVVAR_FLAG_NOABBREV))\ + && (!strcasecmp(env, "min") || !strcasecmp(env, "minimum")))\ + tval.FIELD = min.FIELD; \ + else if (!(level_flags & (PSMI_ENVVAR_FLAG_NOMAX|PSMI_ENVVAR_FLAG_NOABBREV))\ + && (!strcasecmp(env, "max") || !strcasecmp(env, "maximum")))\ + tval.FIELD = max.FIELD; \ + else { \ + SIGNED long long temp; \ + if (convert_str_##SIGNED(env, &temp, MIN, MAX)) { \ + ret = -1; /* callered checked empty, so must be invalid */ \ + tval = defval; \ + } else if ((temp < min.FIELD \ + && !(level_flags & PSMI_ENVVAR_FLAG_NOMIN)) \ + || (temp > max.FIELD \ + && !(level_flags & PSMI_ENVVAR_FLAG_NOMAX))) { \ + ret = -1; \ + tval = defval; \ + } else { \ + tval.FIELD = (TYPE)temp; \ + } \ + } \ } while (0) switch (type) { case PSMI_ENVVAR_TYPE_YESNO: - tval.e_int = psm3_parse_str_yesno(env); - if (tval.e_int < 0) { + if (!env || *env == '\0') { tval = defval; - used_default = 1; + ret = 1; + } else { + switch (psm3_parse_str_yesno(env, &tval.e_int)) { + case -1: // empty, use default + psmi_assert(0); // shouldn't happen, checked for empty above + tval = defval; + ret = 1; + break; + case -2: // bad syntax, use default + tval = defval; + ret = -1; + break; + default: // valid input + _GETENV_CHECK(tval); + break; + } } - _GETENV_PRINT(env, used_default, "%s", tval.e_int ? "YES" : "NO", + _GETENV_PRINT(env, ret, "%s", tval.e_int ? "YES" : "NO", defval.e_int ? "YES" : "NO"); break; case PSMI_ENVVAR_TYPE_STR: if (!env || *env == '\0') { tval = defval; - used_default = 1; - } else + ret = 1; + } else { tval.e_str = env; - _GETENV_PRINT(env, used_default, "'%s'", tval.e_str, defval.e_str); + _GETENV_CHECK(tval); + } + _GETENV_PRINT(env, ret, "'%s'", tval.e_str, defval.e_str); break; case PSMI_ENVVAR_TYPE_INT: if (!env || *env == '\0') { tval = defval; - used_default = 1; + ret = 1; } else { - _CONVERT_TO_NUM(tval.e_int,int,strtol); + _CONVERT_TO_NUM(e_int,int,signed,INT_MIN,INT_MAX); + if (ret == 0) + _GETENV_CHECK(tval); } - _GETENV_PRINT(env, used_default, "%d", tval.e_int, defval.e_int); + FORMAT_RANGESTR(e_int, "%d"); + _GETENV_PRINT(env, ret, "%d", tval.e_int, defval.e_int); break; case PSMI_ENVVAR_TYPE_UINT: case PSMI_ENVVAR_TYPE_UINT_FLAGS: if (!env || *env == '\0') { tval = defval; - used_default = 1; + ret = 1; } else { - _CONVERT_TO_NUM(tval.e_int,unsigned int,strtoul); + _CONVERT_TO_NUM(e_uint,unsigned int,unsigned,0,UINT_MAX); + if (ret == 0) + _GETENV_CHECK(tval); } - if (type == PSMI_ENVVAR_TYPE_UINT_FLAGS) - _GETENV_PRINT(env, used_default, "0x%x", tval.e_uint, + if (type == PSMI_ENVVAR_TYPE_UINT_FLAGS) { + FORMAT_RANGESTR(e_uint, "0x%x"); + _GETENV_PRINT(env, ret, "0x%x", tval.e_uint, defval.e_uint); - else - _GETENV_PRINT(env, used_default, "%u", tval.e_uint, + } else { + FORMAT_RANGESTR(e_uint, "%u"); + _GETENV_PRINT(env, ret, "%u", tval.e_uint, defval.e_uint); + } break; case PSMI_ENVVAR_TYPE_LONG: if (!env || *env == '\0') { tval = defval; - used_default = 1; + ret = 1; } else { - _CONVERT_TO_NUM(tval.e_long,long,strtol); + _CONVERT_TO_NUM(e_long,long,signed,LONG_MIN,LONG_MAX); + if (ret == 0) + _GETENV_CHECK(tval); } - _GETENV_PRINT(env, used_default, "%ld", tval.e_long, defval.e_long); + FORMAT_RANGESTR(e_long, "%ld"); + _GETENV_PRINT(env, ret, "%ld", tval.e_long, defval.e_long); break; case PSMI_ENVVAR_TYPE_ULONG_ULONG: if (!env || *env == '\0') { tval = defval; - used_default = 1; + ret = 1; } else { - _CONVERT_TO_NUM(tval.e_ulonglong,unsigned long long,strtoull); + _CONVERT_TO_NUM(e_ulonglong,unsigned long long,unsigned,0,ULLONG_MAX); + if (ret == 0) + _GETENV_CHECK(tval); } - _GETENV_PRINT(env, used_default, "%llu", + FORMAT_RANGESTR(e_ulonglong, "%llu"); + _GETENV_PRINT(env, ret, "%llu", tval.e_ulonglong, defval.e_ulonglong); break; - case PSMI_ENVVAR_TYPE_STR_VAL_PAT: - { + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT: + if (!env || *env == '\0') { + tval = defval; + ret = 1; + } else if (check) { + // check will parse_val_pattern_int and check value returned + // caller must parse again + tval.e_str = env; + _GETENV_CHECK(tval); + } else { + int trash; + // we parse just for syntax check, caller must parse again + switch (psm3_parse_val_pattern_int(env, 0, &trash, level_flags, + (level_flags & PSMI_ENVVAR_FLAG_NOMIN)?INT_MIN:min.e_int, + (level_flags & PSMI_ENVVAR_FLAG_NOMAX)?INT_MAX:max.e_int)) { + case -1: // empty, use default + psmi_assert(0); // shouldn't happen, checked for empty above + tval = defval; + ret = 1; + break; + case -2: // one or more fields with bad syntax, use default + tval = defval; + ret = -1; + break; + default: // valid string + tval.e_str = env; + break; + } + } + FORMAT_RANGESTR(e_int, "%d"); + _GETENV_PRINT(env, ret, "'%s'", tval.e_str, defval.e_str); + break; + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT: + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS: + if (!env || *env == '\0') { + tval = defval; + ret = 1; + } else if (check) { + // check will parse_val_pattern_uint and check value returned + // caller must parse again + tval.e_str = env; + _GETENV_CHECK(tval); + } else { unsigned trash; // we parse just for syntax check, caller must parse again - if (psm3_parse_val_pattern(env, 0, &trash) < 0) { + switch (psm3_parse_val_pattern_uint(env, 0, &trash, level_flags, + (level_flags & PSMI_ENVVAR_FLAG_NOMIN)?0:min.e_uint, + (level_flags & PSMI_ENVVAR_FLAG_NOMAX)?UINT_MAX:max.e_uint)) { + case -1: // empty, use default + psmi_assert(0); // shouldn't happen, checked for empty above tval = defval; - used_default = 1; - } else + ret = 1; + break; + case -2: // one or more fields with bad syntax, use default + tval = defval; + ret = -1; + break; + default: // valid string tval.e_str = env; - _GETENV_PRINT(env, used_default, "'%s'", tval.e_str, defval.e_str); + break; + } } + if (type == PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS) + FORMAT_RANGESTR(e_uint, "0x%x"); + else + FORMAT_RANGESTR(e_uint, "%u"); + _GETENV_PRINT(env, ret, "'%s'", tval.e_str, defval.e_str); break; case PSMI_ENVVAR_TYPE_STR_TUPLES: - { + if (!env || *env == '\0') { + tval = defval; + ret = 1; + } else if (check) { + // check will parse_str_tuples and check their values + // caller must parse again + tval.e_str = env; + _GETENV_CHECK(tval); + } else { // we parse just for syntax check, caller must parse again int vals[3]; int ntup = psm3_count_tuples(defval.e_str); psmi_assert_always(ntup > 0 && ntup <= 3); - // parse default into vals[] so can show what caller get - (void)psm3_parse_str_tuples(defval.e_str, ntup, vals); switch (psm3_parse_str_tuples(env, ntup, vals)) { case -1: // empty, use default + psmi_assert(0); // shouldn't happen, checked for empty above tval = defval; - used_default = 1; - _GETENV_PRINT(env, 1, "'%s'", tval.e_str, defval.e_str); + ret = 1; break; - case -2: // one or more fields with bad syntax, show what we have - tval.e_str = env; - // only 3 choices, so just bruteforce it - switch (ntup) { - case 1: - _HFI_INFO("Invalid value for %s ('%s') %-40s Using: %d\n", - name, env, descr, vals[0]); - break; - case 2: - _HFI_INFO("Invalid value for %s ('%s') %-40s Using: %d:%d\n", - name, env, descr, vals[0], vals[1]); - break; - case 3: - _HFI_INFO("Invalid value for %s ('%s') %-40s Using: %d:%d:%d\n", - name, env, descr, vals[0], vals[1], vals[2]); - break; - } + case -2: // one or more fields with bad syntax, use default + tval = defval; + ret = -1; break; default: // valid string tval.e_str = env; - _GETENV_PRINT(env, 0, "'%s'", tval.e_str, defval.e_str); break; } } + _GETENV_PRINT(env, ret, "'%s'", tval.e_str, defval.e_str); break; case PSMI_ENVVAR_TYPE_ULONG: case PSMI_ENVVAR_TYPE_ULONG_FLAGS: default: if (!env || *env == '\0') { tval = defval; - used_default = 1; + ret = 1; } else { - _CONVERT_TO_NUM(tval.e_ulong,unsigned long,strtoul); + _CONVERT_TO_NUM(e_ulong,unsigned long,unsigned,0,ULONG_MAX); + if (ret == 0) + _GETENV_CHECK(tval); } - if (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS) - _GETENV_PRINT(env, used_default, "0x%lx", tval.e_ulong, + if (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS) { + FORMAT_RANGESTR(e_ulong, "0x%lx"); + _GETENV_PRINT(env, ret, "0x%lx", tval.e_ulong, defval.e_ulong); - else - _GETENV_PRINT(env, used_default, "%lu", tval.e_ulong, + } else { + FORMAT_RANGESTR(e_ulong, "%lu"); + _GETENV_PRINT(env, ret, "%lu", tval.e_ulong, defval.e_ulong); + } break; } +#undef FORMAT_RANGESTR +#undef _GETENV_CHECK #undef _GETENV_PRINT +#undef _CONVERT_TO_NUM *newval = tval; - if (! used_default) + switch (ret) { + case 0: // good input psm3_stats_print_env(name, env); + break; + case -1: // bad input, used default + // _GETENV_PRINT has set staterrstr + psm3_stats_print_env(name, statserrstr); + if (level_flags & PSMI_ENVVAR_FLAG_FATAL) { + // treat syntax or invalid input as fatal + psm3_handle_error(PSMI_EP_NORETURN, PSM2_PARAM_ERR, + "Invalid value for %s: '%s', can't proceed\n", + name, env); + } + break; + case 1: // no input, used default + // nothing special here + // as needed psm3_stats_initialize will log the stats controls + break; + } + return ret; +} +MOCK_DEF_EPILOGUE(psm3_getenv_range); + +int +MOCKABLE(psm3_getenv)(const char *name, const char *descr, int level, + int type, union psmi_envvar_val defval, + union psmi_envvar_val *newval) +{ + switch (type) { + case PSMI_ENVVAR_TYPE_YESNO: + return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)0, (union psmi_envvar_val)1, NULL, NULL, newval); + break; + + case PSMI_ENVVAR_TYPE_STR: + return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, NULL, NULL, newval); + break; - return used_default; + case PSMI_ENVVAR_TYPE_INT: + return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)INT_MIN, (union psmi_envvar_val)INT_MAX, NULL, NULL, newval); + break; + + case PSMI_ENVVAR_TYPE_UINT: + case PSMI_ENVVAR_TYPE_UINT_FLAGS: + return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)0, (union psmi_envvar_val)UINT_MAX, NULL, NULL, newval); + break; + + case PSMI_ENVVAR_TYPE_LONG: + return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)LONG_MIN, (union psmi_envvar_val)LONG_MAX, NULL, NULL, newval); + break; + case PSMI_ENVVAR_TYPE_ULONG_ULONG: + return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)0, (union psmi_envvar_val)ULLONG_MAX, NULL, NULL, newval); + break; + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT: + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT: + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS: + case PSMI_ENVVAR_TYPE_STR_TUPLES: + return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, NULL, NULL, newval); + break; + case PSMI_ENVVAR_TYPE_ULONG: + case PSMI_ENVVAR_TYPE_ULONG_FLAGS: + default: + return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)0, (union psmi_envvar_val)ULONG_MAX, NULL, NULL, newval); + break; + } } MOCK_DEF_EPILOGUE(psm3_getenv); /* * Parsing int parameters * 0 -> ok, *val updated - * -1 -> empty string - * -2 -> parse error + * -1 -> empty string, *val not updated + * -2 -> parse error, *val not updated */ -int psm3_parse_str_int(const char *string, int *val) +int psm3_parse_str_int(const char *string, int *val, int min, int max) { - char *ep; - long ret; + int ret; + long long temp; - psmi_assert(val != NULL); - if (! string || ! *string) - return -1; - /* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */ - ret = strtol(string, &ep, 10); - if (! _CONSUMED_ALL(ep)) { - ret = strtol(string, &ep, 16); - if (! _CONSUMED_ALL(ep)) - return -2; - } - *val = ret; + if ((ret = convert_str_signed(string, &temp, min, max))) + return ret; + *val = (int)temp; return 0; } /* * Parsing uint parameters * 0 -> ok, *val updated - * -1 -> empty string - * -2 -> parse error + * -1 -> empty string, *val not updated + * -2 -> parse error, *val not updated */ -int psm3_parse_str_uint(const char *string, unsigned int *val) +int psm3_parse_str_uint(const char *string, unsigned int *val, + unsigned int min, unsigned int max) { - char *ep; - unsigned long ret; + int ret; + unsigned long long temp; - psmi_assert(val != NULL); - if (! string || ! *string) - return -1; - /* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */ - ret = strtoul(string, &ep, 10); - if (! _CONSUMED_ALL(ep)) { - ret = strtoul(string, &ep, 16); - if (! _CONSUMED_ALL(ep)) - return -2; - } - *val = ret; + if ((ret = convert_str_unsigned(string, &temp, min, max))) + return ret; + *val = (unsigned int)temp; return 0; } /* * Parsing long parameters - * -1 -> empty string - * -2 -> parse error + * Returns: + * 0 -> ok, *val updated + * -1 -> empty string, *val not updated + * -2 -> parse error, *val not updated */ -long psm3_parse_str_long(const char *string) +int psm3_parse_str_long(const char *string, long *val, long min, long max) { - char *ep; - long ret; + int ret; + long long temp; - if (! string || ! *string) - return -1; - /* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */ - ret = strtol(string, &ep, 10); - if (! _CONSUMED_ALL(ep)) { - ret = strtol(string, &ep, 16); - if (! _CONSUMED_ALL(ep)) - return -2; - } - return ret; + if ((ret = convert_str_signed(string, &temp, min, max))) + return ret; + *val = (long)temp; + return 0; } /* * Parsing yesno parameters * allows: yes/no, true/false, on/off, 1/0 - * -1 -> empty string - * -2 -> parse error + * Returns: + * 0 -> ok, *val updated + * *val = 0 - no selected + * *val = 1 - yes selected + * -1 -> empty string, *val not updated + * -2 -> parse error, *val not updated */ -int psm3_parse_str_yesno(const char *string) +int psm3_parse_str_yesno(const char *string, int *val) { + psmi_assert(val != NULL); if (! string || ! *string) return -1; else if (string[0] == 'Y' || string[0] == 'y' || string[0] == 'T' || string[0] == 't' || ((string[0] == 'O' || string[0] == 'o') - && (string[1] == 'n' || string[1] == 'N'))) - return 1; - else if (string[0] == 'N' || string[0] == 'n' + && (string[1] == 'n' || string[1] == 'N'))) { + *val = 1; + } else if (string[0] == 'N' || string[0] == 'n' || string[0] == 'F' || string[0] == 'f' || ((string[0] == 'O' || string[0] == 'o') - && (string[1] == 'f' || string[1] == 'F'))) - return 0; - else { - char *ep; - unsigned long temp; - temp = strtoul(string, &ep, 0); - if (!_CONSUMED_ALL(ep)) { - return -2; - } else if (temp != 0) { - return 1; + && (string[1] == 'f' || string[1] == 'F'))) { + *val = 0; + } else { + unsigned long long temp; + if (convert_str_unsigned(string, &temp, 0, UINT_MAX)) + return -2; // already checked for empty, so must be invalid value + *val = (temp != 0); + } + return 0; +} + +/* parse int env of the form 'val' or 'val:' or 'val:pattern' + * for PSM3_PRINT_STATS + * Returns: + * 0 - parsed and matches current process, *val set to parsed val + * 0 - parsed and doesn't match current process, *val set to def + * -1 - nothing provided, *val set to def + * -2 - syntax error, *val set to def + * flags PSMI_ENVVAR_FLAG_NOMIN, PSMI_ENVVAR_FLAG_NOMAX and + * PSMI_ENVVAR_FLAG_NOABBREV control if 'min', 'minimum', 'max' or 'maximum' + * allowed as input and indicate if min and/or max supplied. + */ +int psm3_parse_val_pattern_int(const char *env, int def, int *val, + unsigned flags, int min, int max) +{ + int ret = 0; + long long temp; + + psmi_assert(val != NULL); + if (!env || ! *env) { + *val = def; + ret = -1; + } else { + char *e = psmi_strdup(NULL, env); + char *p; + + if (flags & PSMI_ENVVAR_FLAG_NOMIN) + min = INT_MIN; + if (flags & PSMI_ENVVAR_FLAG_NOMAX) + max = INT_MAX; + + psmi_assert_always(e != NULL); + if (e == NULL) { // for klocwork + *val = def; + goto done; + } + p = strchr(e, ':'); + if (p) + *p = '\0'; + if (!(flags & (PSMI_ENVVAR_FLAG_NOMIN|PSMI_ENVVAR_FLAG_NOABBREV)) + && (!strcasecmp(e, "min") || !strcasecmp(e, "minimum"))) + *val = min; + else if (!(flags & (PSMI_ENVVAR_FLAG_NOMAX|PSMI_ENVVAR_FLAG_NOABBREV)) + && (!strcasecmp(e, "max") || !strcasecmp(e, "maximum"))) + *val = max; + else if (convert_str_signed(e, &temp, min, max)) { + *val = def; + ret = -2; } else { - return 0; + *val = (int)temp; } + if (ret == 0 && p) { + if (! *(p+1)) { // val: -> val:*:rank0 + if (psm3_get_myrank() != 0) + *val = def; +#ifdef FNM_EXTMATCH + } else if (0 != fnmatch(p+1, psm3_get_mylabel(), FNM_EXTMATCH )) { +#else + } else if (0 != fnmatch(p+1, psm3_get_mylabel(), 0 )) { +#endif + *val = def; + } + } + psmi_free(e); } +done: + return ret; } -/* parse env of the form 'val' or 'val:' or 'val:pattern' +/* parse unsigned env of the form 'val' or 'val:' or 'val:pattern' * for PSM3_VERBOSE_ENV, PSM3_TRACEMASK, PSM3_FI and PSM3_IDENITFY + * Returns: * 0 - parsed and matches current process, *val set to parsed val * 0 - parsed and doesn't match current process, *val set to def * -1 - nothing provided, *val set to def * -2 - syntax error, *val set to def + * flags PSMI_ENVVAR_FLAG_NOMIN, PSMI_ENVVAR_FLAG_NOMAX and + * PSMI_ENVVAR_FLAG_NOABBREV control if 'min', 'minimum', 'max' or 'maximum' + * allowed as input and indicate if min and/or max supplied. */ -int psm3_parse_val_pattern(const char *env, unsigned def, unsigned *val) +int psm3_parse_val_pattern_uint(const char *env, unsigned def, unsigned *val, + unsigned flags, unsigned min, unsigned max) { int ret = 0; + unsigned long long temp; psmi_assert(val != NULL); if (!env || ! *env) { @@ -734,9 +1070,13 @@ int psm3_parse_val_pattern(const char *env, unsigned def, unsigned *val) ret = -1; } else { char *e = psmi_strdup(NULL, env); - char *ep; char *p; + if (flags & PSMI_ENVVAR_FLAG_NOMIN) + min = 0; + if (flags & PSMI_ENVVAR_FLAG_NOMAX) + max = UINT_MAX; + psmi_assert_always(e != NULL); if (e == NULL) { // for klocwork *val = def; @@ -745,11 +1085,19 @@ int psm3_parse_val_pattern(const char *env, unsigned def, unsigned *val) p = strchr(e, ':'); if (p) *p = '\0'; - *val = (int)strtoul(e, &ep, 0); - if (! _CONSUMED_ALL(ep)) { + if (!(flags & (PSMI_ENVVAR_FLAG_NOMIN|PSMI_ENVVAR_FLAG_NOABBREV)) + && (!strcasecmp(e, "min") || !strcasecmp(e, "minimum"))) + *val = min; + else if (!(flags & (PSMI_ENVVAR_FLAG_NOMAX|PSMI_ENVVAR_FLAG_NOABBREV)) + && (!strcasecmp(e, "max") || !strcasecmp(e, "maximum"))) + *val = max; + else if (convert_str_unsigned(e, &temp, min, max)) { *val = def; ret = -2; - } else if (p) { + } else { + *val = (unsigned)temp; + } + if (ret == 0 && p) { if (! *(p+1)) { // val: -> val:*:rank0 if (psm3_get_myrank() != 0) *val = def; @@ -777,11 +1125,11 @@ int psm3_parse_val_pattern(const char *env, unsigned def, unsigned *val) * It's valid for less than ntup values to be supplied, any unsupplied * fields are not updated in vals[] * Returns: - * 0 - parsed with no errors, vals[] updated - * -1 - empty or NULL string, vals[] unchanged - * -2 - syntax error in one of more of the parameters - * parameters with syntax errors are unchanged, others without - * syntax errors are updated in vals[] + * 0 - parsed with no errors, vals[] updated + * -1 - empty or NULL string, vals[] unchanged + * -2 - syntax error in one of more of the parameters + * parameters with syntax errors are unchanged, others without + * syntax errors are updated in vals[] */ int psm3_parse_str_tuples(const char *string, int ntup, int *vals) { @@ -804,17 +1152,14 @@ int psm3_parse_str_tuples(const char *string, int ntup, int *vals) while (*e && *e != ':') e++; if (e > b) { /* something to parse */ - char *ep; int len = e - b; - long int l; + long long temp; strncpy(buf, b, len); buf[len] = '\0'; - l = strtol(buf, &ep, 0); - if (ep != buf) { /* successful conversion */ - vals[tup_i] = (int)l; - } else { + if (convert_str_signed(buf, &temp, INT_MIN, INT_MAX)) ret = -2; - } + else + vals[tup_i] = (int)temp; } if (*e == ':') e++; /* skip delimiter */ diff --git a/prov/psm3/psm3/utils/utils_mallopt.c b/prov/psm3/psm3/utils/utils_mallopt.c index a821281cb00..830c1bbd22b 100644 --- a/prov/psm3/psm3/utils/utils_mallopt.c +++ b/prov/psm3/psm3/utils/utils_mallopt.c @@ -82,7 +82,8 @@ static void init_mallopt_disable_mmap(void) { // since this occurs before psm3_init, we can't use psm3_env_get // default to NO (0) - if (psm3_parse_str_yesno(getenv("PSM3_DISABLE_MMAP_MALLOC")) > 0) { + int disable = 0; + if (!psm3_parse_str_yesno(getenv("PSM3_DISABLE_MMAP_MALLOC"), &disable) && disable) { if (mallopt(M_MMAP_MAX, 0) && mallopt(M_TRIM_THRESHOLD, -1)) { psm3_malloc_no_mmap = 1; } diff --git a/prov/psm3/src/psmx3_attr.c b/prov/psm3/src/psmx3_attr.c index 7c5a61a8031..fc3663f6133 100644 --- a/prov/psm3/src/psmx3_attr.c +++ b/prov/psm3/src/psmx3_attr.c @@ -272,17 +272,87 @@ static uint64_t psmx3_check_fi_hmem_cap(void) { int gpu = 0; unsigned int gpudirect = 0; #ifdef PSM_CUDA - (void)psm3_parse_str_int(psm3_env_get("PSM3_CUDA"), &gpu); + (void)psm3_parse_str_int(psm3_env_get("PSM3_CUDA"), &gpu, INT_MIN, INT_MAX); #else /* PSM_ONEAPI */ - (void)psm3_parse_str_int(psm3_env_get("PSM3_ONEAPI_ZE"), &gpu); + (void)psm3_parse_str_int(psm3_env_get("PSM3_ONEAPI_ZE"), &gpu, + INT_MIN, INT_MAX); #endif - (void)psm3_parse_str_uint(psm3_env_get("PSM3_GPUDIRECT"), &gpudirect); + (void)psm3_parse_str_uint(psm3_env_get("PSM3_GPUDIRECT"), &gpudirect, + 0, UINT_MAX); if ((gpu || gpudirect) && !ofi_hmem_p2p_disabled()) return FI_HMEM; #endif /* PSM_CUDA || PSM_ONEAPI */ return 0; } +static uint64_t get_max_inject_size(void) { + unsigned int thresh_rv; + unsigned int temp; + int have_shm = 1; + int have_nic = 1; + int devid_enabled[PTL_MAX_INIT]; + + // check PSM3_DEVICES to determine if PSM3 shm enabled + if ((PSM2_OK == psm3_parse_devices(devid_enabled))) { + have_shm = psm3_device_is_enabled(devid_enabled, PTL_DEVID_AMSH); + have_nic = psm3_device_is_enabled(devid_enabled, PTL_DEVID_IPS); + } + + // figure out the smallest rendezvous threshold (GPU vs CPU ips vs shm) + // If middleware above is not using PSM3 for shm but leaves it in + // PSM3_DEVICES, this could be more restrictive than necessary, + // but it's safe. Note that PSM3_DEVICES can't be set per EP open. + // Also not yet sure which HAL will be selected so must pick most + // conservative ips (NIC) config + thresh_rv = 65536; // default in odd case of PSM3_DEVICES=self + + if (have_nic) { + temp = PSM_MQ_NIC_RNDV_THRESH; + psm3_parse_str_uint(psm3_env_get("PSM3_MQ_RNDV_NIC_THRESH"), &temp, + 0, UINT_MAX); + if (thresh_rv > temp) + thresh_rv = temp; + } + + if (have_shm) { + temp = MQ_SHM_THRESH_RNDV; + psm3_parse_str_uint(psm3_env_get("PSM3_MQ_RNDV_SHM_THRESH"), &temp, + 0, UINT_MAX); + if (thresh_rv > temp) + thresh_rv = temp; + } + +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + if (psmx3_prov_info.caps & FI_HMEM) { + if (have_nic) { + // GPU ips rendezvous threshold + // sockets HAL avoids rendezvous, so this may be overly restrictive + temp = GPU_THRESH_RNDV; + // PSM3_CUDA_THRESH_RNDV depricated, use PSM3_GPU_THRESH_RNDV if set + psm3_parse_str_uint(psm3_env_get("PSM3_CUDA_THRESH_RNDV"), &temp, + 0, UINT_MAX); + psm3_parse_str_uint(psm3_env_get("PSM3_GPU_THRESH_RNDV"), &temp, + 0, UINT_MAX); + if (thresh_rv > temp) + thresh_rv = temp; + } + + if (have_shm) { + // GPU shm rendezvous threshold + temp = MQ_SHM_GPU_THRESH_RNDV; + psm3_parse_str_uint(psm3_env_get("PSM3_MQ_RNDV_SHM_GPU_THRESH"), &temp, + 0, UINT_MAX); + if (thresh_rv > temp) + thresh_rv = temp; + } + } +#endif + + // messages <= thresh_rv guaranteed to use eager, so thresh_rv + // is the max allowed inject_size. + return thresh_rv; +} + /* * Possible provider variations: * @@ -496,6 +566,8 @@ void psmx3_update_prov_info(struct fi_info *info, struct psmx3_ep_name *dest_addr) { struct fi_info *p; + unsigned int max_inject_size; + unsigned int inject_size; for (p = info; p; p = p->next) { psmx3_dup_addr(p->addr_format, src_addr, @@ -506,6 +578,15 @@ void psmx3_update_prov_info(struct fi_info *info, psmx3_expand_default_unit(info); + max_inject_size = get_max_inject_size(); + if (psmx3_env.inject_size > max_inject_size) + inject_size = max_inject_size; + else + inject_size = psmx3_env.inject_size; + PSMX3_INFO(&psmx3_prov, FI_LOG_CORE, + "Using inject_size=%u based on FI_PSM3_INJECT_SIZE=%u with max %u\n", + inject_size, psmx3_env.inject_size, max_inject_size); + for (p = info; p; p = p->next) { int unit = ((struct psmx3_ep_name *)p->src_addr)->unit; int port = ((struct psmx3_ep_name *)p->src_addr)->port; @@ -539,7 +620,7 @@ void psmx3_update_prov_info(struct fi_info *info, int addr_index = psmx3_domain_info.addr_index[unit]; args[0].unit = unit_id; - args[1].port = port; + args[1].port = port == PSMX3_DEFAULT_PORT ? 1 : port; args[2].addr_index = addr_index; args[3].length = sizeof(unit_name); @@ -571,7 +652,7 @@ void psmx3_update_prov_info(struct fi_info *info, int addr_index = psmx3_domain_info.addr_index[unit]; args[0].unit = unit_id; - args[1].port = port; + args[1].port = port == PSMX3_DEFAULT_PORT ? 1 : port; args[2].addr_index = addr_index; args[3].length = sizeof(fabric_name); @@ -591,7 +672,7 @@ void psmx3_update_prov_info(struct fi_info *info, } } - p->tx_attr->inject_size = psmx3_env.inject_size; + p->tx_attr->inject_size = inject_size; } } diff --git a/prov/psm3/src/psmx3_cq.c b/prov/psm3/src/psmx3_cq.c index f1a10349dce..b072eb230df 100644 --- a/prov/psm3/src/psmx3_cq.c +++ b/prov/psm3/src/psmx3_cq.c @@ -622,8 +622,10 @@ psmx3_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry data = PSMX3_GET_CQDATA(PSMX3_STATUS_TAG(req)); if (PSMX3_HAS_IMM(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req)))) flags |= FI_REMOTE_CQ_DATA; - if (multi_recv_req->offset + PSMX3_STATUS_RCVLEN(req) + - multi_recv_req->min_buf_size > multi_recv_req->len) + len_remaining = multi_recv_req->len - multi_recv_req->offset - + PSMX3_STATUS_RCVLEN(req); + if (len_remaining < multi_recv_req->min_buf_size || + len_remaining == 0) flags |= FI_MULTI_RECV; /* buffer used up */ err = psmx3_cq_rx_complete( status_data->poll_cq, ep->recv_cq, ep->av, @@ -638,7 +640,8 @@ psmx3_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry /* repost multi-recv buffer */ multi_recv_req->offset += PSMX3_STATUS_RCVLEN(req); len_remaining = multi_recv_req->len - multi_recv_req->offset; - if (len_remaining >= multi_recv_req->min_buf_size) { + if (len_remaining >= multi_recv_req->min_buf_size && + len_remaining > 0) { if (len_remaining > PSMX3_MAX_MSG_SIZE) len_remaining = PSMX3_MAX_MSG_SIZE; err = psm3_mq_irecv2(ep->rx->psm2_mq, @@ -786,7 +789,8 @@ psmx3_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry multi_recv_req = PSMX3_CTXT_USER(fi_context); multi_recv_req->offset += PSMX3_STATUS_RCVLEN(req); len_remaining = multi_recv_req->len - multi_recv_req->offset; - if (len_remaining >= multi_recv_req->min_buf_size) { + if (len_remaining >= multi_recv_req->min_buf_size && + len_remaining > 0) { if (len_remaining > PSMX3_MAX_MSG_SIZE) len_remaining = PSMX3_MAX_MSG_SIZE; err = psm3_mq_irecv2(ep->rx->psm2_mq, diff --git a/prov/psm3/src/psmx3_init.c b/prov/psm3/src/psmx3_init.c index c263446fd64..c20035a84de 100644 --- a/prov/psm3/src/psmx3_init.c +++ b/prov/psm3/src/psmx3_init.c @@ -320,11 +320,11 @@ static int psmx3_check_multi_ep_cap(void) { uint64_t caps = PSM2_MULTI_EP_CAP; char *s = NULL; + int val = 1; /* if parses as empty (-1) or invalid (-2), use default of 1 */ s = psm3_env_get("PSM3_MULTI_EP"); - /* if parses as empty or invalid (-1), use default of 1 */ - /* psm3 below us will provide warning as needed when it parses it */ - if (psm3_get_capability_mask(caps) == caps && 0 != psm3_parse_str_yesno(s)) + /* psm3 below us will provide warning as needed when it parses it again */ + if (psm3_get_capability_mask(caps) == caps && (psm3_parse_str_yesno(s, &val) || val)) psmx3_env.multi_ep = 1; else psmx3_env.multi_ep = 0; @@ -438,7 +438,7 @@ static int psmx3_update_hfi_info(void) // if parses as empty or invalid (-1), use default of 0 */ // PSM3 below us will provide warning as needed when it parses it s = psm3_env_get("PSM3_MULTIRAIL"); - (void)psm3_parse_str_int(s, &multirail); + (void)psm3_parse_str_int(s, &multirail, INT_MIN, INT_MAX); psmx3_domain_info.num_reported_units = 0; psmx3_domain_info.num_active_units = 0; @@ -699,6 +699,7 @@ static void psmx3_update_nic_info(struct fi_info *info) } } +static int init_calls; static int psmx3_getinfo(uint32_t api_version, const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct fi_info **info) @@ -740,6 +741,8 @@ static int psmx3_getinfo(uint32_t api_version, const char *node, goto err_out; } + init_calls += 1; + /* when available, default domain and fabric names are a superset * of all individual names, so we can do a substr search as a 1st level * filter @@ -872,6 +875,9 @@ static int psmx3_getinfo(uint32_t api_version, const char *node, *info = prov_info; free(src_addr); free(dest_addr); + if (hints || init_calls >= 2) { + psm3_turn_off_init_cache(); + } return 0; err_out: diff --git a/prov/psm3/src/psmx3_msg.c b/prov/psm3/src/psmx3_msg.c index 3fe17a6bf73..519593def74 100644 --- a/prov/psm3/src/psmx3_msg.c +++ b/prov/psm3/src/psmx3_msg.c @@ -225,7 +225,7 @@ ssize_t psmx3_send_generic(struct fid_ep *ep, const void *buf, size_t len, return -FI_EMSGSIZE; err = psm3_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, - send_flag, &psm2_tag, buf, len); + send_flag|PSM2_MQ_FLAG_INJECT, &psm2_tag, buf, len); if (err != PSM2_OK) return psmx3_errno(err); @@ -374,7 +374,7 @@ ssize_t psmx3_sendv_generic(struct fid_ep *ep, const struct iovec *iov, } err = psm3_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, - send_flag, &psm2_tag, req->buf, len); + send_flag|PSM2_MQ_FLAG_INJECT, &psm2_tag, req->buf, len); free(req); diff --git a/prov/psm3/src/psmx3_tagged.c b/prov/psm3/src/psmx3_tagged.c index 17caec29533..41475dc211c 100644 --- a/prov/psm3/src/psmx3_tagged.c +++ b/prov/psm3/src/psmx3_tagged.c @@ -551,7 +551,7 @@ ssize_t psmx3_tagged_send_generic(struct fid_ep *ep, return -FI_EMSGSIZE; err = psm3_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, - 0, &psm2_tag, buf, len); + PSM2_MQ_FLAG_INJECT, &psm2_tag, buf, len); if (err != PSM2_OK) return psmx3_errno(err); @@ -764,8 +764,8 @@ psmx3_tagged_inject_specialized(struct fid_ep *ep, const void *buf, else PSMX3_SET_TAG(psm2_tag, tag, ep_priv->sep_id, PSMX3_TYPE_TAGGED); - err = psm3_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, 0, - &psm2_tag, buf, len); + err = psm3_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, + PSM2_MQ_FLAG_INJECT, &psm2_tag, buf, len); if (err != PSM2_OK) return psmx3_errno(err); @@ -915,7 +915,7 @@ ssize_t psmx3_tagged_sendv_generic(struct fid_ep *ep, } err = psm3_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, - send_flag, &psm2_tag, req->buf, len); + send_flag|PSM2_MQ_FLAG_INJECT, &psm2_tag, req->buf, len); free(req);